@engine9-io/input-tools 2.0.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ForEachEntry.js +5 -7
- package/file/FileUtilities.js +899 -951
- package/file/tools.js +283 -312
- package/index.js +2 -2
- package/package.json +1 -1
package/file/FileUtilities.js
CHANGED
|
@@ -3,7 +3,7 @@ import path from 'node:path';
|
|
|
3
3
|
import zlib from 'node:zlib';
|
|
4
4
|
import nodestream from 'node:stream';
|
|
5
5
|
import promises from 'node:stream/promises';
|
|
6
|
-
import
|
|
6
|
+
import { parse, stringify } from 'csv';
|
|
7
7
|
import debug$0 from 'debug';
|
|
8
8
|
import xlstream from 'xlstream';
|
|
9
9
|
import JSON5 from 'json5';
|
|
@@ -15,1067 +15,1017 @@ import { bool, getTempFilename, getStringArray, getTempDir, makeStrings, streamP
|
|
|
15
15
|
const fsp = fs.promises;
|
|
16
16
|
const { Readable, Transform, PassThrough, Writable } = nodestream;
|
|
17
17
|
const { pipeline } = promises;
|
|
18
|
-
|
|
18
|
+
|
|
19
19
|
const debug = debug$0('@engine9-io/file');
|
|
20
20
|
const { getXlsxStream } = xlstream;
|
|
21
|
-
|
|
21
|
+
|
|
22
22
|
function Worker({ accountId }) {
|
|
23
|
-
|
|
23
|
+
this.accountId = accountId;
|
|
24
24
|
}
|
|
25
25
|
class LineReaderTransform extends Transform {
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
26
|
+
constructor(options = {}) {
|
|
27
|
+
super({ ...options, readableObjectMode: true });
|
|
28
|
+
this.buffer = '';
|
|
29
|
+
}
|
|
30
|
+
_transform(chunk, encoding, callback) {
|
|
31
|
+
this.buffer += chunk.toString();
|
|
32
|
+
const lines = this.buffer.split(/\r?\n/);
|
|
33
|
+
this.buffer = lines.pop();
|
|
34
|
+
lines.forEach((line) => this.push(line));
|
|
35
|
+
callback();
|
|
36
|
+
}
|
|
37
|
+
_flush(callback) {
|
|
38
|
+
if (this.buffer) {
|
|
39
|
+
this.push(this.buffer);
|
|
40
|
+
}
|
|
41
|
+
callback();
|
|
42
|
+
}
|
|
43
43
|
}
|
|
44
44
|
Worker.prototype.csvToObjectTransforms = function (options) {
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
let lastLine = null;
|
|
52
|
-
let head = null;
|
|
53
|
-
const skipLinesWithError = bool(options.skip_lines_with_error, false);
|
|
54
|
-
const parserOptions = {
|
|
55
|
-
relax: true,
|
|
56
|
-
skip_empty_lines: true,
|
|
57
|
-
delimiter,
|
|
58
|
-
max_limit_on_data_read: 10000000,
|
|
59
|
-
skip_lines_with_error: skipLinesWithError
|
|
45
|
+
const transforms = [];
|
|
46
|
+
const delimiter = options.delimiter || ',';
|
|
47
|
+
const headerMapping =
|
|
48
|
+
options.headerMapping ||
|
|
49
|
+
function (d) {
|
|
50
|
+
return d;
|
|
60
51
|
};
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
52
|
+
let lastLine = null;
|
|
53
|
+
let head = null;
|
|
54
|
+
const skipLinesWithError = bool(options.skip_lines_with_error, false);
|
|
55
|
+
const parserOptions = {
|
|
56
|
+
relax: true,
|
|
57
|
+
skip_empty_lines: true,
|
|
58
|
+
delimiter,
|
|
59
|
+
max_limit_on_data_read: 10000000,
|
|
60
|
+
skip_lines_with_error: skipLinesWithError
|
|
61
|
+
};
|
|
62
|
+
if (options.skip) parserOptions.from_line = options.skip;
|
|
63
|
+
if (options.relax_column_count) parserOptions.relax_column_count = true;
|
|
64
|
+
if (options.quote_escape) {
|
|
65
|
+
parserOptions.escape = options.quote_escape;
|
|
66
|
+
}
|
|
67
|
+
if (options.limit) {
|
|
68
|
+
parserOptions.to = options.limit;
|
|
69
|
+
}
|
|
70
|
+
debug('Parser options=', parserOptions);
|
|
71
|
+
const parser = parse(parserOptions);
|
|
72
|
+
parser.on('error', (error) => {
|
|
73
|
+
debug('fileToObjectStream: Error parsing csv file');
|
|
74
|
+
debug(lastLine);
|
|
75
|
+
throw new Error(error);
|
|
76
|
+
});
|
|
77
|
+
const blankAndHeaderCheck = new Transform({
|
|
78
|
+
objectMode: true,
|
|
79
|
+
transform(row, enc, cb) {
|
|
80
|
+
// Blank rows
|
|
81
|
+
if (row.length === 0) return cb();
|
|
82
|
+
if (row.length === 1 && !row[0]) return cb();
|
|
83
|
+
if (!head) {
|
|
84
|
+
head = row.map(headerMapping);
|
|
85
|
+
return cb();
|
|
86
|
+
}
|
|
87
|
+
const o = {};
|
|
88
|
+
head.forEach((_h, i) => {
|
|
89
|
+
const h = _h.trim();
|
|
90
|
+
if (h) {
|
|
91
|
+
o[h] = row[i];
|
|
99
92
|
}
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
93
|
+
});
|
|
94
|
+
lastLine = row.join(delimiter);
|
|
95
|
+
return cb(null, o);
|
|
96
|
+
}
|
|
97
|
+
});
|
|
98
|
+
transforms.push(parser);
|
|
99
|
+
transforms.push(blankAndHeaderCheck);
|
|
100
|
+
return { transforms };
|
|
104
101
|
};
|
|
105
102
|
Worker.prototype.detectEncoding = async function (options) {
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
}
|
|
133
|
-
});
|
|
134
|
-
decompressStream.write(buff);
|
|
135
|
-
decompressStream.end();
|
|
103
|
+
if (options.encoding_override) return { encoding: options.encoding_override };
|
|
104
|
+
// Limit to only the top N bytes -- for perfomance
|
|
105
|
+
// Be wary, though, as gzip files may require a certain minimum number of bytes to decompress
|
|
106
|
+
const bytes = 64 * 1024;
|
|
107
|
+
const buff = Buffer.alloc(bytes);
|
|
108
|
+
const fd = await fsp.open(options.filename);
|
|
109
|
+
await fd.read(buff, 0, bytes);
|
|
110
|
+
let finalBuff = buff;
|
|
111
|
+
if (options.filename.slice(-3) === '.gz') {
|
|
112
|
+
// This code deals with scenarios where the buffer coming in may not be exactly the gzip
|
|
113
|
+
// needed chunk size.
|
|
114
|
+
finalBuff = await new Promise((resolve, reject) => {
|
|
115
|
+
const bufferBuilder = [];
|
|
116
|
+
const decompressStream = zlib
|
|
117
|
+
.createGunzip()
|
|
118
|
+
.on('data', (chunk) => {
|
|
119
|
+
bufferBuilder.push(chunk);
|
|
120
|
+
})
|
|
121
|
+
.on('close', () => {
|
|
122
|
+
resolve(Buffer.concat(bufferBuilder));
|
|
123
|
+
})
|
|
124
|
+
.on('error', (err) => {
|
|
125
|
+
if (err.errno !== -5) {
|
|
126
|
+
// EOF: expected
|
|
127
|
+
reject(err);
|
|
128
|
+
}
|
|
136
129
|
});
|
|
137
|
-
|
|
138
|
-
|
|
130
|
+
decompressStream.write(buff);
|
|
131
|
+
decompressStream.end();
|
|
132
|
+
});
|
|
133
|
+
}
|
|
134
|
+
return languageEncoding(finalBuff);
|
|
139
135
|
};
|
|
140
136
|
Worker.prototype.detectEncoding.metadata = {
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
137
|
+
options: {
|
|
138
|
+
filename: { required: true }
|
|
139
|
+
}
|
|
144
140
|
};
|
|
145
141
|
Worker.prototype.xlsxToObjectStream = async function (options) {
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
cb(null, o);
|
|
178
|
-
}
|
|
142
|
+
let { filename } = options;
|
|
143
|
+
if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
|
|
144
|
+
// We need to copy and delete
|
|
145
|
+
let worker = null;
|
|
146
|
+
if (filename.startsWith('r2://')) {
|
|
147
|
+
worker = new R2Worker(this);
|
|
148
|
+
} else {
|
|
149
|
+
worker = new S3Worker(this);
|
|
150
|
+
}
|
|
151
|
+
const target = getTempFilename({ targetFilename: filename.split('/').pop() });
|
|
152
|
+
await worker.copy({ filename, target });
|
|
153
|
+
filename = target;
|
|
154
|
+
}
|
|
155
|
+
let stream = await getXlsxStream({
|
|
156
|
+
filePath: filename,
|
|
157
|
+
sheet: 0
|
|
158
|
+
});
|
|
159
|
+
let keys = null;
|
|
160
|
+
stream = stream.pipe(
|
|
161
|
+
new Transform({
|
|
162
|
+
objectMode: true,
|
|
163
|
+
transform(d, enc, cb) {
|
|
164
|
+
if (!keys) {
|
|
165
|
+
keys = d?.raw.arr;
|
|
166
|
+
cb();
|
|
167
|
+
} else {
|
|
168
|
+
let o = {};
|
|
169
|
+
keys.forEach((k, i) => {
|
|
170
|
+
o[k] = d?.raw?.arr?.[i];
|
|
171
|
+
});
|
|
172
|
+
cb(null, o);
|
|
179
173
|
}
|
|
180
|
-
|
|
181
|
-
|
|
174
|
+
}
|
|
175
|
+
})
|
|
176
|
+
);
|
|
177
|
+
return { stream };
|
|
182
178
|
};
|
|
183
179
|
Worker.prototype.getFormat = async function (options) {
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
180
|
+
const { sourcePostfix, filename, format: formatOverride } = options;
|
|
181
|
+
let postfix = sourcePostfix || filename.toLowerCase().split('.').pop();
|
|
182
|
+
if (postfix === 'gz') {
|
|
183
|
+
postfix = filename.toLowerCase().split('.');
|
|
184
|
+
postfix = postfix[postfix.length - 2];
|
|
185
|
+
}
|
|
186
|
+
return formatOverride || postfix;
|
|
191
187
|
};
|
|
192
188
|
/*
|
|
193
189
|
Commonly used method to transform a file into a stream of objects.
|
|
194
190
|
*/
|
|
195
191
|
Worker.prototype.fileToObjectStream = async function (options) {
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
}
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
}
|
|
253
|
-
else if (format === 'txt') {
|
|
254
|
-
const csvTransforms = this.csvToObjectTransforms({ ...options, delimiter: '\t' });
|
|
255
|
-
transforms = transforms.concat(csvTransforms.transforms);
|
|
256
|
-
}
|
|
257
|
-
else if (format === 'jsonl') {
|
|
258
|
-
/* Type of JSON that has the names in an array in the first record,
|
|
192
|
+
const { filename, columns, limit: limitOption, format: formatOverride } = options;
|
|
193
|
+
// handle stream item
|
|
194
|
+
if (options.stream) {
|
|
195
|
+
if (Array.isArray(options.stream)) {
|
|
196
|
+
return { stream: Readable.from(options.stream) };
|
|
197
|
+
}
|
|
198
|
+
// probably already a stream
|
|
199
|
+
if (typeof options.stream === 'object') return { stream: options.stream };
|
|
200
|
+
throw new Error(`Invalid stream type:${typeof options.stream}`);
|
|
201
|
+
}
|
|
202
|
+
let limit;
|
|
203
|
+
if (limitOption) limit = parseInt(limitOption, 10);
|
|
204
|
+
if (!filename) throw new Error('fileToObjectStream: filename is required');
|
|
205
|
+
if (filename.split('.').pop().toLowerCase() === 'xlsx') {
|
|
206
|
+
return this.xlsxToObjectStream(options);
|
|
207
|
+
}
|
|
208
|
+
let postfix = options.sourcePostfix || filename.toLowerCase().split('.').pop();
|
|
209
|
+
if (postfix === 'zip') {
|
|
210
|
+
debug('Invalid filename:', { filename });
|
|
211
|
+
throw new Error('Cowardly refusing to turn a .zip file into an object stream, turn into a csv first');
|
|
212
|
+
}
|
|
213
|
+
const streamInfo = await this.stream({
|
|
214
|
+
filename,
|
|
215
|
+
columns,
|
|
216
|
+
limit
|
|
217
|
+
});
|
|
218
|
+
const { encoding } = streamInfo;
|
|
219
|
+
let { stream } = streamInfo;
|
|
220
|
+
if (!stream) throw new Error(`No stream found in fileToObjectStream from filename ${filename}`);
|
|
221
|
+
if (encoding === 'object') {
|
|
222
|
+
// already an object
|
|
223
|
+
return { stream };
|
|
224
|
+
}
|
|
225
|
+
let count = 0;
|
|
226
|
+
let transforms = [];
|
|
227
|
+
if (postfix === 'gz') {
|
|
228
|
+
const gunzip = zlib.createGunzip();
|
|
229
|
+
transforms.push(gunzip);
|
|
230
|
+
gunzip.setEncoding(encoding);
|
|
231
|
+
// encoding = null;// Default encoding
|
|
232
|
+
postfix = filename.toLowerCase().split('.');
|
|
233
|
+
postfix = postfix[postfix.length - 2];
|
|
234
|
+
debug(`Using gunzip parser because postfix is .gz, encoding=${encoding}`);
|
|
235
|
+
} else {
|
|
236
|
+
stream.setEncoding(encoding);
|
|
237
|
+
}
|
|
238
|
+
let format = formatOverride || postfix;
|
|
239
|
+
debug(`Reading file ${filename} with encoding: ${encoding} and format ${format}`);
|
|
240
|
+
if (format === 'csv') {
|
|
241
|
+
const csvTransforms = this.csvToObjectTransforms({ ...options });
|
|
242
|
+
transforms = transforms.concat(csvTransforms.transforms);
|
|
243
|
+
} else if (format === 'txt') {
|
|
244
|
+
const csvTransforms = this.csvToObjectTransforms({ ...options, delimiter: '\t' });
|
|
245
|
+
transforms = transforms.concat(csvTransforms.transforms);
|
|
246
|
+
} else if (format === 'jsonl') {
|
|
247
|
+
/* Type of JSON that has the names in an array in the first record,
|
|
259
248
|
and the values in JSON arrays thereafter
|
|
260
249
|
*/
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
}
|
|
277
|
-
/* JSONL could potentially start with an array of names,
|
|
250
|
+
let headers = null;
|
|
251
|
+
const lineReader = new LineReaderTransform();
|
|
252
|
+
const jsonlTransform = new Transform({
|
|
253
|
+
objectMode: true,
|
|
254
|
+
transform(d, enc, cb) {
|
|
255
|
+
if (!d) return cb();
|
|
256
|
+
let obj;
|
|
257
|
+
try {
|
|
258
|
+
obj = JSON5.parse(d);
|
|
259
|
+
} catch (e) {
|
|
260
|
+
debug('Invalid line:');
|
|
261
|
+
debug(d);
|
|
262
|
+
throw e;
|
|
263
|
+
}
|
|
264
|
+
/* JSONL could potentially start with an array of names,
|
|
278
265
|
in which case we need to map the subsequent values
|
|
279
266
|
*/
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
}
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
debug(`Completed reading file, records=${count}`);
|
|
323
|
-
/* if (count === 0) {
|
|
267
|
+
if (headers === null) {
|
|
268
|
+
if (Array.isArray(obj)) {
|
|
269
|
+
headers = obj;
|
|
270
|
+
return cb();
|
|
271
|
+
}
|
|
272
|
+
headers = false;
|
|
273
|
+
}
|
|
274
|
+
if (headers) {
|
|
275
|
+
const mapped = {};
|
|
276
|
+
headers.forEach((name, i) => {
|
|
277
|
+
mapped[name] = obj[i];
|
|
278
|
+
});
|
|
279
|
+
this.push(mapped);
|
|
280
|
+
} else {
|
|
281
|
+
this.push(obj);
|
|
282
|
+
}
|
|
283
|
+
return cb();
|
|
284
|
+
}
|
|
285
|
+
});
|
|
286
|
+
transforms.push(lineReader);
|
|
287
|
+
transforms.push(jsonlTransform);
|
|
288
|
+
} else {
|
|
289
|
+
throw new Error(`Unsupported file type: ${postfix}`);
|
|
290
|
+
}
|
|
291
|
+
const countAndDebug = new Transform({
|
|
292
|
+
objectMode: true,
|
|
293
|
+
transform(d, enc, cb) {
|
|
294
|
+
if (count === 0) {
|
|
295
|
+
debug('Sample object from file:', d);
|
|
296
|
+
}
|
|
297
|
+
count += 1;
|
|
298
|
+
if ((count < 5000 && count % 1000 === 0) || count % 50000 === 0) {
|
|
299
|
+
debug(`fileToObjectStream transformed ${count} lines`);
|
|
300
|
+
}
|
|
301
|
+
this.push(d);
|
|
302
|
+
cb();
|
|
303
|
+
},
|
|
304
|
+
flush(cb) {
|
|
305
|
+
// If there's no records at all, push a dummy record, and specify 0 records
|
|
306
|
+
// Don't push dummy records anymore -- legacy cruft
|
|
307
|
+
debug(`Completed reading file, records=${count}`);
|
|
308
|
+
/* if (count === 0) {
|
|
324
309
|
const o = { _is_placeholder: true };
|
|
325
310
|
|
|
326
311
|
if (head) head.forEach((c) => { o[c] = null; });
|
|
327
312
|
this.push(o);
|
|
328
313
|
} */
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
314
|
+
cb();
|
|
315
|
+
}
|
|
316
|
+
});
|
|
317
|
+
transforms.push(countAndDebug);
|
|
318
|
+
transforms.forEach((t) => {
|
|
319
|
+
stream = stream.pipe(t);
|
|
320
|
+
});
|
|
321
|
+
return { stream };
|
|
337
322
|
};
|
|
338
323
|
Worker.prototype.getFileWriterStream = async function (options = {}) {
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
filename += '.gz';
|
|
356
|
-
const stream = fs.createWriteStream(filename);
|
|
357
|
-
debug('FileWriterStream writing to file ', filename);
|
|
358
|
-
return { filename, stream };
|
|
324
|
+
const accountId = options.accountId || this.accountId;
|
|
325
|
+
if (!accountId) throw new Error('getFileWriterStream has no accountId');
|
|
326
|
+
const targetFormat = options.targetFormat || 'csv';
|
|
327
|
+
const tempDir = await getTempDir({ accountId });
|
|
328
|
+
let { fileExtendedType } = options;
|
|
329
|
+
if (fileExtendedType) fileExtendedType += '.';
|
|
330
|
+
else fileExtendedType = '';
|
|
331
|
+
// So, this could change, but it's easier to read
|
|
332
|
+
// dates in a filename than UUIDs, so this is
|
|
333
|
+
// a unique-ish filename generator
|
|
334
|
+
const uniqueNumberedDate = `${new Date().toISOString().replace(/[^0-9]*/g, '')}.${Math.floor(Math.random() * 1000)}`;
|
|
335
|
+
let filename = `${tempDir}${path.sep}${uniqueNumberedDate}.${fileExtendedType}${targetFormat}`;
|
|
336
|
+
if (bool(options.gzip, false)) filename += '.gz';
|
|
337
|
+
const stream = fs.createWriteStream(filename);
|
|
338
|
+
debug('FileWriterStream writing to file ', filename);
|
|
339
|
+
return { filename, stream };
|
|
359
340
|
};
|
|
360
341
|
Worker.prototype.getOutputStreams = async function (options) {
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
}
|
|
370
|
-
});
|
|
342
|
+
const { filename, stream: fileWriterStream } = await this.getFileWriterStream(options);
|
|
343
|
+
let { transform } = options;
|
|
344
|
+
if (typeof options.transform === 'function') {
|
|
345
|
+
if (options.transform.length === 3) {
|
|
346
|
+
transform = new Transform({
|
|
347
|
+
objectMode: true,
|
|
348
|
+
async transform(item, encoding, cb) {
|
|
349
|
+
options.transform(item, encoding, cb);
|
|
371
350
|
}
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
});
|
|
351
|
+
});
|
|
352
|
+
} else {
|
|
353
|
+
transform = new Transform({
|
|
354
|
+
objectMode: true,
|
|
355
|
+
async transform(item, encoding, cb) {
|
|
356
|
+
cb(null, options.transform(item));
|
|
379
357
|
}
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
o[k] = v;
|
|
402
|
-
}
|
|
403
|
-
}
|
|
404
|
-
});
|
|
405
|
-
cb(null, o);
|
|
406
|
-
}
|
|
407
|
-
});
|
|
408
|
-
}
|
|
409
|
-
const stats = {
|
|
410
|
-
records: 0
|
|
411
|
-
};
|
|
412
|
-
let stringifier;
|
|
413
|
-
if (options.targetFormat === 'jsonl') {
|
|
414
|
-
stringifier = new Transform({
|
|
415
|
-
objectMode: true,
|
|
416
|
-
transform(d, encoding, cb) {
|
|
417
|
-
cb(false, `${JSON.stringify(d)}\n`);
|
|
358
|
+
});
|
|
359
|
+
}
|
|
360
|
+
} else if (options.transform) {
|
|
361
|
+
transform = options.transform;
|
|
362
|
+
}
|
|
363
|
+
const { flatten } = options;
|
|
364
|
+
let flattenTransform = null;
|
|
365
|
+
if (bool(flatten, false)) {
|
|
366
|
+
flattenTransform = new Transform({
|
|
367
|
+
objectMode: true,
|
|
368
|
+
async transform(item, enc, cb) {
|
|
369
|
+
// first item establishes the keys to use
|
|
370
|
+
let o = {};
|
|
371
|
+
Object.keys(item).forEach((k) => {
|
|
372
|
+
let v = item[k];
|
|
373
|
+
if (!o[k]) {
|
|
374
|
+
if (typeof v === 'object') {
|
|
375
|
+
while (Array.isArray(v)) [v] = v; // get first array item
|
|
376
|
+
o = { ...o, ...v };
|
|
377
|
+
} else {
|
|
378
|
+
o[k] = v;
|
|
418
379
|
}
|
|
380
|
+
}
|
|
419
381
|
});
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
382
|
+
cb(null, o);
|
|
383
|
+
}
|
|
384
|
+
});
|
|
385
|
+
}
|
|
386
|
+
const stats = {
|
|
387
|
+
records: 0
|
|
388
|
+
};
|
|
389
|
+
let stringifier;
|
|
390
|
+
if (options.targetFormat === 'jsonl') {
|
|
391
|
+
stringifier = new Transform({
|
|
392
|
+
objectMode: true,
|
|
393
|
+
transform(d, encoding, cb) {
|
|
394
|
+
cb(false, `${JSON.stringify(d)}\n`);
|
|
395
|
+
}
|
|
396
|
+
});
|
|
397
|
+
} else {
|
|
398
|
+
stringifier = stringify({ header: true });
|
|
399
|
+
}
|
|
400
|
+
let gzip = new PassThrough();
|
|
401
|
+
if (options.gzip) {
|
|
402
|
+
gzip = zlib.createGzip();
|
|
403
|
+
}
|
|
404
|
+
const streams = [
|
|
405
|
+
transform,
|
|
406
|
+
flattenTransform,
|
|
407
|
+
new Transform({
|
|
408
|
+
objectMode: true,
|
|
409
|
+
transform(d, enc, cb) {
|
|
410
|
+
stats.records += 1;
|
|
411
|
+
cb(null, d);
|
|
412
|
+
}
|
|
413
|
+
}),
|
|
414
|
+
stringifier,
|
|
415
|
+
gzip,
|
|
416
|
+
fileWriterStream
|
|
417
|
+
].filter(Boolean);
|
|
418
|
+
return { filename, streams, stats };
|
|
443
419
|
};
|
|
444
420
|
Worker.prototype.objectStreamToFile = async function (options) {
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
421
|
+
const { filename, streams, stats } = await this.getOutputStreams(options);
|
|
422
|
+
const { stream: inStream } = options;
|
|
423
|
+
streams.unshift(inStream);
|
|
424
|
+
await pipeline(streams);
|
|
425
|
+
return { filename, records: stats.records };
|
|
450
426
|
};
|
|
451
427
|
Worker.prototype.transform = async function (options) {
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
428
|
+
const worker = this;
|
|
429
|
+
const { filename } = options;
|
|
430
|
+
debug(`Transforming ${filename}`);
|
|
431
|
+
options.filename = filename;
|
|
432
|
+
let { stream } = await worker.fileToObjectStream(options);
|
|
433
|
+
if (typeof stream.pipe !== 'function') {
|
|
434
|
+
debug(stream);
|
|
435
|
+
throw new Error('No pipe in stream');
|
|
436
|
+
}
|
|
437
|
+
let t = options.transform;
|
|
438
|
+
// No longer need this
|
|
439
|
+
delete options.transform;
|
|
440
|
+
if (!t) {
|
|
441
|
+
t = function (d, enc, cb) {
|
|
442
|
+
d.is_test_transform = true;
|
|
443
|
+
cb(null, d);
|
|
444
|
+
};
|
|
445
|
+
}
|
|
446
|
+
if (!Array.isArray(t)) t = [t];
|
|
447
|
+
Object.keys(t).forEach((key) => {
|
|
448
|
+
let f = t[key];
|
|
449
|
+
if (typeof f === 'function') {
|
|
450
|
+
f = new Transform({
|
|
451
|
+
objectMode: true,
|
|
452
|
+
transform: f
|
|
453
|
+
});
|
|
454
|
+
}
|
|
455
|
+
stream = stream.pipe(f);
|
|
456
|
+
});
|
|
457
|
+
const { targetFormat } = options;
|
|
458
|
+
if (
|
|
459
|
+
!targetFormat &&
|
|
460
|
+
(filename.toLowerCase().slice(-4) === '.csv' || filename.toLowerCase().slice(-7) === '.csv.gz')
|
|
461
|
+
) {
|
|
462
|
+
options.targetFormat = 'csv';
|
|
463
|
+
}
|
|
464
|
+
return worker.objectStreamToFile({ ...options, stream });
|
|
488
465
|
};
|
|
489
466
|
Worker.prototype.transform.metadata = {
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
}
|
|
467
|
+
options: {
|
|
468
|
+
sourcePostfix: { description: "Override the source postfix, if for example it's a csv" },
|
|
469
|
+
encoding: { description: 'Manual override of source file encoding' },
|
|
470
|
+
names: { description: 'Target field names (e.g. my_new_field,x,y,z)' },
|
|
471
|
+
values: {
|
|
472
|
+
description:
|
|
473
|
+
"Comma delimited source field name, or Handlebars [[ ]] merge fields (e.g. 'my_field,x,y,z', '[[field1]]-[[field2]]', etc)"
|
|
474
|
+
},
|
|
475
|
+
targetFilename: { description: 'Custom name of the output file (default auto-generated)' },
|
|
476
|
+
targetFormat: { description: 'Output format -- csv supported, or none for txt (default)' },
|
|
477
|
+
targetRowDelimiter: { description: 'Row delimiter (default \n)' },
|
|
478
|
+
targetFieldDelimiter: { description: 'Field delimiter (default \t or ,)' }
|
|
479
|
+
}
|
|
502
480
|
};
|
|
503
481
|
Worker.prototype.testTransform = async function (options) {
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
482
|
+
return this.transform({
|
|
483
|
+
...options,
|
|
484
|
+
transform(d, enc, cb) {
|
|
485
|
+
d.transform_time = new Date();
|
|
486
|
+
cb(null, d);
|
|
487
|
+
}
|
|
488
|
+
});
|
|
511
489
|
};
|
|
512
490
|
Worker.prototype.testTransform.metadata = {
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
491
|
+
options: {
|
|
492
|
+
filename: true
|
|
493
|
+
}
|
|
516
494
|
};
|
|
517
495
|
/* Get a stream from an actual stream, or an array, or a file */
|
|
518
496
|
Worker.prototype.stream = async function (options) {
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
const { transforms } = this.csvToObjectTransforms({});
|
|
573
|
-
transforms.forEach((t) => {
|
|
574
|
-
packetStream = packetStream.pipe(t);
|
|
575
|
-
});
|
|
576
|
-
return { stream: packetStream };
|
|
577
|
-
}
|
|
578
|
-
else {
|
|
579
|
-
throw new Error('stream must be passed a stream, filename, or packet');
|
|
580
|
-
}
|
|
497
|
+
const { stream: inputStream, packet, type, columns, limit, filename: filenameOpt } = options;
|
|
498
|
+
let filename = filenameOpt;
|
|
499
|
+
if (inputStream) {
|
|
500
|
+
if (Array.isArray(inputStream)) {
|
|
501
|
+
return { stream: Readable.from(inputStream) };
|
|
502
|
+
}
|
|
503
|
+
// probably already a stream
|
|
504
|
+
if (typeof inputStream === 'object') return { stream: inputStream, encoding: 'object' };
|
|
505
|
+
throw new Error(`Invalid stream type:${typeof inputStream}`);
|
|
506
|
+
} else if (filename) {
|
|
507
|
+
if (filename.startsWith('engine9-accounts/')) {
|
|
508
|
+
filename = `${process.env.ENGINE9_ACCOUNT_DIR}/${filename.slice('engine9-accounts/'.length)}`;
|
|
509
|
+
// debug(`Prepending file with ${process.env.ENGINE9_ACCOUNT_DIR}, filename=${filename}`);
|
|
510
|
+
} else {
|
|
511
|
+
// debug(`Not prepending filename:${filename}`);
|
|
512
|
+
}
|
|
513
|
+
let encoding;
|
|
514
|
+
let stream;
|
|
515
|
+
if (filename.slice(-8) === '.parquet') {
|
|
516
|
+
const pq = new ParquetWorker(this);
|
|
517
|
+
stream = (await pq.stream({ filename, columns, limit })).stream;
|
|
518
|
+
encoding = 'object';
|
|
519
|
+
} else if (filename.startsWith('s3://')) {
|
|
520
|
+
const s3Worker = new S3Worker(this);
|
|
521
|
+
stream = (await s3Worker.stream({ filename, columns, limit })).stream;
|
|
522
|
+
encoding = 'UTF-8';
|
|
523
|
+
} else if (filename.startsWith('r2://')) {
|
|
524
|
+
const r2Worker = new R2Worker(this);
|
|
525
|
+
stream = (await r2Worker.stream({ filename, columns, limit })).stream;
|
|
526
|
+
encoding = 'UTF-8';
|
|
527
|
+
} else {
|
|
528
|
+
// Check if the file exists, and fast fail if not
|
|
529
|
+
// Otherwise the stream hangs out as a handle
|
|
530
|
+
try {
|
|
531
|
+
await fsp.stat(filename);
|
|
532
|
+
} catch (e) {
|
|
533
|
+
debug(`Error reading file ${filename}, current directory: ${process.cwd()},__dirname:${__dirname}`);
|
|
534
|
+
throw e;
|
|
535
|
+
}
|
|
536
|
+
stream = fs.createReadStream(filename);
|
|
537
|
+
encoding = (await this.detectEncoding({ filename })).encoding;
|
|
538
|
+
}
|
|
539
|
+
return { stream, encoding };
|
|
540
|
+
} else if (packet) {
|
|
541
|
+
let { stream: packetStream } = await streamPacket({ packet, type, limit });
|
|
542
|
+
const { transforms } = this.csvToObjectTransforms({});
|
|
543
|
+
transforms.forEach((t) => {
|
|
544
|
+
packetStream = packetStream.pipe(t);
|
|
545
|
+
});
|
|
546
|
+
return { stream: packetStream };
|
|
547
|
+
} else {
|
|
548
|
+
throw new Error('stream must be passed a stream, filename, or packet');
|
|
549
|
+
}
|
|
581
550
|
};
|
|
582
551
|
Worker.prototype.sample = async function (opts) {
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
552
|
+
opts.limit = opts.limit || 10;
|
|
553
|
+
const { stream } = await this.fileToObjectStream(opts);
|
|
554
|
+
return stream.toArray();
|
|
586
555
|
};
|
|
587
556
|
Worker.prototype.sample.metadata = {
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
557
|
+
options: {
|
|
558
|
+
filename: {}
|
|
559
|
+
}
|
|
591
560
|
};
|
|
592
561
|
Worker.prototype.toArray = async function (opts) {
|
|
593
|
-
|
|
594
|
-
|
|
562
|
+
const { stream } = await this.fileToObjectStream(opts);
|
|
563
|
+
return stream.toArray();
|
|
595
564
|
};
|
|
596
565
|
Worker.prototype.toArray.metadata = {
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
566
|
+
options: {
|
|
567
|
+
filename: {}
|
|
568
|
+
}
|
|
600
569
|
};
|
|
601
570
|
Worker.prototype.write = async function (opts) {
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
return { success: true, filename };
|
|
571
|
+
const { filename, content } = opts;
|
|
572
|
+
if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
|
|
573
|
+
const worker = new (filename.startsWith('r2://') ? R2Worker : S3Worker)(this);
|
|
574
|
+
const parts = filename.split('/');
|
|
575
|
+
const directory = parts.slice(0, -1).join('/');
|
|
576
|
+
const file = parts.slice(-1)[0];
|
|
577
|
+
// debug(JSON.stringify({ parts, directory, file }));
|
|
578
|
+
await worker.write({
|
|
579
|
+
directory,
|
|
580
|
+
file,
|
|
581
|
+
content
|
|
582
|
+
});
|
|
583
|
+
} else {
|
|
584
|
+
const directory = path.dirname(filename);
|
|
585
|
+
await fsp.mkdir(directory, { recursive: true });
|
|
586
|
+
await fsp.writeFile(filename, content);
|
|
587
|
+
}
|
|
588
|
+
return { success: true, filename };
|
|
621
589
|
};
|
|
622
590
|
Worker.prototype.write.metadata = {
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
591
|
+
options: {
|
|
592
|
+
filename: { description: 'Location to write content to, can be local or s3:// or r2://' },
|
|
593
|
+
content: {}
|
|
594
|
+
}
|
|
627
595
|
};
|
|
628
596
|
async function streamToString(stream) {
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
597
|
+
// lets have a ReadableStream as a stream variable
|
|
598
|
+
const chunks = [];
|
|
599
|
+
for await (const chunk of stream) {
|
|
600
|
+
chunks.push(Buffer.from(chunk));
|
|
601
|
+
}
|
|
602
|
+
return Buffer.concat(chunks).toString('utf-8');
|
|
635
603
|
}
|
|
636
604
|
/*
|
|
637
605
|
Retrieves and parsed
|
|
638
606
|
*/
|
|
639
607
|
Worker.prototype.json = async function (opts) {
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
}
|
|
608
|
+
const { stream } = await this.stream(opts);
|
|
609
|
+
const str = await streamToString(stream);
|
|
610
|
+
try {
|
|
611
|
+
return JSON5.parse(str);
|
|
612
|
+
} catch (e) {
|
|
613
|
+
debug(e);
|
|
614
|
+
throw new Error(`Unparseable JSON received: ${opts.filename || '(native stream)'}`);
|
|
615
|
+
}
|
|
649
616
|
};
|
|
650
617
|
Worker.prototype.json.metadata = {
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
618
|
+
options: {
|
|
619
|
+
filename: { description: 'Get a javascript object from a file' }
|
|
620
|
+
}
|
|
654
621
|
};
|
|
655
622
|
Worker.prototype.list = async function ({ directory, start: s, end: e }) {
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
const
|
|
669
|
-
const
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
modifiedAt: new Date(stats.mtime).toISOString()
|
|
684
|
-
});
|
|
685
|
-
}
|
|
686
|
-
}
|
|
687
|
-
return withModified;
|
|
623
|
+
if (!directory) throw new Error('directory is required');
|
|
624
|
+
let start = null;
|
|
625
|
+
let end = null;
|
|
626
|
+
if (s) start = relativeDate(s);
|
|
627
|
+
if (e) end = relativeDate(e);
|
|
628
|
+
if (directory.startsWith('s3://') || directory.startsWith('r2://')) {
|
|
629
|
+
const worker = new (directory.startsWith('r2://') ? R2Worker : S3Worker)(this);
|
|
630
|
+
return worker.list({ directory, start, end });
|
|
631
|
+
}
|
|
632
|
+
const a = await fsp.readdir(directory, { withFileTypes: true });
|
|
633
|
+
const withModified = [];
|
|
634
|
+
for (const file of a) {
|
|
635
|
+
const fullPath = path.join(directory, file.name);
|
|
636
|
+
const stats = await fsp.stat(fullPath);
|
|
637
|
+
if (start && stats.mtime < start.getTime()) {
|
|
638
|
+
//do not include
|
|
639
|
+
} else if (end && stats.mtime > end.getTime()) {
|
|
640
|
+
//do nothing
|
|
641
|
+
} else {
|
|
642
|
+
withModified.push({
|
|
643
|
+
name: file.name,
|
|
644
|
+
type: file.isDirectory() ? 'directory' : 'file',
|
|
645
|
+
modifiedAt: new Date(stats.mtime).toISOString()
|
|
646
|
+
});
|
|
647
|
+
}
|
|
648
|
+
}
|
|
649
|
+
return withModified;
|
|
688
650
|
};
|
|
689
651
|
Worker.prototype.list.metadata = {
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
652
|
+
options: {
|
|
653
|
+
directory: { required: true }
|
|
654
|
+
}
|
|
693
655
|
};
|
|
694
656
|
Worker.prototype.listAll = async function ({ directory, start: s, end: e }) {
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
await Promise.all(files.map((filename) => limitedMethod(async () => {
|
|
657
|
+
if (!directory) throw new Error('directory is required');
|
|
658
|
+
let start = null;
|
|
659
|
+
let end = null;
|
|
660
|
+
if (s) start = relativeDate(s).getTime();
|
|
661
|
+
if (e) end = relativeDate(e).getTime();
|
|
662
|
+
if (directory.startsWith('s3://') || directory.startsWith('r2://')) {
|
|
663
|
+
const worker = new (directory.startsWith('r2://') ? R2Worker : S3Worker)(this);
|
|
664
|
+
return worker.listAll({ directory, start, end });
|
|
665
|
+
}
|
|
666
|
+
const a = await fsp.readdir(directory, { recursive: true });
|
|
667
|
+
let files = a.map((f) => `${directory}/${f}`);
|
|
668
|
+
if (!start && !end) {
|
|
669
|
+
return files;
|
|
670
|
+
}
|
|
671
|
+
const pLimit = await import('p-limit');
|
|
672
|
+
const limitedMethod = pLimit.default(10);
|
|
673
|
+
const filesWithinLimit = [];
|
|
674
|
+
await Promise.all(
|
|
675
|
+
files.map((filename) =>
|
|
676
|
+
limitedMethod(async () => {
|
|
716
677
|
const stats = await fsp.stat(filename);
|
|
717
678
|
if (start && stats.mtime < start) {
|
|
718
|
-
|
|
719
|
-
}
|
|
720
|
-
|
|
721
|
-
|
|
679
|
+
//do not include
|
|
680
|
+
} else if (end && stats.mtime > end) {
|
|
681
|
+
//do nothing
|
|
682
|
+
} else {
|
|
683
|
+
filesWithinLimit.push({
|
|
684
|
+
name: filename,
|
|
685
|
+
type: stats.isDirectory() ? 'directory' : 'file',
|
|
686
|
+
modifiedAt: new Date(stats.mtime).toISOString()
|
|
687
|
+
});
|
|
722
688
|
}
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
modifiedAt: new Date(stats.mtime).toISOString()
|
|
728
|
-
});
|
|
729
|
-
}
|
|
730
|
-
})));
|
|
731
|
-
return filesWithinLimit;
|
|
689
|
+
})
|
|
690
|
+
)
|
|
691
|
+
);
|
|
692
|
+
return filesWithinLimit;
|
|
732
693
|
};
|
|
733
694
|
Worker.prototype.listAll.metadata = {
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
695
|
+
options: {
|
|
696
|
+
directory: { required: true },
|
|
697
|
+
start: {},
|
|
698
|
+
end: {}
|
|
699
|
+
}
|
|
739
700
|
};
|
|
740
701
|
Worker.prototype.moveAll = async function (options) {
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
let
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
return Promise.all(configs.map(({ filename, target }) => limitedMethod(async () => this.move({ filename, target }))));
|
|
702
|
+
const { directory, targetDirectory } = options;
|
|
703
|
+
if (!directory) throw new Error('directory is required');
|
|
704
|
+
if (directory.startsWith('s3://') || directory.startsWith('r2://')) {
|
|
705
|
+
const worker = new (directory.startsWith('r2://') ? R2Worker : S3Worker)(this);
|
|
706
|
+
return worker.moveAll(options);
|
|
707
|
+
}
|
|
708
|
+
const a = await this.listAll(options);
|
|
709
|
+
let configs = a.map((f) => {
|
|
710
|
+
let filename = typeof f === 'string' ? f : f.filename;
|
|
711
|
+
return {
|
|
712
|
+
filename,
|
|
713
|
+
target: filename.replace(directory, targetDirectory)
|
|
714
|
+
};
|
|
715
|
+
});
|
|
716
|
+
const pLimit = await import('p-limit');
|
|
717
|
+
const limitedMethod = pLimit.default(10);
|
|
718
|
+
return Promise.all(configs.map(({ filename, target }) => limitedMethod(async () => this.move({ filename, target }))));
|
|
759
719
|
};
|
|
760
720
|
Worker.prototype.moveAll.metadata = {
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
721
|
+
options: {
|
|
722
|
+
directory: { required: true },
|
|
723
|
+
targetDirectory: { required: true }
|
|
724
|
+
}
|
|
765
725
|
};
|
|
766
726
|
Worker.prototype.empty = async function ({ directory }) {
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
return { directory, removed };
|
|
727
|
+
if (!directory) throw new Error('directory is required');
|
|
728
|
+
if (directory.startsWith('s3://') || directory.startsWith('r2://')) {
|
|
729
|
+
// currently not emptying S3 this way -- dangerous
|
|
730
|
+
throw new Error('Cannot empty an s3:// or r2:// directory');
|
|
731
|
+
}
|
|
732
|
+
const removed = [];
|
|
733
|
+
for (const file of await fsp.readdir(directory)) {
|
|
734
|
+
removed.push(file);
|
|
735
|
+
await fsp.unlink(path.join(directory, file));
|
|
736
|
+
}
|
|
737
|
+
return { directory, removed };
|
|
779
738
|
};
|
|
780
739
|
Worker.prototype.empty.metadata = {
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
740
|
+
options: {
|
|
741
|
+
directory: { required: true }
|
|
742
|
+
}
|
|
784
743
|
};
|
|
785
744
|
Worker.prototype.removeAll = async function (options) {
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
745
|
+
const filenames = await this.listAll(options);
|
|
746
|
+
const pLimit = await import('p-limit');
|
|
747
|
+
const limitedMethod = pLimit.default(10);
|
|
748
|
+
return Promise.all(filenames.map((filename) => limitedMethod(async () => this.remove({ filename }))));
|
|
790
749
|
};
|
|
791
750
|
Worker.prototype.removeAll.metadata = {
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
751
|
+
options: {
|
|
752
|
+
directory: { required: true },
|
|
753
|
+
start: {},
|
|
754
|
+
end: {}
|
|
755
|
+
}
|
|
797
756
|
};
|
|
798
757
|
Worker.prototype.remove = async function ({ filename }) {
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
if (filename.startsWith('
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
else {
|
|
814
|
-
fsp.unlink(filename);
|
|
815
|
-
}
|
|
816
|
-
return { removed: filename };
|
|
758
|
+
if (!filename) throw new Error('filename is required');
|
|
759
|
+
if (typeof filename !== 'string') throw new Error(`filename isn't a string:${JSON.stringify(filename)}`);
|
|
760
|
+
if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
|
|
761
|
+
let worker = null;
|
|
762
|
+
if (filename.startsWith('r2://')) {
|
|
763
|
+
worker = new R2Worker(this);
|
|
764
|
+
} else {
|
|
765
|
+
worker = new S3Worker(this);
|
|
766
|
+
}
|
|
767
|
+
await worker.remove({ filename });
|
|
768
|
+
} else {
|
|
769
|
+
fsp.unlink(filename);
|
|
770
|
+
}
|
|
771
|
+
return { removed: filename };
|
|
817
772
|
};
|
|
818
773
|
Worker.prototype.remove.metadata = {
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
774
|
+
options: {
|
|
775
|
+
filename: {}
|
|
776
|
+
}
|
|
822
777
|
};
|
|
823
778
|
Worker.prototype.move = async function ({ filename, target, remove = true }) {
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
worker = new S3Worker(this);
|
|
839
|
-
}
|
|
840
|
-
if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
|
|
841
|
-
// We need to copy and delete
|
|
842
|
-
const output = await worker.copy({ filename, target });
|
|
843
|
-
if (remove)
|
|
844
|
-
await worker.remove({ filename });
|
|
845
|
-
return output;
|
|
846
|
-
}
|
|
847
|
-
const parts = target.split('/');
|
|
848
|
-
return worker.put({ filename, directory: parts.slice(0, -1).join('/'), file: parts.slice(-1)[0] });
|
|
779
|
+
if (!target) throw new Error('target is required');
|
|
780
|
+
if (typeof target !== 'string') throw new Error(`target isn't a string:${JSON.stringify(target)}`);
|
|
781
|
+
if (target.startsWith('s3://') || target.startsWith('r2://')) {
|
|
782
|
+
if (
|
|
783
|
+
(target.startsWith('s3://') && filename.startsWith('r2://')) ||
|
|
784
|
+
(target.startsWith('r2://') && filename.startsWith('s3://'))
|
|
785
|
+
) {
|
|
786
|
+
throw new Error('Cowardly not copying between services');
|
|
787
|
+
}
|
|
788
|
+
let worker = null;
|
|
789
|
+
if (target.startsWith('r2://')) {
|
|
790
|
+
worker = new R2Worker(this);
|
|
791
|
+
} else {
|
|
792
|
+
worker = new S3Worker(this);
|
|
849
793
|
}
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
794
|
+
if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
|
|
795
|
+
// We need to copy and delete
|
|
796
|
+
const output = await worker.copy({ filename, target });
|
|
797
|
+
if (remove) await worker.remove({ filename });
|
|
798
|
+
return output;
|
|
799
|
+
}
|
|
800
|
+
const parts = target.split('/');
|
|
801
|
+
return worker.put({ filename, directory: parts.slice(0, -1).join('/'), file: parts.slice(-1)[0] });
|
|
802
|
+
}
|
|
803
|
+
await fsp.mkdir(path.dirname(target), { recursive: true });
|
|
804
|
+
if (remove) {
|
|
805
|
+
try {
|
|
806
|
+
await fsp.rename(filename, target);
|
|
807
|
+
} catch (e) {
|
|
808
|
+
//it may be a filesystem issue moving between items
|
|
809
|
+
debug('Assuming this is a filesystem crosslink error, ignoring ', e.getMessage());
|
|
810
|
+
await fsp.copyFile(filename, target);
|
|
811
|
+
await fsp.unlink(filename);
|
|
812
|
+
}
|
|
813
|
+
} else {
|
|
814
|
+
await fsp.copyFile(filename, target);
|
|
815
|
+
}
|
|
816
|
+
return { filename: target };
|
|
866
817
|
};
|
|
867
818
|
Worker.prototype.move.metadata = {
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
819
|
+
options: {
|
|
820
|
+
filename: {},
|
|
821
|
+
target: {}
|
|
822
|
+
}
|
|
872
823
|
};
|
|
873
824
|
Worker.prototype.copy = async function (opts) {
|
|
874
|
-
|
|
825
|
+
return this.move({ ...opts, remove: false });
|
|
875
826
|
};
|
|
876
827
|
Worker.prototype.copy.metadata = {
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
828
|
+
options: {
|
|
829
|
+
filename: {},
|
|
830
|
+
target: {}
|
|
831
|
+
}
|
|
881
832
|
};
|
|
882
833
|
Worker.prototype.stat = async function ({ filename }) {
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
}
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
});
|
|
907
|
-
}
|
|
908
|
-
return output;
|
|
834
|
+
if (!filename) throw new Error('filename is required');
|
|
835
|
+
const output = {};
|
|
836
|
+
if (filename.slice(-8) === '.parquet') {
|
|
837
|
+
const pq = new ParquetWorker(this);
|
|
838
|
+
output.schema = (await pq.schema({ filename }))?.schema;
|
|
839
|
+
output.records = (await pq.meta({ filename }))?.records;
|
|
840
|
+
}
|
|
841
|
+
if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
|
|
842
|
+
const worker = new (filename.startsWith('r2://') ? R2Worker : S3Worker)(this);
|
|
843
|
+
Object.assign(output, await worker.stat({ filename }));
|
|
844
|
+
} else {
|
|
845
|
+
const { ctime, birthtime, size } = await fsp.stat(filename);
|
|
846
|
+
const modifiedAt = new Date(ctime);
|
|
847
|
+
let createdAt = birthtime;
|
|
848
|
+
if (createdAt === 0 || !createdAt) createdAt = ctime;
|
|
849
|
+
createdAt = new Date(createdAt);
|
|
850
|
+
Object.assign(output, {
|
|
851
|
+
createdAt,
|
|
852
|
+
modifiedAt,
|
|
853
|
+
size
|
|
854
|
+
});
|
|
855
|
+
}
|
|
856
|
+
return output;
|
|
909
857
|
};
|
|
910
858
|
Worker.prototype.stat.metadata = {
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
859
|
+
options: {
|
|
860
|
+
filename: {}
|
|
861
|
+
}
|
|
914
862
|
};
|
|
915
863
|
Worker.prototype.download = async function ({ filename }) {
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
throw new Error('Cannot download a local file');
|
|
864
|
+
if (!filename) throw new Error('filename is required');
|
|
865
|
+
if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
|
|
866
|
+
const worker = new (filename.startsWith('r2://') ? R2Worker : S3Worker)(this);
|
|
867
|
+
return worker.download({ filename });
|
|
868
|
+
}
|
|
869
|
+
throw new Error('Cannot download a local file');
|
|
923
870
|
};
|
|
924
871
|
Worker.prototype.download.metadata = {
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
872
|
+
options: {
|
|
873
|
+
filename: {}
|
|
874
|
+
}
|
|
928
875
|
};
|
|
929
876
|
Worker.prototype.head = async function (options) {
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
return chunks;
|
|
877
|
+
const limit = options.limit || 3;
|
|
878
|
+
const { stream } = await this.fileToObjectStream({ ...options, limit });
|
|
879
|
+
const chunks = [];
|
|
880
|
+
let counter = 0;
|
|
881
|
+
for await (const chunk of stream) {
|
|
882
|
+
chunks.push(chunk);
|
|
883
|
+
counter += 1;
|
|
884
|
+
if (counter >= limit) break;
|
|
885
|
+
}
|
|
886
|
+
return chunks;
|
|
941
887
|
};
|
|
942
888
|
Worker.prototype.head.metadata = {
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
889
|
+
options: {
|
|
890
|
+
filename: { required: true }
|
|
891
|
+
}
|
|
946
892
|
};
|
|
947
893
|
Worker.prototype.columns = async function (options) {
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
return {
|
|
951
|
-
records: 0,
|
|
952
|
-
likelyHeaderLines: 0,
|
|
953
|
-
columns: []
|
|
954
|
-
};
|
|
955
|
-
}
|
|
956
|
-
let likelyHeaderLines = 1;
|
|
957
|
-
const columns = Object.keys(head[0]);
|
|
958
|
-
let s = columns.join(',');
|
|
959
|
-
if (s.match(/[()@#%!]/)) {
|
|
960
|
-
likelyHeaderLines = 0;
|
|
961
|
-
}
|
|
894
|
+
const head = await this.head(options);
|
|
895
|
+
if (head.length == 0) {
|
|
962
896
|
return {
|
|
963
|
-
|
|
964
|
-
|
|
897
|
+
records: 0,
|
|
898
|
+
likelyHeaderLines: 0,
|
|
899
|
+
columns: []
|
|
965
900
|
};
|
|
901
|
+
}
|
|
902
|
+
let likelyHeaderLines = 1;
|
|
903
|
+
const columns = Object.keys(head[0]);
|
|
904
|
+
let s = columns.join(',');
|
|
905
|
+
if (s.match(/[()@#%!]/)) {
|
|
906
|
+
likelyHeaderLines = 0;
|
|
907
|
+
}
|
|
908
|
+
return {
|
|
909
|
+
likelyHeaderLines,
|
|
910
|
+
columns
|
|
911
|
+
};
|
|
966
912
|
};
|
|
967
913
|
Worker.prototype.columns.metadata = {
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
914
|
+
options: {
|
|
915
|
+
filename: { required: true }
|
|
916
|
+
}
|
|
971
917
|
};
|
|
972
918
|
Worker.prototype.count = async function (options) {
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
919
|
+
const { stream } = await this.fileToObjectStream(options);
|
|
920
|
+
const sample = [];
|
|
921
|
+
const limit = options.limit || 5;
|
|
922
|
+
let records = 0;
|
|
923
|
+
for await (const chunk of stream) {
|
|
924
|
+
records += 1;
|
|
925
|
+
if (records < limit) {
|
|
926
|
+
sample.push(chunk);
|
|
927
|
+
}
|
|
928
|
+
}
|
|
929
|
+
return { sample, records };
|
|
984
930
|
};
|
|
985
931
|
Worker.prototype.count.metadata = {
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
932
|
+
options: {
|
|
933
|
+
filename: { required: true }
|
|
934
|
+
}
|
|
989
935
|
};
|
|
990
936
|
// Get a set of unique entries from a uniqueFunction
|
|
991
937
|
// This could be large
|
|
992
938
|
Worker.prototype.getUniqueSet = async function (options) {
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
const v = uniqueFunction(makeStrings(d)) || '';
|
|
1006
|
-
if (uniqueSet.size < 3) {
|
|
1007
|
-
sample[v] = d;
|
|
1008
|
-
}
|
|
1009
|
-
uniqueSet.add(v);
|
|
1010
|
-
cb(null, d);
|
|
1011
|
-
}
|
|
1012
|
-
}), new Writable({
|
|
1013
|
-
objectMode: true,
|
|
1014
|
-
write(d, enc, cb) {
|
|
1015
|
-
cb();
|
|
1016
|
-
}
|
|
1017
|
-
}));
|
|
1018
|
-
debug(`Finished loading ${filename}`);
|
|
1019
|
-
}
|
|
1020
|
-
return { uniqueFunction, uniqueSet, sample };
|
|
1021
|
-
};
|
|
1022
|
-
Worker.prototype.getUniqueStream = async function (options) {
|
|
1023
|
-
const includeDuplicateSourceRecords = bool(options.includeDuplicateSourceRecords, false);
|
|
1024
|
-
const { uniqueSet, uniqueFunction, sample } = await this.getUniqueSet({
|
|
1025
|
-
filenames: options.existingFiles,
|
|
1026
|
-
uniqueFunction: options.uniqueFunction
|
|
1027
|
-
});
|
|
1028
|
-
const { stream: inStream } = await this.fileToObjectStream(options);
|
|
1029
|
-
const uniqueStream = inStream.pipe(new Transform({
|
|
939
|
+
const existingFiles = getStringArray(options.filenames);
|
|
940
|
+
const sample = {};
|
|
941
|
+
let { uniqueFunction } = options;
|
|
942
|
+
if (!uniqueFunction) {
|
|
943
|
+
uniqueFunction = (o) => JSON.stringify(o);
|
|
944
|
+
}
|
|
945
|
+
const uniqueSet = new Set();
|
|
946
|
+
for (const filename of existingFiles) {
|
|
947
|
+
const { stream: existsStream } = await this.fileToObjectStream({ filename });
|
|
948
|
+
await pipeline(
|
|
949
|
+
existsStream,
|
|
950
|
+
new Transform({
|
|
1030
951
|
objectMode: true,
|
|
1031
952
|
transform(d, enc, cb) {
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
// add it to the set for the next time
|
|
1045
|
-
uniqueSet.add(v);
|
|
1046
|
-
}
|
|
1047
|
-
cb(null, d);
|
|
1048
|
-
}
|
|
953
|
+
const v = uniqueFunction(makeStrings(d)) || '';
|
|
954
|
+
if (uniqueSet.size < 3) {
|
|
955
|
+
sample[v] = d;
|
|
956
|
+
}
|
|
957
|
+
uniqueSet.add(v);
|
|
958
|
+
cb(null, d);
|
|
959
|
+
}
|
|
960
|
+
}),
|
|
961
|
+
new Writable({
|
|
962
|
+
objectMode: true,
|
|
963
|
+
write(d, enc, cb) {
|
|
964
|
+
cb();
|
|
1049
965
|
}
|
|
1050
|
-
|
|
1051
|
-
|
|
966
|
+
})
|
|
967
|
+
);
|
|
968
|
+
debug(`Finished loading ${filename}`);
|
|
969
|
+
}
|
|
970
|
+
return { uniqueFunction, uniqueSet, sample };
|
|
1052
971
|
};
|
|
1053
|
-
Worker.prototype.getUniqueStream
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
972
|
+
Worker.prototype.getUniqueStream = async function (options) {
|
|
973
|
+
const includeDuplicateSourceRecords = bool(options.includeDuplicateSourceRecords, false);
|
|
974
|
+
const { uniqueSet, uniqueFunction, sample } = await this.getUniqueSet({
|
|
975
|
+
filenames: options.existingFiles,
|
|
976
|
+
uniqueFunction: options.uniqueFunction
|
|
977
|
+
});
|
|
978
|
+
const { stream: inStream } = await this.fileToObjectStream(options);
|
|
979
|
+
const uniqueStream = inStream.pipe(
|
|
980
|
+
new Transform({
|
|
981
|
+
objectMode: true,
|
|
982
|
+
transform(d, enc, cb) {
|
|
983
|
+
const v = uniqueFunction(makeStrings(d)) || '';
|
|
984
|
+
if (!v) {
|
|
985
|
+
// falsey unique function includes
|
|
986
|
+
// by default
|
|
987
|
+
cb(null, d);
|
|
988
|
+
} else if (uniqueSet.has(v)) {
|
|
989
|
+
// do nothing
|
|
990
|
+
cb();
|
|
991
|
+
} else {
|
|
992
|
+
if (!includeDuplicateSourceRecords) {
|
|
993
|
+
// add it to the set for the next time
|
|
994
|
+
uniqueSet.add(v);
|
|
995
|
+
}
|
|
996
|
+
cb(null, d);
|
|
1061
997
|
}
|
|
1062
|
-
|
|
998
|
+
}
|
|
999
|
+
})
|
|
1000
|
+
);
|
|
1001
|
+
return { stream: uniqueStream, sample };
|
|
1002
|
+
};
|
|
1003
|
+
Worker.prototype.getUniqueStream.metadata = {
|
|
1004
|
+
options: {
|
|
1005
|
+
existingFiles: {},
|
|
1006
|
+
uniqueFunction: {},
|
|
1007
|
+
filename: { description: 'Specify a source filename or a stream' },
|
|
1008
|
+
stream: { description: 'Specify a source filename or a stream' },
|
|
1009
|
+
includeDuplicateSourceRecords: {
|
|
1010
|
+
description: 'Sometimes you want the output to include source dupes, sometimes not, default false'
|
|
1011
|
+
}
|
|
1012
|
+
}
|
|
1063
1013
|
};
|
|
1064
1014
|
Worker.prototype.getUniqueFile = async function (options) {
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1015
|
+
const { stream, sample } = await this.getUniqueStream(options);
|
|
1016
|
+
const { filename, records } = await this.objectStreamToFile({ stream });
|
|
1017
|
+
return { filename, records, sample };
|
|
1068
1018
|
};
|
|
1069
1019
|
Worker.prototype.getUniqueFile.metadata = {
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1020
|
+
options: {
|
|
1021
|
+
existingFiles: {},
|
|
1022
|
+
uniqueFunction: {},
|
|
1023
|
+
filename: { description: 'Specify a source filename or a stream' },
|
|
1024
|
+
stream: { description: 'Specify a source filename or a stream' },
|
|
1025
|
+
includeDuplicateSourceRecords: {
|
|
1026
|
+
description: 'Sometimes you want the output to include source dupes, sometimes not, default false'
|
|
1027
|
+
}
|
|
1028
|
+
}
|
|
1079
1029
|
};
|
|
1080
1030
|
/*
|
|
1081
1031
|
diff that allows for unordered files, and doesn't store full objects in memory.
|
|
@@ -1083,42 +1033,40 @@ Requires 2 passes of the files,
|
|
|
1083
1033
|
but that's a better tradeoff than trying to store huge files in memory
|
|
1084
1034
|
*/
|
|
1085
1035
|
Worker.prototype.diff = async function (options) {
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
right
|
|
1111
|
-
};
|
|
1036
|
+
const { fileA, fileB, uniqueFunction: ufOpt, columns, includeDuplicateSourceRecords } = options;
|
|
1037
|
+
if (options.fields) throw new Error('fields is deprecated, use columns');
|
|
1038
|
+
if (ufOpt && columns) throw new Error('fields and uniqueFunction cannot both be specified');
|
|
1039
|
+
let uniqueFunction = ufOpt;
|
|
1040
|
+
if (!uniqueFunction && columns) {
|
|
1041
|
+
const farr = getStringArray(columns);
|
|
1042
|
+
uniqueFunction = (o) => farr.map((f) => o[f] || '').join('.');
|
|
1043
|
+
}
|
|
1044
|
+
const left = await this.getUniqueFile({
|
|
1045
|
+
existingFiles: [fileB],
|
|
1046
|
+
filename: fileA,
|
|
1047
|
+
uniqueFunction,
|
|
1048
|
+
includeDuplicateSourceRecords
|
|
1049
|
+
});
|
|
1050
|
+
const right = await this.getUniqueFile({
|
|
1051
|
+
existingFiles: [fileA],
|
|
1052
|
+
filename: fileB,
|
|
1053
|
+
uniqueFunction,
|
|
1054
|
+
includeDuplicateSourceRecords
|
|
1055
|
+
});
|
|
1056
|
+
return {
|
|
1057
|
+
left,
|
|
1058
|
+
right
|
|
1059
|
+
};
|
|
1112
1060
|
};
|
|
1113
1061
|
Worker.prototype.diff.metadata = {
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
|
|
1062
|
+
options: {
|
|
1063
|
+
fileA: {},
|
|
1064
|
+
fileB: {},
|
|
1065
|
+
columns: { description: 'Columns to use for uniqueness -- aka primary key. Defaults to JSON of line' },
|
|
1066
|
+
uniqueFunction: {},
|
|
1067
|
+
includeDuplicateSourceRecords: {
|
|
1068
|
+
description: 'Sometimes you want the output to include source dupes, sometimes not, default false'
|
|
1069
|
+
}
|
|
1070
|
+
}
|
|
1123
1071
|
};
|
|
1124
1072
|
export default Worker;
|