@engine9-io/input-tools 1.9.11 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,1196 +1,1124 @@
1
- const fs = require('node:fs');
2
-
1
+ import fs from 'node:fs';
2
+ import path from 'node:path';
3
+ import zlib from 'node:zlib';
4
+ import nodestream from 'node:stream';
5
+ import promises from 'node:stream/promises';
6
+ import csv$0 from 'csv';
7
+ import debug$0 from 'debug';
8
+ import xlstream from 'xlstream';
9
+ import JSON5 from 'json5';
10
+ import languageEncoding from 'detect-file-encoding-and-language';
11
+ import R2Worker from './R2.js';
12
+ import S3Worker from './S3.js';
13
+ import ParquetWorker from './Parquet.js';
14
+ import { bool, getTempFilename, getStringArray, getTempDir, makeStrings, streamPacket, relativeDate } from './tools.js';
3
15
  const fsp = fs.promises;
4
- const path = require('node:path');
5
- const zlib = require('node:zlib');
6
- const { Readable, Transform, PassThrough, Writable } = require('node:stream');
7
- const { pipeline } = require('node:stream/promises');
8
- const { stringify } = require('csv');
9
-
10
- const debug = require('debug')('@engine9-io/file');
11
-
12
- const { getXlsxStream } = require('xlstream');
13
- const csv = require('csv');
14
- const JSON5 = require('json5');
15
-
16
- const languageEncoding = require('detect-file-encoding-and-language');
17
- const R2Worker = require('./R2');
18
- const S3Worker = require('./S3');
19
- const ParquetWorker = require('./Parquet');
20
-
21
- const {
22
- bool,
23
- getTempFilename,
24
- getStringArray,
25
- getTempDir,
26
- makeStrings,
27
- streamPacket,
28
- relativeDate
29
- } = require('./tools');
30
-
16
+ const { Readable, Transform, PassThrough, Writable } = nodestream;
17
+ const { pipeline } = promises;
18
+ const { stringify } = csv$0;
19
+ const debug = debug$0('@engine9-io/file');
20
+ const { getXlsxStream } = xlstream;
21
+ const csv = csv$0;
31
22
  function Worker({ accountId }) {
32
- this.accountId = accountId;
23
+ this.accountId = accountId;
33
24
  }
34
-
35
25
  class LineReaderTransform extends Transform {
36
- constructor(options = {}) {
37
- super({ ...options, readableObjectMode: true });
38
- this.buffer = '';
39
- }
40
-
41
- _transform(chunk, encoding, callback) {
42
- this.buffer += chunk.toString();
43
- const lines = this.buffer.split(/\r?\n/);
44
- this.buffer = lines.pop();
45
- lines.forEach((line) => this.push(line));
46
- callback();
47
- }
48
-
49
- _flush(callback) {
50
- if (this.buffer) {
51
- this.push(this.buffer);
52
- }
53
- callback();
54
- }
26
+ constructor(options = {}) {
27
+ super({ ...options, readableObjectMode: true });
28
+ this.buffer = '';
29
+ }
30
+ _transform(chunk, encoding, callback) {
31
+ this.buffer += chunk.toString();
32
+ const lines = this.buffer.split(/\r?\n/);
33
+ this.buffer = lines.pop();
34
+ lines.forEach((line) => this.push(line));
35
+ callback();
36
+ }
37
+ _flush(callback) {
38
+ if (this.buffer) {
39
+ this.push(this.buffer);
40
+ }
41
+ callback();
42
+ }
55
43
  }
56
-
57
44
  Worker.prototype.csvToObjectTransforms = function (options) {
58
- const transforms = [];
59
- const delimiter = options.delimiter || ',';
60
-
61
- const headerMapping =
62
- options.headerMapping ||
63
- function (d) {
64
- return d;
45
+ const transforms = [];
46
+ const delimiter = options.delimiter || ',';
47
+ const headerMapping = options.headerMapping ||
48
+ function (d) {
49
+ return d;
50
+ };
51
+ let lastLine = null;
52
+ let head = null;
53
+ const skipLinesWithError = bool(options.skip_lines_with_error, false);
54
+ const parserOptions = {
55
+ relax: true,
56
+ skip_empty_lines: true,
57
+ delimiter,
58
+ max_limit_on_data_read: 10000000,
59
+ skip_lines_with_error: skipLinesWithError
65
60
  };
66
- let lastLine = null;
67
- let head = null;
68
-
69
- const skipLinesWithError = bool(options.skip_lines_with_error, false);
70
- const parserOptions = {
71
- relax: true,
72
- skip_empty_lines: true,
73
- delimiter,
74
- max_limit_on_data_read: 10000000,
75
- skip_lines_with_error: skipLinesWithError
76
- };
77
- if (options.skip) parserOptions.from_line = options.skip;
78
- if (options.relax_column_count) parserOptions.relax_column_count = true;
79
- if (options.quote_escape) {
80
- parserOptions.escape = options.quote_escape;
81
- }
82
- if (options.limit) {
83
- parserOptions.to = options.limit;
84
- }
85
-
86
- debug('Parser options=', parserOptions);
87
- const parser = csv.parse(parserOptions);
88
- parser.on('error', (error) => {
89
- debug('fileToObjectStream: Error parsing csv file');
90
- debug(lastLine);
91
- throw new Error(error);
92
- });
93
-
94
- const blankAndHeaderCheck = new Transform({
95
- objectMode: true,
96
- transform(row, enc, cb) {
97
- // Blank rows
98
- if (row.length === 0) return cb();
99
- if (row.length === 1 && !row[0]) return cb();
100
-
101
- if (!head) {
102
- head = row.map(headerMapping);
103
- return cb();
104
- }
105
-
106
- const o = {};
107
- head.forEach((_h, i) => {
108
- const h = _h.trim();
109
- if (h) {
110
- o[h] = row[i];
61
+ if (options.skip)
62
+ parserOptions.from_line = options.skip;
63
+ if (options.relax_column_count)
64
+ parserOptions.relax_column_count = true;
65
+ if (options.quote_escape) {
66
+ parserOptions.escape = options.quote_escape;
67
+ }
68
+ if (options.limit) {
69
+ parserOptions.to = options.limit;
70
+ }
71
+ debug('Parser options=', parserOptions);
72
+ const parser = csv.parse(parserOptions);
73
+ parser.on('error', (error) => {
74
+ debug('fileToObjectStream: Error parsing csv file');
75
+ debug(lastLine);
76
+ throw new Error(error);
77
+ });
78
+ const blankAndHeaderCheck = new Transform({
79
+ objectMode: true,
80
+ transform(row, enc, cb) {
81
+ // Blank rows
82
+ if (row.length === 0)
83
+ return cb();
84
+ if (row.length === 1 && !row[0])
85
+ return cb();
86
+ if (!head) {
87
+ head = row.map(headerMapping);
88
+ return cb();
89
+ }
90
+ const o = {};
91
+ head.forEach((_h, i) => {
92
+ const h = _h.trim();
93
+ if (h) {
94
+ o[h] = row[i];
95
+ }
96
+ });
97
+ lastLine = row.join(delimiter);
98
+ return cb(null, o);
111
99
  }
112
- });
113
-
114
- lastLine = row.join(delimiter);
115
- return cb(null, o);
116
- }
117
- });
118
-
119
- transforms.push(parser);
120
- transforms.push(blankAndHeaderCheck);
121
-
122
- return { transforms };
100
+ });
101
+ transforms.push(parser);
102
+ transforms.push(blankAndHeaderCheck);
103
+ return { transforms };
123
104
  };
124
-
125
105
  Worker.prototype.detectEncoding = async function (options) {
126
- if (options.encoding_override) return { encoding: options.encoding_override };
127
- // Limit to only the top N bytes -- for perfomance
128
- // Be wary, though, as gzip files may require a certain minimum number of bytes to decompress
129
- const bytes = 64 * 1024;
130
- const buff = Buffer.alloc(bytes);
131
- const fd = await fsp.open(options.filename);
132
- await fd.read(buff, 0, bytes);
133
- let finalBuff = buff;
134
- if (options.filename.slice(-3) === '.gz') {
135
- // This code deals with scenarios where the buffer coming in may not be exactly the gzip
136
- // needed chunk size.
137
- finalBuff = await new Promise((resolve, reject) => {
138
- const bufferBuilder = [];
139
- const decompressStream = zlib
140
- .createGunzip()
141
- .on('data', (chunk) => {
142
- bufferBuilder.push(chunk);
143
- })
144
- .on('close', () => {
145
- resolve(Buffer.concat(bufferBuilder));
146
- })
147
- .on('error', (err) => {
148
- if (err.errno !== -5) {
149
- // EOF: expected
150
- reject(err);
151
- }
106
+ if (options.encoding_override)
107
+ return { encoding: options.encoding_override };
108
+ // Limit to only the top N bytes -- for perfomance
109
+ // Be wary, though, as gzip files may require a certain minimum number of bytes to decompress
110
+ const bytes = 64 * 1024;
111
+ const buff = Buffer.alloc(bytes);
112
+ const fd = await fsp.open(options.filename);
113
+ await fd.read(buff, 0, bytes);
114
+ let finalBuff = buff;
115
+ if (options.filename.slice(-3) === '.gz') {
116
+ // This code deals with scenarios where the buffer coming in may not be exactly the gzip
117
+ // needed chunk size.
118
+ finalBuff = await new Promise((resolve, reject) => {
119
+ const bufferBuilder = [];
120
+ const decompressStream = zlib
121
+ .createGunzip()
122
+ .on('data', (chunk) => {
123
+ bufferBuilder.push(chunk);
124
+ })
125
+ .on('close', () => {
126
+ resolve(Buffer.concat(bufferBuilder));
127
+ })
128
+ .on('error', (err) => {
129
+ if (err.errno !== -5) {
130
+ // EOF: expected
131
+ reject(err);
132
+ }
133
+ });
134
+ decompressStream.write(buff);
135
+ decompressStream.end();
152
136
  });
153
- decompressStream.write(buff);
154
- decompressStream.end();
155
- });
156
- }
157
-
158
- return languageEncoding(finalBuff);
137
+ }
138
+ return languageEncoding(finalBuff);
159
139
  };
160
-
161
140
  Worker.prototype.detectEncoding.metadata = {
162
- options: {
163
- filename: { required: true }
164
- }
141
+ options: {
142
+ filename: { required: true }
143
+ }
165
144
  };
166
-
167
145
  Worker.prototype.xlsxToObjectStream = async function (options) {
168
- let { filename } = options;
169
-
170
- if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
171
- // We need to copy and delete
172
- let worker = null;
173
- if (filename.startsWith('r2://')) {
174
- worker = new R2Worker(this);
175
- } else {
176
- worker = new S3Worker(this);
177
- }
178
- const target = getTempFilename({ targetFilename: filename.split('/').pop() });
179
-
180
- await worker.copy({ filename, target });
181
- filename = target;
182
- }
183
- let stream = await getXlsxStream({
184
- filePath: filename,
185
- sheet: 0
186
- });
187
- let keys = null;
188
- stream = stream.pipe(
189
- new Transform({
190
- objectMode: true,
191
- transform(d, enc, cb) {
192
- if (!keys) {
193
- keys = d?.raw.arr;
194
- cb();
195
- } else {
196
- let o = {};
197
- keys.forEach((k, i) => {
198
- o[k] = d?.raw?.arr?.[i];
199
- });
200
- cb(null, o);
146
+ let { filename } = options;
147
+ if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
148
+ // We need to copy and delete
149
+ let worker = null;
150
+ if (filename.startsWith('r2://')) {
151
+ worker = new R2Worker(this);
152
+ }
153
+ else {
154
+ worker = new S3Worker(this);
155
+ }
156
+ const target = getTempFilename({ targetFilename: filename.split('/').pop() });
157
+ await worker.copy({ filename, target });
158
+ filename = target;
159
+ }
160
+ let stream = await getXlsxStream({
161
+ filePath: filename,
162
+ sheet: 0
163
+ });
164
+ let keys = null;
165
+ stream = stream.pipe(new Transform({
166
+ objectMode: true,
167
+ transform(d, enc, cb) {
168
+ if (!keys) {
169
+ keys = d?.raw.arr;
170
+ cb();
171
+ }
172
+ else {
173
+ let o = {};
174
+ keys.forEach((k, i) => {
175
+ o[k] = d?.raw?.arr?.[i];
176
+ });
177
+ cb(null, o);
178
+ }
201
179
  }
202
- }
203
- })
204
- );
205
-
206
- return { stream };
180
+ }));
181
+ return { stream };
207
182
  };
208
-
209
183
  Worker.prototype.getFormat = async function (options) {
210
- const { sourcePostfix, filename, format: formatOverride } = options;
211
- let postfix = sourcePostfix || filename.toLowerCase().split('.').pop();
212
-
213
- if (postfix === 'gz') {
214
- postfix = filename.toLowerCase().split('.');
215
- postfix = postfix[postfix.length - 2];
216
- }
217
- return formatOverride || postfix;
184
+ const { sourcePostfix, filename, format: formatOverride } = options;
185
+ let postfix = sourcePostfix || filename.toLowerCase().split('.').pop();
186
+ if (postfix === 'gz') {
187
+ postfix = filename.toLowerCase().split('.');
188
+ postfix = postfix[postfix.length - 2];
189
+ }
190
+ return formatOverride || postfix;
218
191
  };
219
-
220
192
  /*
221
193
  Commonly used method to transform a file into a stream of objects.
222
194
  */
223
195
  Worker.prototype.fileToObjectStream = async function (options) {
224
- const { filename, columns, limit: limitOption, format: formatOverride } = options;
225
-
226
- // handle stream item
227
- if (options.stream) {
228
- if (Array.isArray(options.stream)) {
229
- return { stream: Readable.from(options.stream) };
230
- }
231
- // probably already a stream
232
- if (typeof options.stream === 'object') return { stream: options.stream };
233
- throw new Error(`Invalid stream type:${typeof options.stream}`);
234
- }
235
- let limit;
236
- if (limitOption) limit = parseInt(limitOption, 10);
237
- if (!filename) throw new Error('fileToObjectStream: filename is required');
238
- if (filename.split('.').pop().toLowerCase() === 'xlsx') {
239
- return this.xlsxToObjectStream(options);
240
- }
241
- let postfix = options.sourcePostfix || filename.toLowerCase().split('.').pop();
242
- if (postfix === 'zip') {
243
- debug('Invalid filename:', { filename });
244
- throw new Error('Cowardly refusing to turn a .zip file into an object stream, turn into a csv first');
245
- }
246
-
247
- const streamInfo = await this.stream({
248
- filename,
249
- columns,
250
- limit
251
- });
252
- const { encoding } = streamInfo;
253
- let { stream } = streamInfo;
254
- if (!stream) throw new Error(`No stream found in fileToObjectStream from filename ${filename}`);
255
- if (encoding === 'object') {
256
- // already an object
257
- return { stream };
258
- }
259
-
260
- let count = 0;
261
-
262
- let transforms = [];
263
-
264
- if (postfix === 'gz') {
265
- const gunzip = zlib.createGunzip();
266
- transforms.push(gunzip);
267
- gunzip.setEncoding(encoding);
268
- // encoding = null;// Default encoding
269
- postfix = filename.toLowerCase().split('.');
270
- postfix = postfix[postfix.length - 2];
271
- debug(`Using gunzip parser because postfix is .gz, encoding=${encoding}`);
272
- } else {
273
- stream.setEncoding(encoding);
274
- }
275
- let format = formatOverride || postfix;
276
-
277
- debug(`Reading file ${filename} with encoding: ${encoding} and format ${format}`);
278
-
279
- if (format === 'csv') {
280
- const csvTransforms = this.csvToObjectTransforms({ ...options });
281
- transforms = transforms.concat(csvTransforms.transforms);
282
- } else if (format === 'txt') {
283
- const csvTransforms = this.csvToObjectTransforms({ ...options, delimiter: '\t' });
284
- transforms = transforms.concat(csvTransforms.transforms);
285
- } else if (format === 'jsonl') {
286
- /* Type of JSON that has the names in an array in the first record,
287
- and the values in JSON arrays thereafter
288
- */
289
- let headers = null;
290
-
291
- const lineReader = new LineReaderTransform();
292
-
293
- const jsonlTransform = new Transform({
294
- objectMode: true,
295
- transform(d, enc, cb) {
296
- if (!d) return cb();
297
- let obj;
298
- try {
299
- obj = JSON5.parse(d);
300
- } catch (e) {
301
- debug('Invalid line:');
302
- debug(d);
303
- throw e;
304
- }
305
- /* JSONL could potentially start with an array of names,
306
- in which case we need to map the subsequent values
307
- */
308
- if (headers === null) {
309
- if (Array.isArray(obj)) {
310
- headers = obj;
311
- return cb();
312
- }
313
- headers = false;
196
+ const { filename, columns, limit: limitOption, format: formatOverride } = options;
197
+ // handle stream item
198
+ if (options.stream) {
199
+ if (Array.isArray(options.stream)) {
200
+ return { stream: Readable.from(options.stream) };
314
201
  }
315
- if (headers) {
316
- const mapped = {};
317
- headers.forEach((name, i) => {
318
- mapped[name] = obj[i];
319
- });
320
- this.push(mapped);
321
- } else {
322
- this.push(obj);
202
+ // probably already a stream
203
+ if (typeof options.stream === 'object')
204
+ return { stream: options.stream };
205
+ throw new Error(`Invalid stream type:${typeof options.stream}`);
206
+ }
207
+ let limit;
208
+ if (limitOption)
209
+ limit = parseInt(limitOption, 10);
210
+ if (!filename)
211
+ throw new Error('fileToObjectStream: filename is required');
212
+ if (filename.split('.').pop().toLowerCase() === 'xlsx') {
213
+ return this.xlsxToObjectStream(options);
214
+ }
215
+ let postfix = options.sourcePostfix || filename.toLowerCase().split('.').pop();
216
+ if (postfix === 'zip') {
217
+ debug('Invalid filename:', { filename });
218
+ throw new Error('Cowardly refusing to turn a .zip file into an object stream, turn into a csv first');
219
+ }
220
+ const streamInfo = await this.stream({
221
+ filename,
222
+ columns,
223
+ limit
224
+ });
225
+ const { encoding } = streamInfo;
226
+ let { stream } = streamInfo;
227
+ if (!stream)
228
+ throw new Error(`No stream found in fileToObjectStream from filename ${filename}`);
229
+ if (encoding === 'object') {
230
+ // already an object
231
+ return { stream };
232
+ }
233
+ let count = 0;
234
+ let transforms = [];
235
+ if (postfix === 'gz') {
236
+ const gunzip = zlib.createGunzip();
237
+ transforms.push(gunzip);
238
+ gunzip.setEncoding(encoding);
239
+ // encoding = null;// Default encoding
240
+ postfix = filename.toLowerCase().split('.');
241
+ postfix = postfix[postfix.length - 2];
242
+ debug(`Using gunzip parser because postfix is .gz, encoding=${encoding}`);
243
+ }
244
+ else {
245
+ stream.setEncoding(encoding);
246
+ }
247
+ let format = formatOverride || postfix;
248
+ debug(`Reading file ${filename} with encoding: ${encoding} and format ${format}`);
249
+ if (format === 'csv') {
250
+ const csvTransforms = this.csvToObjectTransforms({ ...options });
251
+ transforms = transforms.concat(csvTransforms.transforms);
252
+ }
253
+ else if (format === 'txt') {
254
+ const csvTransforms = this.csvToObjectTransforms({ ...options, delimiter: '\t' });
255
+ transforms = transforms.concat(csvTransforms.transforms);
256
+ }
257
+ else if (format === 'jsonl') {
258
+ /* Type of JSON that has the names in an array in the first record,
259
+ and the values in JSON arrays thereafter
260
+ */
261
+ let headers = null;
262
+ const lineReader = new LineReaderTransform();
263
+ const jsonlTransform = new Transform({
264
+ objectMode: true,
265
+ transform(d, enc, cb) {
266
+ if (!d)
267
+ return cb();
268
+ let obj;
269
+ try {
270
+ obj = JSON5.parse(d);
271
+ }
272
+ catch (e) {
273
+ debug('Invalid line:');
274
+ debug(d);
275
+ throw e;
276
+ }
277
+ /* JSONL could potentially start with an array of names,
278
+ in which case we need to map the subsequent values
279
+ */
280
+ if (headers === null) {
281
+ if (Array.isArray(obj)) {
282
+ headers = obj;
283
+ return cb();
284
+ }
285
+ headers = false;
286
+ }
287
+ if (headers) {
288
+ const mapped = {};
289
+ headers.forEach((name, i) => {
290
+ mapped[name] = obj[i];
291
+ });
292
+ this.push(mapped);
293
+ }
294
+ else {
295
+ this.push(obj);
296
+ }
297
+ return cb();
298
+ }
299
+ });
300
+ transforms.push(lineReader);
301
+ transforms.push(jsonlTransform);
302
+ }
303
+ else {
304
+ throw new Error(`Unsupported file type: ${postfix}`);
305
+ }
306
+ const countAndDebug = new Transform({
307
+ objectMode: true,
308
+ transform(d, enc, cb) {
309
+ if (count === 0) {
310
+ debug('Sample object from file:', d);
311
+ }
312
+ count += 1;
313
+ if ((count < 5000 && count % 1000 === 0) || count % 50000 === 0) {
314
+ debug(`fileToObjectStream transformed ${count} lines`);
315
+ }
316
+ this.push(d);
317
+ cb();
318
+ },
319
+ flush(cb) {
320
+ // If there's no records at all, push a dummy record, and specify 0 records
321
+ // Don't push dummy records anymore -- legacy cruft
322
+ debug(`Completed reading file, records=${count}`);
323
+ /* if (count === 0) {
324
+ const o = { _is_placeholder: true };
325
+
326
+ if (head) head.forEach((c) => { o[c] = null; });
327
+ this.push(o);
328
+ } */
329
+ cb();
323
330
  }
324
- return cb();
325
- }
326
331
  });
327
-
328
- transforms.push(lineReader);
329
- transforms.push(jsonlTransform);
330
- } else {
331
- throw new Error(`Unsupported file type: ${postfix}`);
332
- }
333
- const countAndDebug = new Transform({
334
- objectMode: true,
335
- transform(d, enc, cb) {
336
- if (count === 0) {
337
- debug('Sample object from file:', d);
338
- }
339
- count += 1;
340
- if ((count < 5000 && count % 1000 === 0) || count % 50000 === 0) {
341
- debug(`fileToObjectStream transformed ${count} lines`);
342
- }
343
- this.push(d);
344
- cb();
345
- },
346
- flush(cb) {
347
- // If there's no records at all, push a dummy record, and specify 0 records
348
- // Don't push dummy records anymore -- legacy cruft
349
- debug(`Completed reading file, records=${count}`);
350
- /* if (count === 0) {
351
- const o = { _is_placeholder: true };
352
-
353
- if (head) head.forEach((c) => { o[c] = null; });
354
- this.push(o);
355
- } */
356
- cb();
357
- }
358
- });
359
-
360
- transforms.push(countAndDebug);
361
- transforms.forEach((t) => {
362
- stream = stream.pipe(t);
363
- });
364
-
365
- return { stream };
332
+ transforms.push(countAndDebug);
333
+ transforms.forEach((t) => {
334
+ stream = stream.pipe(t);
335
+ });
336
+ return { stream };
366
337
  };
367
338
  Worker.prototype.getFileWriterStream = async function (options = {}) {
368
- const accountId = options.accountId || this.accountId;
369
- if (!accountId) throw new Error('getFileWriterStream has no accountId');
370
- const targetFormat = options.targetFormat || 'csv';
371
- const tempDir = await getTempDir({ accountId });
372
- let { fileExtendedType } = options;
373
- if (fileExtendedType) fileExtendedType += '.';
374
- else fileExtendedType = '';
375
- // So, this could change, but it's easier to read
376
- // dates in a filename than UUIDs, so this is
377
- // a unique-ish filename generator
378
- const uniqueNumberedDate = `${new Date().toISOString().replace(/[^0-9]*/g, '')}.${Math.floor(Math.random() * 1000)}`;
379
- let filename = `${tempDir}${path.sep}${uniqueNumberedDate}.${fileExtendedType}${targetFormat}`;
380
- if (bool(options.gzip, false)) filename += '.gz';
381
- const stream = fs.createWriteStream(filename);
382
- debug('FileWriterStream writing to file ', filename);
383
-
384
- return { filename, stream };
339
+ const accountId = options.accountId || this.accountId;
340
+ if (!accountId)
341
+ throw new Error('getFileWriterStream has no accountId');
342
+ const targetFormat = options.targetFormat || 'csv';
343
+ const tempDir = await getTempDir({ accountId });
344
+ let { fileExtendedType } = options;
345
+ if (fileExtendedType)
346
+ fileExtendedType += '.';
347
+ else
348
+ fileExtendedType = '';
349
+ // So, this could change, but it's easier to read
350
+ // dates in a filename than UUIDs, so this is
351
+ // a unique-ish filename generator
352
+ const uniqueNumberedDate = `${new Date().toISOString().replace(/[^0-9]*/g, '')}.${Math.floor(Math.random() * 1000)}`;
353
+ let filename = `${tempDir}${path.sep}${uniqueNumberedDate}.${fileExtendedType}${targetFormat}`;
354
+ if (bool(options.gzip, false))
355
+ filename += '.gz';
356
+ const stream = fs.createWriteStream(filename);
357
+ debug('FileWriterStream writing to file ', filename);
358
+ return { filename, stream };
385
359
  };
386
-
387
360
  Worker.prototype.getOutputStreams = async function (options) {
388
- const { filename, stream: fileWriterStream } = await this.getFileWriterStream(options);
389
-
390
- let { transform } = options;
391
- if (typeof options.transform === 'function') {
392
- if (options.transform.length === 3) {
393
- transform = new Transform({
394
- objectMode: true,
395
- async transform(item, encoding, cb) {
396
- options.transform(item, encoding, cb);
361
+ const { filename, stream: fileWriterStream } = await this.getFileWriterStream(options);
362
+ let { transform } = options;
363
+ if (typeof options.transform === 'function') {
364
+ if (options.transform.length === 3) {
365
+ transform = new Transform({
366
+ objectMode: true,
367
+ async transform(item, encoding, cb) {
368
+ options.transform(item, encoding, cb);
369
+ }
370
+ });
397
371
  }
398
- });
399
- } else {
400
- transform = new Transform({
401
- objectMode: true,
402
- async transform(item, encoding, cb) {
403
- cb(null, options.transform(item));
372
+ else {
373
+ transform = new Transform({
374
+ objectMode: true,
375
+ async transform(item, encoding, cb) {
376
+ cb(null, options.transform(item));
377
+ }
378
+ });
404
379
  }
405
- });
406
- }
407
- } else if (options.transform) {
408
- transform = options.transform;
409
- }
410
- const { flatten } = options;
411
- let flattenTransform = null;
412
-
413
- if (bool(flatten, false)) {
414
- flattenTransform = new Transform({
415
- objectMode: true,
416
- async transform(item, enc, cb) {
417
- // first item establishes the keys to use
418
- let o = {};
419
- Object.keys(item).forEach((k) => {
420
- let v = item[k];
421
- if (!o[k]) {
422
- if (typeof v === 'object') {
423
- while (Array.isArray(v)) [v] = v; // get first array item
424
- o = { ...o, ...v };
425
- } else {
426
- o[k] = v;
380
+ }
381
+ else if (options.transform) {
382
+ transform = options.transform;
383
+ }
384
+ const { flatten } = options;
385
+ let flattenTransform = null;
386
+ if (bool(flatten, false)) {
387
+ flattenTransform = new Transform({
388
+ objectMode: true,
389
+ async transform(item, enc, cb) {
390
+ // first item establishes the keys to use
391
+ let o = {};
392
+ Object.keys(item).forEach((k) => {
393
+ let v = item[k];
394
+ if (!o[k]) {
395
+ if (typeof v === 'object') {
396
+ while (Array.isArray(v))
397
+ [v] = v; // get first array item
398
+ o = { ...o, ...v };
399
+ }
400
+ else {
401
+ o[k] = v;
402
+ }
403
+ }
404
+ });
405
+ cb(null, o);
427
406
  }
428
- }
429
407
  });
430
- cb(null, o);
431
- }
432
- });
433
- }
434
-
435
- const stats = {
436
- records: 0
437
- };
438
- let stringifier;
439
- if (options.targetFormat === 'jsonl') {
440
- stringifier = new Transform({
441
- objectMode: true,
442
- transform(d, encoding, cb) {
443
- cb(false, `${JSON.stringify(d)}\n`);
444
- }
445
- });
446
- } else {
447
- stringifier = stringify({ header: true });
448
- }
449
- let gzip = new PassThrough();
450
- if (options.gzip) {
451
- gzip = zlib.createGzip();
452
- }
453
- const streams = [
454
- transform,
455
- flattenTransform,
456
- new Transform({
457
- objectMode: true,
458
- transform(d, enc, cb) {
459
- stats.records += 1;
460
- cb(null, d);
461
- }
462
- }),
463
- stringifier,
464
- gzip,
465
- fileWriterStream
466
- ].filter(Boolean);
467
- return { filename, streams, stats };
408
+ }
409
+ const stats = {
410
+ records: 0
411
+ };
412
+ let stringifier;
413
+ if (options.targetFormat === 'jsonl') {
414
+ stringifier = new Transform({
415
+ objectMode: true,
416
+ transform(d, encoding, cb) {
417
+ cb(false, `${JSON.stringify(d)}\n`);
418
+ }
419
+ });
420
+ }
421
+ else {
422
+ stringifier = stringify({ header: true });
423
+ }
424
+ let gzip = new PassThrough();
425
+ if (options.gzip) {
426
+ gzip = zlib.createGzip();
427
+ }
428
+ const streams = [
429
+ transform,
430
+ flattenTransform,
431
+ new Transform({
432
+ objectMode: true,
433
+ transform(d, enc, cb) {
434
+ stats.records += 1;
435
+ cb(null, d);
436
+ }
437
+ }),
438
+ stringifier,
439
+ gzip,
440
+ fileWriterStream
441
+ ].filter(Boolean);
442
+ return { filename, streams, stats };
468
443
  };
469
444
  Worker.prototype.objectStreamToFile = async function (options) {
470
- const { filename, streams, stats } = await this.getOutputStreams(options);
471
- const { stream: inStream } = options;
472
- streams.unshift(inStream);
473
- await pipeline(streams);
474
- return { filename, records: stats.records };
445
+ const { filename, streams, stats } = await this.getOutputStreams(options);
446
+ const { stream: inStream } = options;
447
+ streams.unshift(inStream);
448
+ await pipeline(streams);
449
+ return { filename, records: stats.records };
475
450
  };
476
-
477
451
  Worker.prototype.transform = async function (options) {
478
- const worker = this;
479
-
480
- const { filename } = options;
481
-
482
- debug(`Transforming ${filename}`);
483
-
484
- options.filename = filename;
485
- let { stream } = await worker.fileToObjectStream(options);
486
- if (typeof stream.pipe !== 'function') {
487
- debug(stream);
488
- throw new Error('No pipe in stream');
489
- }
490
-
491
- let t = options.transform;
492
-
493
- // No longer need this
494
- delete options.transform;
495
- if (!t) {
496
- t = function (d, enc, cb) {
497
- d.is_test_transform = true;
498
- cb(null, d);
499
- };
500
- }
501
-
502
- if (!Array.isArray(t)) t = [t];
503
- Object.keys(t).forEach((key) => {
504
- let f = t[key];
505
- if (typeof f === 'function') {
506
- f = new Transform({
507
- objectMode: true,
508
- transform: f
509
- });
510
- }
511
-
512
- stream = stream.pipe(f);
513
- });
514
-
515
- const { targetFormat } = options;
516
-
517
- if (
518
- !targetFormat &&
519
- (filename.toLowerCase().slice(-4) === '.csv' || filename.toLowerCase().slice(-7) === '.csv.gz')
520
- ) {
521
- options.targetFormat = 'csv';
522
- }
523
-
524
- return worker.objectStreamToFile({ ...options, stream });
452
+ const worker = this;
453
+ const { filename } = options;
454
+ debug(`Transforming ${filename}`);
455
+ options.filename = filename;
456
+ let { stream } = await worker.fileToObjectStream(options);
457
+ if (typeof stream.pipe !== 'function') {
458
+ debug(stream);
459
+ throw new Error('No pipe in stream');
460
+ }
461
+ let t = options.transform;
462
+ // No longer need this
463
+ delete options.transform;
464
+ if (!t) {
465
+ t = function (d, enc, cb) {
466
+ d.is_test_transform = true;
467
+ cb(null, d);
468
+ };
469
+ }
470
+ if (!Array.isArray(t))
471
+ t = [t];
472
+ Object.keys(t).forEach((key) => {
473
+ let f = t[key];
474
+ if (typeof f === 'function') {
475
+ f = new Transform({
476
+ objectMode: true,
477
+ transform: f
478
+ });
479
+ }
480
+ stream = stream.pipe(f);
481
+ });
482
+ const { targetFormat } = options;
483
+ if (!targetFormat &&
484
+ (filename.toLowerCase().slice(-4) === '.csv' || filename.toLowerCase().slice(-7) === '.csv.gz')) {
485
+ options.targetFormat = 'csv';
486
+ }
487
+ return worker.objectStreamToFile({ ...options, stream });
525
488
  };
526
-
527
489
  Worker.prototype.transform.metadata = {
528
- options: {
529
- sourcePostfix: { description: "Override the source postfix, if for example it's a csv" },
530
- encoding: { description: 'Manual override of source file encoding' },
531
- names: { description: 'Target field names (e.g. my_new_field,x,y,z)' },
532
- values: {
533
- description:
534
- "Comma delimited source field name, or Handlebars [[ ]] merge fields (e.g. 'my_field,x,y,z', '[[field1]]-[[field2]]', etc)"
535
- },
536
- targetFilename: { description: 'Custom name of the output file (default auto-generated)' },
537
- targetFormat: { description: 'Output format -- csv supported, or none for txt (default)' },
538
- targetRowDelimiter: { description: 'Row delimiter (default \n)' },
539
- targetFieldDelimiter: { description: 'Field delimiter (default \t or ,)' }
540
- }
490
+ options: {
491
+ sourcePostfix: { description: "Override the source postfix, if for example it's a csv" },
492
+ encoding: { description: 'Manual override of source file encoding' },
493
+ names: { description: 'Target field names (e.g. my_new_field,x,y,z)' },
494
+ values: {
495
+ description: "Comma delimited source field name, or Handlebars [[ ]] merge fields (e.g. 'my_field,x,y,z', '[[field1]]-[[field2]]', etc)"
496
+ },
497
+ targetFilename: { description: 'Custom name of the output file (default auto-generated)' },
498
+ targetFormat: { description: 'Output format -- csv supported, or none for txt (default)' },
499
+ targetRowDelimiter: { description: 'Row delimiter (default \n)' },
500
+ targetFieldDelimiter: { description: 'Field delimiter (default \t or ,)' }
501
+ }
541
502
  };
542
503
  Worker.prototype.testTransform = async function (options) {
543
- return this.transform({
544
- ...options,
545
- transform(d, enc, cb) {
546
- d.transform_time = new Date();
547
- cb(null, d);
548
- }
549
- });
504
+ return this.transform({
505
+ ...options,
506
+ transform(d, enc, cb) {
507
+ d.transform_time = new Date();
508
+ cb(null, d);
509
+ }
510
+ });
550
511
  };
551
512
  Worker.prototype.testTransform.metadata = {
552
- options: {
553
- filename: true
554
- }
513
+ options: {
514
+ filename: true
515
+ }
555
516
  };
556
-
557
517
  /* Get a stream from an actual stream, or an array, or a file */
558
518
  Worker.prototype.stream = async function (options) {
559
- const { stream: inputStream, packet, type, columns, limit, filename: filenameOpt } = options;
560
- let filename = filenameOpt;
561
-
562
- if (inputStream) {
563
- if (Array.isArray(inputStream)) {
564
- return { stream: Readable.from(inputStream) };
565
- }
566
- // probably already a stream
567
- if (typeof inputStream === 'object') return { stream: inputStream, encoding: 'object' };
568
- throw new Error(`Invalid stream type:${typeof inputStream}`);
569
- } else if (filename) {
570
- if (filename.startsWith('engine9-accounts/')) {
571
- filename = `${process.env.ENGINE9_ACCOUNT_DIR}/${filename.slice('engine9-accounts/'.length)}`;
572
- // debug(`Prepending file with ${process.env.ENGINE9_ACCOUNT_DIR}, filename=${filename}`);
573
- } else {
574
- // debug(`Not prepending filename:${filename}`);
575
- }
576
- let encoding;
577
- let stream;
578
- if (filename.slice(-8) === '.parquet') {
579
- const pq = new ParquetWorker(this);
580
- stream = (await pq.stream({ filename, columns, limit })).stream;
581
- encoding = 'object';
582
- } else if (filename.startsWith('s3://')) {
583
- const s3Worker = new S3Worker(this);
584
- stream = (await s3Worker.stream({ filename, columns, limit })).stream;
585
- encoding = 'UTF-8';
586
- } else if (filename.startsWith('r2://')) {
587
- const r2Worker = new R2Worker(this);
588
- stream = (await r2Worker.stream({ filename, columns, limit })).stream;
589
- encoding = 'UTF-8';
590
- } else {
591
- // Check if the file exists, and fast fail if not
592
- // Otherwise the stream hangs out as a handle
593
- try {
594
- await fsp.stat(filename);
595
- } catch (e) {
596
- debug(`Error reading file ${filename}, current directory: ${process.cwd()},__dirname:${__dirname}`);
597
- throw e;
598
- }
599
- stream = fs.createReadStream(filename);
600
- encoding = (await this.detectEncoding({ filename })).encoding;
601
- }
602
- return { stream, encoding };
603
- } else if (packet) {
604
- let { stream: packetStream } = await streamPacket({ packet, type, limit });
605
- const { transforms } = this.csvToObjectTransforms({});
606
- transforms.forEach((t) => {
607
- packetStream = packetStream.pipe(t);
608
- });
609
- return { stream: packetStream };
610
- } else {
611
- throw new Error('stream must be passed a stream, filename, or packet');
612
- }
519
+ const { stream: inputStream, packet, type, columns, limit, filename: filenameOpt } = options;
520
+ let filename = filenameOpt;
521
+ if (inputStream) {
522
+ if (Array.isArray(inputStream)) {
523
+ return { stream: Readable.from(inputStream) };
524
+ }
525
+ // probably already a stream
526
+ if (typeof inputStream === 'object')
527
+ return { stream: inputStream, encoding: 'object' };
528
+ throw new Error(`Invalid stream type:${typeof inputStream}`);
529
+ }
530
+ else if (filename) {
531
+ if (filename.startsWith('engine9-accounts/')) {
532
+ filename = `${process.env.ENGINE9_ACCOUNT_DIR}/${filename.slice('engine9-accounts/'.length)}`;
533
+ // debug(`Prepending file with ${process.env.ENGINE9_ACCOUNT_DIR}, filename=${filename}`);
534
+ }
535
+ else {
536
+ // debug(`Not prepending filename:${filename}`);
537
+ }
538
+ let encoding;
539
+ let stream;
540
+ if (filename.slice(-8) === '.parquet') {
541
+ const pq = new ParquetWorker(this);
542
+ stream = (await pq.stream({ filename, columns, limit })).stream;
543
+ encoding = 'object';
544
+ }
545
+ else if (filename.startsWith('s3://')) {
546
+ const s3Worker = new S3Worker(this);
547
+ stream = (await s3Worker.stream({ filename, columns, limit })).stream;
548
+ encoding = 'UTF-8';
549
+ }
550
+ else if (filename.startsWith('r2://')) {
551
+ const r2Worker = new R2Worker(this);
552
+ stream = (await r2Worker.stream({ filename, columns, limit })).stream;
553
+ encoding = 'UTF-8';
554
+ }
555
+ else {
556
+ // Check if the file exists, and fast fail if not
557
+ // Otherwise the stream hangs out as a handle
558
+ try {
559
+ await fsp.stat(filename);
560
+ }
561
+ catch (e) {
562
+ debug(`Error reading file ${filename}, current directory: ${process.cwd()},__dirname:${__dirname}`);
563
+ throw e;
564
+ }
565
+ stream = fs.createReadStream(filename);
566
+ encoding = (await this.detectEncoding({ filename })).encoding;
567
+ }
568
+ return { stream, encoding };
569
+ }
570
+ else if (packet) {
571
+ let { stream: packetStream } = await streamPacket({ packet, type, limit });
572
+ const { transforms } = this.csvToObjectTransforms({});
573
+ transforms.forEach((t) => {
574
+ packetStream = packetStream.pipe(t);
575
+ });
576
+ return { stream: packetStream };
577
+ }
578
+ else {
579
+ throw new Error('stream must be passed a stream, filename, or packet');
580
+ }
613
581
  };
614
-
615
582
  Worker.prototype.sample = async function (opts) {
616
- opts.limit = opts.limit || 10;
617
- const { stream } = await this.fileToObjectStream(opts);
618
- return stream.toArray();
583
+ opts.limit = opts.limit || 10;
584
+ const { stream } = await this.fileToObjectStream(opts);
585
+ return stream.toArray();
619
586
  };
620
587
  Worker.prototype.sample.metadata = {
621
- options: {
622
- filename: {}
623
- }
588
+ options: {
589
+ filename: {}
590
+ }
624
591
  };
625
592
  Worker.prototype.toArray = async function (opts) {
626
- const { stream } = await this.fileToObjectStream(opts);
627
- return stream.toArray();
593
+ const { stream } = await this.fileToObjectStream(opts);
594
+ return stream.toArray();
628
595
  };
629
596
  Worker.prototype.toArray.metadata = {
630
- options: {
631
- filename: {}
632
- }
597
+ options: {
598
+ filename: {}
599
+ }
633
600
  };
634
-
635
601
  Worker.prototype.write = async function (opts) {
636
- const { filename, content } = opts;
637
- if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
638
- const worker = new (filename.startsWith('r2://') ? R2Worker : S3Worker)(this);
639
- const parts = filename.split('/');
640
- const directory = parts.slice(0, -1).join('/');
641
- const file = parts.slice(-1)[0];
642
- // debug(JSON.stringify({ parts, directory, file }));
643
- await worker.write({
644
- directory,
645
- file,
646
- content
647
- });
648
- } else {
649
- const directory = path.dirname(filename);
650
- await fsp.mkdir(directory, { recursive: true });
651
- await fsp.writeFile(filename, content);
652
- }
653
- return { success: true, filename };
602
+ const { filename, content } = opts;
603
+ if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
604
+ const worker = new (filename.startsWith('r2://') ? R2Worker : S3Worker)(this);
605
+ const parts = filename.split('/');
606
+ const directory = parts.slice(0, -1).join('/');
607
+ const file = parts.slice(-1)[0];
608
+ // debug(JSON.stringify({ parts, directory, file }));
609
+ await worker.write({
610
+ directory,
611
+ file,
612
+ content
613
+ });
614
+ }
615
+ else {
616
+ const directory = path.dirname(filename);
617
+ await fsp.mkdir(directory, { recursive: true });
618
+ await fsp.writeFile(filename, content);
619
+ }
620
+ return { success: true, filename };
654
621
  };
655
622
  Worker.prototype.write.metadata = {
656
- options: {
657
- filename: { description: 'Location to write content to, can be local or s3:// or r2://' },
658
- content: {}
659
- }
623
+ options: {
624
+ filename: { description: 'Location to write content to, can be local or s3:// or r2://' },
625
+ content: {}
626
+ }
660
627
  };
661
-
662
628
  async function streamToString(stream) {
663
- // lets have a ReadableStream as a stream variable
664
- const chunks = [];
665
-
666
- for await (const chunk of stream) {
667
- chunks.push(Buffer.from(chunk));
668
- }
669
-
670
- return Buffer.concat(chunks).toString('utf-8');
629
+ // lets have a ReadableStream as a stream variable
630
+ const chunks = [];
631
+ for await (const chunk of stream) {
632
+ chunks.push(Buffer.from(chunk));
633
+ }
634
+ return Buffer.concat(chunks).toString('utf-8');
671
635
  }
672
636
  /*
673
637
  Retrieves and parsed
674
638
  */
675
639
  Worker.prototype.json = async function (opts) {
676
- const { stream } = await this.stream(opts);
677
- const str = await streamToString(stream);
678
- try {
679
- return JSON5.parse(str);
680
- } catch (e) {
681
- debug(e);
682
- throw new Error(`Unparseable JSON received: ${opts.filename || '(native stream)'}`);
683
- }
640
+ const { stream } = await this.stream(opts);
641
+ const str = await streamToString(stream);
642
+ try {
643
+ return JSON5.parse(str);
644
+ }
645
+ catch (e) {
646
+ debug(e);
647
+ throw new Error(`Unparseable JSON received: ${opts.filename || '(native stream)'}`);
648
+ }
684
649
  };
685
650
  Worker.prototype.json.metadata = {
686
- options: {
687
- filename: { description: 'Get a javascript object from a file' }
688
- }
651
+ options: {
652
+ filename: { description: 'Get a javascript object from a file' }
653
+ }
689
654
  };
690
-
691
655
  Worker.prototype.list = async function ({ directory, start: s, end: e }) {
692
- if (!directory) throw new Error('directory is required');
693
- let start = null;
694
- let end = null;
695
- if (s) start = relativeDate(s);
696
- if (e) end = relativeDate(e);
697
-
698
- if (directory.startsWith('s3://') || directory.startsWith('r2://')) {
699
- const worker = new (directory.startsWith('r2://') ? R2Worker : S3Worker)(this);
700
- return worker.list({ directory, start, end });
701
- }
702
- const a = await fsp.readdir(directory, { withFileTypes: true });
703
-
704
- const withModified = [];
705
- for (const file of a) {
706
- const fullPath = path.join(directory, file.name);
707
- const stats = await fsp.stat(fullPath);
708
- if (start && stats.mtime < start.getTime()) {
709
- //do not include
710
- } else if (end && stats.mtime > end.getTime()) {
711
- //do nothing
712
- } else {
713
- withModified.push({
714
- name: file.name,
715
- type: file.isDirectory() ? 'directory' : 'file',
716
- modifiedAt: new Date(stats.mtime).toISOString()
717
- });
718
- }
719
- }
720
-
721
- return withModified;
656
+ if (!directory)
657
+ throw new Error('directory is required');
658
+ let start = null;
659
+ let end = null;
660
+ if (s)
661
+ start = relativeDate(s);
662
+ if (e)
663
+ end = relativeDate(e);
664
+ if (directory.startsWith('s3://') || directory.startsWith('r2://')) {
665
+ const worker = new (directory.startsWith('r2://') ? R2Worker : S3Worker)(this);
666
+ return worker.list({ directory, start, end });
667
+ }
668
+ const a = await fsp.readdir(directory, { withFileTypes: true });
669
+ const withModified = [];
670
+ for (const file of a) {
671
+ const fullPath = path.join(directory, file.name);
672
+ const stats = await fsp.stat(fullPath);
673
+ if (start && stats.mtime < start.getTime()) {
674
+ //do not include
675
+ }
676
+ else if (end && stats.mtime > end.getTime()) {
677
+ //do nothing
678
+ }
679
+ else {
680
+ withModified.push({
681
+ name: file.name,
682
+ type: file.isDirectory() ? 'directory' : 'file',
683
+ modifiedAt: new Date(stats.mtime).toISOString()
684
+ });
685
+ }
686
+ }
687
+ return withModified;
722
688
  };
723
689
  Worker.prototype.list.metadata = {
724
- options: {
725
- directory: { required: true }
726
- }
690
+ options: {
691
+ directory: { required: true }
692
+ }
727
693
  };
728
-
729
694
  Worker.prototype.listAll = async function ({ directory, start: s, end: e }) {
730
- if (!directory) throw new Error('directory is required');
731
- let start = null;
732
- let end = null;
733
- if (s) start = relativeDate(s).getTime();
734
- if (e) end = relativeDate(e).getTime();
735
- if (directory.startsWith('s3://') || directory.startsWith('r2://')) {
736
- const worker = new (directory.startsWith('r2://') ? R2Worker : S3Worker)(this);
737
- return worker.listAll({ directory, start, end });
738
- }
739
- const a = await fsp.readdir(directory, { recursive: true });
740
-
741
- let files = a.map((f) => `${directory}/${f}`);
742
- if (!start && !end) {
743
- return files;
744
- }
745
- const pLimit = await import('p-limit');
746
-
747
- const limitedMethod = pLimit.default(10);
748
- const filesWithinLimit = [];
749
-
750
- await Promise.all(
751
- files.map((filename) =>
752
- limitedMethod(async () => {
695
+ if (!directory)
696
+ throw new Error('directory is required');
697
+ let start = null;
698
+ let end = null;
699
+ if (s)
700
+ start = relativeDate(s).getTime();
701
+ if (e)
702
+ end = relativeDate(e).getTime();
703
+ if (directory.startsWith('s3://') || directory.startsWith('r2://')) {
704
+ const worker = new (directory.startsWith('r2://') ? R2Worker : S3Worker)(this);
705
+ return worker.listAll({ directory, start, end });
706
+ }
707
+ const a = await fsp.readdir(directory, { recursive: true });
708
+ let files = a.map((f) => `${directory}/${f}`);
709
+ if (!start && !end) {
710
+ return files;
711
+ }
712
+ const pLimit = await import('p-limit');
713
+ const limitedMethod = pLimit.default(10);
714
+ const filesWithinLimit = [];
715
+ await Promise.all(files.map((filename) => limitedMethod(async () => {
753
716
  const stats = await fsp.stat(filename);
754
717
  if (start && stats.mtime < start) {
755
- //do not include
756
- } else if (end && stats.mtime > end) {
757
- //do nothing
758
- } else {
759
- filesWithinLimit.push({
760
- name: filename,
761
- type: stats.isDirectory() ? 'directory' : 'file',
762
- modifiedAt: new Date(stats.mtime).toISOString()
763
- });
718
+ //do not include
719
+ }
720
+ else if (end && stats.mtime > end) {
721
+ //do nothing
764
722
  }
765
- })
766
- )
767
- );
768
- return filesWithinLimit;
723
+ else {
724
+ filesWithinLimit.push({
725
+ name: filename,
726
+ type: stats.isDirectory() ? 'directory' : 'file',
727
+ modifiedAt: new Date(stats.mtime).toISOString()
728
+ });
729
+ }
730
+ })));
731
+ return filesWithinLimit;
769
732
  };
770
733
  Worker.prototype.listAll.metadata = {
771
- options: {
772
- directory: { required: true },
773
- start: {},
774
- end: {}
775
- }
734
+ options: {
735
+ directory: { required: true },
736
+ start: {},
737
+ end: {}
738
+ }
776
739
  };
777
-
778
740
  Worker.prototype.moveAll = async function (options) {
779
- const { directory, targetDirectory } = options;
780
- if (!directory) throw new Error('directory is required');
781
- if (directory.startsWith('s3://') || directory.startsWith('r2://')) {
782
- const worker = new (directory.startsWith('r2://') ? R2Worker : S3Worker)(this);
783
- return worker.moveAll(options);
784
- }
785
- const a = await this.listAll(options);
786
-
787
- let configs = a.map((f) => {
788
- let filename = typeof f === 'string' ? f : f.filename;
789
- return {
790
- filename,
791
- target: filename.replace(directory, targetDirectory)
792
- };
793
- });
794
- const pLimit = await import('p-limit');
795
-
796
- const limitedMethod = pLimit.default(10);
797
-
798
- return Promise.all(configs.map(({ filename, target }) => limitedMethod(async () => this.move({ filename, target }))));
741
+ const { directory, targetDirectory } = options;
742
+ if (!directory)
743
+ throw new Error('directory is required');
744
+ if (directory.startsWith('s3://') || directory.startsWith('r2://')) {
745
+ const worker = new (directory.startsWith('r2://') ? R2Worker : S3Worker)(this);
746
+ return worker.moveAll(options);
747
+ }
748
+ const a = await this.listAll(options);
749
+ let configs = a.map((f) => {
750
+ let filename = typeof f === 'string' ? f : f.filename;
751
+ return {
752
+ filename,
753
+ target: filename.replace(directory, targetDirectory)
754
+ };
755
+ });
756
+ const pLimit = await import('p-limit');
757
+ const limitedMethod = pLimit.default(10);
758
+ return Promise.all(configs.map(({ filename, target }) => limitedMethod(async () => this.move({ filename, target }))));
799
759
  };
800
760
  Worker.prototype.moveAll.metadata = {
801
- options: {
802
- directory: { required: true },
803
- targetDirectory: { required: true }
804
- }
761
+ options: {
762
+ directory: { required: true },
763
+ targetDirectory: { required: true }
764
+ }
805
765
  };
806
-
807
766
  Worker.prototype.empty = async function ({ directory }) {
808
- if (!directory) throw new Error('directory is required');
809
- if (directory.startsWith('s3://') || directory.startsWith('r2://')) {
810
- // currently not emptying S3 this way -- dangerous
811
- throw new Error('Cannot empty an s3:// or r2:// directory');
812
- }
813
- const removed = [];
814
-
815
- for (const file of await fsp.readdir(directory)) {
816
- removed.push(file);
817
- await fsp.unlink(path.join(directory, file));
818
- }
819
- return { directory, removed };
767
+ if (!directory)
768
+ throw new Error('directory is required');
769
+ if (directory.startsWith('s3://') || directory.startsWith('r2://')) {
770
+ // currently not emptying S3 this way -- dangerous
771
+ throw new Error('Cannot empty an s3:// or r2:// directory');
772
+ }
773
+ const removed = [];
774
+ for (const file of await fsp.readdir(directory)) {
775
+ removed.push(file);
776
+ await fsp.unlink(path.join(directory, file));
777
+ }
778
+ return { directory, removed };
820
779
  };
821
780
  Worker.prototype.empty.metadata = {
822
- options: {
823
- directory: { required: true }
824
- }
781
+ options: {
782
+ directory: { required: true }
783
+ }
825
784
  };
826
-
827
785
  Worker.prototype.removeAll = async function (options) {
828
- const filenames = await this.listAll(options);
829
-
830
- const pLimit = await import('p-limit');
831
-
832
- const limitedMethod = pLimit.default(10);
833
-
834
- return Promise.all(filenames.map((filename) => limitedMethod(async () => this.remove({ filename }))));
786
+ const filenames = await this.listAll(options);
787
+ const pLimit = await import('p-limit');
788
+ const limitedMethod = pLimit.default(10);
789
+ return Promise.all(filenames.map((filename) => limitedMethod(async () => this.remove({ filename }))));
835
790
  };
836
791
  Worker.prototype.removeAll.metadata = {
837
- options: {
838
- directory: { required: true },
839
- start: {},
840
- end: {}
841
- }
792
+ options: {
793
+ directory: { required: true },
794
+ start: {},
795
+ end: {}
796
+ }
842
797
  };
843
-
844
798
  Worker.prototype.remove = async function ({ filename }) {
845
- if (!filename) throw new Error('filename is required');
846
- if (typeof filename !== 'string') throw new Error(`filename isn't a string:${JSON.stringify(filename)}`);
847
- if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
848
- let worker = null;
849
- if (filename.startsWith('r2://')) {
850
- worker = new R2Worker(this);
851
- } else {
852
- worker = new S3Worker(this);
853
- }
854
-
855
- await worker.remove({ filename });
856
- } else {
857
- fsp.unlink(filename);
858
- }
859
-
860
- return { removed: filename };
799
+ if (!filename)
800
+ throw new Error('filename is required');
801
+ if (typeof filename !== 'string')
802
+ throw new Error(`filename isn't a string:${JSON.stringify(filename)}`);
803
+ if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
804
+ let worker = null;
805
+ if (filename.startsWith('r2://')) {
806
+ worker = new R2Worker(this);
807
+ }
808
+ else {
809
+ worker = new S3Worker(this);
810
+ }
811
+ await worker.remove({ filename });
812
+ }
813
+ else {
814
+ fsp.unlink(filename);
815
+ }
816
+ return { removed: filename };
861
817
  };
862
818
  Worker.prototype.remove.metadata = {
863
- options: {
864
- filename: {}
865
- }
819
+ options: {
820
+ filename: {}
821
+ }
866
822
  };
867
-
868
823
  Worker.prototype.move = async function ({ filename, target, remove = true }) {
869
- if (!target) throw new Error('target is required');
870
- if (typeof target !== 'string') throw new Error(`target isn't a string:${JSON.stringify(target)}`);
871
- if (target.startsWith('s3://') || target.startsWith('r2://')) {
872
- if (
873
- (target.startsWith('s3://') && filename.startsWith('r2://')) ||
874
- (target.startsWith('r2://') && filename.startsWith('s3://'))
875
- ) {
876
- throw new Error('Cowardly not copying between services');
877
- }
878
-
879
- let worker = null;
880
- if (target.startsWith('r2://')) {
881
- worker = new R2Worker(this);
882
- } else {
883
- worker = new S3Worker(this);
884
- }
885
-
886
- if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
887
- // We need to copy and delete
888
- const output = await worker.copy({ filename, target });
889
- if (remove) await worker.remove({ filename });
890
- return output;
891
- }
892
- const parts = target.split('/');
893
- return worker.put({ filename, directory: parts.slice(0, -1).join('/'), file: parts.slice(-1)[0] });
894
- }
895
- await fsp.mkdir(path.dirname(target), { recursive: true });
896
- if (remove) {
897
- try {
898
- await fsp.rename(filename, target);
899
- } catch (e) {
900
- //it may be a filesystem issue moving between items
901
- debug('Assuming this is a filesystem crosslink error, ignoring ', e.getMessage());
902
- await fsp.copyFile(filename, target);
903
- await fsp.unlink(filename);
904
- }
905
- } else {
906
- await fsp.copyFile(filename, target);
907
- }
908
- return { filename: target };
824
+ if (!target)
825
+ throw new Error('target is required');
826
+ if (typeof target !== 'string')
827
+ throw new Error(`target isn't a string:${JSON.stringify(target)}`);
828
+ if (target.startsWith('s3://') || target.startsWith('r2://')) {
829
+ if ((target.startsWith('s3://') && filename.startsWith('r2://')) ||
830
+ (target.startsWith('r2://') && filename.startsWith('s3://'))) {
831
+ throw new Error('Cowardly not copying between services');
832
+ }
833
+ let worker = null;
834
+ if (target.startsWith('r2://')) {
835
+ worker = new R2Worker(this);
836
+ }
837
+ else {
838
+ worker = new S3Worker(this);
839
+ }
840
+ if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
841
+ // We need to copy and delete
842
+ const output = await worker.copy({ filename, target });
843
+ if (remove)
844
+ await worker.remove({ filename });
845
+ return output;
846
+ }
847
+ const parts = target.split('/');
848
+ return worker.put({ filename, directory: parts.slice(0, -1).join('/'), file: parts.slice(-1)[0] });
849
+ }
850
+ await fsp.mkdir(path.dirname(target), { recursive: true });
851
+ if (remove) {
852
+ try {
853
+ await fsp.rename(filename, target);
854
+ }
855
+ catch (e) {
856
+ //it may be a filesystem issue moving between items
857
+ debug('Assuming this is a filesystem crosslink error, ignoring ', e.getMessage());
858
+ await fsp.copyFile(filename, target);
859
+ await fsp.unlink(filename);
860
+ }
861
+ }
862
+ else {
863
+ await fsp.copyFile(filename, target);
864
+ }
865
+ return { filename: target };
909
866
  };
910
867
  Worker.prototype.move.metadata = {
911
- options: {
912
- filename: {},
913
- target: {}
914
- }
868
+ options: {
869
+ filename: {},
870
+ target: {}
871
+ }
915
872
  };
916
-
917
873
  Worker.prototype.copy = async function (opts) {
918
- return this.move({ ...opts, remove: false });
874
+ return this.move({ ...opts, remove: false });
919
875
  };
920
876
  Worker.prototype.copy.metadata = {
921
- options: {
922
- filename: {},
923
- target: {}
924
- }
877
+ options: {
878
+ filename: {},
879
+ target: {}
880
+ }
925
881
  };
926
-
927
882
  Worker.prototype.stat = async function ({ filename }) {
928
- if (!filename) throw new Error('filename is required');
929
- const output = {};
930
-
931
- if (filename.slice(-8) === '.parquet') {
932
- const pq = new ParquetWorker(this);
933
- output.schema = (await pq.schema({ filename }))?.schema;
934
- output.records = (await pq.meta({ filename }))?.records;
935
- }
936
-
937
- if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
938
- const worker = new (filename.startsWith('r2://') ? R2Worker : S3Worker)(this);
939
- Object.assign(output, await worker.stat({ filename }));
940
- } else {
941
- const { ctime, birthtime, size } = await fsp.stat(filename);
942
- const modifiedAt = new Date(ctime);
943
- let createdAt = birthtime;
944
- if (createdAt === 0 || !createdAt) createdAt = ctime;
945
- createdAt = new Date(createdAt);
946
- Object.assign(output, {
947
- createdAt,
948
- modifiedAt,
949
- size
950
- });
951
- }
952
- return output;
883
+ if (!filename)
884
+ throw new Error('filename is required');
885
+ const output = {};
886
+ if (filename.slice(-8) === '.parquet') {
887
+ const pq = new ParquetWorker(this);
888
+ output.schema = (await pq.schema({ filename }))?.schema;
889
+ output.records = (await pq.meta({ filename }))?.records;
890
+ }
891
+ if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
892
+ const worker = new (filename.startsWith('r2://') ? R2Worker : S3Worker)(this);
893
+ Object.assign(output, await worker.stat({ filename }));
894
+ }
895
+ else {
896
+ const { ctime, birthtime, size } = await fsp.stat(filename);
897
+ const modifiedAt = new Date(ctime);
898
+ let createdAt = birthtime;
899
+ if (createdAt === 0 || !createdAt)
900
+ createdAt = ctime;
901
+ createdAt = new Date(createdAt);
902
+ Object.assign(output, {
903
+ createdAt,
904
+ modifiedAt,
905
+ size
906
+ });
907
+ }
908
+ return output;
953
909
  };
954
910
  Worker.prototype.stat.metadata = {
955
- options: {
956
- filename: {}
957
- }
911
+ options: {
912
+ filename: {}
913
+ }
958
914
  };
959
-
960
915
  Worker.prototype.download = async function ({ filename }) {
961
- if (!filename) throw new Error('filename is required');
962
- if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
963
- const worker = new (filename.startsWith('r2://') ? R2Worker : S3Worker)(this);
964
- return worker.download({ filename });
965
- }
966
- throw new Error('Cannot download a local file');
916
+ if (!filename)
917
+ throw new Error('filename is required');
918
+ if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
919
+ const worker = new (filename.startsWith('r2://') ? R2Worker : S3Worker)(this);
920
+ return worker.download({ filename });
921
+ }
922
+ throw new Error('Cannot download a local file');
967
923
  };
968
924
  Worker.prototype.download.metadata = {
969
- options: {
970
- filename: {}
971
- }
925
+ options: {
926
+ filename: {}
927
+ }
972
928
  };
973
-
974
929
  Worker.prototype.head = async function (options) {
975
- const limit = options.limit || 3;
976
- const { stream } = await this.fileToObjectStream({ ...options, limit });
977
- const chunks = [];
978
-
979
- let counter = 0;
980
-
981
- for await (const chunk of stream) {
982
- chunks.push(chunk);
983
- counter += 1;
984
- if (counter >= limit) break;
985
- }
986
-
987
- return chunks;
930
+ const limit = options.limit || 3;
931
+ const { stream } = await this.fileToObjectStream({ ...options, limit });
932
+ const chunks = [];
933
+ let counter = 0;
934
+ for await (const chunk of stream) {
935
+ chunks.push(chunk);
936
+ counter += 1;
937
+ if (counter >= limit)
938
+ break;
939
+ }
940
+ return chunks;
988
941
  };
989
-
990
942
  Worker.prototype.head.metadata = {
991
- options: {
992
- filename: { required: true }
993
- }
943
+ options: {
944
+ filename: { required: true }
945
+ }
994
946
  };
995
-
996
947
  Worker.prototype.columns = async function (options) {
997
- const head = await this.head(options);
998
- if (head.length == 0) {
948
+ const head = await this.head(options);
949
+ if (head.length == 0) {
950
+ return {
951
+ records: 0,
952
+ likelyHeaderLines: 0,
953
+ columns: []
954
+ };
955
+ }
956
+ let likelyHeaderLines = 1;
957
+ const columns = Object.keys(head[0]);
958
+ let s = columns.join(',');
959
+ if (s.match(/[()@#%!]/)) {
960
+ likelyHeaderLines = 0;
961
+ }
999
962
  return {
1000
- records: 0,
1001
- likelyHeaderLines: 0,
1002
- columns: []
963
+ likelyHeaderLines,
964
+ columns
1003
965
  };
1004
- }
1005
-
1006
- let likelyHeaderLines = 1;
1007
- const columns = Object.keys(head[0]);
1008
- let s = columns.join(',');
1009
- if (s.match(/[()@#%!]/)) {
1010
- likelyHeaderLines = 0;
1011
- }
1012
- return {
1013
- likelyHeaderLines,
1014
- columns
1015
- };
1016
966
  };
1017
-
1018
967
  Worker.prototype.columns.metadata = {
1019
- options: {
1020
- filename: { required: true }
1021
- }
968
+ options: {
969
+ filename: { required: true }
970
+ }
1022
971
  };
1023
-
1024
972
  Worker.prototype.count = async function (options) {
1025
- const { stream } = await this.fileToObjectStream(options);
1026
- const sample = [];
1027
-
1028
- const limit = options.limit || 5;
1029
- let records = 0;
1030
-
1031
- for await (const chunk of stream) {
1032
- records += 1;
1033
- if (records < limit) {
1034
- sample.push(chunk);
1035
- }
1036
- }
1037
-
1038
- return { sample, records };
973
+ const { stream } = await this.fileToObjectStream(options);
974
+ const sample = [];
975
+ const limit = options.limit || 5;
976
+ let records = 0;
977
+ for await (const chunk of stream) {
978
+ records += 1;
979
+ if (records < limit) {
980
+ sample.push(chunk);
981
+ }
982
+ }
983
+ return { sample, records };
1039
984
  };
1040
-
1041
985
  Worker.prototype.count.metadata = {
1042
- options: {
1043
- filename: { required: true }
1044
- }
986
+ options: {
987
+ filename: { required: true }
988
+ }
1045
989
  };
1046
-
1047
990
  // Get a set of unique entries from a uniqueFunction
1048
991
  // This could be large
1049
992
  Worker.prototype.getUniqueSet = async function (options) {
1050
- const existingFiles = getStringArray(options.filenames);
1051
- const sample = {};
1052
-
1053
- let { uniqueFunction } = options;
1054
- if (!uniqueFunction) {
1055
- uniqueFunction = (o) => JSON.stringify(o);
1056
- }
1057
- const uniqueSet = new Set();
1058
-
1059
- for (const filename of existingFiles) {
1060
- const { stream: existsStream } = await this.fileToObjectStream({ filename });
1061
- await pipeline(
1062
- existsStream,
1063
- new Transform({
1064
- objectMode: true,
1065
- transform(d, enc, cb) {
1066
- const v = uniqueFunction(makeStrings(d)) || '';
1067
- if (uniqueSet.size < 3) {
1068
- sample[v] = d;
1069
- }
1070
- uniqueSet.add(v);
1071
- cb(null, d);
1072
- }
1073
- }),
1074
- new Writable({
1075
- objectMode: true,
1076
- write(d, enc, cb) {
1077
- cb();
1078
- }
1079
- })
1080
- );
1081
- debug(`Finished loading ${filename}`);
1082
- }
1083
- return { uniqueFunction, uniqueSet, sample };
993
+ const existingFiles = getStringArray(options.filenames);
994
+ const sample = {};
995
+ let { uniqueFunction } = options;
996
+ if (!uniqueFunction) {
997
+ uniqueFunction = (o) => JSON.stringify(o);
998
+ }
999
+ const uniqueSet = new Set();
1000
+ for (const filename of existingFiles) {
1001
+ const { stream: existsStream } = await this.fileToObjectStream({ filename });
1002
+ await pipeline(existsStream, new Transform({
1003
+ objectMode: true,
1004
+ transform(d, enc, cb) {
1005
+ const v = uniqueFunction(makeStrings(d)) || '';
1006
+ if (uniqueSet.size < 3) {
1007
+ sample[v] = d;
1008
+ }
1009
+ uniqueSet.add(v);
1010
+ cb(null, d);
1011
+ }
1012
+ }), new Writable({
1013
+ objectMode: true,
1014
+ write(d, enc, cb) {
1015
+ cb();
1016
+ }
1017
+ }));
1018
+ debug(`Finished loading ${filename}`);
1019
+ }
1020
+ return { uniqueFunction, uniqueSet, sample };
1084
1021
  };
1085
-
1086
1022
  Worker.prototype.getUniqueStream = async function (options) {
1087
- const includeDuplicateSourceRecords = bool(options.includeDuplicateSourceRecords, false);
1088
-
1089
- const { uniqueSet, uniqueFunction, sample } = await this.getUniqueSet({
1090
- filenames: options.existingFiles,
1091
- uniqueFunction: options.uniqueFunction
1092
- });
1093
-
1094
- const { stream: inStream } = await this.fileToObjectStream(options);
1095
- const uniqueStream = inStream.pipe(
1096
- new Transform({
1097
- objectMode: true,
1098
- transform(d, enc, cb) {
1099
- const v = uniqueFunction(makeStrings(d)) || '';
1100
-
1101
- if (!v) {
1102
- // falsey unique function includes
1103
- // by default
1104
- cb(null, d);
1105
- } else if (uniqueSet.has(v)) {
1106
- // do nothing
1107
- cb();
1108
- } else {
1109
- if (!includeDuplicateSourceRecords) {
1110
- // add it to the set for the next time
1111
- uniqueSet.add(v);
1112
- }
1113
- cb(null, d);
1023
+ const includeDuplicateSourceRecords = bool(options.includeDuplicateSourceRecords, false);
1024
+ const { uniqueSet, uniqueFunction, sample } = await this.getUniqueSet({
1025
+ filenames: options.existingFiles,
1026
+ uniqueFunction: options.uniqueFunction
1027
+ });
1028
+ const { stream: inStream } = await this.fileToObjectStream(options);
1029
+ const uniqueStream = inStream.pipe(new Transform({
1030
+ objectMode: true,
1031
+ transform(d, enc, cb) {
1032
+ const v = uniqueFunction(makeStrings(d)) || '';
1033
+ if (!v) {
1034
+ // falsey unique function includes
1035
+ // by default
1036
+ cb(null, d);
1037
+ }
1038
+ else if (uniqueSet.has(v)) {
1039
+ // do nothing
1040
+ cb();
1041
+ }
1042
+ else {
1043
+ if (!includeDuplicateSourceRecords) {
1044
+ // add it to the set for the next time
1045
+ uniqueSet.add(v);
1046
+ }
1047
+ cb(null, d);
1048
+ }
1114
1049
  }
1115
- }
1116
- })
1117
- );
1118
- return { stream: uniqueStream, sample };
1050
+ }));
1051
+ return { stream: uniqueStream, sample };
1119
1052
  };
1120
-
1121
1053
  Worker.prototype.getUniqueStream.metadata = {
1122
- options: {
1123
- existingFiles: {},
1124
- uniqueFunction: {},
1125
- filename: { description: 'Specify a source filename or a stream' },
1126
- stream: { description: 'Specify a source filename or a stream' },
1127
- includeDuplicateSourceRecords: {
1128
- description: 'Sometimes you want the output to include source dupes, sometimes not, default false'
1129
- }
1130
- }
1054
+ options: {
1055
+ existingFiles: {},
1056
+ uniqueFunction: {},
1057
+ filename: { description: 'Specify a source filename or a stream' },
1058
+ stream: { description: 'Specify a source filename or a stream' },
1059
+ includeDuplicateSourceRecords: {
1060
+ description: 'Sometimes you want the output to include source dupes, sometimes not, default false'
1061
+ }
1062
+ }
1131
1063
  };
1132
1064
  Worker.prototype.getUniqueFile = async function (options) {
1133
- const { stream, sample } = await this.getUniqueStream(options);
1134
- const { filename, records } = await this.objectStreamToFile({ stream });
1135
- return { filename, records, sample };
1065
+ const { stream, sample } = await this.getUniqueStream(options);
1066
+ const { filename, records } = await this.objectStreamToFile({ stream });
1067
+ return { filename, records, sample };
1136
1068
  };
1137
-
1138
1069
  Worker.prototype.getUniqueFile.metadata = {
1139
- options: {
1140
- existingFiles: {},
1141
- uniqueFunction: {},
1142
- filename: { description: 'Specify a source filename or a stream' },
1143
- stream: { description: 'Specify a source filename or a stream' },
1144
- includeDuplicateSourceRecords: {
1145
- description: 'Sometimes you want the output to include source dupes, sometimes not, default false'
1146
- }
1147
- }
1070
+ options: {
1071
+ existingFiles: {},
1072
+ uniqueFunction: {},
1073
+ filename: { description: 'Specify a source filename or a stream' },
1074
+ stream: { description: 'Specify a source filename or a stream' },
1075
+ includeDuplicateSourceRecords: {
1076
+ description: 'Sometimes you want the output to include source dupes, sometimes not, default false'
1077
+ }
1078
+ }
1148
1079
  };
1149
-
1150
1080
  /*
1151
1081
  diff that allows for unordered files, and doesn't store full objects in memory.
1152
1082
  Requires 2 passes of the files,
1153
1083
  but that's a better tradeoff than trying to store huge files in memory
1154
1084
  */
1155
1085
  Worker.prototype.diff = async function (options) {
1156
- const { fileA, fileB, uniqueFunction: ufOpt, columns, includeDuplicateSourceRecords } = options;
1157
- if (options.fields) throw new Error('fields is deprecated, use columns');
1158
-
1159
- if (ufOpt && columns) throw new Error('fields and uniqueFunction cannot both be specified');
1160
- let uniqueFunction = ufOpt;
1161
- if (!uniqueFunction && columns) {
1162
- const farr = getStringArray(columns);
1163
- uniqueFunction = (o) => farr.map((f) => o[f] || '').join('.');
1164
- }
1165
-
1166
- const left = await this.getUniqueFile({
1167
- existingFiles: [fileB],
1168
- filename: fileA,
1169
- uniqueFunction,
1170
- includeDuplicateSourceRecords
1171
- });
1172
- const right = await this.getUniqueFile({
1173
- existingFiles: [fileA],
1174
- filename: fileB,
1175
- uniqueFunction,
1176
- includeDuplicateSourceRecords
1177
- });
1178
-
1179
- return {
1180
- left,
1181
- right
1182
- };
1086
+ const { fileA, fileB, uniqueFunction: ufOpt, columns, includeDuplicateSourceRecords } = options;
1087
+ if (options.fields)
1088
+ throw new Error('fields is deprecated, use columns');
1089
+ if (ufOpt && columns)
1090
+ throw new Error('fields and uniqueFunction cannot both be specified');
1091
+ let uniqueFunction = ufOpt;
1092
+ if (!uniqueFunction && columns) {
1093
+ const farr = getStringArray(columns);
1094
+ uniqueFunction = (o) => farr.map((f) => o[f] || '').join('.');
1095
+ }
1096
+ const left = await this.getUniqueFile({
1097
+ existingFiles: [fileB],
1098
+ filename: fileA,
1099
+ uniqueFunction,
1100
+ includeDuplicateSourceRecords
1101
+ });
1102
+ const right = await this.getUniqueFile({
1103
+ existingFiles: [fileA],
1104
+ filename: fileB,
1105
+ uniqueFunction,
1106
+ includeDuplicateSourceRecords
1107
+ });
1108
+ return {
1109
+ left,
1110
+ right
1111
+ };
1183
1112
  };
1184
1113
  Worker.prototype.diff.metadata = {
1185
- options: {
1186
- fileA: {},
1187
- fileB: {},
1188
- columns: { description: 'Columns to use for uniqueness -- aka primary key. Defaults to JSON of line' },
1189
- uniqueFunction: {},
1190
- includeDuplicateSourceRecords: {
1191
- description: 'Sometimes you want the output to include source dupes, sometimes not, default false'
1192
- }
1193
- }
1114
+ options: {
1115
+ fileA: {},
1116
+ fileB: {},
1117
+ columns: { description: 'Columns to use for uniqueness -- aka primary key. Defaults to JSON of line' },
1118
+ uniqueFunction: {},
1119
+ includeDuplicateSourceRecords: {
1120
+ description: 'Sometimes you want the output to include source dupes, sometimes not, default false'
1121
+ }
1122
+ }
1194
1123
  };
1195
-
1196
- module.exports = Worker;
1124
+ export default Worker;