@engine9-io/input-tools 1.9.11 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,43 +1,32 @@
1
- const fs = require('node:fs');
2
-
1
+ import fs from 'node:fs';
2
+ import path from 'node:path';
3
+ import zlib from 'node:zlib';
4
+ import nodestream from 'node:stream';
5
+ import promises from 'node:stream/promises';
6
+ import { parse, stringify } from 'csv';
7
+ import debug$0 from 'debug';
8
+ import xlstream from 'xlstream';
9
+ import JSON5 from 'json5';
10
+ import languageEncoding from 'detect-file-encoding-and-language';
11
+ import R2Worker from './R2.js';
12
+ import S3Worker from './S3.js';
13
+ import ParquetWorker from './Parquet.js';
14
+ import { bool, getTempFilename, getStringArray, getTempDir, makeStrings, streamPacket, relativeDate } from './tools.js';
3
15
  const fsp = fs.promises;
4
- const path = require('node:path');
5
- const zlib = require('node:zlib');
6
- const { Readable, Transform, PassThrough, Writable } = require('node:stream');
7
- const { pipeline } = require('node:stream/promises');
8
- const { stringify } = require('csv');
9
-
10
- const debug = require('debug')('@engine9-io/file');
11
-
12
- const { getXlsxStream } = require('xlstream');
13
- const csv = require('csv');
14
- const JSON5 = require('json5');
15
-
16
- const languageEncoding = require('detect-file-encoding-and-language');
17
- const R2Worker = require('./R2');
18
- const S3Worker = require('./S3');
19
- const ParquetWorker = require('./Parquet');
16
+ const { Readable, Transform, PassThrough, Writable } = nodestream;
17
+ const { pipeline } = promises;
20
18
 
21
- const {
22
- bool,
23
- getTempFilename,
24
- getStringArray,
25
- getTempDir,
26
- makeStrings,
27
- streamPacket,
28
- relativeDate
29
- } = require('./tools');
19
+ const debug = debug$0('@engine9-io/file');
20
+ const { getXlsxStream } = xlstream;
30
21
 
31
22
  function Worker({ accountId }) {
32
23
  this.accountId = accountId;
33
24
  }
34
-
35
25
  class LineReaderTransform extends Transform {
36
26
  constructor(options = {}) {
37
27
  super({ ...options, readableObjectMode: true });
38
28
  this.buffer = '';
39
29
  }
40
-
41
30
  _transform(chunk, encoding, callback) {
42
31
  this.buffer += chunk.toString();
43
32
  const lines = this.buffer.split(/\r?\n/);
@@ -45,7 +34,6 @@ class LineReaderTransform extends Transform {
45
34
  lines.forEach((line) => this.push(line));
46
35
  callback();
47
36
  }
48
-
49
37
  _flush(callback) {
50
38
  if (this.buffer) {
51
39
  this.push(this.buffer);
@@ -53,11 +41,9 @@ class LineReaderTransform extends Transform {
53
41
  callback();
54
42
  }
55
43
  }
56
-
57
44
  Worker.prototype.csvToObjectTransforms = function (options) {
58
45
  const transforms = [];
59
46
  const delimiter = options.delimiter || ',';
60
-
61
47
  const headerMapping =
62
48
  options.headerMapping ||
63
49
  function (d) {
@@ -65,7 +51,6 @@ Worker.prototype.csvToObjectTransforms = function (options) {
65
51
  };
66
52
  let lastLine = null;
67
53
  let head = null;
68
-
69
54
  const skipLinesWithError = bool(options.skip_lines_with_error, false);
70
55
  const parserOptions = {
71
56
  relax: true,
@@ -82,27 +67,23 @@ Worker.prototype.csvToObjectTransforms = function (options) {
82
67
  if (options.limit) {
83
68
  parserOptions.to = options.limit;
84
69
  }
85
-
86
70
  debug('Parser options=', parserOptions);
87
- const parser = csv.parse(parserOptions);
71
+ const parser = parse(parserOptions);
88
72
  parser.on('error', (error) => {
89
73
  debug('fileToObjectStream: Error parsing csv file');
90
74
  debug(lastLine);
91
75
  throw new Error(error);
92
76
  });
93
-
94
77
  const blankAndHeaderCheck = new Transform({
95
78
  objectMode: true,
96
79
  transform(row, enc, cb) {
97
80
  // Blank rows
98
81
  if (row.length === 0) return cb();
99
82
  if (row.length === 1 && !row[0]) return cb();
100
-
101
83
  if (!head) {
102
84
  head = row.map(headerMapping);
103
85
  return cb();
104
86
  }
105
-
106
87
  const o = {};
107
88
  head.forEach((_h, i) => {
108
89
  const h = _h.trim();
@@ -110,18 +91,14 @@ Worker.prototype.csvToObjectTransforms = function (options) {
110
91
  o[h] = row[i];
111
92
  }
112
93
  });
113
-
114
94
  lastLine = row.join(delimiter);
115
95
  return cb(null, o);
116
96
  }
117
97
  });
118
-
119
98
  transforms.push(parser);
120
99
  transforms.push(blankAndHeaderCheck);
121
-
122
100
  return { transforms };
123
101
  };
124
-
125
102
  Worker.prototype.detectEncoding = async function (options) {
126
103
  if (options.encoding_override) return { encoding: options.encoding_override };
127
104
  // Limit to only the top N bytes -- for perfomance
@@ -154,19 +131,15 @@ Worker.prototype.detectEncoding = async function (options) {
154
131
  decompressStream.end();
155
132
  });
156
133
  }
157
-
158
134
  return languageEncoding(finalBuff);
159
135
  };
160
-
161
136
  Worker.prototype.detectEncoding.metadata = {
162
137
  options: {
163
138
  filename: { required: true }
164
139
  }
165
140
  };
166
-
167
141
  Worker.prototype.xlsxToObjectStream = async function (options) {
168
142
  let { filename } = options;
169
-
170
143
  if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
171
144
  // We need to copy and delete
172
145
  let worker = null;
@@ -176,7 +149,6 @@ Worker.prototype.xlsxToObjectStream = async function (options) {
176
149
  worker = new S3Worker(this);
177
150
  }
178
151
  const target = getTempFilename({ targetFilename: filename.split('/').pop() });
179
-
180
152
  await worker.copy({ filename, target });
181
153
  filename = target;
182
154
  }
@@ -202,27 +174,22 @@ Worker.prototype.xlsxToObjectStream = async function (options) {
202
174
  }
203
175
  })
204
176
  );
205
-
206
177
  return { stream };
207
178
  };
208
-
209
179
  Worker.prototype.getFormat = async function (options) {
210
180
  const { sourcePostfix, filename, format: formatOverride } = options;
211
181
  let postfix = sourcePostfix || filename.toLowerCase().split('.').pop();
212
-
213
182
  if (postfix === 'gz') {
214
183
  postfix = filename.toLowerCase().split('.');
215
184
  postfix = postfix[postfix.length - 2];
216
185
  }
217
186
  return formatOverride || postfix;
218
187
  };
219
-
220
188
  /*
221
189
  Commonly used method to transform a file into a stream of objects.
222
190
  */
223
191
  Worker.prototype.fileToObjectStream = async function (options) {
224
192
  const { filename, columns, limit: limitOption, format: formatOverride } = options;
225
-
226
193
  // handle stream item
227
194
  if (options.stream) {
228
195
  if (Array.isArray(options.stream)) {
@@ -243,7 +210,6 @@ Worker.prototype.fileToObjectStream = async function (options) {
243
210
  debug('Invalid filename:', { filename });
244
211
  throw new Error('Cowardly refusing to turn a .zip file into an object stream, turn into a csv first');
245
212
  }
246
-
247
213
  const streamInfo = await this.stream({
248
214
  filename,
249
215
  columns,
@@ -256,11 +222,8 @@ Worker.prototype.fileToObjectStream = async function (options) {
256
222
  // already an object
257
223
  return { stream };
258
224
  }
259
-
260
225
  let count = 0;
261
-
262
226
  let transforms = [];
263
-
264
227
  if (postfix === 'gz') {
265
228
  const gunzip = zlib.createGunzip();
266
229
  transforms.push(gunzip);
@@ -273,9 +236,7 @@ Worker.prototype.fileToObjectStream = async function (options) {
273
236
  stream.setEncoding(encoding);
274
237
  }
275
238
  let format = formatOverride || postfix;
276
-
277
239
  debug(`Reading file ${filename} with encoding: ${encoding} and format ${format}`);
278
-
279
240
  if (format === 'csv') {
280
241
  const csvTransforms = this.csvToObjectTransforms({ ...options });
281
242
  transforms = transforms.concat(csvTransforms.transforms);
@@ -284,12 +245,10 @@ Worker.prototype.fileToObjectStream = async function (options) {
284
245
  transforms = transforms.concat(csvTransforms.transforms);
285
246
  } else if (format === 'jsonl') {
286
247
  /* Type of JSON that has the names in an array in the first record,
287
- and the values in JSON arrays thereafter
288
- */
248
+ and the values in JSON arrays thereafter
249
+ */
289
250
  let headers = null;
290
-
291
251
  const lineReader = new LineReaderTransform();
292
-
293
252
  const jsonlTransform = new Transform({
294
253
  objectMode: true,
295
254
  transform(d, enc, cb) {
@@ -303,8 +262,8 @@ Worker.prototype.fileToObjectStream = async function (options) {
303
262
  throw e;
304
263
  }
305
264
  /* JSONL could potentially start with an array of names,
306
- in which case we need to map the subsequent values
307
- */
265
+ in which case we need to map the subsequent values
266
+ */
308
267
  if (headers === null) {
309
268
  if (Array.isArray(obj)) {
310
269
  headers = obj;
@@ -324,7 +283,6 @@ Worker.prototype.fileToObjectStream = async function (options) {
324
283
  return cb();
325
284
  }
326
285
  });
327
-
328
286
  transforms.push(lineReader);
329
287
  transforms.push(jsonlTransform);
330
288
  } else {
@@ -348,20 +306,18 @@ Worker.prototype.fileToObjectStream = async function (options) {
348
306
  // Don't push dummy records anymore -- legacy cruft
349
307
  debug(`Completed reading file, records=${count}`);
350
308
  /* if (count === 0) {
351
- const o = { _is_placeholder: true };
352
-
353
- if (head) head.forEach((c) => { o[c] = null; });
354
- this.push(o);
355
- } */
309
+ const o = { _is_placeholder: true };
310
+
311
+ if (head) head.forEach((c) => { o[c] = null; });
312
+ this.push(o);
313
+ } */
356
314
  cb();
357
315
  }
358
316
  });
359
-
360
317
  transforms.push(countAndDebug);
361
318
  transforms.forEach((t) => {
362
319
  stream = stream.pipe(t);
363
320
  });
364
-
365
321
  return { stream };
366
322
  };
367
323
  Worker.prototype.getFileWriterStream = async function (options = {}) {
@@ -380,13 +336,10 @@ Worker.prototype.getFileWriterStream = async function (options = {}) {
380
336
  if (bool(options.gzip, false)) filename += '.gz';
381
337
  const stream = fs.createWriteStream(filename);
382
338
  debug('FileWriterStream writing to file ', filename);
383
-
384
339
  return { filename, stream };
385
340
  };
386
-
387
341
  Worker.prototype.getOutputStreams = async function (options) {
388
342
  const { filename, stream: fileWriterStream } = await this.getFileWriterStream(options);
389
-
390
343
  let { transform } = options;
391
344
  if (typeof options.transform === 'function') {
392
345
  if (options.transform.length === 3) {
@@ -409,7 +362,6 @@ Worker.prototype.getOutputStreams = async function (options) {
409
362
  }
410
363
  const { flatten } = options;
411
364
  let flattenTransform = null;
412
-
413
365
  if (bool(flatten, false)) {
414
366
  flattenTransform = new Transform({
415
367
  objectMode: true,
@@ -431,7 +383,6 @@ Worker.prototype.getOutputStreams = async function (options) {
431
383
  }
432
384
  });
433
385
  }
434
-
435
386
  const stats = {
436
387
  records: 0
437
388
  };
@@ -473,23 +424,17 @@ Worker.prototype.objectStreamToFile = async function (options) {
473
424
  await pipeline(streams);
474
425
  return { filename, records: stats.records };
475
426
  };
476
-
477
427
  Worker.prototype.transform = async function (options) {
478
428
  const worker = this;
479
-
480
429
  const { filename } = options;
481
-
482
430
  debug(`Transforming ${filename}`);
483
-
484
431
  options.filename = filename;
485
432
  let { stream } = await worker.fileToObjectStream(options);
486
433
  if (typeof stream.pipe !== 'function') {
487
434
  debug(stream);
488
435
  throw new Error('No pipe in stream');
489
436
  }
490
-
491
437
  let t = options.transform;
492
-
493
438
  // No longer need this
494
439
  delete options.transform;
495
440
  if (!t) {
@@ -498,7 +443,6 @@ Worker.prototype.transform = async function (options) {
498
443
  cb(null, d);
499
444
  };
500
445
  }
501
-
502
446
  if (!Array.isArray(t)) t = [t];
503
447
  Object.keys(t).forEach((key) => {
504
448
  let f = t[key];
@@ -508,22 +452,17 @@ Worker.prototype.transform = async function (options) {
508
452
  transform: f
509
453
  });
510
454
  }
511
-
512
455
  stream = stream.pipe(f);
513
456
  });
514
-
515
457
  const { targetFormat } = options;
516
-
517
458
  if (
518
459
  !targetFormat &&
519
460
  (filename.toLowerCase().slice(-4) === '.csv' || filename.toLowerCase().slice(-7) === '.csv.gz')
520
461
  ) {
521
462
  options.targetFormat = 'csv';
522
463
  }
523
-
524
464
  return worker.objectStreamToFile({ ...options, stream });
525
465
  };
526
-
527
466
  Worker.prototype.transform.metadata = {
528
467
  options: {
529
468
  sourcePostfix: { description: "Override the source postfix, if for example it's a csv" },
@@ -553,12 +492,10 @@ Worker.prototype.testTransform.metadata = {
553
492
  filename: true
554
493
  }
555
494
  };
556
-
557
495
  /* Get a stream from an actual stream, or an array, or a file */
558
496
  Worker.prototype.stream = async function (options) {
559
497
  const { stream: inputStream, packet, type, columns, limit, filename: filenameOpt } = options;
560
498
  let filename = filenameOpt;
561
-
562
499
  if (inputStream) {
563
500
  if (Array.isArray(inputStream)) {
564
501
  return { stream: Readable.from(inputStream) };
@@ -611,7 +548,6 @@ Worker.prototype.stream = async function (options) {
611
548
  throw new Error('stream must be passed a stream, filename, or packet');
612
549
  }
613
550
  };
614
-
615
551
  Worker.prototype.sample = async function (opts) {
616
552
  opts.limit = opts.limit || 10;
617
553
  const { stream } = await this.fileToObjectStream(opts);
@@ -631,7 +567,6 @@ Worker.prototype.toArray.metadata = {
631
567
  filename: {}
632
568
  }
633
569
  };
634
-
635
570
  Worker.prototype.write = async function (opts) {
636
571
  const { filename, content } = opts;
637
572
  if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
@@ -658,15 +593,12 @@ Worker.prototype.write.metadata = {
658
593
  content: {}
659
594
  }
660
595
  };
661
-
662
596
  async function streamToString(stream) {
663
597
  // lets have a ReadableStream as a stream variable
664
598
  const chunks = [];
665
-
666
599
  for await (const chunk of stream) {
667
600
  chunks.push(Buffer.from(chunk));
668
601
  }
669
-
670
602
  return Buffer.concat(chunks).toString('utf-8');
671
603
  }
672
604
  /*
@@ -687,20 +619,17 @@ Worker.prototype.json.metadata = {
687
619
  filename: { description: 'Get a javascript object from a file' }
688
620
  }
689
621
  };
690
-
691
622
  Worker.prototype.list = async function ({ directory, start: s, end: e }) {
692
623
  if (!directory) throw new Error('directory is required');
693
624
  let start = null;
694
625
  let end = null;
695
626
  if (s) start = relativeDate(s);
696
627
  if (e) end = relativeDate(e);
697
-
698
628
  if (directory.startsWith('s3://') || directory.startsWith('r2://')) {
699
629
  const worker = new (directory.startsWith('r2://') ? R2Worker : S3Worker)(this);
700
630
  return worker.list({ directory, start, end });
701
631
  }
702
632
  const a = await fsp.readdir(directory, { withFileTypes: true });
703
-
704
633
  const withModified = [];
705
634
  for (const file of a) {
706
635
  const fullPath = path.join(directory, file.name);
@@ -717,7 +646,6 @@ Worker.prototype.list = async function ({ directory, start: s, end: e }) {
717
646
  });
718
647
  }
719
648
  }
720
-
721
649
  return withModified;
722
650
  };
723
651
  Worker.prototype.list.metadata = {
@@ -725,7 +653,6 @@ Worker.prototype.list.metadata = {
725
653
  directory: { required: true }
726
654
  }
727
655
  };
728
-
729
656
  Worker.prototype.listAll = async function ({ directory, start: s, end: e }) {
730
657
  if (!directory) throw new Error('directory is required');
731
658
  let start = null;
@@ -737,16 +664,13 @@ Worker.prototype.listAll = async function ({ directory, start: s, end: e }) {
737
664
  return worker.listAll({ directory, start, end });
738
665
  }
739
666
  const a = await fsp.readdir(directory, { recursive: true });
740
-
741
667
  let files = a.map((f) => `${directory}/${f}`);
742
668
  if (!start && !end) {
743
669
  return files;
744
670
  }
745
671
  const pLimit = await import('p-limit');
746
-
747
672
  const limitedMethod = pLimit.default(10);
748
673
  const filesWithinLimit = [];
749
-
750
674
  await Promise.all(
751
675
  files.map((filename) =>
752
676
  limitedMethod(async () => {
@@ -774,7 +698,6 @@ Worker.prototype.listAll.metadata = {
774
698
  end: {}
775
699
  }
776
700
  };
777
-
778
701
  Worker.prototype.moveAll = async function (options) {
779
702
  const { directory, targetDirectory } = options;
780
703
  if (!directory) throw new Error('directory is required');
@@ -783,7 +706,6 @@ Worker.prototype.moveAll = async function (options) {
783
706
  return worker.moveAll(options);
784
707
  }
785
708
  const a = await this.listAll(options);
786
-
787
709
  let configs = a.map((f) => {
788
710
  let filename = typeof f === 'string' ? f : f.filename;
789
711
  return {
@@ -792,9 +714,7 @@ Worker.prototype.moveAll = async function (options) {
792
714
  };
793
715
  });
794
716
  const pLimit = await import('p-limit');
795
-
796
717
  const limitedMethod = pLimit.default(10);
797
-
798
718
  return Promise.all(configs.map(({ filename, target }) => limitedMethod(async () => this.move({ filename, target }))));
799
719
  };
800
720
  Worker.prototype.moveAll.metadata = {
@@ -803,7 +723,6 @@ Worker.prototype.moveAll.metadata = {
803
723
  targetDirectory: { required: true }
804
724
  }
805
725
  };
806
-
807
726
  Worker.prototype.empty = async function ({ directory }) {
808
727
  if (!directory) throw new Error('directory is required');
809
728
  if (directory.startsWith('s3://') || directory.startsWith('r2://')) {
@@ -811,7 +730,6 @@ Worker.prototype.empty = async function ({ directory }) {
811
730
  throw new Error('Cannot empty an s3:// or r2:// directory');
812
731
  }
813
732
  const removed = [];
814
-
815
733
  for (const file of await fsp.readdir(directory)) {
816
734
  removed.push(file);
817
735
  await fsp.unlink(path.join(directory, file));
@@ -823,14 +741,10 @@ Worker.prototype.empty.metadata = {
823
741
  directory: { required: true }
824
742
  }
825
743
  };
826
-
827
744
  Worker.prototype.removeAll = async function (options) {
828
745
  const filenames = await this.listAll(options);
829
-
830
746
  const pLimit = await import('p-limit');
831
-
832
747
  const limitedMethod = pLimit.default(10);
833
-
834
748
  return Promise.all(filenames.map((filename) => limitedMethod(async () => this.remove({ filename }))));
835
749
  };
836
750
  Worker.prototype.removeAll.metadata = {
@@ -840,7 +754,6 @@ Worker.prototype.removeAll.metadata = {
840
754
  end: {}
841
755
  }
842
756
  };
843
-
844
757
  Worker.prototype.remove = async function ({ filename }) {
845
758
  if (!filename) throw new Error('filename is required');
846
759
  if (typeof filename !== 'string') throw new Error(`filename isn't a string:${JSON.stringify(filename)}`);
@@ -851,12 +764,10 @@ Worker.prototype.remove = async function ({ filename }) {
851
764
  } else {
852
765
  worker = new S3Worker(this);
853
766
  }
854
-
855
767
  await worker.remove({ filename });
856
768
  } else {
857
769
  fsp.unlink(filename);
858
770
  }
859
-
860
771
  return { removed: filename };
861
772
  };
862
773
  Worker.prototype.remove.metadata = {
@@ -864,7 +775,6 @@ Worker.prototype.remove.metadata = {
864
775
  filename: {}
865
776
  }
866
777
  };
867
-
868
778
  Worker.prototype.move = async function ({ filename, target, remove = true }) {
869
779
  if (!target) throw new Error('target is required');
870
780
  if (typeof target !== 'string') throw new Error(`target isn't a string:${JSON.stringify(target)}`);
@@ -875,14 +785,12 @@ Worker.prototype.move = async function ({ filename, target, remove = true }) {
875
785
  ) {
876
786
  throw new Error('Cowardly not copying between services');
877
787
  }
878
-
879
788
  let worker = null;
880
789
  if (target.startsWith('r2://')) {
881
790
  worker = new R2Worker(this);
882
791
  } else {
883
792
  worker = new S3Worker(this);
884
793
  }
885
-
886
794
  if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
887
795
  // We need to copy and delete
888
796
  const output = await worker.copy({ filename, target });
@@ -913,7 +821,6 @@ Worker.prototype.move.metadata = {
913
821
  target: {}
914
822
  }
915
823
  };
916
-
917
824
  Worker.prototype.copy = async function (opts) {
918
825
  return this.move({ ...opts, remove: false });
919
826
  };
@@ -923,17 +830,14 @@ Worker.prototype.copy.metadata = {
923
830
  target: {}
924
831
  }
925
832
  };
926
-
927
833
  Worker.prototype.stat = async function ({ filename }) {
928
834
  if (!filename) throw new Error('filename is required');
929
835
  const output = {};
930
-
931
836
  if (filename.slice(-8) === '.parquet') {
932
837
  const pq = new ParquetWorker(this);
933
838
  output.schema = (await pq.schema({ filename }))?.schema;
934
839
  output.records = (await pq.meta({ filename }))?.records;
935
840
  }
936
-
937
841
  if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
938
842
  const worker = new (filename.startsWith('r2://') ? R2Worker : S3Worker)(this);
939
843
  Object.assign(output, await worker.stat({ filename }));
@@ -956,7 +860,6 @@ Worker.prototype.stat.metadata = {
956
860
  filename: {}
957
861
  }
958
862
  };
959
-
960
863
  Worker.prototype.download = async function ({ filename }) {
961
864
  if (!filename) throw new Error('filename is required');
962
865
  if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
@@ -970,29 +873,23 @@ Worker.prototype.download.metadata = {
970
873
  filename: {}
971
874
  }
972
875
  };
973
-
974
876
  Worker.prototype.head = async function (options) {
975
877
  const limit = options.limit || 3;
976
878
  const { stream } = await this.fileToObjectStream({ ...options, limit });
977
879
  const chunks = [];
978
-
979
880
  let counter = 0;
980
-
981
881
  for await (const chunk of stream) {
982
882
  chunks.push(chunk);
983
883
  counter += 1;
984
884
  if (counter >= limit) break;
985
885
  }
986
-
987
886
  return chunks;
988
887
  };
989
-
990
888
  Worker.prototype.head.metadata = {
991
889
  options: {
992
890
  filename: { required: true }
993
891
  }
994
892
  };
995
-
996
893
  Worker.prototype.columns = async function (options) {
997
894
  const head = await this.head(options);
998
895
  if (head.length == 0) {
@@ -1002,7 +899,6 @@ Worker.prototype.columns = async function (options) {
1002
899
  columns: []
1003
900
  };
1004
901
  }
1005
-
1006
902
  let likelyHeaderLines = 1;
1007
903
  const columns = Object.keys(head[0]);
1008
904
  let s = columns.join(',');
@@ -1014,48 +910,39 @@ Worker.prototype.columns = async function (options) {
1014
910
  columns
1015
911
  };
1016
912
  };
1017
-
1018
913
  Worker.prototype.columns.metadata = {
1019
914
  options: {
1020
915
  filename: { required: true }
1021
916
  }
1022
917
  };
1023
-
1024
918
  Worker.prototype.count = async function (options) {
1025
919
  const { stream } = await this.fileToObjectStream(options);
1026
920
  const sample = [];
1027
-
1028
921
  const limit = options.limit || 5;
1029
922
  let records = 0;
1030
-
1031
923
  for await (const chunk of stream) {
1032
924
  records += 1;
1033
925
  if (records < limit) {
1034
926
  sample.push(chunk);
1035
927
  }
1036
928
  }
1037
-
1038
929
  return { sample, records };
1039
930
  };
1040
-
1041
931
  Worker.prototype.count.metadata = {
1042
932
  options: {
1043
933
  filename: { required: true }
1044
934
  }
1045
935
  };
1046
-
1047
936
  // Get a set of unique entries from a uniqueFunction
1048
937
  // This could be large
1049
938
  Worker.prototype.getUniqueSet = async function (options) {
1050
939
  const existingFiles = getStringArray(options.filenames);
1051
940
  const sample = {};
1052
-
1053
941
  let { uniqueFunction } = options;
1054
942
  if (!uniqueFunction) {
1055
943
  uniqueFunction = (o) => JSON.stringify(o);
1056
944
  }
1057
945
  const uniqueSet = new Set();
1058
-
1059
946
  for (const filename of existingFiles) {
1060
947
  const { stream: existsStream } = await this.fileToObjectStream({ filename });
1061
948
  await pipeline(
@@ -1082,22 +969,18 @@ Worker.prototype.getUniqueSet = async function (options) {
1082
969
  }
1083
970
  return { uniqueFunction, uniqueSet, sample };
1084
971
  };
1085
-
1086
972
  Worker.prototype.getUniqueStream = async function (options) {
1087
973
  const includeDuplicateSourceRecords = bool(options.includeDuplicateSourceRecords, false);
1088
-
1089
974
  const { uniqueSet, uniqueFunction, sample } = await this.getUniqueSet({
1090
975
  filenames: options.existingFiles,
1091
976
  uniqueFunction: options.uniqueFunction
1092
977
  });
1093
-
1094
978
  const { stream: inStream } = await this.fileToObjectStream(options);
1095
979
  const uniqueStream = inStream.pipe(
1096
980
  new Transform({
1097
981
  objectMode: true,
1098
982
  transform(d, enc, cb) {
1099
983
  const v = uniqueFunction(makeStrings(d)) || '';
1100
-
1101
984
  if (!v) {
1102
985
  // falsey unique function includes
1103
986
  // by default
@@ -1117,7 +1000,6 @@ Worker.prototype.getUniqueStream = async function (options) {
1117
1000
  );
1118
1001
  return { stream: uniqueStream, sample };
1119
1002
  };
1120
-
1121
1003
  Worker.prototype.getUniqueStream.metadata = {
1122
1004
  options: {
1123
1005
  existingFiles: {},
@@ -1134,7 +1016,6 @@ Worker.prototype.getUniqueFile = async function (options) {
1134
1016
  const { filename, records } = await this.objectStreamToFile({ stream });
1135
1017
  return { filename, records, sample };
1136
1018
  };
1137
-
1138
1019
  Worker.prototype.getUniqueFile.metadata = {
1139
1020
  options: {
1140
1021
  existingFiles: {},
@@ -1146,7 +1027,6 @@ Worker.prototype.getUniqueFile.metadata = {
1146
1027
  }
1147
1028
  }
1148
1029
  };
1149
-
1150
1030
  /*
1151
1031
  diff that allows for unordered files, and doesn't store full objects in memory.
1152
1032
  Requires 2 passes of the files,
@@ -1155,14 +1035,12 @@ but that's a better tradeoff than trying to store huge files in memory
1155
1035
  Worker.prototype.diff = async function (options) {
1156
1036
  const { fileA, fileB, uniqueFunction: ufOpt, columns, includeDuplicateSourceRecords } = options;
1157
1037
  if (options.fields) throw new Error('fields is deprecated, use columns');
1158
-
1159
1038
  if (ufOpt && columns) throw new Error('fields and uniqueFunction cannot both be specified');
1160
1039
  let uniqueFunction = ufOpt;
1161
1040
  if (!uniqueFunction && columns) {
1162
1041
  const farr = getStringArray(columns);
1163
1042
  uniqueFunction = (o) => farr.map((f) => o[f] || '').join('.');
1164
1043
  }
1165
-
1166
1044
  const left = await this.getUniqueFile({
1167
1045
  existingFiles: [fileB],
1168
1046
  filename: fileA,
@@ -1175,7 +1053,6 @@ Worker.prototype.diff = async function (options) {
1175
1053
  uniqueFunction,
1176
1054
  includeDuplicateSourceRecords
1177
1055
  });
1178
-
1179
1056
  return {
1180
1057
  left,
1181
1058
  right
@@ -1192,5 +1069,4 @@ Worker.prototype.diff.metadata = {
1192
1069
  }
1193
1070
  }
1194
1071
  };
1195
-
1196
- module.exports = Worker;
1072
+ export default Worker;