@engine9-io/input-tools 1.5.3 → 1.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,22 +5,23 @@ const fsp = fs.promises;
5
5
  const path = require('node:path');
6
6
  const zlib = require('node:zlib');
7
7
  const {
8
- Readable, Transform, PassThrough,
8
+ Readable, Transform, PassThrough, Writable,
9
9
  } = require('node:stream');
10
10
  const { pipeline } = require('node:stream/promises');
11
11
  const { stringify } = require('csv');
12
12
 
13
13
  const debug = require('debug')('FileWorker');
14
- // const through2 = require('through2');
14
+
15
15
  const csv = require('csv');
16
- const JSON5 = require('json5');// Useful for parsing extended JSON
16
+ const JSON5 = require('json5');
17
17
  const languageEncoding = require('detect-file-encoding-and-language');
18
18
  const R2Worker = require('./R2');
19
19
  const S3Worker = require('./S3');
20
20
  const ParquetWorker = require('./Parquet');
21
- const { streamPacket } = require('./tools');
22
21
 
23
- const { bool, getTempDir } = require('./tools');
22
+ const {
23
+ bool, getStringArray, getTempDir, makeStrings, streamPacket,
24
+ } = require('./tools');
24
25
 
25
26
  function Worker({ accountId }) { this.accountId = accountId; }
26
27
 
@@ -543,6 +544,15 @@ Worker.prototype.sample.metadata = {
543
544
 
544
545
  },
545
546
  };
547
+ Worker.prototype.toArray = async function (opts) {
548
+ const { stream } = await this.fileToObjectStream(opts);
549
+ return stream.toArray();
550
+ };
551
+ Worker.prototype.toArray.metadata = {
552
+ options: {
553
+ filename: {},
554
+ },
555
+ };
546
556
 
547
557
  Worker.prototype.write = async function (opts) {
548
558
  const { filename, content } = opts;
@@ -774,4 +784,151 @@ Worker.prototype.count.metadata = {
774
784
  },
775
785
  };
776
786
 
787
+ // Get a set of unique entries from a uniqueFunction
788
+ // This could be large
789
+ Worker.prototype.getUniqueSet = async function (options) {
790
+ const existingFiles = getStringArray(options.filenames);
791
+ const sample = {};
792
+
793
+ let { uniqueFunction } = options;
794
+ if (!uniqueFunction) {
795
+ uniqueFunction = ((o) => JSON.stringify(o));
796
+ }
797
+ const uniqueSet = new Set();
798
+ // eslint-disable-next-line no-restricted-syntax, guard-for-in
799
+ for (const filename of existingFiles) {
800
+ const { stream: existsStream } = await this.fileToObjectStream({ filename });
801
+ await pipeline(
802
+ existsStream,
803
+ new Transform({
804
+ objectMode: true,
805
+ transform(d, enc, cb) {
806
+ const v = uniqueFunction(makeStrings(d)) || '';
807
+ if (uniqueSet.size < 3) {
808
+ sample[v] = d;
809
+ }
810
+ uniqueSet.add(v);
811
+ cb(null, d);
812
+ },
813
+ }),
814
+ new Writable({
815
+ objectMode: true,
816
+ write(d, enc, cb) {
817
+ cb();
818
+ },
819
+ }),
820
+ );
821
+ debug(`Finished loading ${filename}`);
822
+ }
823
+ return { uniqueFunction, uniqueSet, sample };
824
+ };
825
+
826
+ Worker.prototype.getUniqueStream = async function (options) {
827
+ const includeDuplicateSourceRecords = bool(options.includeDuplicateSourceRecords, false);
828
+
829
+ const { uniqueSet, uniqueFunction, sample } = await this.getUniqueSet({
830
+ filenames: options.existingFiles,
831
+ uniqueFunction: options.uniqueFunction,
832
+ });
833
+
834
+ const { stream: inStream } = await this.fileToObjectStream(options);
835
+ const uniqueStream = inStream.pipe(
836
+ new Transform({
837
+ objectMode: true,
838
+ transform(d, enc, cb) {
839
+ const v = uniqueFunction(makeStrings(d)) || '';
840
+
841
+ if (!v) {
842
+ // falsey unique function includes
843
+ // by default
844
+ cb(null, d);
845
+ } else if (uniqueSet.has(v)) {
846
+ // do nothing
847
+ cb();
848
+ } else {
849
+ if (!includeDuplicateSourceRecords) {
850
+ // add it to the set for the next time
851
+ uniqueSet.add(v);
852
+ }
853
+ cb(null, d);
854
+ }
855
+ },
856
+ }),
857
+ );
858
+ return { stream: uniqueStream, sample };
859
+ };
860
+
861
+ Worker.prototype.getUniqueStream.metadata = {
862
+ options: {
863
+ existingFiles: {},
864
+ uniqueFunction: {},
865
+ filename: { description: 'Specify a source filename or a stream' },
866
+ stream: { description: 'Specify a source filename or a stream' },
867
+ includeDuplicateSourceRecords: {
868
+ description: 'Sometimes you want the output to include source dupes, sometimes not, default false',
869
+ },
870
+ },
871
+ };
872
+ Worker.prototype.getUniqueFile = async function (options) {
873
+ const { stream, sample } = await this.getUniqueStream(options);
874
+ const { filename, records } = await this.objectStreamToFile({ stream });
875
+ return { filename, records, sample };
876
+ };
877
+
878
+ Worker.prototype.getUniqueFile.metadata = {
879
+ options: {
880
+ existingFiles: {},
881
+ uniqueFunction: {},
882
+ filename: { description: 'Specify a source filename or a stream' },
883
+ stream: { description: 'Specify a source filename or a stream' },
884
+ includeDuplicateSourceRecords: {
885
+ description: 'Sometimes you want the output to include source dupes, sometimes not, default false',
886
+ },
887
+ },
888
+ };
889
+
890
+ /*
891
+ diff that allows for unordered files, and doesn't store full objects in memory.
892
+ Requires 2 passes of the files,
893
+ but that's a better tradeoff than trying to store huge files in memory
894
+ */
895
+ Worker.prototype.diff = async function ({
896
+ fileA, fileB, uniqueFunction: ufOpt, fields, includeDuplicateSourceRecords,
897
+ }) {
898
+ if (ufOpt && fields) throw new Error('fields and uniqueFunction cannot both be specified');
899
+ let uniqueFunction = ufOpt;
900
+ if (!uniqueFunction && fields) {
901
+ const farr = getStringArray(fields);
902
+ uniqueFunction = (o) => farr.map((f) => o[f] || '').join('.');
903
+ }
904
+
905
+ const left = await this.getUniqueFile({
906
+ existingFiles: [fileB],
907
+ filename: fileA,
908
+ uniqueFunction,
909
+ includeDuplicateSourceRecords,
910
+ });
911
+ const right = await this.getUniqueFile({
912
+ existingFiles: [fileA],
913
+ filename: fileB,
914
+ uniqueFunction,
915
+ includeDuplicateSourceRecords,
916
+ });
917
+
918
+ return {
919
+ left, right,
920
+ };
921
+ };
922
+ Worker.prototype.diff.metadata = {
923
+ options: {
924
+ fileA: {},
925
+ fileB: {},
926
+ fields: { description: 'Fields to use for uniqueness -- aka primary key. Defaults to JSON of line' },
927
+ uniqueFunction: {},
928
+ includeDuplicateSourceRecords: {
929
+ description: 'Sometimes you want the output to include source dupes, sometimes not, default false',
930
+ },
931
+ },
932
+ };
933
+
777
934
  module.exports = Worker;
package/file/tools.js CHANGED
@@ -222,16 +222,38 @@ function bool(x, _defaultVal) {
222
222
  const y = x.toLowerCase();
223
223
  return !!(y.indexOf('y') + 1) || !!(y.indexOf('t') + 1);
224
224
  }
225
+ function getStringArray(s, nonZeroLength) {
226
+ let a = s || [];
227
+ if (typeof a === 'number') a = String(a);
228
+ if (typeof a === 'string') a = [a];
229
+
230
+ if (typeof s === 'string') a = s.split(',');
231
+ a = a.map((x) => x.toString().trim()).filter(Boolean);
232
+ if (nonZeroLength && a.length === 0) a = [0];
233
+ return a;
234
+ }
235
+ /*
236
+ When comparing two objects, some may come from a file (thus strings), and some from
237
+ a database or elsewhere (not strings), so for deduping make sure to make them all strings
238
+ */
239
+ function makeStrings(o) {
240
+ return Object.entries(o).reduce((a, [k, v]) => {
241
+ a[k] = (typeof v === 'object') ? JSON.stringify(v) : String(v);
242
+ return a;
243
+ }, {});
244
+ }
225
245
 
226
246
  module.exports = {
227
247
  bool,
248
+ downloadFile,
228
249
  getTempFilename,
229
250
  getTempDir,
230
- downloadFile,
231
251
  getBatchTransform,
232
252
  getDebatchTransform,
233
253
  getFile,
234
254
  getManifest,
235
255
  getPacketFiles,
256
+ getStringArray,
257
+ makeStrings,
236
258
  streamPacket,
237
259
  };
package/index.js CHANGED
@@ -25,6 +25,8 @@ const {
25
25
  getPacketFiles,
26
26
  getBatchTransform,
27
27
  getDebatchTransform,
28
+ getStringArray,
29
+ makeStrings,
28
30
  } = require('./file/tools');
29
31
 
30
32
  const ForEachEntry = require('./ForEachEntry');
@@ -47,17 +49,6 @@ handlebars.registerHelper('json', (d) => JSON.stringify(d));
47
49
 
48
50
  handlebars.registerHelper('percent', (a, b) => `${((100 * a) / b).toFixed(2)}%`);
49
51
 
50
- function getStringArray(s, nonZeroLength) {
51
- let a = s || [];
52
- if (typeof a === 'number') a = String(a);
53
- if (typeof a === 'string') a = [a];
54
-
55
- if (typeof s === 'string') a = s.split(',');
56
- a = a.map((x) => x.toString().trim()).filter(Boolean);
57
- if (nonZeroLength && a.length === 0) a = [0];
58
- return a;
59
- }
60
-
61
52
  function isValidDate(d) {
62
53
  // we WANT to use isNaN, not the Number.isNaN -- we're checking the date type
63
54
  // eslint-disable-next-line no-restricted-globals
@@ -125,17 +116,6 @@ function relativeDate(s, _initialDate) {
125
116
  return r;
126
117
  }
127
118
 
128
- /*
129
- When comparing two objects, some may come from a file (thus strings), and some from
130
- a database or elsewhere (not strings), so for deduping make sure to make them all strings
131
- */
132
- function makeStrings(o) {
133
- return Object.entries(o).reduce((a, [k, v]) => {
134
- a[k] = (typeof v === 'object') ? JSON.stringify(v) : String(v);
135
- return a;
136
- }, {});
137
- }
138
-
139
119
  async function list(_path) {
140
120
  const directory = await unzipper.Open.file(_path);
141
121
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@engine9-io/input-tools",
3
- "version": "1.5.3",
3
+ "version": "1.6.1",
4
4
  "description": "Tools for dealing with Engine9 inputs",
5
5
  "main": "index.js",
6
6
  "scripts": {
@@ -1,48 +0,0 @@
1
- /* eslint-disable no-console */
2
- const {
3
- setTimeout,
4
- } = require('node:timers/promises');
5
-
6
- const { describe } = require('node:test');
7
- // const assert = require('node:assert');
8
- const { Readable } = require('node:stream');
9
- const { createWriteStream } = require('node:fs');
10
- const { pipeline } = require('node:stream/promises');
11
- const ParallelStream = require('../ParallelStream');
12
- const { getTempFilename } = require('../index');
13
-
14
- describe('Should process items in parallel:', async () => {
15
- const outputFile = await getTempFilename({});
16
- const writeStream = createWriteStream(outputFile);
17
-
18
- const CONCURRENCY = 500;
19
- await pipeline(
20
- Readable.from(
21
- [...Array(1000)].map((v, i) => ({ i })),
22
- ),
23
-
24
- new ParallelStream(
25
- CONCURRENCY,
26
- async (obj, enc, push, done) => {
27
- let res;
28
-
29
- try {
30
- await setTimeout(Math.random() * 1000);
31
- if (Math.random() > 0.7) throw new Error('Random error');
32
-
33
- res = `${obj.id} is complete\n`;
34
- } catch (err) {
35
- await setTimeout(Math.random() * 2000);// longer timeouts for errors
36
- res = `${obj.id} is error, ${err.name}\n`;
37
- }
38
-
39
- done(null, obj.id); // _onComplete actually
40
-
41
- return res;
42
- },
43
- ),
44
- writeStream,
45
- );
46
-
47
- console.log('Wrote responses to ', outputFile);
48
- });