@engine9-io/input-tools 1.5.3 → 1.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/file/FileUtilities.js +162 -5
- package/file/tools.js +23 -1
- package/index.js +2 -22
- package/package.json +1 -1
- package/test/parallelStream.js +0 -48
package/file/FileUtilities.js
CHANGED
@@ -5,22 +5,23 @@ const fsp = fs.promises;
|
|
5
5
|
const path = require('node:path');
|
6
6
|
const zlib = require('node:zlib');
|
7
7
|
const {
|
8
|
-
Readable, Transform, PassThrough,
|
8
|
+
Readable, Transform, PassThrough, Writable,
|
9
9
|
} = require('node:stream');
|
10
10
|
const { pipeline } = require('node:stream/promises');
|
11
11
|
const { stringify } = require('csv');
|
12
12
|
|
13
13
|
const debug = require('debug')('FileWorker');
|
14
|
-
|
14
|
+
|
15
15
|
const csv = require('csv');
|
16
|
-
const JSON5 = require('json5')
|
16
|
+
const JSON5 = require('json5');
|
17
17
|
const languageEncoding = require('detect-file-encoding-and-language');
|
18
18
|
const R2Worker = require('./R2');
|
19
19
|
const S3Worker = require('./S3');
|
20
20
|
const ParquetWorker = require('./Parquet');
|
21
|
-
const { streamPacket } = require('./tools');
|
22
21
|
|
23
|
-
const {
|
22
|
+
const {
|
23
|
+
bool, getStringArray, getTempDir, makeStrings, streamPacket,
|
24
|
+
} = require('./tools');
|
24
25
|
|
25
26
|
function Worker({ accountId }) { this.accountId = accountId; }
|
26
27
|
|
@@ -543,6 +544,15 @@ Worker.prototype.sample.metadata = {
|
|
543
544
|
|
544
545
|
},
|
545
546
|
};
|
547
|
+
Worker.prototype.toArray = async function (opts) {
|
548
|
+
const { stream } = await this.fileToObjectStream(opts);
|
549
|
+
return stream.toArray();
|
550
|
+
};
|
551
|
+
Worker.prototype.toArray.metadata = {
|
552
|
+
options: {
|
553
|
+
filename: {},
|
554
|
+
},
|
555
|
+
};
|
546
556
|
|
547
557
|
Worker.prototype.write = async function (opts) {
|
548
558
|
const { filename, content } = opts;
|
@@ -774,4 +784,151 @@ Worker.prototype.count.metadata = {
|
|
774
784
|
},
|
775
785
|
};
|
776
786
|
|
787
|
+
// Get a set of unique entries from a uniqueFunction
|
788
|
+
// This could be large
|
789
|
+
Worker.prototype.getUniqueSet = async function (options) {
|
790
|
+
const existingFiles = getStringArray(options.filenames);
|
791
|
+
const sample = {};
|
792
|
+
|
793
|
+
let { uniqueFunction } = options;
|
794
|
+
if (!uniqueFunction) {
|
795
|
+
uniqueFunction = ((o) => JSON.stringify(o));
|
796
|
+
}
|
797
|
+
const uniqueSet = new Set();
|
798
|
+
// eslint-disable-next-line no-restricted-syntax, guard-for-in
|
799
|
+
for (const filename of existingFiles) {
|
800
|
+
const { stream: existsStream } = await this.fileToObjectStream({ filename });
|
801
|
+
await pipeline(
|
802
|
+
existsStream,
|
803
|
+
new Transform({
|
804
|
+
objectMode: true,
|
805
|
+
transform(d, enc, cb) {
|
806
|
+
const v = uniqueFunction(makeStrings(d)) || '';
|
807
|
+
if (uniqueSet.size < 3) {
|
808
|
+
sample[v] = d;
|
809
|
+
}
|
810
|
+
uniqueSet.add(v);
|
811
|
+
cb(null, d);
|
812
|
+
},
|
813
|
+
}),
|
814
|
+
new Writable({
|
815
|
+
objectMode: true,
|
816
|
+
write(d, enc, cb) {
|
817
|
+
cb();
|
818
|
+
},
|
819
|
+
}),
|
820
|
+
);
|
821
|
+
debug(`Finished loading ${filename}`);
|
822
|
+
}
|
823
|
+
return { uniqueFunction, uniqueSet, sample };
|
824
|
+
};
|
825
|
+
|
826
|
+
Worker.prototype.getUniqueStream = async function (options) {
|
827
|
+
const includeDuplicateSourceRecords = bool(options.includeDuplicateSourceRecords, false);
|
828
|
+
|
829
|
+
const { uniqueSet, uniqueFunction, sample } = await this.getUniqueSet({
|
830
|
+
filenames: options.existingFiles,
|
831
|
+
uniqueFunction: options.uniqueFunction,
|
832
|
+
});
|
833
|
+
|
834
|
+
const { stream: inStream } = await this.fileToObjectStream(options);
|
835
|
+
const uniqueStream = inStream.pipe(
|
836
|
+
new Transform({
|
837
|
+
objectMode: true,
|
838
|
+
transform(d, enc, cb) {
|
839
|
+
const v = uniqueFunction(makeStrings(d)) || '';
|
840
|
+
|
841
|
+
if (!v) {
|
842
|
+
// falsey unique function includes
|
843
|
+
// by default
|
844
|
+
cb(null, d);
|
845
|
+
} else if (uniqueSet.has(v)) {
|
846
|
+
// do nothing
|
847
|
+
cb();
|
848
|
+
} else {
|
849
|
+
if (!includeDuplicateSourceRecords) {
|
850
|
+
// add it to the set for the next time
|
851
|
+
uniqueSet.add(v);
|
852
|
+
}
|
853
|
+
cb(null, d);
|
854
|
+
}
|
855
|
+
},
|
856
|
+
}),
|
857
|
+
);
|
858
|
+
return { stream: uniqueStream, sample };
|
859
|
+
};
|
860
|
+
|
861
|
+
Worker.prototype.getUniqueStream.metadata = {
|
862
|
+
options: {
|
863
|
+
existingFiles: {},
|
864
|
+
uniqueFunction: {},
|
865
|
+
filename: { description: 'Specify a source filename or a stream' },
|
866
|
+
stream: { description: 'Specify a source filename or a stream' },
|
867
|
+
includeDuplicateSourceRecords: {
|
868
|
+
description: 'Sometimes you want the output to include source dupes, sometimes not, default false',
|
869
|
+
},
|
870
|
+
},
|
871
|
+
};
|
872
|
+
Worker.prototype.getUniqueFile = async function (options) {
|
873
|
+
const { stream, sample } = await this.getUniqueStream(options);
|
874
|
+
const { filename, records } = await this.objectStreamToFile({ stream });
|
875
|
+
return { filename, records, sample };
|
876
|
+
};
|
877
|
+
|
878
|
+
Worker.prototype.getUniqueFile.metadata = {
|
879
|
+
options: {
|
880
|
+
existingFiles: {},
|
881
|
+
uniqueFunction: {},
|
882
|
+
filename: { description: 'Specify a source filename or a stream' },
|
883
|
+
stream: { description: 'Specify a source filename or a stream' },
|
884
|
+
includeDuplicateSourceRecords: {
|
885
|
+
description: 'Sometimes you want the output to include source dupes, sometimes not, default false',
|
886
|
+
},
|
887
|
+
},
|
888
|
+
};
|
889
|
+
|
890
|
+
/*
|
891
|
+
diff that allows for unordered files, and doesn't store full objects in memory.
|
892
|
+
Requires 2 passes of the files,
|
893
|
+
but that's a better tradeoff than trying to store huge files in memory
|
894
|
+
*/
|
895
|
+
Worker.prototype.diff = async function ({
|
896
|
+
fileA, fileB, uniqueFunction: ufOpt, fields, includeDuplicateSourceRecords,
|
897
|
+
}) {
|
898
|
+
if (ufOpt && fields) throw new Error('fields and uniqueFunction cannot both be specified');
|
899
|
+
let uniqueFunction = ufOpt;
|
900
|
+
if (!uniqueFunction && fields) {
|
901
|
+
const farr = getStringArray(fields);
|
902
|
+
uniqueFunction = (o) => farr.map((f) => o[f] || '').join('.');
|
903
|
+
}
|
904
|
+
|
905
|
+
const left = await this.getUniqueFile({
|
906
|
+
existingFiles: [fileB],
|
907
|
+
filename: fileA,
|
908
|
+
uniqueFunction,
|
909
|
+
includeDuplicateSourceRecords,
|
910
|
+
});
|
911
|
+
const right = await this.getUniqueFile({
|
912
|
+
existingFiles: [fileA],
|
913
|
+
filename: fileB,
|
914
|
+
uniqueFunction,
|
915
|
+
includeDuplicateSourceRecords,
|
916
|
+
});
|
917
|
+
|
918
|
+
return {
|
919
|
+
left, right,
|
920
|
+
};
|
921
|
+
};
|
922
|
+
Worker.prototype.diff.metadata = {
|
923
|
+
options: {
|
924
|
+
fileA: {},
|
925
|
+
fileB: {},
|
926
|
+
fields: { description: 'Fields to use for uniqueness -- aka primary key. Defaults to JSON of line' },
|
927
|
+
uniqueFunction: {},
|
928
|
+
includeDuplicateSourceRecords: {
|
929
|
+
description: 'Sometimes you want the output to include source dupes, sometimes not, default false',
|
930
|
+
},
|
931
|
+
},
|
932
|
+
};
|
933
|
+
|
777
934
|
module.exports = Worker;
|
package/file/tools.js
CHANGED
@@ -222,16 +222,38 @@ function bool(x, _defaultVal) {
|
|
222
222
|
const y = x.toLowerCase();
|
223
223
|
return !!(y.indexOf('y') + 1) || !!(y.indexOf('t') + 1);
|
224
224
|
}
|
225
|
+
function getStringArray(s, nonZeroLength) {
|
226
|
+
let a = s || [];
|
227
|
+
if (typeof a === 'number') a = String(a);
|
228
|
+
if (typeof a === 'string') a = [a];
|
229
|
+
|
230
|
+
if (typeof s === 'string') a = s.split(',');
|
231
|
+
a = a.map((x) => x.toString().trim()).filter(Boolean);
|
232
|
+
if (nonZeroLength && a.length === 0) a = [0];
|
233
|
+
return a;
|
234
|
+
}
|
235
|
+
/*
|
236
|
+
When comparing two objects, some may come from a file (thus strings), and some from
|
237
|
+
a database or elsewhere (not strings), so for deduping make sure to make them all strings
|
238
|
+
*/
|
239
|
+
function makeStrings(o) {
|
240
|
+
return Object.entries(o).reduce((a, [k, v]) => {
|
241
|
+
a[k] = (typeof v === 'object') ? JSON.stringify(v) : String(v);
|
242
|
+
return a;
|
243
|
+
}, {});
|
244
|
+
}
|
225
245
|
|
226
246
|
module.exports = {
|
227
247
|
bool,
|
248
|
+
downloadFile,
|
228
249
|
getTempFilename,
|
229
250
|
getTempDir,
|
230
|
-
downloadFile,
|
231
251
|
getBatchTransform,
|
232
252
|
getDebatchTransform,
|
233
253
|
getFile,
|
234
254
|
getManifest,
|
235
255
|
getPacketFiles,
|
256
|
+
getStringArray,
|
257
|
+
makeStrings,
|
236
258
|
streamPacket,
|
237
259
|
};
|
package/index.js
CHANGED
@@ -25,6 +25,8 @@ const {
|
|
25
25
|
getPacketFiles,
|
26
26
|
getBatchTransform,
|
27
27
|
getDebatchTransform,
|
28
|
+
getStringArray,
|
29
|
+
makeStrings,
|
28
30
|
} = require('./file/tools');
|
29
31
|
|
30
32
|
const ForEachEntry = require('./ForEachEntry');
|
@@ -47,17 +49,6 @@ handlebars.registerHelper('json', (d) => JSON.stringify(d));
|
|
47
49
|
|
48
50
|
handlebars.registerHelper('percent', (a, b) => `${((100 * a) / b).toFixed(2)}%`);
|
49
51
|
|
50
|
-
function getStringArray(s, nonZeroLength) {
|
51
|
-
let a = s || [];
|
52
|
-
if (typeof a === 'number') a = String(a);
|
53
|
-
if (typeof a === 'string') a = [a];
|
54
|
-
|
55
|
-
if (typeof s === 'string') a = s.split(',');
|
56
|
-
a = a.map((x) => x.toString().trim()).filter(Boolean);
|
57
|
-
if (nonZeroLength && a.length === 0) a = [0];
|
58
|
-
return a;
|
59
|
-
}
|
60
|
-
|
61
52
|
function isValidDate(d) {
|
62
53
|
// we WANT to use isNaN, not the Number.isNaN -- we're checking the date type
|
63
54
|
// eslint-disable-next-line no-restricted-globals
|
@@ -125,17 +116,6 @@ function relativeDate(s, _initialDate) {
|
|
125
116
|
return r;
|
126
117
|
}
|
127
118
|
|
128
|
-
/*
|
129
|
-
When comparing two objects, some may come from a file (thus strings), and some from
|
130
|
-
a database or elsewhere (not strings), so for deduping make sure to make them all strings
|
131
|
-
*/
|
132
|
-
function makeStrings(o) {
|
133
|
-
return Object.entries(o).reduce((a, [k, v]) => {
|
134
|
-
a[k] = (typeof v === 'object') ? JSON.stringify(v) : String(v);
|
135
|
-
return a;
|
136
|
-
}, {});
|
137
|
-
}
|
138
|
-
|
139
119
|
async function list(_path) {
|
140
120
|
const directory = await unzipper.Open.file(_path);
|
141
121
|
|
package/package.json
CHANGED
package/test/parallelStream.js
DELETED
@@ -1,48 +0,0 @@
|
|
1
|
-
/* eslint-disable no-console */
|
2
|
-
const {
|
3
|
-
setTimeout,
|
4
|
-
} = require('node:timers/promises');
|
5
|
-
|
6
|
-
const { describe } = require('node:test');
|
7
|
-
// const assert = require('node:assert');
|
8
|
-
const { Readable } = require('node:stream');
|
9
|
-
const { createWriteStream } = require('node:fs');
|
10
|
-
const { pipeline } = require('node:stream/promises');
|
11
|
-
const ParallelStream = require('../ParallelStream');
|
12
|
-
const { getTempFilename } = require('../index');
|
13
|
-
|
14
|
-
describe('Should process items in parallel:', async () => {
|
15
|
-
const outputFile = await getTempFilename({});
|
16
|
-
const writeStream = createWriteStream(outputFile);
|
17
|
-
|
18
|
-
const CONCURRENCY = 500;
|
19
|
-
await pipeline(
|
20
|
-
Readable.from(
|
21
|
-
[...Array(1000)].map((v, i) => ({ i })),
|
22
|
-
),
|
23
|
-
|
24
|
-
new ParallelStream(
|
25
|
-
CONCURRENCY,
|
26
|
-
async (obj, enc, push, done) => {
|
27
|
-
let res;
|
28
|
-
|
29
|
-
try {
|
30
|
-
await setTimeout(Math.random() * 1000);
|
31
|
-
if (Math.random() > 0.7) throw new Error('Random error');
|
32
|
-
|
33
|
-
res = `${obj.id} is complete\n`;
|
34
|
-
} catch (err) {
|
35
|
-
await setTimeout(Math.random() * 2000);// longer timeouts for errors
|
36
|
-
res = `${obj.id} is error, ${err.name}\n`;
|
37
|
-
}
|
38
|
-
|
39
|
-
done(null, obj.id); // _onComplete actually
|
40
|
-
|
41
|
-
return res;
|
42
|
-
},
|
43
|
-
),
|
44
|
-
writeStream,
|
45
|
-
);
|
46
|
-
|
47
|
-
console.log('Wrote responses to ', outputFile);
|
48
|
-
});
|