@naturalcycles/nodejs-lib 13.21.0 → 13.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/fs/fs2.d.ts +3 -0
- package/dist/fs/fs2.js +61 -0
- package/dist/index.d.ts +0 -6
- package/dist/index.js +0 -6
- package/dist/stream/ndjson/ndjsonMap.js +4 -13
- package/dist/stream/ndjson/ndjsonStreamForEach.js +1 -9
- package/dist/stream/ndjson/transformJsonParse.d.ts +1 -1
- package/dist/stream/transform/transformSplit.d.ts +13 -3
- package/dist/stream/transform/transformSplit.js +128 -8
- package/package.json +1 -2
- package/src/fs/fs2.ts +74 -0
- package/src/index.ts +0 -6
- package/src/stream/ndjson/ndjsonMap.ts +5 -19
- package/src/stream/ndjson/ndjsonStreamForEach.ts +2 -13
- package/src/stream/ndjson/transformJsonParse.ts +2 -2
- package/src/stream/transform/transformSplit.ts +134 -8
- package/dist/stream/ndjson/ndJsonFileRead.d.ts +0 -5
- package/dist/stream/ndjson/ndJsonFileRead.js +0 -14
- package/dist/stream/ndjson/ndJsonFileWrite.d.ts +0 -5
- package/dist/stream/ndjson/ndJsonFileWrite.js +0 -12
- package/dist/stream/ndjson/pipelineFromNDJsonFile.d.ts +0 -24
- package/dist/stream/ndjson/pipelineFromNDJsonFile.js +0 -37
- package/dist/stream/ndjson/pipelineToNDJsonFile.d.ts +0 -27
- package/dist/stream/ndjson/pipelineToNDJsonFile.js +0 -42
- package/dist/stream/ndjson/streamToNDJsonFile.d.ts +0 -3
- package/dist/stream/ndjson/streamToNDJsonFile.js +0 -8
- package/dist/stream/transform/transformToString.d.ts +0 -12
- package/dist/stream/transform/transformToString.js +0 -24
- package/src/stream/ndjson/ndJsonFileRead.ts +0 -15
- package/src/stream/ndjson/ndJsonFileWrite.ts +0 -12
- package/src/stream/ndjson/pipelineFromNDJsonFile.ts +0 -62
- package/src/stream/ndjson/pipelineToNDJsonFile.ts +0 -70
- package/src/stream/ndjson/streamToNDJsonFile.ts +0 -9
- package/src/stream/transform/transformToString.ts +0 -22
package/dist/fs/fs2.d.ts
CHANGED
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
import type { RmOptions } from 'node:fs';
|
|
5
5
|
import fs from 'node:fs';
|
|
6
6
|
import { DumpOptions } from 'js-yaml';
|
|
7
|
+
import { ReadableTyped, WritableTyped } from '../stream/stream.model';
|
|
7
8
|
/**
|
|
8
9
|
* fs2 conveniently groups filesystem functions together.
|
|
9
10
|
* Supposed to be almost a drop-in replacement for these things together:
|
|
@@ -75,6 +76,8 @@ declare class FS2 {
|
|
|
75
76
|
readdirAsync: typeof fs.promises.readdir;
|
|
76
77
|
createWriteStream: typeof fs.createWriteStream;
|
|
77
78
|
createReadStream: typeof fs.createReadStream;
|
|
79
|
+
createReadStreamAsNDJSON<ROW = any>(inputPath: string): ReadableTyped<ROW>;
|
|
80
|
+
createWriteStreamAsNDJSON(outputPath: string): WritableTyped<any>;
|
|
78
81
|
}
|
|
79
82
|
export declare const fs2: FS2;
|
|
80
83
|
export interface JsonOptions {
|
package/dist/fs/fs2.js
CHANGED
|
@@ -20,8 +20,12 @@ const tslib_1 = require("tslib");
|
|
|
20
20
|
const node_fs_1 = tslib_1.__importDefault(require("node:fs"));
|
|
21
21
|
const promises_1 = tslib_1.__importDefault(require("node:fs/promises"));
|
|
22
22
|
const node_path_1 = tslib_1.__importDefault(require("node:path"));
|
|
23
|
+
const node_zlib_1 = require("node:zlib");
|
|
23
24
|
const js_lib_1 = require("@naturalcycles/js-lib");
|
|
24
25
|
const js_yaml_1 = tslib_1.__importDefault(require("js-yaml"));
|
|
26
|
+
const transformToNDJson_1 = require("../stream/ndjson/transformToNDJson");
|
|
27
|
+
const transformSplit_1 = require("../stream/transform/transformSplit");
|
|
28
|
+
const env_util_1 = require("../util/env.util");
|
|
25
29
|
/**
|
|
26
30
|
* fs2 conveniently groups filesystem functions together.
|
|
27
31
|
* Supposed to be almost a drop-in replacement for these things together:
|
|
@@ -271,6 +275,63 @@ class FS2 {
|
|
|
271
275
|
await this.copyPathAsync(src, dest, opt);
|
|
272
276
|
await this.removePathAsync(src);
|
|
273
277
|
}
|
|
278
|
+
/*
|
|
279
|
+
Returns a Readable of [already parsed] NDJSON objects.
|
|
280
|
+
|
|
281
|
+
Replaces a list of operations:
|
|
282
|
+
- requireFileToExist(inputPath)
|
|
283
|
+
- fs.createReadStream
|
|
284
|
+
- createUnzip (only if path ends with '.gz')
|
|
285
|
+
- transformSplitOnNewline
|
|
286
|
+
- transformJsonParse
|
|
287
|
+
|
|
288
|
+
To add a Limit or Offset: just add .take() or .drop(), example:
|
|
289
|
+
|
|
290
|
+
_pipeline([
|
|
291
|
+
fs2.createReadStreamAsNDJSON().take(100),
|
|
292
|
+
transformX(),
|
|
293
|
+
])
|
|
294
|
+
*/
|
|
295
|
+
createReadStreamAsNDJSON(inputPath) {
|
|
296
|
+
(0, env_util_1.requireFileToExist)(inputPath);
|
|
297
|
+
let stream = node_fs_1.default
|
|
298
|
+
.createReadStream(inputPath, {
|
|
299
|
+
highWaterMark: 64 * 1024, // no observed speedup
|
|
300
|
+
})
|
|
301
|
+
.on('error', err => stream.emit('error', err));
|
|
302
|
+
if (inputPath.endsWith('.gz')) {
|
|
303
|
+
stream = stream.pipe((0, node_zlib_1.createUnzip)({
|
|
304
|
+
chunkSize: 64 * 1024, // speedup from ~3200 to 3800 rps!
|
|
305
|
+
}));
|
|
306
|
+
}
|
|
307
|
+
return stream.pipe((0, transformSplit_1.transformSplitOnNewline)()).map(line => JSON.parse(line));
|
|
308
|
+
// For some crazy reason .map is much faster than transformJsonParse!
|
|
309
|
+
// ~5000 vs ~4000 rps !!!
|
|
310
|
+
// .on('error', err => stream.emit('error', err))
|
|
311
|
+
// .pipe(transformJsonParse<ROW>())
|
|
312
|
+
}
|
|
313
|
+
/*
|
|
314
|
+
Returns a Writable.
|
|
315
|
+
|
|
316
|
+
Replaces a list of operations:
|
|
317
|
+
- transformToNDJson
|
|
318
|
+
- createGzip (only if path ends with '.gz')
|
|
319
|
+
- fs.createWriteStream
|
|
320
|
+
*/
|
|
321
|
+
createWriteStreamAsNDJSON(outputPath) {
|
|
322
|
+
this.ensureFile(outputPath);
|
|
323
|
+
const transform1 = (0, transformToNDJson_1.transformToNDJson)();
|
|
324
|
+
let transform = transform1;
|
|
325
|
+
if (outputPath.endsWith('.gz')) {
|
|
326
|
+
transform = transform.pipe((0, node_zlib_1.createGzip)({
|
|
327
|
+
// chunkSize: 64 * 1024, // no observed speedup
|
|
328
|
+
}));
|
|
329
|
+
}
|
|
330
|
+
transform.pipe(node_fs_1.default.createWriteStream(outputPath, {
|
|
331
|
+
// highWaterMark: 64 * 1024, // no observed speedup
|
|
332
|
+
}));
|
|
333
|
+
return transform1;
|
|
334
|
+
}
|
|
274
335
|
}
|
|
275
336
|
exports.fs2 = new FS2();
|
|
276
337
|
function stringify(data, opt) {
|
package/dist/index.d.ts
CHANGED
|
@@ -18,13 +18,8 @@ export * from './log/log.util';
|
|
|
18
18
|
export * from './slack/slack.service';
|
|
19
19
|
export * from './slack/slack.service.model';
|
|
20
20
|
export * from './stream/ndjson/ndjson.model';
|
|
21
|
-
export * from './stream/ndjson/ndJsonFileRead';
|
|
22
|
-
export * from './stream/ndjson/ndJsonFileWrite';
|
|
23
21
|
export * from './stream/ndjson/ndjsonMap';
|
|
24
22
|
export * from './stream/ndjson/ndjsonStreamForEach';
|
|
25
|
-
export * from './stream/ndjson/pipelineFromNDJsonFile';
|
|
26
|
-
export * from './stream/ndjson/pipelineToNDJsonFile';
|
|
27
|
-
export * from './stream/ndjson/streamToNDJsonFile';
|
|
28
23
|
export * from './stream/ndjson/transformJsonParse';
|
|
29
24
|
export * from './stream/ndjson/transformToNDJson';
|
|
30
25
|
export * from './stream/pipeline/pipeline';
|
|
@@ -46,7 +41,6 @@ export * from './stream/transform/transformMapSync';
|
|
|
46
41
|
export * from './stream/transform/transformSplit';
|
|
47
42
|
export * from './stream/transform/transformTap';
|
|
48
43
|
export * from './stream/transform/transformToArray';
|
|
49
|
-
export * from './stream/transform/transformToString';
|
|
50
44
|
export * from './stream/transform/transformTee';
|
|
51
45
|
export * from './stream/transform/worker/baseWorkerClass';
|
|
52
46
|
export * from './stream/transform/worker/transformMultiThreaded';
|
package/dist/index.js
CHANGED
|
@@ -22,13 +22,8 @@ tslib_1.__exportStar(require("./log/log.util"), exports);
|
|
|
22
22
|
tslib_1.__exportStar(require("./slack/slack.service"), exports);
|
|
23
23
|
tslib_1.__exportStar(require("./slack/slack.service.model"), exports);
|
|
24
24
|
tslib_1.__exportStar(require("./stream/ndjson/ndjson.model"), exports);
|
|
25
|
-
tslib_1.__exportStar(require("./stream/ndjson/ndJsonFileRead"), exports);
|
|
26
|
-
tslib_1.__exportStar(require("./stream/ndjson/ndJsonFileWrite"), exports);
|
|
27
25
|
tslib_1.__exportStar(require("./stream/ndjson/ndjsonMap"), exports);
|
|
28
26
|
tslib_1.__exportStar(require("./stream/ndjson/ndjsonStreamForEach"), exports);
|
|
29
|
-
tslib_1.__exportStar(require("./stream/ndjson/pipelineFromNDJsonFile"), exports);
|
|
30
|
-
tslib_1.__exportStar(require("./stream/ndjson/pipelineToNDJsonFile"), exports);
|
|
31
|
-
tslib_1.__exportStar(require("./stream/ndjson/streamToNDJsonFile"), exports);
|
|
32
27
|
tslib_1.__exportStar(require("./stream/ndjson/transformJsonParse"), exports);
|
|
33
28
|
tslib_1.__exportStar(require("./stream/ndjson/transformToNDJson"), exports);
|
|
34
29
|
tslib_1.__exportStar(require("./stream/pipeline/pipeline"), exports);
|
|
@@ -50,7 +45,6 @@ tslib_1.__exportStar(require("./stream/transform/transformMapSync"), exports);
|
|
|
50
45
|
tslib_1.__exportStar(require("./stream/transform/transformSplit"), exports);
|
|
51
46
|
tslib_1.__exportStar(require("./stream/transform/transformTap"), exports);
|
|
52
47
|
tslib_1.__exportStar(require("./stream/transform/transformToArray"), exports);
|
|
53
|
-
tslib_1.__exportStar(require("./stream/transform/transformToString"), exports);
|
|
54
48
|
tslib_1.__exportStar(require("./stream/transform/transformTee"), exports);
|
|
55
49
|
tslib_1.__exportStar(require("./stream/transform/worker/baseWorkerClass"), exports);
|
|
56
50
|
tslib_1.__exportStar(require("./stream/transform/worker/transformMultiThreaded"), exports);
|
|
@@ -1,8 +1,6 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.ndjsonMap = void 0;
|
|
4
|
-
const node_fs_1 = require("node:fs");
|
|
5
|
-
const node_zlib_1 = require("node:zlib");
|
|
6
4
|
const js_lib_1 = require("@naturalcycles/js-lib");
|
|
7
5
|
const __1 = require("../..");
|
|
8
6
|
/**
|
|
@@ -11,20 +9,15 @@ const __1 = require("../..");
|
|
|
11
9
|
*/
|
|
12
10
|
async function ndjsonMap(mapper, opt) {
|
|
13
11
|
const { inputFilePath, outputFilePath, logEveryOutput = 100_000, limitInput, limitOutput } = opt;
|
|
14
|
-
(0, __1.requireFileToExist)(inputFilePath);
|
|
15
12
|
console.log({
|
|
16
13
|
inputFilePath,
|
|
17
14
|
outputFilePath,
|
|
18
15
|
});
|
|
19
|
-
const
|
|
20
|
-
|
|
21
|
-
|
|
16
|
+
const readable = __1.fs2
|
|
17
|
+
.createReadStreamAsNDJSON(inputFilePath)
|
|
18
|
+
.take(limitInput || Number.POSITIVE_INFINITY);
|
|
22
19
|
await (0, __1._pipeline)([
|
|
23
20
|
readable,
|
|
24
|
-
...transformUnzip,
|
|
25
|
-
(0, __1.transformSplit)(), // splits by \n
|
|
26
|
-
(0, __1.transformJsonParse)(),
|
|
27
|
-
(0, __1.transformLimit)({ limit: limitInput, sourceReadable: readable }),
|
|
28
21
|
(0, __1.transformLogProgress)({ metric: 'read', ...opt }),
|
|
29
22
|
(0, __1.transformMap)(mapper, {
|
|
30
23
|
flattenArrayOutput: true,
|
|
@@ -33,9 +26,7 @@ async function ndjsonMap(mapper, opt) {
|
|
|
33
26
|
}),
|
|
34
27
|
(0, __1.transformLimit)({ limit: limitOutput, sourceReadable: readable }),
|
|
35
28
|
(0, __1.transformLogProgress)({ metric: 'saved', logEvery: logEveryOutput }),
|
|
36
|
-
|
|
37
|
-
...transformZip,
|
|
38
|
-
(0, node_fs_1.createWriteStream)(outputFilePath),
|
|
29
|
+
__1.fs2.createWriteStreamAsNDJSON(outputFilePath),
|
|
39
30
|
]);
|
|
40
31
|
}
|
|
41
32
|
exports.ndjsonMap = ndjsonMap;
|
|
@@ -1,22 +1,14 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.ndjsonStreamForEach = void 0;
|
|
4
|
-
const tslib_1 = require("tslib");
|
|
5
|
-
const node_fs_1 = tslib_1.__importDefault(require("node:fs"));
|
|
6
|
-
const node_zlib_1 = require("node:zlib");
|
|
7
4
|
const js_lib_1 = require("@naturalcycles/js-lib");
|
|
8
5
|
const __1 = require("../..");
|
|
9
6
|
/**
|
|
10
7
|
* Convenience function to `forEach` through an ndjson file.
|
|
11
8
|
*/
|
|
12
9
|
async function ndjsonStreamForEach(mapper, opt) {
|
|
13
|
-
(0, __1.requireFileToExist)(opt.inputFilePath);
|
|
14
|
-
const transformUnzip = opt.inputFilePath.endsWith('.gz') ? [(0, node_zlib_1.createUnzip)()] : [];
|
|
15
10
|
await (0, __1._pipeline)([
|
|
16
|
-
|
|
17
|
-
...transformUnzip,
|
|
18
|
-
(0, __1.transformSplit)(),
|
|
19
|
-
(0, __1.transformJsonParse)(),
|
|
11
|
+
__1.fs2.createReadStreamAsNDJSON(opt.inputFilePath),
|
|
20
12
|
(0, __1.transformMap)(mapper, {
|
|
21
13
|
errorMode: js_lib_1.ErrorMode.THROW_AGGREGATED,
|
|
22
14
|
...opt,
|
|
@@ -24,5 +24,5 @@ export interface TransformJsonParseOptions {
|
|
|
24
24
|
* consumeYourStream...
|
|
25
25
|
* [)
|
|
26
26
|
*/
|
|
27
|
-
export declare function transformJsonParse<
|
|
27
|
+
export declare function transformJsonParse<ROW = any>(opt?: TransformJsonParseOptions): TransformTyped<string | Buffer, ROW>;
|
|
28
28
|
export declare const bufferReviver: Reviver;
|
|
@@ -1,9 +1,19 @@
|
|
|
1
1
|
/// <reference types="node" />
|
|
2
2
|
import { TransformTyped } from '../stream.model';
|
|
3
|
+
/**
|
|
4
|
+
* Transforms input Buffer/string stream into Buffer chunks (objectMode: true) split by newLine.
|
|
5
|
+
*
|
|
6
|
+
* Useful for reading NDJSON files from fs.
|
|
7
|
+
*
|
|
8
|
+
* Same as binarySplit, but optimized (hard-coded) to split on NEWLINE (aka `\n`).
|
|
9
|
+
* (+5-10% _pipeline speedup measured, compared to generic `binarySplit` on variable length delimiter)
|
|
10
|
+
*/
|
|
11
|
+
export declare function transformSplitOnNewline(): TransformTyped<Buffer, Buffer>;
|
|
3
12
|
/**
|
|
4
13
|
* Input: stream (objectMode=false) of arbitrary string|Buffer chunks, like when read from fs
|
|
5
|
-
* Output: stream (objectMode=
|
|
14
|
+
* Output: stream (objectMode=true) or string|Buffer chunks split by `separator` (@default to `\n`)
|
|
6
15
|
*
|
|
7
|
-
*
|
|
16
|
+
* Please use slightly more optimized `transformSplitOnNewline` for NDJSON file parsing.
|
|
17
|
+
* (+5-10% _pipeline speedup measured!)
|
|
8
18
|
*/
|
|
9
|
-
export declare function transformSplit(separator?: string): TransformTyped<
|
|
19
|
+
export declare function transformSplit(separator?: string): TransformTyped<Buffer, Buffer>;
|
|
@@ -1,17 +1,137 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.transformSplit = void 0;
|
|
4
|
-
|
|
5
|
-
//
|
|
6
|
-
|
|
7
|
-
|
|
3
|
+
exports.transformSplit = exports.transformSplitOnNewline = void 0;
|
|
4
|
+
const node_stream_1 = require("node:stream");
|
|
5
|
+
// The code below is carefully adopted from: https://github.com/max-mapper/binary-split
|
|
6
|
+
/**
|
|
7
|
+
* Transforms input Buffer/string stream into Buffer chunks (objectMode: true) split by newLine.
|
|
8
|
+
*
|
|
9
|
+
* Useful for reading NDJSON files from fs.
|
|
10
|
+
*
|
|
11
|
+
* Same as binarySplit, but optimized (hard-coded) to split on NEWLINE (aka `\n`).
|
|
12
|
+
* (+5-10% _pipeline speedup measured, compared to generic `binarySplit` on variable length delimiter)
|
|
13
|
+
*/
|
|
14
|
+
function transformSplitOnNewline() {
|
|
15
|
+
let buffered;
|
|
16
|
+
return new node_stream_1.Transform({
|
|
17
|
+
readableObjectMode: true,
|
|
18
|
+
writableHighWaterMark: 64 * 1024,
|
|
19
|
+
transform(buf, enc, done) {
|
|
20
|
+
let offset = 0;
|
|
21
|
+
let lastMatch = 0;
|
|
22
|
+
if (buffered) {
|
|
23
|
+
buf = Buffer.concat([buffered, buf]);
|
|
24
|
+
offset = buffered.length;
|
|
25
|
+
buffered = undefined;
|
|
26
|
+
}
|
|
27
|
+
while (true) {
|
|
28
|
+
const idx = firstNewlineMatch(buf, offset);
|
|
29
|
+
if (idx !== -1 && idx < buf.length) {
|
|
30
|
+
if (lastMatch !== idx) {
|
|
31
|
+
this.push(buf.slice(lastMatch, idx));
|
|
32
|
+
}
|
|
33
|
+
offset = idx + 1;
|
|
34
|
+
lastMatch = offset;
|
|
35
|
+
}
|
|
36
|
+
else {
|
|
37
|
+
buffered = buf.slice(lastMatch);
|
|
38
|
+
break;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
done();
|
|
42
|
+
},
|
|
43
|
+
flush(done) {
|
|
44
|
+
if (buffered && buffered.length > 0)
|
|
45
|
+
this.push(buffered);
|
|
46
|
+
done();
|
|
47
|
+
},
|
|
48
|
+
});
|
|
49
|
+
}
|
|
50
|
+
exports.transformSplitOnNewline = transformSplitOnNewline;
|
|
8
51
|
/**
|
|
9
52
|
* Input: stream (objectMode=false) of arbitrary string|Buffer chunks, like when read from fs
|
|
10
|
-
* Output: stream (objectMode=
|
|
53
|
+
* Output: stream (objectMode=true) or string|Buffer chunks split by `separator` (@default to `\n`)
|
|
11
54
|
*
|
|
12
|
-
*
|
|
55
|
+
* Please use slightly more optimized `transformSplitOnNewline` for NDJSON file parsing.
|
|
56
|
+
* (+5-10% _pipeline speedup measured!)
|
|
13
57
|
*/
|
|
14
58
|
function transformSplit(separator = '\n') {
|
|
15
|
-
|
|
59
|
+
const matcher = Buffer.from(separator);
|
|
60
|
+
let buffered;
|
|
61
|
+
return new node_stream_1.Transform({
|
|
62
|
+
readableObjectMode: true,
|
|
63
|
+
writableHighWaterMark: 64 * 1024,
|
|
64
|
+
transform(buf, enc, done) {
|
|
65
|
+
let offset = 0;
|
|
66
|
+
let lastMatch = 0;
|
|
67
|
+
if (buffered) {
|
|
68
|
+
buf = Buffer.concat([buffered, buf]);
|
|
69
|
+
offset = buffered.length;
|
|
70
|
+
buffered = undefined;
|
|
71
|
+
}
|
|
72
|
+
while (true) {
|
|
73
|
+
const idx = firstMatch(buf, offset - matcher.length + 1, matcher);
|
|
74
|
+
if (idx !== -1 && idx < buf.length) {
|
|
75
|
+
if (lastMatch !== idx) {
|
|
76
|
+
this.push(buf.slice(lastMatch, idx));
|
|
77
|
+
}
|
|
78
|
+
offset = idx + matcher.length;
|
|
79
|
+
lastMatch = offset;
|
|
80
|
+
}
|
|
81
|
+
else {
|
|
82
|
+
buffered = buf.slice(lastMatch);
|
|
83
|
+
break;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
done();
|
|
87
|
+
},
|
|
88
|
+
flush(done) {
|
|
89
|
+
if (buffered && buffered.length > 0)
|
|
90
|
+
this.push(buffered);
|
|
91
|
+
done();
|
|
92
|
+
},
|
|
93
|
+
});
|
|
16
94
|
}
|
|
17
95
|
exports.transformSplit = transformSplit;
|
|
96
|
+
// const NEWLINE = Buffer.from('\n')
|
|
97
|
+
// const NEWLINE_CODE = NEWLINE[0]! // it is `10`
|
|
98
|
+
const NEWLINE_CODE = 10;
|
|
99
|
+
/**
|
|
100
|
+
* Same as firstMatch, but optimized (hard-coded) to find NEWLINE (aka `\n`).
|
|
101
|
+
*/
|
|
102
|
+
function firstNewlineMatch(buf, offset) {
|
|
103
|
+
const bufLength = buf.length;
|
|
104
|
+
if (offset >= bufLength)
|
|
105
|
+
return -1;
|
|
106
|
+
for (let i = offset; i < bufLength; i++) {
|
|
107
|
+
if (buf[i] === NEWLINE_CODE) {
|
|
108
|
+
return i;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
return -1; // this code is unreachable, because i is guaranteed to be found in the loop above
|
|
112
|
+
}
|
|
113
|
+
function firstMatch(buf, offset, matcher) {
|
|
114
|
+
if (offset >= buf.length)
|
|
115
|
+
return -1;
|
|
116
|
+
let i;
|
|
117
|
+
for (i = offset; i < buf.length; i++) {
|
|
118
|
+
if (buf[i] === matcher[0]) {
|
|
119
|
+
if (matcher.length > 1) {
|
|
120
|
+
let fullMatch = true;
|
|
121
|
+
let j = i;
|
|
122
|
+
for (let k = 0; j < i + matcher.length; j++, k++) {
|
|
123
|
+
if (buf[j] !== matcher[k]) {
|
|
124
|
+
fullMatch = false;
|
|
125
|
+
break;
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
if (fullMatch)
|
|
129
|
+
return j - matcher.length;
|
|
130
|
+
}
|
|
131
|
+
else {
|
|
132
|
+
break;
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
return i + matcher.length - 1;
|
|
137
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@naturalcycles/nodejs-lib",
|
|
3
|
-
"version": "13.
|
|
3
|
+
"version": "13.23.0",
|
|
4
4
|
"scripts": {
|
|
5
5
|
"prepare": "husky",
|
|
6
6
|
"docs-serve": "vuepress dev docs",
|
|
@@ -21,7 +21,6 @@
|
|
|
21
21
|
"ajv": "^8.6.2",
|
|
22
22
|
"ajv-formats": "^3.0.1",
|
|
23
23
|
"ajv-keywords": "^5.0.0",
|
|
24
|
-
"binary-split": "^1.0.5",
|
|
25
24
|
"chalk": "^4.0.0",
|
|
26
25
|
"debug": "^4.1.1",
|
|
27
26
|
"dotenv": "^16.0.0",
|
package/src/fs/fs2.ts
CHANGED
|
@@ -18,8 +18,13 @@ import type { RmOptions } from 'node:fs'
|
|
|
18
18
|
import fs from 'node:fs'
|
|
19
19
|
import fsp from 'node:fs/promises'
|
|
20
20
|
import path from 'node:path'
|
|
21
|
+
import { createGzip, createUnzip } from 'node:zlib'
|
|
21
22
|
import { _jsonParse } from '@naturalcycles/js-lib'
|
|
22
23
|
import yaml, { DumpOptions } from 'js-yaml'
|
|
24
|
+
import { transformToNDJson } from '../stream/ndjson/transformToNDJson'
|
|
25
|
+
import { ReadableTyped, WritableTyped } from '../stream/stream.model'
|
|
26
|
+
import { transformSplitOnNewline } from '../stream/transform/transformSplit'
|
|
27
|
+
import { requireFileToExist } from '../util/env.util'
|
|
23
28
|
|
|
24
29
|
/**
|
|
25
30
|
* fs2 conveniently groups filesystem functions together.
|
|
@@ -305,6 +310,75 @@ class FS2 {
|
|
|
305
310
|
readdirAsync = fsp.readdir
|
|
306
311
|
createWriteStream = fs.createWriteStream
|
|
307
312
|
createReadStream = fs.createReadStream
|
|
313
|
+
|
|
314
|
+
/*
|
|
315
|
+
Returns a Readable of [already parsed] NDJSON objects.
|
|
316
|
+
|
|
317
|
+
Replaces a list of operations:
|
|
318
|
+
- requireFileToExist(inputPath)
|
|
319
|
+
- fs.createReadStream
|
|
320
|
+
- createUnzip (only if path ends with '.gz')
|
|
321
|
+
- transformSplitOnNewline
|
|
322
|
+
- transformJsonParse
|
|
323
|
+
|
|
324
|
+
To add a Limit or Offset: just add .take() or .drop(), example:
|
|
325
|
+
|
|
326
|
+
_pipeline([
|
|
327
|
+
fs2.createReadStreamAsNDJSON().take(100),
|
|
328
|
+
transformX(),
|
|
329
|
+
])
|
|
330
|
+
*/
|
|
331
|
+
createReadStreamAsNDJSON<ROW = any>(inputPath: string): ReadableTyped<ROW> {
|
|
332
|
+
requireFileToExist(inputPath)
|
|
333
|
+
|
|
334
|
+
let stream: ReadableTyped<ROW> = fs
|
|
335
|
+
.createReadStream(inputPath, {
|
|
336
|
+
highWaterMark: 64 * 1024, // no observed speedup
|
|
337
|
+
})
|
|
338
|
+
.on('error', err => stream.emit('error', err))
|
|
339
|
+
|
|
340
|
+
if (inputPath.endsWith('.gz')) {
|
|
341
|
+
stream = stream.pipe(
|
|
342
|
+
createUnzip({
|
|
343
|
+
chunkSize: 64 * 1024, // speedup from ~3200 to 3800 rps!
|
|
344
|
+
}),
|
|
345
|
+
)
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
return stream.pipe(transformSplitOnNewline()).map(line => JSON.parse(line))
|
|
349
|
+
// For some crazy reason .map is much faster than transformJsonParse!
|
|
350
|
+
// ~5000 vs ~4000 rps !!!
|
|
351
|
+
// .on('error', err => stream.emit('error', err))
|
|
352
|
+
// .pipe(transformJsonParse<ROW>())
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
/*
|
|
356
|
+
Returns a Writable.
|
|
357
|
+
|
|
358
|
+
Replaces a list of operations:
|
|
359
|
+
- transformToNDJson
|
|
360
|
+
- createGzip (only if path ends with '.gz')
|
|
361
|
+
- fs.createWriteStream
|
|
362
|
+
*/
|
|
363
|
+
createWriteStreamAsNDJSON(outputPath: string): WritableTyped<any> {
|
|
364
|
+
this.ensureFile(outputPath)
|
|
365
|
+
|
|
366
|
+
const transform1 = transformToNDJson()
|
|
367
|
+
let transform = transform1
|
|
368
|
+
if (outputPath.endsWith('.gz')) {
|
|
369
|
+
transform = transform.pipe(
|
|
370
|
+
createGzip({
|
|
371
|
+
// chunkSize: 64 * 1024, // no observed speedup
|
|
372
|
+
}),
|
|
373
|
+
)
|
|
374
|
+
}
|
|
375
|
+
transform.pipe(
|
|
376
|
+
fs.createWriteStream(outputPath, {
|
|
377
|
+
// highWaterMark: 64 * 1024, // no observed speedup
|
|
378
|
+
}),
|
|
379
|
+
)
|
|
380
|
+
return transform1
|
|
381
|
+
}
|
|
308
382
|
}
|
|
309
383
|
|
|
310
384
|
export const fs2 = new FS2()
|
package/src/index.ts
CHANGED
|
@@ -28,13 +28,8 @@ export * from './log/log.util'
|
|
|
28
28
|
export * from './slack/slack.service'
|
|
29
29
|
export * from './slack/slack.service.model'
|
|
30
30
|
export * from './stream/ndjson/ndjson.model'
|
|
31
|
-
export * from './stream/ndjson/ndJsonFileRead'
|
|
32
|
-
export * from './stream/ndjson/ndJsonFileWrite'
|
|
33
31
|
export * from './stream/ndjson/ndjsonMap'
|
|
34
32
|
export * from './stream/ndjson/ndjsonStreamForEach'
|
|
35
|
-
export * from './stream/ndjson/pipelineFromNDJsonFile'
|
|
36
|
-
export * from './stream/ndjson/pipelineToNDJsonFile'
|
|
37
|
-
export * from './stream/ndjson/streamToNDJsonFile'
|
|
38
33
|
export * from './stream/ndjson/transformJsonParse'
|
|
39
34
|
export * from './stream/ndjson/transformToNDJson'
|
|
40
35
|
export * from './stream/pipeline/pipeline'
|
|
@@ -56,7 +51,6 @@ export * from './stream/transform/transformMapSync'
|
|
|
56
51
|
export * from './stream/transform/transformSplit'
|
|
57
52
|
export * from './stream/transform/transformTap'
|
|
58
53
|
export * from './stream/transform/transformToArray'
|
|
59
|
-
export * from './stream/transform/transformToString'
|
|
60
54
|
export * from './stream/transform/transformTee'
|
|
61
55
|
export * from './stream/transform/worker/baseWorkerClass'
|
|
62
56
|
export * from './stream/transform/worker/transformMultiThreaded'
|
|
@@ -1,17 +1,12 @@
|
|
|
1
|
-
import { createReadStream, createWriteStream } from 'node:fs'
|
|
2
|
-
import { createGzip, createUnzip } from 'node:zlib'
|
|
3
1
|
import { AbortableAsyncMapper, ErrorMode } from '@naturalcycles/js-lib'
|
|
4
2
|
import {
|
|
5
|
-
requireFileToExist,
|
|
6
|
-
transformJsonParse,
|
|
7
3
|
transformLimit,
|
|
8
4
|
transformLogProgress,
|
|
9
5
|
transformMap,
|
|
10
6
|
TransformMapOptions,
|
|
11
|
-
transformSplit,
|
|
12
|
-
transformToNDJson,
|
|
13
7
|
_pipeline,
|
|
14
8
|
TransformLogProgressOptions,
|
|
9
|
+
fs2,
|
|
15
10
|
} from '../..'
|
|
16
11
|
|
|
17
12
|
export interface NDJSONMapOptions<IN = any, OUT = IN>
|
|
@@ -46,24 +41,17 @@ export async function ndjsonMap<IN = any, OUT = any>(
|
|
|
46
41
|
): Promise<void> {
|
|
47
42
|
const { inputFilePath, outputFilePath, logEveryOutput = 100_000, limitInput, limitOutput } = opt
|
|
48
43
|
|
|
49
|
-
requireFileToExist(inputFilePath)
|
|
50
|
-
|
|
51
44
|
console.log({
|
|
52
45
|
inputFilePath,
|
|
53
46
|
outputFilePath,
|
|
54
47
|
})
|
|
55
48
|
|
|
56
|
-
const
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
const readable = createReadStream(inputFilePath)
|
|
49
|
+
const readable = fs2
|
|
50
|
+
.createReadStreamAsNDJSON(inputFilePath)
|
|
51
|
+
.take(limitInput || Number.POSITIVE_INFINITY)
|
|
60
52
|
|
|
61
53
|
await _pipeline([
|
|
62
54
|
readable,
|
|
63
|
-
...transformUnzip,
|
|
64
|
-
transformSplit(), // splits by \n
|
|
65
|
-
transformJsonParse(),
|
|
66
|
-
transformLimit({ limit: limitInput, sourceReadable: readable }),
|
|
67
55
|
transformLogProgress({ metric: 'read', ...opt }),
|
|
68
56
|
transformMap(mapper, {
|
|
69
57
|
flattenArrayOutput: true,
|
|
@@ -72,8 +60,6 @@ export async function ndjsonMap<IN = any, OUT = any>(
|
|
|
72
60
|
}),
|
|
73
61
|
transformLimit({ limit: limitOutput, sourceReadable: readable }),
|
|
74
62
|
transformLogProgress({ metric: 'saved', logEvery: logEveryOutput }),
|
|
75
|
-
|
|
76
|
-
...transformZip,
|
|
77
|
-
createWriteStream(outputFilePath),
|
|
63
|
+
fs2.createWriteStreamAsNDJSON(outputFilePath),
|
|
78
64
|
])
|
|
79
65
|
}
|
|
@@ -1,16 +1,12 @@
|
|
|
1
|
-
import fs from 'node:fs'
|
|
2
|
-
import { createUnzip } from 'node:zlib'
|
|
3
1
|
import { AbortableAsyncMapper, ErrorMode } from '@naturalcycles/js-lib'
|
|
4
2
|
import {
|
|
5
|
-
requireFileToExist,
|
|
6
|
-
transformJsonParse,
|
|
7
3
|
transformLogProgress,
|
|
8
4
|
TransformLogProgressOptions,
|
|
9
5
|
transformMap,
|
|
10
6
|
TransformMapOptions,
|
|
11
|
-
transformSplit,
|
|
12
7
|
writableVoid,
|
|
13
8
|
_pipeline,
|
|
9
|
+
fs2,
|
|
14
10
|
} from '../..'
|
|
15
11
|
|
|
16
12
|
export interface NDJSONStreamForEachOptions<IN = any>
|
|
@@ -26,15 +22,8 @@ export async function ndjsonStreamForEach<T>(
|
|
|
26
22
|
mapper: AbortableAsyncMapper<T, void>,
|
|
27
23
|
opt: NDJSONStreamForEachOptions<T>,
|
|
28
24
|
): Promise<void> {
|
|
29
|
-
requireFileToExist(opt.inputFilePath)
|
|
30
|
-
|
|
31
|
-
const transformUnzip = opt.inputFilePath.endsWith('.gz') ? [createUnzip()] : []
|
|
32
|
-
|
|
33
25
|
await _pipeline([
|
|
34
|
-
|
|
35
|
-
...transformUnzip,
|
|
36
|
-
transformSplit(),
|
|
37
|
-
transformJsonParse(),
|
|
26
|
+
fs2.createReadStreamAsNDJSON(opt.inputFilePath),
|
|
38
27
|
transformMap<T, any>(mapper, {
|
|
39
28
|
errorMode: ErrorMode.THROW_AGGREGATED,
|
|
40
29
|
...opt,
|
|
@@ -27,9 +27,9 @@ export interface TransformJsonParseOptions {
|
|
|
27
27
|
* consumeYourStream...
|
|
28
28
|
* [)
|
|
29
29
|
*/
|
|
30
|
-
export function transformJsonParse<
|
|
30
|
+
export function transformJsonParse<ROW = any>(
|
|
31
31
|
opt: TransformJsonParseOptions = {},
|
|
32
|
-
): TransformTyped<string | Buffer,
|
|
32
|
+
): TransformTyped<string | Buffer, ROW> {
|
|
33
33
|
const { strict = true, reviver } = opt
|
|
34
34
|
|
|
35
35
|
return new Transform({
|
|
@@ -1,16 +1,142 @@
|
|
|
1
|
+
import { Transform } from 'node:stream'
|
|
1
2
|
import { TransformTyped } from '../stream.model'
|
|
2
3
|
|
|
3
|
-
// https://github.com/max-mapper/binary-split
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
4
|
+
// The code below is carefully adopted from: https://github.com/max-mapper/binary-split
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Transforms input Buffer/string stream into Buffer chunks (objectMode: true) split by newLine.
|
|
8
|
+
*
|
|
9
|
+
* Useful for reading NDJSON files from fs.
|
|
10
|
+
*
|
|
11
|
+
* Same as binarySplit, but optimized (hard-coded) to split on NEWLINE (aka `\n`).
|
|
12
|
+
* (+5-10% _pipeline speedup measured, compared to generic `binarySplit` on variable length delimiter)
|
|
13
|
+
*/
|
|
14
|
+
export function transformSplitOnNewline(): TransformTyped<Buffer, Buffer> {
|
|
15
|
+
let buffered: Buffer | undefined
|
|
16
|
+
|
|
17
|
+
return new Transform({
|
|
18
|
+
readableObjectMode: true,
|
|
19
|
+
writableHighWaterMark: 64 * 1024,
|
|
20
|
+
|
|
21
|
+
transform(buf: Buffer, enc, done) {
|
|
22
|
+
let offset = 0
|
|
23
|
+
let lastMatch = 0
|
|
24
|
+
if (buffered) {
|
|
25
|
+
buf = Buffer.concat([buffered, buf])
|
|
26
|
+
offset = buffered.length
|
|
27
|
+
buffered = undefined
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
while (true) {
|
|
31
|
+
const idx = firstNewlineMatch(buf, offset)
|
|
32
|
+
if (idx !== -1 && idx < buf.length) {
|
|
33
|
+
if (lastMatch !== idx) {
|
|
34
|
+
this.push(buf.slice(lastMatch, idx))
|
|
35
|
+
}
|
|
36
|
+
offset = idx + 1
|
|
37
|
+
lastMatch = offset
|
|
38
|
+
} else {
|
|
39
|
+
buffered = buf.slice(lastMatch)
|
|
40
|
+
break
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
done()
|
|
45
|
+
},
|
|
46
|
+
|
|
47
|
+
flush(done) {
|
|
48
|
+
if (buffered && buffered.length > 0) this.push(buffered)
|
|
49
|
+
done()
|
|
50
|
+
},
|
|
51
|
+
})
|
|
52
|
+
}
|
|
7
53
|
|
|
8
54
|
/**
|
|
9
55
|
* Input: stream (objectMode=false) of arbitrary string|Buffer chunks, like when read from fs
|
|
10
|
-
* Output: stream (objectMode=
|
|
56
|
+
* Output: stream (objectMode=true) or string|Buffer chunks split by `separator` (@default to `\n`)
|
|
11
57
|
*
|
|
12
|
-
*
|
|
58
|
+
* Please use slightly more optimized `transformSplitOnNewline` for NDJSON file parsing.
|
|
59
|
+
* (+5-10% _pipeline speedup measured!)
|
|
60
|
+
*/
|
|
61
|
+
export function transformSplit(separator = '\n'): TransformTyped<Buffer, Buffer> {
|
|
62
|
+
const matcher = Buffer.from(separator)
|
|
63
|
+
let buffered: Buffer | undefined
|
|
64
|
+
|
|
65
|
+
return new Transform({
|
|
66
|
+
readableObjectMode: true,
|
|
67
|
+
writableHighWaterMark: 64 * 1024,
|
|
68
|
+
|
|
69
|
+
transform(buf: Buffer, enc, done) {
|
|
70
|
+
let offset = 0
|
|
71
|
+
let lastMatch = 0
|
|
72
|
+
if (buffered) {
|
|
73
|
+
buf = Buffer.concat([buffered, buf])
|
|
74
|
+
offset = buffered.length
|
|
75
|
+
buffered = undefined
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
while (true) {
|
|
79
|
+
const idx = firstMatch(buf, offset - matcher.length + 1, matcher)
|
|
80
|
+
if (idx !== -1 && idx < buf.length) {
|
|
81
|
+
if (lastMatch !== idx) {
|
|
82
|
+
this.push(buf.slice(lastMatch, idx))
|
|
83
|
+
}
|
|
84
|
+
offset = idx + matcher.length
|
|
85
|
+
lastMatch = offset
|
|
86
|
+
} else {
|
|
87
|
+
buffered = buf.slice(lastMatch)
|
|
88
|
+
break
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
done()
|
|
93
|
+
},
|
|
94
|
+
|
|
95
|
+
flush(done) {
|
|
96
|
+
if (buffered && buffered.length > 0) this.push(buffered)
|
|
97
|
+
done()
|
|
98
|
+
},
|
|
99
|
+
})
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// const NEWLINE = Buffer.from('\n')
|
|
103
|
+
// const NEWLINE_CODE = NEWLINE[0]! // it is `10`
|
|
104
|
+
const NEWLINE_CODE = 10
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Same as firstMatch, but optimized (hard-coded) to find NEWLINE (aka `\n`).
|
|
13
108
|
*/
|
|
14
|
-
|
|
15
|
-
|
|
109
|
+
function firstNewlineMatch(buf: Buffer, offset: number): number {
|
|
110
|
+
const bufLength = buf.length
|
|
111
|
+
if (offset >= bufLength) return -1
|
|
112
|
+
for (let i = offset; i < bufLength; i++) {
|
|
113
|
+
if (buf[i] === NEWLINE_CODE) {
|
|
114
|
+
return i
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
return -1 // this code is unreachable, because i is guaranteed to be found in the loop above
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
function firstMatch(buf: Buffer, offset: number, matcher: Buffer): number {
|
|
121
|
+
if (offset >= buf.length) return -1
|
|
122
|
+
let i
|
|
123
|
+
for (i = offset; i < buf.length; i++) {
|
|
124
|
+
if (buf[i] === matcher[0]) {
|
|
125
|
+
if (matcher.length > 1) {
|
|
126
|
+
let fullMatch = true
|
|
127
|
+
let j = i
|
|
128
|
+
for (let k = 0; j < i + matcher.length; j++, k++) {
|
|
129
|
+
if (buf[j] !== matcher[k]) {
|
|
130
|
+
fullMatch = false
|
|
131
|
+
break
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
if (fullMatch) return j - matcher.length
|
|
135
|
+
} else {
|
|
136
|
+
break
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
return i + matcher.length - 1
|
|
16
142
|
}
|
|
@@ -1,5 +0,0 @@
|
|
|
1
|
-
import { PipelineFromNDJsonFileOptions } from './pipelineFromNDJsonFile';
|
|
2
|
-
/**
|
|
3
|
-
* Read whole NDJSON file into memory, resolve promise with resulting array of items.
|
|
4
|
-
*/
|
|
5
|
-
export declare function ndJsonFileRead<OUT = any>(opt: PipelineFromNDJsonFileOptions): Promise<OUT[]>;
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.ndJsonFileRead = void 0;
|
|
4
|
-
const __1 = require("../..");
|
|
5
|
-
const pipelineFromNDJsonFile_1 = require("./pipelineFromNDJsonFile");
|
|
6
|
-
/**
|
|
7
|
-
* Read whole NDJSON file into memory, resolve promise with resulting array of items.
|
|
8
|
-
*/
|
|
9
|
-
async function ndJsonFileRead(opt) {
|
|
10
|
-
const res = [];
|
|
11
|
-
await (0, pipelineFromNDJsonFile_1.pipelineFromNDJsonFile)([(0, __1.writablePushToArray)(res)], opt);
|
|
12
|
-
return res;
|
|
13
|
-
}
|
|
14
|
-
exports.ndJsonFileRead = ndJsonFileRead;
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.ndJsonFileWrite = void 0;
|
|
4
|
-
const readableFromArray_1 = require("../readable/readableFromArray");
|
|
5
|
-
const pipelineToNDJsonFile_1 = require("./pipelineToNDJsonFile");
|
|
6
|
-
/**
|
|
7
|
-
* Write array of objects (in memory) into NDJSON file. Resolve when done.
|
|
8
|
-
*/
|
|
9
|
-
async function ndJsonFileWrite(items, opt) {
|
|
10
|
-
await (0, pipelineToNDJsonFile_1.pipelineToNDJsonFile)([(0, readableFromArray_1.readableFromArray)(items)], opt);
|
|
11
|
-
}
|
|
12
|
-
exports.ndJsonFileWrite = ndJsonFileWrite;
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
/// <reference types="node" />
|
|
2
|
-
/// <reference types="node" />
|
|
3
|
-
import { ZlibOptions } from 'node:zlib';
|
|
4
|
-
import { NDJsonStats } from './ndjson.model';
|
|
5
|
-
import { TransformJsonParseOptions } from './transformJsonParse';
|
|
6
|
-
export interface PipelineFromNDJsonFileOptions extends TransformJsonParseOptions {
|
|
7
|
-
filePath: string;
|
|
8
|
-
/**
|
|
9
|
-
* @default `\n`
|
|
10
|
-
*/
|
|
11
|
-
separator?: string;
|
|
12
|
-
/**
|
|
13
|
-
* @default false
|
|
14
|
-
*/
|
|
15
|
-
gzip?: boolean;
|
|
16
|
-
/**
|
|
17
|
-
* Only applicable if `gzip` is enabled
|
|
18
|
-
*/
|
|
19
|
-
zlibOptions?: ZlibOptions;
|
|
20
|
-
}
|
|
21
|
-
/**
|
|
22
|
-
* Convenience pipeline that starts from reading NDJSON file.
|
|
23
|
-
*/
|
|
24
|
-
export declare function pipelineFromNDJsonFile(streams: NodeJS.WritableStream[], opt: PipelineFromNDJsonFileOptions): Promise<NDJsonStats>;
|
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.pipelineFromNDJsonFile = void 0;
|
|
4
|
-
const tslib_1 = require("tslib");
|
|
5
|
-
const node_fs_1 = tslib_1.__importDefault(require("node:fs"));
|
|
6
|
-
const node_zlib_1 = require("node:zlib");
|
|
7
|
-
const js_lib_1 = require("@naturalcycles/js-lib");
|
|
8
|
-
const __1 = require("../..");
|
|
9
|
-
const colors_1 = require("../../colors/colors");
|
|
10
|
-
const ndjson_model_1 = require("./ndjson.model");
|
|
11
|
-
const transformJsonParse_1 = require("./transformJsonParse");
|
|
12
|
-
/**
|
|
13
|
-
* Convenience pipeline that starts from reading NDJSON file.
|
|
14
|
-
*/
|
|
15
|
-
async function pipelineFromNDJsonFile(streams, opt) {
|
|
16
|
-
const { filePath, gzip, separator } = opt;
|
|
17
|
-
const started = Date.now();
|
|
18
|
-
let rows = 0;
|
|
19
|
-
const { size: sizeBytes } = node_fs_1.default.statSync(filePath);
|
|
20
|
-
console.log(`<< ${(0, colors_1.grey)(filePath)} ${(0, colors_1.dimWhite)((0, js_lib_1._hb)(sizeBytes))} started...`);
|
|
21
|
-
await (0, __1._pipeline)([
|
|
22
|
-
node_fs_1.default.createReadStream(filePath),
|
|
23
|
-
...(gzip ? [(0, node_zlib_1.createUnzip)(opt.zlibOptions)] : []),
|
|
24
|
-
(0, __1.transformSplit)(separator), // splits by separator
|
|
25
|
-
(0, transformJsonParse_1.transformJsonParse)(opt),
|
|
26
|
-
(0, __1.transformTap)(() => rows++),
|
|
27
|
-
...streams,
|
|
28
|
-
]);
|
|
29
|
-
const stats = ndjson_model_1.NDJsonStats.create({
|
|
30
|
-
tookMillis: Date.now() - started,
|
|
31
|
-
rows,
|
|
32
|
-
sizeBytes,
|
|
33
|
-
});
|
|
34
|
-
console.log(`<< ${(0, colors_1.grey)(filePath)}\n` + stats.toPretty());
|
|
35
|
-
return stats;
|
|
36
|
-
}
|
|
37
|
-
exports.pipelineFromNDJsonFile = pipelineFromNDJsonFile;
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
/// <reference types="node" />
|
|
2
|
-
/// <reference types="node" />
|
|
3
|
-
import { ZlibOptions } from 'node:zlib';
|
|
4
|
-
import { NDJsonStats } from './ndjson.model';
|
|
5
|
-
import { TransformToNDJsonOptions } from './transformToNDJson';
|
|
6
|
-
export interface PipelineToNDJsonFileOptions extends TransformToNDJsonOptions {
|
|
7
|
-
filePath: string;
|
|
8
|
-
/**
|
|
9
|
-
* @default false
|
|
10
|
-
* If true - will fail if output file already exists.
|
|
11
|
-
*/
|
|
12
|
-
protectFromOverwrite?: boolean;
|
|
13
|
-
/**
|
|
14
|
-
* @default false
|
|
15
|
-
*/
|
|
16
|
-
gzip?: boolean;
|
|
17
|
-
/**
|
|
18
|
-
* Only applicable if `gzip` is enabled
|
|
19
|
-
*/
|
|
20
|
-
zlibOptions?: ZlibOptions;
|
|
21
|
-
}
|
|
22
|
-
/**
|
|
23
|
-
* Convenience pipeline to transform stream of objects into a file in NDJSON format.
|
|
24
|
-
*
|
|
25
|
-
* Does fs.ensureFile() before starting, which will create all needed directories and truncate the file if it existed.
|
|
26
|
-
*/
|
|
27
|
-
export declare function pipelineToNDJsonFile(streams: (NodeJS.ReadableStream | NodeJS.WritableStream)[], opt: PipelineToNDJsonFileOptions): Promise<NDJsonStats>;
|
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.pipelineToNDJsonFile = void 0;
|
|
4
|
-
const tslib_1 = require("tslib");
|
|
5
|
-
const node_fs_1 = tslib_1.__importDefault(require("node:fs"));
|
|
6
|
-
const node_zlib_1 = require("node:zlib");
|
|
7
|
-
const js_lib_1 = require("@naturalcycles/js-lib");
|
|
8
|
-
const __1 = require("../..");
|
|
9
|
-
const colors_1 = require("../../colors/colors");
|
|
10
|
-
const ndjson_model_1 = require("./ndjson.model");
|
|
11
|
-
const transformToNDJson_1 = require("./transformToNDJson");
|
|
12
|
-
/**
|
|
13
|
-
* Convenience pipeline to transform stream of objects into a file in NDJSON format.
|
|
14
|
-
*
|
|
15
|
-
* Does fs.ensureFile() before starting, which will create all needed directories and truncate the file if it existed.
|
|
16
|
-
*/
|
|
17
|
-
async function pipelineToNDJsonFile(streams, opt) {
|
|
18
|
-
const { filePath, gzip, protectFromOverwrite = false } = opt;
|
|
19
|
-
if (protectFromOverwrite && __1.fs2.pathExists(filePath)) {
|
|
20
|
-
throw new js_lib_1.AppError(`pipelineToNDJsonFile: output file exists: ${filePath}`);
|
|
21
|
-
}
|
|
22
|
-
const started = Date.now();
|
|
23
|
-
let rows = 0;
|
|
24
|
-
__1.fs2.ensureFile(filePath);
|
|
25
|
-
console.log(`>> ${(0, colors_1.grey)(filePath)} started...`);
|
|
26
|
-
await (0, __1._pipeline)([
|
|
27
|
-
...streams,
|
|
28
|
-
(0, __1.transformTap)(() => rows++),
|
|
29
|
-
(0, transformToNDJson_1.transformToNDJson)(opt),
|
|
30
|
-
...(gzip ? [(0, node_zlib_1.createGzip)(opt.zlibOptions)] : []), // optional gzip
|
|
31
|
-
node_fs_1.default.createWriteStream(filePath),
|
|
32
|
-
]);
|
|
33
|
-
const { size: sizeBytes } = node_fs_1.default.statSync(filePath);
|
|
34
|
-
const stats = ndjson_model_1.NDJsonStats.create({
|
|
35
|
-
tookMillis: Date.now() - started,
|
|
36
|
-
rows,
|
|
37
|
-
sizeBytes,
|
|
38
|
-
});
|
|
39
|
-
console.log(`>> ${(0, colors_1.grey)(filePath)}\n` + stats.toPretty());
|
|
40
|
-
return stats;
|
|
41
|
-
}
|
|
42
|
-
exports.pipelineToNDJsonFile = pipelineToNDJsonFile;
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.streamToNDJsonFile = void 0;
|
|
4
|
-
const pipelineToNDJsonFile_1 = require("./pipelineToNDJsonFile");
|
|
5
|
-
async function streamToNDJsonFile(stream, opt) {
|
|
6
|
-
await (0, pipelineToNDJsonFile_1.pipelineToNDJsonFile)([stream], opt);
|
|
7
|
-
}
|
|
8
|
-
exports.streamToNDJsonFile = streamToNDJsonFile;
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
/// <reference types="node" />
|
|
2
|
-
import { TransformTyped } from '../stream.model';
|
|
3
|
-
/**
|
|
4
|
-
* Transforms objectMode=false Buffers/strings into objectMode=true strings.
|
|
5
|
-
*
|
|
6
|
-
* Useful in this _pipeline:
|
|
7
|
-
* fs.createReadStream(inputPath),
|
|
8
|
-
* createUnzip(), // binary
|
|
9
|
-
* transformSplit(), // string chunks, but objectMode==false
|
|
10
|
-
* transformToString(), // string chunks, but objectMode==true
|
|
11
|
-
*/
|
|
12
|
-
export declare function transformToString(): TransformTyped<Buffer, string>;
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.transformToString = void 0;
|
|
4
|
-
const node_stream_1 = require("node:stream");
|
|
5
|
-
/**
|
|
6
|
-
* Transforms objectMode=false Buffers/strings into objectMode=true strings.
|
|
7
|
-
*
|
|
8
|
-
* Useful in this _pipeline:
|
|
9
|
-
* fs.createReadStream(inputPath),
|
|
10
|
-
* createUnzip(), // binary
|
|
11
|
-
* transformSplit(), // string chunks, but objectMode==false
|
|
12
|
-
* transformToString(), // string chunks, but objectMode==true
|
|
13
|
-
*/
|
|
14
|
-
function transformToString() {
|
|
15
|
-
return new node_stream_1.Transform({
|
|
16
|
-
objectMode: false,
|
|
17
|
-
readableObjectMode: true,
|
|
18
|
-
transform(chunk, _, cb) {
|
|
19
|
-
// console.log(`enc: ${_}`, chunk.toString())
|
|
20
|
-
cb(null, chunk.toString());
|
|
21
|
-
},
|
|
22
|
-
});
|
|
23
|
-
}
|
|
24
|
-
exports.transformToString = transformToString;
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
import { writablePushToArray } from '../..'
|
|
2
|
-
import { pipelineFromNDJsonFile, PipelineFromNDJsonFileOptions } from './pipelineFromNDJsonFile'
|
|
3
|
-
|
|
4
|
-
/**
|
|
5
|
-
* Read whole NDJSON file into memory, resolve promise with resulting array of items.
|
|
6
|
-
*/
|
|
7
|
-
export async function ndJsonFileRead<OUT = any>(
|
|
8
|
-
opt: PipelineFromNDJsonFileOptions,
|
|
9
|
-
): Promise<OUT[]> {
|
|
10
|
-
const res: OUT[] = []
|
|
11
|
-
|
|
12
|
-
await pipelineFromNDJsonFile([writablePushToArray(res)], opt)
|
|
13
|
-
|
|
14
|
-
return res
|
|
15
|
-
}
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
import { readableFromArray } from '../readable/readableFromArray'
|
|
2
|
-
import { pipelineToNDJsonFile, PipelineToNDJsonFileOptions } from './pipelineToNDJsonFile'
|
|
3
|
-
|
|
4
|
-
/**
|
|
5
|
-
* Write array of objects (in memory) into NDJSON file. Resolve when done.
|
|
6
|
-
*/
|
|
7
|
-
export async function ndJsonFileWrite<IN = any>(
|
|
8
|
-
items: IN[],
|
|
9
|
-
opt: PipelineToNDJsonFileOptions,
|
|
10
|
-
): Promise<void> {
|
|
11
|
-
await pipelineToNDJsonFile([readableFromArray(items)], opt)
|
|
12
|
-
}
|
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
import fs from 'node:fs'
|
|
2
|
-
import { createUnzip, ZlibOptions } from 'node:zlib'
|
|
3
|
-
import { _hb } from '@naturalcycles/js-lib'
|
|
4
|
-
import { transformTap, _pipeline, transformSplit } from '../..'
|
|
5
|
-
import { dimWhite, grey } from '../../colors/colors'
|
|
6
|
-
import { NDJsonStats } from './ndjson.model'
|
|
7
|
-
import { transformJsonParse, TransformJsonParseOptions } from './transformJsonParse'
|
|
8
|
-
|
|
9
|
-
export interface PipelineFromNDJsonFileOptions extends TransformJsonParseOptions {
|
|
10
|
-
filePath: string
|
|
11
|
-
|
|
12
|
-
/**
|
|
13
|
-
* @default `\n`
|
|
14
|
-
*/
|
|
15
|
-
separator?: string
|
|
16
|
-
|
|
17
|
-
/**
|
|
18
|
-
* @default false
|
|
19
|
-
*/
|
|
20
|
-
gzip?: boolean
|
|
21
|
-
|
|
22
|
-
/**
|
|
23
|
-
* Only applicable if `gzip` is enabled
|
|
24
|
-
*/
|
|
25
|
-
zlibOptions?: ZlibOptions
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
/**
|
|
29
|
-
* Convenience pipeline that starts from reading NDJSON file.
|
|
30
|
-
*/
|
|
31
|
-
export async function pipelineFromNDJsonFile(
|
|
32
|
-
streams: NodeJS.WritableStream[],
|
|
33
|
-
opt: PipelineFromNDJsonFileOptions,
|
|
34
|
-
): Promise<NDJsonStats> {
|
|
35
|
-
const { filePath, gzip, separator } = opt
|
|
36
|
-
|
|
37
|
-
const started = Date.now()
|
|
38
|
-
let rows = 0
|
|
39
|
-
|
|
40
|
-
const { size: sizeBytes } = fs.statSync(filePath)
|
|
41
|
-
|
|
42
|
-
console.log(`<< ${grey(filePath)} ${dimWhite(_hb(sizeBytes))} started...`)
|
|
43
|
-
|
|
44
|
-
await _pipeline([
|
|
45
|
-
fs.createReadStream(filePath),
|
|
46
|
-
...(gzip ? [createUnzip(opt.zlibOptions)] : []),
|
|
47
|
-
transformSplit(separator), // splits by separator
|
|
48
|
-
transformJsonParse(opt),
|
|
49
|
-
transformTap(() => rows++),
|
|
50
|
-
...streams,
|
|
51
|
-
])
|
|
52
|
-
|
|
53
|
-
const stats = NDJsonStats.create({
|
|
54
|
-
tookMillis: Date.now() - started,
|
|
55
|
-
rows,
|
|
56
|
-
sizeBytes,
|
|
57
|
-
})
|
|
58
|
-
|
|
59
|
-
console.log(`<< ${grey(filePath)}\n` + stats.toPretty())
|
|
60
|
-
|
|
61
|
-
return stats
|
|
62
|
-
}
|
|
@@ -1,70 +0,0 @@
|
|
|
1
|
-
import fs from 'node:fs'
|
|
2
|
-
import { createGzip, ZlibOptions } from 'node:zlib'
|
|
3
|
-
import { AppError } from '@naturalcycles/js-lib'
|
|
4
|
-
import { transformTap, _pipeline, fs2 } from '../..'
|
|
5
|
-
import { grey } from '../../colors/colors'
|
|
6
|
-
import { NDJsonStats } from './ndjson.model'
|
|
7
|
-
import { transformToNDJson, TransformToNDJsonOptions } from './transformToNDJson'
|
|
8
|
-
|
|
9
|
-
export interface PipelineToNDJsonFileOptions extends TransformToNDJsonOptions {
|
|
10
|
-
filePath: string
|
|
11
|
-
|
|
12
|
-
/**
|
|
13
|
-
* @default false
|
|
14
|
-
* If true - will fail if output file already exists.
|
|
15
|
-
*/
|
|
16
|
-
protectFromOverwrite?: boolean
|
|
17
|
-
|
|
18
|
-
/**
|
|
19
|
-
* @default false
|
|
20
|
-
*/
|
|
21
|
-
gzip?: boolean
|
|
22
|
-
|
|
23
|
-
/**
|
|
24
|
-
* Only applicable if `gzip` is enabled
|
|
25
|
-
*/
|
|
26
|
-
zlibOptions?: ZlibOptions
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
/**
|
|
30
|
-
* Convenience pipeline to transform stream of objects into a file in NDJSON format.
|
|
31
|
-
*
|
|
32
|
-
* Does fs.ensureFile() before starting, which will create all needed directories and truncate the file if it existed.
|
|
33
|
-
*/
|
|
34
|
-
export async function pipelineToNDJsonFile(
|
|
35
|
-
streams: (NodeJS.ReadableStream | NodeJS.WritableStream)[],
|
|
36
|
-
opt: PipelineToNDJsonFileOptions,
|
|
37
|
-
): Promise<NDJsonStats> {
|
|
38
|
-
const { filePath, gzip, protectFromOverwrite = false } = opt
|
|
39
|
-
|
|
40
|
-
if (protectFromOverwrite && fs2.pathExists(filePath)) {
|
|
41
|
-
throw new AppError(`pipelineToNDJsonFile: output file exists: ${filePath}`)
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
const started = Date.now()
|
|
45
|
-
let rows = 0
|
|
46
|
-
|
|
47
|
-
fs2.ensureFile(filePath)
|
|
48
|
-
|
|
49
|
-
console.log(`>> ${grey(filePath)} started...`)
|
|
50
|
-
|
|
51
|
-
await _pipeline([
|
|
52
|
-
...streams,
|
|
53
|
-
transformTap(() => rows++),
|
|
54
|
-
transformToNDJson(opt),
|
|
55
|
-
...(gzip ? [createGzip(opt.zlibOptions)] : []), // optional gzip
|
|
56
|
-
fs.createWriteStream(filePath),
|
|
57
|
-
])
|
|
58
|
-
|
|
59
|
-
const { size: sizeBytes } = fs.statSync(filePath)
|
|
60
|
-
|
|
61
|
-
const stats = NDJsonStats.create({
|
|
62
|
-
tookMillis: Date.now() - started,
|
|
63
|
-
rows,
|
|
64
|
-
sizeBytes,
|
|
65
|
-
})
|
|
66
|
-
|
|
67
|
-
console.log(`>> ${grey(filePath)}\n` + stats.toPretty())
|
|
68
|
-
|
|
69
|
-
return stats
|
|
70
|
-
}
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
import { ReadableTyped } from '../stream.model'
|
|
2
|
-
import { pipelineToNDJsonFile, PipelineToNDJsonFileOptions } from './pipelineToNDJsonFile'
|
|
3
|
-
|
|
4
|
-
export async function streamToNDJsonFile<IN>(
|
|
5
|
-
stream: ReadableTyped<IN>,
|
|
6
|
-
opt: PipelineToNDJsonFileOptions,
|
|
7
|
-
): Promise<void> {
|
|
8
|
-
await pipelineToNDJsonFile([stream], opt)
|
|
9
|
-
}
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
import { Transform } from 'node:stream'
|
|
2
|
-
import { TransformTyped } from '../stream.model'
|
|
3
|
-
|
|
4
|
-
/**
|
|
5
|
-
* Transforms objectMode=false Buffers/strings into objectMode=true strings.
|
|
6
|
-
*
|
|
7
|
-
* Useful in this _pipeline:
|
|
8
|
-
* fs.createReadStream(inputPath),
|
|
9
|
-
* createUnzip(), // binary
|
|
10
|
-
* transformSplit(), // string chunks, but objectMode==false
|
|
11
|
-
* transformToString(), // string chunks, but objectMode==true
|
|
12
|
-
*/
|
|
13
|
-
export function transformToString(): TransformTyped<Buffer, string> {
|
|
14
|
-
return new Transform({
|
|
15
|
-
objectMode: false,
|
|
16
|
-
readableObjectMode: true,
|
|
17
|
-
transform(chunk: Buffer, _, cb) {
|
|
18
|
-
// console.log(`enc: ${_}`, chunk.toString())
|
|
19
|
-
cb(null, chunk.toString())
|
|
20
|
-
},
|
|
21
|
-
})
|
|
22
|
-
}
|