@naturalcycles/nodejs-lib 13.21.0 → 13.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/dist/fs/fs2.d.ts +3 -0
  2. package/dist/fs/fs2.js +60 -0
  3. package/dist/index.d.ts +0 -6
  4. package/dist/index.js +0 -6
  5. package/dist/stream/ndjson/ndjsonMap.js +4 -13
  6. package/dist/stream/ndjson/ndjsonStreamForEach.js +1 -9
  7. package/dist/stream/ndjson/transformJsonParse.d.ts +1 -1
  8. package/dist/stream/transform/transformSplit.d.ts +13 -3
  9. package/dist/stream/transform/transformSplit.js +128 -8
  10. package/package.json +1 -2
  11. package/src/fs/fs2.ts +72 -0
  12. package/src/index.ts +0 -6
  13. package/src/stream/ndjson/ndjsonMap.ts +5 -19
  14. package/src/stream/ndjson/ndjsonStreamForEach.ts +2 -13
  15. package/src/stream/ndjson/transformJsonParse.ts +2 -2
  16. package/src/stream/transform/transformSplit.ts +134 -8
  17. package/dist/stream/ndjson/ndJsonFileRead.d.ts +0 -5
  18. package/dist/stream/ndjson/ndJsonFileRead.js +0 -14
  19. package/dist/stream/ndjson/ndJsonFileWrite.d.ts +0 -5
  20. package/dist/stream/ndjson/ndJsonFileWrite.js +0 -12
  21. package/dist/stream/ndjson/pipelineFromNDJsonFile.d.ts +0 -24
  22. package/dist/stream/ndjson/pipelineFromNDJsonFile.js +0 -37
  23. package/dist/stream/ndjson/pipelineToNDJsonFile.d.ts +0 -27
  24. package/dist/stream/ndjson/pipelineToNDJsonFile.js +0 -42
  25. package/dist/stream/ndjson/streamToNDJsonFile.d.ts +0 -3
  26. package/dist/stream/ndjson/streamToNDJsonFile.js +0 -8
  27. package/dist/stream/transform/transformToString.d.ts +0 -12
  28. package/dist/stream/transform/transformToString.js +0 -24
  29. package/src/stream/ndjson/ndJsonFileRead.ts +0 -15
  30. package/src/stream/ndjson/ndJsonFileWrite.ts +0 -12
  31. package/src/stream/ndjson/pipelineFromNDJsonFile.ts +0 -62
  32. package/src/stream/ndjson/pipelineToNDJsonFile.ts +0 -70
  33. package/src/stream/ndjson/streamToNDJsonFile.ts +0 -9
  34. package/src/stream/transform/transformToString.ts +0 -22
package/dist/fs/fs2.d.ts CHANGED
@@ -4,6 +4,7 @@
4
4
  import type { RmOptions } from 'node:fs';
5
5
  import fs from 'node:fs';
6
6
  import { DumpOptions } from 'js-yaml';
7
+ import { ReadableTyped, WritableTyped } from '../stream/stream.model';
7
8
  /**
8
9
  * fs2 conveniently groups filesystem functions together.
9
10
  * Supposed to be almost a drop-in replacement for these things together:
@@ -75,6 +76,8 @@ declare class FS2 {
75
76
  readdirAsync: typeof fs.promises.readdir;
76
77
  createWriteStream: typeof fs.createWriteStream;
77
78
  createReadStream: typeof fs.createReadStream;
79
+ createReadStreamAsNDJSON<ROW = any>(inputPath: string): ReadableTyped<ROW>;
80
+ createWriteStreamAsNDJSON(outputPath: string): WritableTyped<any>;
78
81
  }
79
82
  export declare const fs2: FS2;
80
83
  export interface JsonOptions {
package/dist/fs/fs2.js CHANGED
@@ -20,8 +20,12 @@ const tslib_1 = require("tslib");
20
20
  const node_fs_1 = tslib_1.__importDefault(require("node:fs"));
21
21
  const promises_1 = tslib_1.__importDefault(require("node:fs/promises"));
22
22
  const node_path_1 = tslib_1.__importDefault(require("node:path"));
23
+ const node_zlib_1 = require("node:zlib");
23
24
  const js_lib_1 = require("@naturalcycles/js-lib");
24
25
  const js_yaml_1 = tslib_1.__importDefault(require("js-yaml"));
26
+ const transformToNDJson_1 = require("../stream/ndjson/transformToNDJson");
27
+ const transformSplit_1 = require("../stream/transform/transformSplit");
28
+ const env_util_1 = require("../util/env.util");
25
29
  /**
26
30
  * fs2 conveniently groups filesystem functions together.
27
31
  * Supposed to be almost a drop-in replacement for these things together:
@@ -271,6 +275,62 @@ class FS2 {
271
275
  await this.copyPathAsync(src, dest, opt);
272
276
  await this.removePathAsync(src);
273
277
  }
278
+ /*
279
+ Returns a Readable of [already parsed] NDJSON objects.
280
+
281
+ Replaces a list of operations:
282
+ - requireFileToExist(inputPath)
283
+ - fs.createReadStream
284
+ - createUnzip (only if path ends with '.gz')
285
+ - transformSplitOnNewline
286
+ - transformJsonParse
287
+
288
+ To add a Limit or Offset: just add .take() or .drop(), example:
289
+
290
+ _pipeline([
291
+ fs2.createReadStreamAsNDJSON().take(100),
292
+ transformX(),
293
+ ])
294
+ */
295
+ createReadStreamAsNDJSON(inputPath) {
296
+ (0, env_util_1.requireFileToExist)(inputPath);
297
+ let stream = node_fs_1.default
298
+ .createReadStream(inputPath, {
299
+ highWaterMark: 64 * 1024, // no observed speedup
300
+ })
301
+ .on('error', err => stream.emit('error', err));
302
+ if (inputPath.endsWith('.gz')) {
303
+ stream = stream.pipe((0, node_zlib_1.createUnzip)({
304
+ chunkSize: 64 * 1024, // speedup from ~3200 to 3800 rps!
305
+ }));
306
+ }
307
+ return stream.pipe((0, transformSplit_1.transformSplitOnNewline)()).map(line => JSON.parse(line));
308
+ // For some crazy reason .map is much faster than transformJsonParse!
309
+ // ~5000 vs ~4000 rps !!!
310
+ // .on('error', err => stream.emit('error', err))
311
+ // .pipe(transformJsonParse<ROW>())
312
+ }
313
+ /*
314
+ Returns a Writable.
315
+
316
+ Replaces a list of operations:
317
+ - transformToNDJson
318
+ - createGzip (only if path ends with '.gz')
319
+ - fs.createWriteStream
320
+ */
321
+ createWriteStreamAsNDJSON(outputPath) {
322
+ const transform1 = (0, transformToNDJson_1.transformToNDJson)();
323
+ let transform = transform1;
324
+ if (outputPath.endsWith('.gz')) {
325
+ transform = transform.pipe((0, node_zlib_1.createGzip)({
326
+ // chunkSize: 64 * 1024, // no observed speedup
327
+ }));
328
+ }
329
+ transform.pipe(node_fs_1.default.createWriteStream(outputPath, {
330
+ // highWaterMark: 64 * 1024, // no observed speedup
331
+ }));
332
+ return transform1;
333
+ }
274
334
  }
275
335
  exports.fs2 = new FS2();
276
336
  function stringify(data, opt) {
package/dist/index.d.ts CHANGED
@@ -18,13 +18,8 @@ export * from './log/log.util';
18
18
  export * from './slack/slack.service';
19
19
  export * from './slack/slack.service.model';
20
20
  export * from './stream/ndjson/ndjson.model';
21
- export * from './stream/ndjson/ndJsonFileRead';
22
- export * from './stream/ndjson/ndJsonFileWrite';
23
21
  export * from './stream/ndjson/ndjsonMap';
24
22
  export * from './stream/ndjson/ndjsonStreamForEach';
25
- export * from './stream/ndjson/pipelineFromNDJsonFile';
26
- export * from './stream/ndjson/pipelineToNDJsonFile';
27
- export * from './stream/ndjson/streamToNDJsonFile';
28
23
  export * from './stream/ndjson/transformJsonParse';
29
24
  export * from './stream/ndjson/transformToNDJson';
30
25
  export * from './stream/pipeline/pipeline';
@@ -46,7 +41,6 @@ export * from './stream/transform/transformMapSync';
46
41
  export * from './stream/transform/transformSplit';
47
42
  export * from './stream/transform/transformTap';
48
43
  export * from './stream/transform/transformToArray';
49
- export * from './stream/transform/transformToString';
50
44
  export * from './stream/transform/transformTee';
51
45
  export * from './stream/transform/worker/baseWorkerClass';
52
46
  export * from './stream/transform/worker/transformMultiThreaded';
package/dist/index.js CHANGED
@@ -22,13 +22,8 @@ tslib_1.__exportStar(require("./log/log.util"), exports);
22
22
  tslib_1.__exportStar(require("./slack/slack.service"), exports);
23
23
  tslib_1.__exportStar(require("./slack/slack.service.model"), exports);
24
24
  tslib_1.__exportStar(require("./stream/ndjson/ndjson.model"), exports);
25
- tslib_1.__exportStar(require("./stream/ndjson/ndJsonFileRead"), exports);
26
- tslib_1.__exportStar(require("./stream/ndjson/ndJsonFileWrite"), exports);
27
25
  tslib_1.__exportStar(require("./stream/ndjson/ndjsonMap"), exports);
28
26
  tslib_1.__exportStar(require("./stream/ndjson/ndjsonStreamForEach"), exports);
29
- tslib_1.__exportStar(require("./stream/ndjson/pipelineFromNDJsonFile"), exports);
30
- tslib_1.__exportStar(require("./stream/ndjson/pipelineToNDJsonFile"), exports);
31
- tslib_1.__exportStar(require("./stream/ndjson/streamToNDJsonFile"), exports);
32
27
  tslib_1.__exportStar(require("./stream/ndjson/transformJsonParse"), exports);
33
28
  tslib_1.__exportStar(require("./stream/ndjson/transformToNDJson"), exports);
34
29
  tslib_1.__exportStar(require("./stream/pipeline/pipeline"), exports);
@@ -50,7 +45,6 @@ tslib_1.__exportStar(require("./stream/transform/transformMapSync"), exports);
50
45
  tslib_1.__exportStar(require("./stream/transform/transformSplit"), exports);
51
46
  tslib_1.__exportStar(require("./stream/transform/transformTap"), exports);
52
47
  tslib_1.__exportStar(require("./stream/transform/transformToArray"), exports);
53
- tslib_1.__exportStar(require("./stream/transform/transformToString"), exports);
54
48
  tslib_1.__exportStar(require("./stream/transform/transformTee"), exports);
55
49
  tslib_1.__exportStar(require("./stream/transform/worker/baseWorkerClass"), exports);
56
50
  tslib_1.__exportStar(require("./stream/transform/worker/transformMultiThreaded"), exports);
@@ -1,8 +1,6 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.ndjsonMap = void 0;
4
- const node_fs_1 = require("node:fs");
5
- const node_zlib_1 = require("node:zlib");
6
4
  const js_lib_1 = require("@naturalcycles/js-lib");
7
5
  const __1 = require("../..");
8
6
  /**
@@ -11,20 +9,15 @@ const __1 = require("../..");
11
9
  */
12
10
  async function ndjsonMap(mapper, opt) {
13
11
  const { inputFilePath, outputFilePath, logEveryOutput = 100_000, limitInput, limitOutput } = opt;
14
- (0, __1.requireFileToExist)(inputFilePath);
15
12
  console.log({
16
13
  inputFilePath,
17
14
  outputFilePath,
18
15
  });
19
- const transformUnzip = inputFilePath.endsWith('.gz') ? [(0, node_zlib_1.createUnzip)()] : [];
20
- const transformZip = outputFilePath.endsWith('.gz') ? [(0, node_zlib_1.createGzip)()] : [];
21
- const readable = (0, node_fs_1.createReadStream)(inputFilePath);
16
+ const readable = __1.fs2
17
+ .createReadStreamAsNDJSON(inputFilePath)
18
+ .take(limitInput || Number.POSITIVE_INFINITY);
22
19
  await (0, __1._pipeline)([
23
20
  readable,
24
- ...transformUnzip,
25
- (0, __1.transformSplit)(), // splits by \n
26
- (0, __1.transformJsonParse)(),
27
- (0, __1.transformLimit)({ limit: limitInput, sourceReadable: readable }),
28
21
  (0, __1.transformLogProgress)({ metric: 'read', ...opt }),
29
22
  (0, __1.transformMap)(mapper, {
30
23
  flattenArrayOutput: true,
@@ -33,9 +26,7 @@ async function ndjsonMap(mapper, opt) {
33
26
  }),
34
27
  (0, __1.transformLimit)({ limit: limitOutput, sourceReadable: readable }),
35
28
  (0, __1.transformLogProgress)({ metric: 'saved', logEvery: logEveryOutput }),
36
- (0, __1.transformToNDJson)(),
37
- ...transformZip,
38
- (0, node_fs_1.createWriteStream)(outputFilePath),
29
+ __1.fs2.createWriteStreamAsNDJSON(outputFilePath),
39
30
  ]);
40
31
  }
41
32
  exports.ndjsonMap = ndjsonMap;
@@ -1,22 +1,14 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.ndjsonStreamForEach = void 0;
4
- const tslib_1 = require("tslib");
5
- const node_fs_1 = tslib_1.__importDefault(require("node:fs"));
6
- const node_zlib_1 = require("node:zlib");
7
4
  const js_lib_1 = require("@naturalcycles/js-lib");
8
5
  const __1 = require("../..");
9
6
  /**
10
7
  * Convenience function to `forEach` through an ndjson file.
11
8
  */
12
9
  async function ndjsonStreamForEach(mapper, opt) {
13
- (0, __1.requireFileToExist)(opt.inputFilePath);
14
- const transformUnzip = opt.inputFilePath.endsWith('.gz') ? [(0, node_zlib_1.createUnzip)()] : [];
15
10
  await (0, __1._pipeline)([
16
- node_fs_1.default.createReadStream(opt.inputFilePath),
17
- ...transformUnzip,
18
- (0, __1.transformSplit)(),
19
- (0, __1.transformJsonParse)(),
11
+ __1.fs2.createReadStreamAsNDJSON(opt.inputFilePath),
20
12
  (0, __1.transformMap)(mapper, {
21
13
  errorMode: js_lib_1.ErrorMode.THROW_AGGREGATED,
22
14
  ...opt,
@@ -24,5 +24,5 @@ export interface TransformJsonParseOptions {
24
24
  * consumeYourStream...
25
25
  * [)
26
26
  */
27
- export declare function transformJsonParse<OUT = any>(opt?: TransformJsonParseOptions): TransformTyped<string | Buffer, OUT>;
27
+ export declare function transformJsonParse<ROW = any>(opt?: TransformJsonParseOptions): TransformTyped<string | Buffer, ROW>;
28
28
  export declare const bufferReviver: Reviver;
@@ -1,9 +1,19 @@
1
1
  /// <reference types="node" />
2
2
  import { TransformTyped } from '../stream.model';
3
+ /**
4
+ * Transforms input Buffer/string stream into Buffer chunks (objectMode: true) split by newLine.
5
+ *
6
+ * Useful for reading NDJSON files from fs.
7
+ *
8
+ * Same as binarySplit, but optimized (hard-coded) to split on NEWLINE (aka `\n`).
9
+ * (+5-10% _pipeline speedup measured, compared to generic `binarySplit` on variable length delimiter)
10
+ */
11
+ export declare function transformSplitOnNewline(): TransformTyped<Buffer, Buffer>;
3
12
  /**
4
13
  * Input: stream (objectMode=false) of arbitrary string|Buffer chunks, like when read from fs
5
- * Output: stream (objectMode=false) or string|Buffer chunks split by `separator` (@default to `\n`)
14
+ * Output: stream (objectMode=true) or string|Buffer chunks split by `separator` (@default to `\n`)
6
15
  *
7
- * Useful to, for example, reading NDJSON files from fs
16
+ * Please use slightly more optimized `transformSplitOnNewline` for NDJSON file parsing.
17
+ * (+5-10% _pipeline speedup measured!)
8
18
  */
9
- export declare function transformSplit(separator?: string): TransformTyped<string | Buffer, string | Buffer>;
19
+ export declare function transformSplit(separator?: string): TransformTyped<Buffer, Buffer>;
@@ -1,17 +1,137 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.transformSplit = void 0;
4
- // https://github.com/max-mapper/binary-split
5
- // todo: test its newer version that doesn't have `through2` dependency
6
- // todo: test writableHighWaterMark of 64k
7
- const _binarySplit = require('binary-split');
3
+ exports.transformSplit = exports.transformSplitOnNewline = void 0;
4
+ const node_stream_1 = require("node:stream");
5
+ // The code below is carefully adopted from: https://github.com/max-mapper/binary-split
6
+ /**
7
+ * Transforms input Buffer/string stream into Buffer chunks (objectMode: true) split by newLine.
8
+ *
9
+ * Useful for reading NDJSON files from fs.
10
+ *
11
+ * Same as binarySplit, but optimized (hard-coded) to split on NEWLINE (aka `\n`).
12
+ * (+5-10% _pipeline speedup measured, compared to generic `binarySplit` on variable length delimiter)
13
+ */
14
+ function transformSplitOnNewline() {
15
+ let buffered;
16
+ return new node_stream_1.Transform({
17
+ readableObjectMode: true,
18
+ writableHighWaterMark: 64 * 1024,
19
+ transform(buf, enc, done) {
20
+ let offset = 0;
21
+ let lastMatch = 0;
22
+ if (buffered) {
23
+ buf = Buffer.concat([buffered, buf]);
24
+ offset = buffered.length;
25
+ buffered = undefined;
26
+ }
27
+ while (true) {
28
+ const idx = firstNewlineMatch(buf, offset);
29
+ if (idx !== -1 && idx < buf.length) {
30
+ if (lastMatch !== idx) {
31
+ this.push(buf.slice(lastMatch, idx));
32
+ }
33
+ offset = idx + 1;
34
+ lastMatch = offset;
35
+ }
36
+ else {
37
+ buffered = buf.slice(lastMatch);
38
+ break;
39
+ }
40
+ }
41
+ done();
42
+ },
43
+ flush(done) {
44
+ if (buffered && buffered.length > 0)
45
+ this.push(buffered);
46
+ done();
47
+ },
48
+ });
49
+ }
50
+ exports.transformSplitOnNewline = transformSplitOnNewline;
8
51
  /**
9
52
  * Input: stream (objectMode=false) of arbitrary string|Buffer chunks, like when read from fs
10
- * Output: stream (objectMode=false) or string|Buffer chunks split by `separator` (@default to `\n`)
53
+ * Output: stream (objectMode=true) or string|Buffer chunks split by `separator` (@default to `\n`)
11
54
  *
12
- * Useful to, for example, reading NDJSON files from fs
55
+ * Please use slightly more optimized `transformSplitOnNewline` for NDJSON file parsing.
56
+ * (+5-10% _pipeline speedup measured!)
13
57
  */
14
58
  function transformSplit(separator = '\n') {
15
- return _binarySplit(separator);
59
+ const matcher = Buffer.from(separator);
60
+ let buffered;
61
+ return new node_stream_1.Transform({
62
+ readableObjectMode: true,
63
+ writableHighWaterMark: 64 * 1024,
64
+ transform(buf, enc, done) {
65
+ let offset = 0;
66
+ let lastMatch = 0;
67
+ if (buffered) {
68
+ buf = Buffer.concat([buffered, buf]);
69
+ offset = buffered.length;
70
+ buffered = undefined;
71
+ }
72
+ while (true) {
73
+ const idx = firstMatch(buf, offset - matcher.length + 1, matcher);
74
+ if (idx !== -1 && idx < buf.length) {
75
+ if (lastMatch !== idx) {
76
+ this.push(buf.slice(lastMatch, idx));
77
+ }
78
+ offset = idx + matcher.length;
79
+ lastMatch = offset;
80
+ }
81
+ else {
82
+ buffered = buf.slice(lastMatch);
83
+ break;
84
+ }
85
+ }
86
+ done();
87
+ },
88
+ flush(done) {
89
+ if (buffered && buffered.length > 0)
90
+ this.push(buffered);
91
+ done();
92
+ },
93
+ });
16
94
  }
17
95
  exports.transformSplit = transformSplit;
96
+ // const NEWLINE = Buffer.from('\n')
97
+ // const NEWLINE_CODE = NEWLINE[0]! // it is `10`
98
+ const NEWLINE_CODE = 10;
99
+ /**
100
+ * Same as firstMatch, but optimized (hard-coded) to find NEWLINE (aka `\n`).
101
+ */
102
+ function firstNewlineMatch(buf, offset) {
103
+ const bufLength = buf.length;
104
+ if (offset >= bufLength)
105
+ return -1;
106
+ for (let i = offset; i < bufLength; i++) {
107
+ if (buf[i] === NEWLINE_CODE) {
108
+ return i;
109
+ }
110
+ }
111
+ return -1; // this code is unreachable, because i is guaranteed to be found in the loop above
112
+ }
113
+ function firstMatch(buf, offset, matcher) {
114
+ if (offset >= buf.length)
115
+ return -1;
116
+ let i;
117
+ for (i = offset; i < buf.length; i++) {
118
+ if (buf[i] === matcher[0]) {
119
+ if (matcher.length > 1) {
120
+ let fullMatch = true;
121
+ let j = i;
122
+ for (let k = 0; j < i + matcher.length; j++, k++) {
123
+ if (buf[j] !== matcher[k]) {
124
+ fullMatch = false;
125
+ break;
126
+ }
127
+ }
128
+ if (fullMatch)
129
+ return j - matcher.length;
130
+ }
131
+ else {
132
+ break;
133
+ }
134
+ }
135
+ }
136
+ return i + matcher.length - 1;
137
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@naturalcycles/nodejs-lib",
3
- "version": "13.21.0",
3
+ "version": "13.22.0",
4
4
  "scripts": {
5
5
  "prepare": "husky",
6
6
  "docs-serve": "vuepress dev docs",
@@ -21,7 +21,6 @@
21
21
  "ajv": "^8.6.2",
22
22
  "ajv-formats": "^3.0.1",
23
23
  "ajv-keywords": "^5.0.0",
24
- "binary-split": "^1.0.5",
25
24
  "chalk": "^4.0.0",
26
25
  "debug": "^4.1.1",
27
26
  "dotenv": "^16.0.0",
package/src/fs/fs2.ts CHANGED
@@ -18,8 +18,13 @@ import type { RmOptions } from 'node:fs'
18
18
  import fs from 'node:fs'
19
19
  import fsp from 'node:fs/promises'
20
20
  import path from 'node:path'
21
+ import { createGzip, createUnzip } from 'node:zlib'
21
22
  import { _jsonParse } from '@naturalcycles/js-lib'
22
23
  import yaml, { DumpOptions } from 'js-yaml'
24
+ import { transformToNDJson } from '../stream/ndjson/transformToNDJson'
25
+ import { ReadableTyped, WritableTyped } from '../stream/stream.model'
26
+ import { transformSplitOnNewline } from '../stream/transform/transformSplit'
27
+ import { requireFileToExist } from '../util/env.util'
23
28
 
24
29
  /**
25
30
  * fs2 conveniently groups filesystem functions together.
@@ -305,6 +310,73 @@ class FS2 {
305
310
  readdirAsync = fsp.readdir
306
311
  createWriteStream = fs.createWriteStream
307
312
  createReadStream = fs.createReadStream
313
+
314
+ /*
315
+ Returns a Readable of [already parsed] NDJSON objects.
316
+
317
+ Replaces a list of operations:
318
+ - requireFileToExist(inputPath)
319
+ - fs.createReadStream
320
+ - createUnzip (only if path ends with '.gz')
321
+ - transformSplitOnNewline
322
+ - transformJsonParse
323
+
324
+ To add a Limit or Offset: just add .take() or .drop(), example:
325
+
326
+ _pipeline([
327
+ fs2.createReadStreamAsNDJSON().take(100),
328
+ transformX(),
329
+ ])
330
+ */
331
+ createReadStreamAsNDJSON<ROW = any>(inputPath: string): ReadableTyped<ROW> {
332
+ requireFileToExist(inputPath)
333
+
334
+ let stream: ReadableTyped<ROW> = fs
335
+ .createReadStream(inputPath, {
336
+ highWaterMark: 64 * 1024, // no observed speedup
337
+ })
338
+ .on('error', err => stream.emit('error', err))
339
+
340
+ if (inputPath.endsWith('.gz')) {
341
+ stream = stream.pipe(
342
+ createUnzip({
343
+ chunkSize: 64 * 1024, // speedup from ~3200 to 3800 rps!
344
+ }),
345
+ )
346
+ }
347
+
348
+ return stream.pipe(transformSplitOnNewline()).map(line => JSON.parse(line))
349
+ // For some crazy reason .map is much faster than transformJsonParse!
350
+ // ~5000 vs ~4000 rps !!!
351
+ // .on('error', err => stream.emit('error', err))
352
+ // .pipe(transformJsonParse<ROW>())
353
+ }
354
+
355
+ /*
356
+ Returns a Writable.
357
+
358
+ Replaces a list of operations:
359
+ - transformToNDJson
360
+ - createGzip (only if path ends with '.gz')
361
+ - fs.createWriteStream
362
+ */
363
+ createWriteStreamAsNDJSON(outputPath: string): WritableTyped<any> {
364
+ const transform1 = transformToNDJson()
365
+ let transform = transform1
366
+ if (outputPath.endsWith('.gz')) {
367
+ transform = transform.pipe(
368
+ createGzip({
369
+ // chunkSize: 64 * 1024, // no observed speedup
370
+ }),
371
+ )
372
+ }
373
+ transform.pipe(
374
+ fs.createWriteStream(outputPath, {
375
+ // highWaterMark: 64 * 1024, // no observed speedup
376
+ }),
377
+ )
378
+ return transform1
379
+ }
308
380
  }
309
381
 
310
382
  export const fs2 = new FS2()
package/src/index.ts CHANGED
@@ -28,13 +28,8 @@ export * from './log/log.util'
28
28
  export * from './slack/slack.service'
29
29
  export * from './slack/slack.service.model'
30
30
  export * from './stream/ndjson/ndjson.model'
31
- export * from './stream/ndjson/ndJsonFileRead'
32
- export * from './stream/ndjson/ndJsonFileWrite'
33
31
  export * from './stream/ndjson/ndjsonMap'
34
32
  export * from './stream/ndjson/ndjsonStreamForEach'
35
- export * from './stream/ndjson/pipelineFromNDJsonFile'
36
- export * from './stream/ndjson/pipelineToNDJsonFile'
37
- export * from './stream/ndjson/streamToNDJsonFile'
38
33
  export * from './stream/ndjson/transformJsonParse'
39
34
  export * from './stream/ndjson/transformToNDJson'
40
35
  export * from './stream/pipeline/pipeline'
@@ -56,7 +51,6 @@ export * from './stream/transform/transformMapSync'
56
51
  export * from './stream/transform/transformSplit'
57
52
  export * from './stream/transform/transformTap'
58
53
  export * from './stream/transform/transformToArray'
59
- export * from './stream/transform/transformToString'
60
54
  export * from './stream/transform/transformTee'
61
55
  export * from './stream/transform/worker/baseWorkerClass'
62
56
  export * from './stream/transform/worker/transformMultiThreaded'
@@ -1,17 +1,12 @@
1
- import { createReadStream, createWriteStream } from 'node:fs'
2
- import { createGzip, createUnzip } from 'node:zlib'
3
1
  import { AbortableAsyncMapper, ErrorMode } from '@naturalcycles/js-lib'
4
2
  import {
5
- requireFileToExist,
6
- transformJsonParse,
7
3
  transformLimit,
8
4
  transformLogProgress,
9
5
  transformMap,
10
6
  TransformMapOptions,
11
- transformSplit,
12
- transformToNDJson,
13
7
  _pipeline,
14
8
  TransformLogProgressOptions,
9
+ fs2,
15
10
  } from '../..'
16
11
 
17
12
  export interface NDJSONMapOptions<IN = any, OUT = IN>
@@ -46,24 +41,17 @@ export async function ndjsonMap<IN = any, OUT = any>(
46
41
  ): Promise<void> {
47
42
  const { inputFilePath, outputFilePath, logEveryOutput = 100_000, limitInput, limitOutput } = opt
48
43
 
49
- requireFileToExist(inputFilePath)
50
-
51
44
  console.log({
52
45
  inputFilePath,
53
46
  outputFilePath,
54
47
  })
55
48
 
56
- const transformUnzip = inputFilePath.endsWith('.gz') ? [createUnzip()] : []
57
- const transformZip = outputFilePath.endsWith('.gz') ? [createGzip()] : []
58
-
59
- const readable = createReadStream(inputFilePath)
49
+ const readable = fs2
50
+ .createReadStreamAsNDJSON(inputFilePath)
51
+ .take(limitInput || Number.POSITIVE_INFINITY)
60
52
 
61
53
  await _pipeline([
62
54
  readable,
63
- ...transformUnzip,
64
- transformSplit(), // splits by \n
65
- transformJsonParse(),
66
- transformLimit({ limit: limitInput, sourceReadable: readable }),
67
55
  transformLogProgress({ metric: 'read', ...opt }),
68
56
  transformMap(mapper, {
69
57
  flattenArrayOutput: true,
@@ -72,8 +60,6 @@ export async function ndjsonMap<IN = any, OUT = any>(
72
60
  }),
73
61
  transformLimit({ limit: limitOutput, sourceReadable: readable }),
74
62
  transformLogProgress({ metric: 'saved', logEvery: logEveryOutput }),
75
- transformToNDJson(),
76
- ...transformZip,
77
- createWriteStream(outputFilePath),
63
+ fs2.createWriteStreamAsNDJSON(outputFilePath),
78
64
  ])
79
65
  }
@@ -1,16 +1,12 @@
1
- import fs from 'node:fs'
2
- import { createUnzip } from 'node:zlib'
3
1
  import { AbortableAsyncMapper, ErrorMode } from '@naturalcycles/js-lib'
4
2
  import {
5
- requireFileToExist,
6
- transformJsonParse,
7
3
  transformLogProgress,
8
4
  TransformLogProgressOptions,
9
5
  transformMap,
10
6
  TransformMapOptions,
11
- transformSplit,
12
7
  writableVoid,
13
8
  _pipeline,
9
+ fs2,
14
10
  } from '../..'
15
11
 
16
12
  export interface NDJSONStreamForEachOptions<IN = any>
@@ -26,15 +22,8 @@ export async function ndjsonStreamForEach<T>(
26
22
  mapper: AbortableAsyncMapper<T, void>,
27
23
  opt: NDJSONStreamForEachOptions<T>,
28
24
  ): Promise<void> {
29
- requireFileToExist(opt.inputFilePath)
30
-
31
- const transformUnzip = opt.inputFilePath.endsWith('.gz') ? [createUnzip()] : []
32
-
33
25
  await _pipeline([
34
- fs.createReadStream(opt.inputFilePath),
35
- ...transformUnzip,
36
- transformSplit(),
37
- transformJsonParse(),
26
+ fs2.createReadStreamAsNDJSON(opt.inputFilePath),
38
27
  transformMap<T, any>(mapper, {
39
28
  errorMode: ErrorMode.THROW_AGGREGATED,
40
29
  ...opt,
@@ -27,9 +27,9 @@ export interface TransformJsonParseOptions {
27
27
  * consumeYourStream...
28
28
  * [)
29
29
  */
30
- export function transformJsonParse<OUT = any>(
30
+ export function transformJsonParse<ROW = any>(
31
31
  opt: TransformJsonParseOptions = {},
32
- ): TransformTyped<string | Buffer, OUT> {
32
+ ): TransformTyped<string | Buffer, ROW> {
33
33
  const { strict = true, reviver } = opt
34
34
 
35
35
  return new Transform({
@@ -1,16 +1,142 @@
1
+ import { Transform } from 'node:stream'
1
2
  import { TransformTyped } from '../stream.model'
2
3
 
3
- // https://github.com/max-mapper/binary-split
4
- // todo: test its newer version that doesn't have `through2` dependency
5
- // todo: test writableHighWaterMark of 64k
6
- const _binarySplit = require('binary-split')
4
+ // The code below is carefully adopted from: https://github.com/max-mapper/binary-split
5
+
6
+ /**
7
+ * Transforms input Buffer/string stream into Buffer chunks (objectMode: true) split by newLine.
8
+ *
9
+ * Useful for reading NDJSON files from fs.
10
+ *
11
+ * Same as binarySplit, but optimized (hard-coded) to split on NEWLINE (aka `\n`).
12
+ * (+5-10% _pipeline speedup measured, compared to generic `binarySplit` on variable length delimiter)
13
+ */
14
+ export function transformSplitOnNewline(): TransformTyped<Buffer, Buffer> {
15
+ let buffered: Buffer | undefined
16
+
17
+ return new Transform({
18
+ readableObjectMode: true,
19
+ writableHighWaterMark: 64 * 1024,
20
+
21
+ transform(buf: Buffer, enc, done) {
22
+ let offset = 0
23
+ let lastMatch = 0
24
+ if (buffered) {
25
+ buf = Buffer.concat([buffered, buf])
26
+ offset = buffered.length
27
+ buffered = undefined
28
+ }
29
+
30
+ while (true) {
31
+ const idx = firstNewlineMatch(buf, offset)
32
+ if (idx !== -1 && idx < buf.length) {
33
+ if (lastMatch !== idx) {
34
+ this.push(buf.slice(lastMatch, idx))
35
+ }
36
+ offset = idx + 1
37
+ lastMatch = offset
38
+ } else {
39
+ buffered = buf.slice(lastMatch)
40
+ break
41
+ }
42
+ }
43
+
44
+ done()
45
+ },
46
+
47
+ flush(done) {
48
+ if (buffered && buffered.length > 0) this.push(buffered)
49
+ done()
50
+ },
51
+ })
52
+ }
7
53
 
8
54
  /**
9
55
  * Input: stream (objectMode=false) of arbitrary string|Buffer chunks, like when read from fs
10
- * Output: stream (objectMode=false) or string|Buffer chunks split by `separator` (@default to `\n`)
56
+ * Output: stream (objectMode=true) or string|Buffer chunks split by `separator` (@default to `\n`)
11
57
  *
12
- * Useful to, for example, reading NDJSON files from fs
58
+ * Please use slightly more optimized `transformSplitOnNewline` for NDJSON file parsing.
59
+ * (+5-10% _pipeline speedup measured!)
60
+ */
61
+ export function transformSplit(separator = '\n'): TransformTyped<Buffer, Buffer> {
62
+ const matcher = Buffer.from(separator)
63
+ let buffered: Buffer | undefined
64
+
65
+ return new Transform({
66
+ readableObjectMode: true,
67
+ writableHighWaterMark: 64 * 1024,
68
+
69
+ transform(buf: Buffer, enc, done) {
70
+ let offset = 0
71
+ let lastMatch = 0
72
+ if (buffered) {
73
+ buf = Buffer.concat([buffered, buf])
74
+ offset = buffered.length
75
+ buffered = undefined
76
+ }
77
+
78
+ while (true) {
79
+ const idx = firstMatch(buf, offset - matcher.length + 1, matcher)
80
+ if (idx !== -1 && idx < buf.length) {
81
+ if (lastMatch !== idx) {
82
+ this.push(buf.slice(lastMatch, idx))
83
+ }
84
+ offset = idx + matcher.length
85
+ lastMatch = offset
86
+ } else {
87
+ buffered = buf.slice(lastMatch)
88
+ break
89
+ }
90
+ }
91
+
92
+ done()
93
+ },
94
+
95
+ flush(done) {
96
+ if (buffered && buffered.length > 0) this.push(buffered)
97
+ done()
98
+ },
99
+ })
100
+ }
101
+
102
+ // const NEWLINE = Buffer.from('\n')
103
+ // const NEWLINE_CODE = NEWLINE[0]! // it is `10`
104
+ const NEWLINE_CODE = 10
105
+
106
+ /**
107
+ * Same as firstMatch, but optimized (hard-coded) to find NEWLINE (aka `\n`).
13
108
  */
14
- export function transformSplit(separator = '\n'): TransformTyped<string | Buffer, string | Buffer> {
15
- return _binarySplit(separator)
109
+ function firstNewlineMatch(buf: Buffer, offset: number): number {
110
+ const bufLength = buf.length
111
+ if (offset >= bufLength) return -1
112
+ for (let i = offset; i < bufLength; i++) {
113
+ if (buf[i] === NEWLINE_CODE) {
114
+ return i
115
+ }
116
+ }
117
+ return -1 // this code is unreachable, because i is guaranteed to be found in the loop above
118
+ }
119
+
120
+ function firstMatch(buf: Buffer, offset: number, matcher: Buffer): number {
121
+ if (offset >= buf.length) return -1
122
+ let i
123
+ for (i = offset; i < buf.length; i++) {
124
+ if (buf[i] === matcher[0]) {
125
+ if (matcher.length > 1) {
126
+ let fullMatch = true
127
+ let j = i
128
+ for (let k = 0; j < i + matcher.length; j++, k++) {
129
+ if (buf[j] !== matcher[k]) {
130
+ fullMatch = false
131
+ break
132
+ }
133
+ }
134
+ if (fullMatch) return j - matcher.length
135
+ } else {
136
+ break
137
+ }
138
+ }
139
+ }
140
+
141
+ return i + matcher.length - 1
16
142
  }
@@ -1,5 +0,0 @@
1
- import { PipelineFromNDJsonFileOptions } from './pipelineFromNDJsonFile';
2
- /**
3
- * Read whole NDJSON file into memory, resolve promise with resulting array of items.
4
- */
5
- export declare function ndJsonFileRead<OUT = any>(opt: PipelineFromNDJsonFileOptions): Promise<OUT[]>;
@@ -1,14 +0,0 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.ndJsonFileRead = void 0;
4
- const __1 = require("../..");
5
- const pipelineFromNDJsonFile_1 = require("./pipelineFromNDJsonFile");
6
- /**
7
- * Read whole NDJSON file into memory, resolve promise with resulting array of items.
8
- */
9
- async function ndJsonFileRead(opt) {
10
- const res = [];
11
- await (0, pipelineFromNDJsonFile_1.pipelineFromNDJsonFile)([(0, __1.writablePushToArray)(res)], opt);
12
- return res;
13
- }
14
- exports.ndJsonFileRead = ndJsonFileRead;
@@ -1,5 +0,0 @@
1
- import { PipelineToNDJsonFileOptions } from './pipelineToNDJsonFile';
2
- /**
3
- * Write array of objects (in memory) into NDJSON file. Resolve when done.
4
- */
5
- export declare function ndJsonFileWrite<IN = any>(items: IN[], opt: PipelineToNDJsonFileOptions): Promise<void>;
@@ -1,12 +0,0 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.ndJsonFileWrite = void 0;
4
- const readableFromArray_1 = require("../readable/readableFromArray");
5
- const pipelineToNDJsonFile_1 = require("./pipelineToNDJsonFile");
6
- /**
7
- * Write array of objects (in memory) into NDJSON file. Resolve when done.
8
- */
9
- async function ndJsonFileWrite(items, opt) {
10
- await (0, pipelineToNDJsonFile_1.pipelineToNDJsonFile)([(0, readableFromArray_1.readableFromArray)(items)], opt);
11
- }
12
- exports.ndJsonFileWrite = ndJsonFileWrite;
@@ -1,24 +0,0 @@
1
- /// <reference types="node" />
2
- /// <reference types="node" />
3
- import { ZlibOptions } from 'node:zlib';
4
- import { NDJsonStats } from './ndjson.model';
5
- import { TransformJsonParseOptions } from './transformJsonParse';
6
- export interface PipelineFromNDJsonFileOptions extends TransformJsonParseOptions {
7
- filePath: string;
8
- /**
9
- * @default `\n`
10
- */
11
- separator?: string;
12
- /**
13
- * @default false
14
- */
15
- gzip?: boolean;
16
- /**
17
- * Only applicable if `gzip` is enabled
18
- */
19
- zlibOptions?: ZlibOptions;
20
- }
21
- /**
22
- * Convenience pipeline that starts from reading NDJSON file.
23
- */
24
- export declare function pipelineFromNDJsonFile(streams: NodeJS.WritableStream[], opt: PipelineFromNDJsonFileOptions): Promise<NDJsonStats>;
@@ -1,37 +0,0 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.pipelineFromNDJsonFile = void 0;
4
- const tslib_1 = require("tslib");
5
- const node_fs_1 = tslib_1.__importDefault(require("node:fs"));
6
- const node_zlib_1 = require("node:zlib");
7
- const js_lib_1 = require("@naturalcycles/js-lib");
8
- const __1 = require("../..");
9
- const colors_1 = require("../../colors/colors");
10
- const ndjson_model_1 = require("./ndjson.model");
11
- const transformJsonParse_1 = require("./transformJsonParse");
12
- /**
13
- * Convenience pipeline that starts from reading NDJSON file.
14
- */
15
- async function pipelineFromNDJsonFile(streams, opt) {
16
- const { filePath, gzip, separator } = opt;
17
- const started = Date.now();
18
- let rows = 0;
19
- const { size: sizeBytes } = node_fs_1.default.statSync(filePath);
20
- console.log(`<< ${(0, colors_1.grey)(filePath)} ${(0, colors_1.dimWhite)((0, js_lib_1._hb)(sizeBytes))} started...`);
21
- await (0, __1._pipeline)([
22
- node_fs_1.default.createReadStream(filePath),
23
- ...(gzip ? [(0, node_zlib_1.createUnzip)(opt.zlibOptions)] : []),
24
- (0, __1.transformSplit)(separator), // splits by separator
25
- (0, transformJsonParse_1.transformJsonParse)(opt),
26
- (0, __1.transformTap)(() => rows++),
27
- ...streams,
28
- ]);
29
- const stats = ndjson_model_1.NDJsonStats.create({
30
- tookMillis: Date.now() - started,
31
- rows,
32
- sizeBytes,
33
- });
34
- console.log(`<< ${(0, colors_1.grey)(filePath)}\n` + stats.toPretty());
35
- return stats;
36
- }
37
- exports.pipelineFromNDJsonFile = pipelineFromNDJsonFile;
@@ -1,27 +0,0 @@
1
- /// <reference types="node" />
2
- /// <reference types="node" />
3
- import { ZlibOptions } from 'node:zlib';
4
- import { NDJsonStats } from './ndjson.model';
5
- import { TransformToNDJsonOptions } from './transformToNDJson';
6
- export interface PipelineToNDJsonFileOptions extends TransformToNDJsonOptions {
7
- filePath: string;
8
- /**
9
- * @default false
10
- * If true - will fail if output file already exists.
11
- */
12
- protectFromOverwrite?: boolean;
13
- /**
14
- * @default false
15
- */
16
- gzip?: boolean;
17
- /**
18
- * Only applicable if `gzip` is enabled
19
- */
20
- zlibOptions?: ZlibOptions;
21
- }
22
- /**
23
- * Convenience pipeline to transform stream of objects into a file in NDJSON format.
24
- *
25
- * Does fs.ensureFile() before starting, which will create all needed directories and truncate the file if it existed.
26
- */
27
- export declare function pipelineToNDJsonFile(streams: (NodeJS.ReadableStream | NodeJS.WritableStream)[], opt: PipelineToNDJsonFileOptions): Promise<NDJsonStats>;
@@ -1,42 +0,0 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.pipelineToNDJsonFile = void 0;
4
- const tslib_1 = require("tslib");
5
- const node_fs_1 = tslib_1.__importDefault(require("node:fs"));
6
- const node_zlib_1 = require("node:zlib");
7
- const js_lib_1 = require("@naturalcycles/js-lib");
8
- const __1 = require("../..");
9
- const colors_1 = require("../../colors/colors");
10
- const ndjson_model_1 = require("./ndjson.model");
11
- const transformToNDJson_1 = require("./transformToNDJson");
12
- /**
13
- * Convenience pipeline to transform stream of objects into a file in NDJSON format.
14
- *
15
- * Does fs.ensureFile() before starting, which will create all needed directories and truncate the file if it existed.
16
- */
17
- async function pipelineToNDJsonFile(streams, opt) {
18
- const { filePath, gzip, protectFromOverwrite = false } = opt;
19
- if (protectFromOverwrite && __1.fs2.pathExists(filePath)) {
20
- throw new js_lib_1.AppError(`pipelineToNDJsonFile: output file exists: ${filePath}`);
21
- }
22
- const started = Date.now();
23
- let rows = 0;
24
- __1.fs2.ensureFile(filePath);
25
- console.log(`>> ${(0, colors_1.grey)(filePath)} started...`);
26
- await (0, __1._pipeline)([
27
- ...streams,
28
- (0, __1.transformTap)(() => rows++),
29
- (0, transformToNDJson_1.transformToNDJson)(opt),
30
- ...(gzip ? [(0, node_zlib_1.createGzip)(opt.zlibOptions)] : []), // optional gzip
31
- node_fs_1.default.createWriteStream(filePath),
32
- ]);
33
- const { size: sizeBytes } = node_fs_1.default.statSync(filePath);
34
- const stats = ndjson_model_1.NDJsonStats.create({
35
- tookMillis: Date.now() - started,
36
- rows,
37
- sizeBytes,
38
- });
39
- console.log(`>> ${(0, colors_1.grey)(filePath)}\n` + stats.toPretty());
40
- return stats;
41
- }
42
- exports.pipelineToNDJsonFile = pipelineToNDJsonFile;
@@ -1,3 +0,0 @@
1
- import { ReadableTyped } from '../stream.model';
2
- import { PipelineToNDJsonFileOptions } from './pipelineToNDJsonFile';
3
- export declare function streamToNDJsonFile<IN>(stream: ReadableTyped<IN>, opt: PipelineToNDJsonFileOptions): Promise<void>;
@@ -1,8 +0,0 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.streamToNDJsonFile = void 0;
4
- const pipelineToNDJsonFile_1 = require("./pipelineToNDJsonFile");
5
- async function streamToNDJsonFile(stream, opt) {
6
- await (0, pipelineToNDJsonFile_1.pipelineToNDJsonFile)([stream], opt);
7
- }
8
- exports.streamToNDJsonFile = streamToNDJsonFile;
@@ -1,12 +0,0 @@
1
- /// <reference types="node" />
2
- import { TransformTyped } from '../stream.model';
3
- /**
4
- * Transforms objectMode=false Buffers/strings into objectMode=true strings.
5
- *
6
- * Useful in this _pipeline:
7
- * fs.createReadStream(inputPath),
8
- * createUnzip(), // binary
9
- * transformSplit(), // string chunks, but objectMode==false
10
- * transformToString(), // string chunks, but objectMode==true
11
- */
12
- export declare function transformToString(): TransformTyped<Buffer, string>;
@@ -1,24 +0,0 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.transformToString = void 0;
4
- const node_stream_1 = require("node:stream");
5
- /**
6
- * Transforms objectMode=false Buffers/strings into objectMode=true strings.
7
- *
8
- * Useful in this _pipeline:
9
- * fs.createReadStream(inputPath),
10
- * createUnzip(), // binary
11
- * transformSplit(), // string chunks, but objectMode==false
12
- * transformToString(), // string chunks, but objectMode==true
13
- */
14
- function transformToString() {
15
- return new node_stream_1.Transform({
16
- objectMode: false,
17
- readableObjectMode: true,
18
- transform(chunk, _, cb) {
19
- // console.log(`enc: ${_}`, chunk.toString())
20
- cb(null, chunk.toString());
21
- },
22
- });
23
- }
24
- exports.transformToString = transformToString;
@@ -1,15 +0,0 @@
1
- import { writablePushToArray } from '../..'
2
- import { pipelineFromNDJsonFile, PipelineFromNDJsonFileOptions } from './pipelineFromNDJsonFile'
3
-
4
- /**
5
- * Read whole NDJSON file into memory, resolve promise with resulting array of items.
6
- */
7
- export async function ndJsonFileRead<OUT = any>(
8
- opt: PipelineFromNDJsonFileOptions,
9
- ): Promise<OUT[]> {
10
- const res: OUT[] = []
11
-
12
- await pipelineFromNDJsonFile([writablePushToArray(res)], opt)
13
-
14
- return res
15
- }
@@ -1,12 +0,0 @@
1
- import { readableFromArray } from '../readable/readableFromArray'
2
- import { pipelineToNDJsonFile, PipelineToNDJsonFileOptions } from './pipelineToNDJsonFile'
3
-
4
- /**
5
- * Write array of objects (in memory) into NDJSON file. Resolve when done.
6
- */
7
- export async function ndJsonFileWrite<IN = any>(
8
- items: IN[],
9
- opt: PipelineToNDJsonFileOptions,
10
- ): Promise<void> {
11
- await pipelineToNDJsonFile([readableFromArray(items)], opt)
12
- }
@@ -1,62 +0,0 @@
1
- import fs from 'node:fs'
2
- import { createUnzip, ZlibOptions } from 'node:zlib'
3
- import { _hb } from '@naturalcycles/js-lib'
4
- import { transformTap, _pipeline, transformSplit } from '../..'
5
- import { dimWhite, grey } from '../../colors/colors'
6
- import { NDJsonStats } from './ndjson.model'
7
- import { transformJsonParse, TransformJsonParseOptions } from './transformJsonParse'
8
-
9
- export interface PipelineFromNDJsonFileOptions extends TransformJsonParseOptions {
10
- filePath: string
11
-
12
- /**
13
- * @default `\n`
14
- */
15
- separator?: string
16
-
17
- /**
18
- * @default false
19
- */
20
- gzip?: boolean
21
-
22
- /**
23
- * Only applicable if `gzip` is enabled
24
- */
25
- zlibOptions?: ZlibOptions
26
- }
27
-
28
- /**
29
- * Convenience pipeline that starts from reading NDJSON file.
30
- */
31
- export async function pipelineFromNDJsonFile(
32
- streams: NodeJS.WritableStream[],
33
- opt: PipelineFromNDJsonFileOptions,
34
- ): Promise<NDJsonStats> {
35
- const { filePath, gzip, separator } = opt
36
-
37
- const started = Date.now()
38
- let rows = 0
39
-
40
- const { size: sizeBytes } = fs.statSync(filePath)
41
-
42
- console.log(`<< ${grey(filePath)} ${dimWhite(_hb(sizeBytes))} started...`)
43
-
44
- await _pipeline([
45
- fs.createReadStream(filePath),
46
- ...(gzip ? [createUnzip(opt.zlibOptions)] : []),
47
- transformSplit(separator), // splits by separator
48
- transformJsonParse(opt),
49
- transformTap(() => rows++),
50
- ...streams,
51
- ])
52
-
53
- const stats = NDJsonStats.create({
54
- tookMillis: Date.now() - started,
55
- rows,
56
- sizeBytes,
57
- })
58
-
59
- console.log(`<< ${grey(filePath)}\n` + stats.toPretty())
60
-
61
- return stats
62
- }
@@ -1,70 +0,0 @@
1
- import fs from 'node:fs'
2
- import { createGzip, ZlibOptions } from 'node:zlib'
3
- import { AppError } from '@naturalcycles/js-lib'
4
- import { transformTap, _pipeline, fs2 } from '../..'
5
- import { grey } from '../../colors/colors'
6
- import { NDJsonStats } from './ndjson.model'
7
- import { transformToNDJson, TransformToNDJsonOptions } from './transformToNDJson'
8
-
9
- export interface PipelineToNDJsonFileOptions extends TransformToNDJsonOptions {
10
- filePath: string
11
-
12
- /**
13
- * @default false
14
- * If true - will fail if output file already exists.
15
- */
16
- protectFromOverwrite?: boolean
17
-
18
- /**
19
- * @default false
20
- */
21
- gzip?: boolean
22
-
23
- /**
24
- * Only applicable if `gzip` is enabled
25
- */
26
- zlibOptions?: ZlibOptions
27
- }
28
-
29
- /**
30
- * Convenience pipeline to transform stream of objects into a file in NDJSON format.
31
- *
32
- * Does fs.ensureFile() before starting, which will create all needed directories and truncate the file if it existed.
33
- */
34
- export async function pipelineToNDJsonFile(
35
- streams: (NodeJS.ReadableStream | NodeJS.WritableStream)[],
36
- opt: PipelineToNDJsonFileOptions,
37
- ): Promise<NDJsonStats> {
38
- const { filePath, gzip, protectFromOverwrite = false } = opt
39
-
40
- if (protectFromOverwrite && fs2.pathExists(filePath)) {
41
- throw new AppError(`pipelineToNDJsonFile: output file exists: ${filePath}`)
42
- }
43
-
44
- const started = Date.now()
45
- let rows = 0
46
-
47
- fs2.ensureFile(filePath)
48
-
49
- console.log(`>> ${grey(filePath)} started...`)
50
-
51
- await _pipeline([
52
- ...streams,
53
- transformTap(() => rows++),
54
- transformToNDJson(opt),
55
- ...(gzip ? [createGzip(opt.zlibOptions)] : []), // optional gzip
56
- fs.createWriteStream(filePath),
57
- ])
58
-
59
- const { size: sizeBytes } = fs.statSync(filePath)
60
-
61
- const stats = NDJsonStats.create({
62
- tookMillis: Date.now() - started,
63
- rows,
64
- sizeBytes,
65
- })
66
-
67
- console.log(`>> ${grey(filePath)}\n` + stats.toPretty())
68
-
69
- return stats
70
- }
@@ -1,9 +0,0 @@
1
- import { ReadableTyped } from '../stream.model'
2
- import { pipelineToNDJsonFile, PipelineToNDJsonFileOptions } from './pipelineToNDJsonFile'
3
-
4
- export async function streamToNDJsonFile<IN>(
5
- stream: ReadableTyped<IN>,
6
- opt: PipelineToNDJsonFileOptions,
7
- ): Promise<void> {
8
- await pipelineToNDJsonFile([stream], opt)
9
- }
@@ -1,22 +0,0 @@
1
- import { Transform } from 'node:stream'
2
- import { TransformTyped } from '../stream.model'
3
-
4
- /**
5
- * Transforms objectMode=false Buffers/strings into objectMode=true strings.
6
- *
7
- * Useful in this _pipeline:
8
- * fs.createReadStream(inputPath),
9
- * createUnzip(), // binary
10
- * transformSplit(), // string chunks, but objectMode==false
11
- * transformToString(), // string chunks, but objectMode==true
12
- */
13
- export function transformToString(): TransformTyped<Buffer, string> {
14
- return new Transform({
15
- objectMode: false,
16
- readableObjectMode: true,
17
- transform(chunk: Buffer, _, cb) {
18
- // console.log(`enc: ${_}`, chunk.toString())
19
- cb(null, chunk.toString())
20
- },
21
- })
22
- }