@energy8platform/stake-math-tools 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/index.ts +7 -0
- package/src/tiered.ts +512 -108
- package/src/transform-jsonl-zst.ts +285 -0
- package/src/types.ts +23 -0
- package/test/transform-jsonl-zst.test.ts +343 -0
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Streaming `*.jsonl.zst → *.jsonl.zst` transformer.
|
|
3
|
+
*
|
|
4
|
+
* Decompresses the input with zstd, runs each line through an optional user
|
|
5
|
+
* mapper, recompresses the result. Pipes everything — no temp files, memory
|
|
6
|
+
* footprint stays at one line regardless of input size, so gigabyte books
|
|
7
|
+
* files work fine.
|
|
8
|
+
*
|
|
9
|
+
* Why not `readline.createInterface`? readline accumulates each line as a
|
|
10
|
+
* growing JS string (`buffer += chunk`), which hits V8's ~512MB string-length
|
|
11
|
+
* limit on books files that contain very long event arrays — manifests as
|
|
12
|
+
* `RangeError: Invalid string length` deep in node:internal/readline. The
|
|
13
|
+
* Buffer-based splitter here keeps incomplete-line state as raw bytes and
|
|
14
|
+
* only materializes a string at the LF boundary (mapper mode) or never
|
|
15
|
+
* (identity mode), so any line that fits in OS memory is fine.
|
|
16
|
+
*
|
|
17
|
+
* In identity mode (no mapper) we don't split at all — we pipe decompressor
|
|
18
|
+
* output directly into compressor input as raw bytes. That's the fastest
|
|
19
|
+
* path: ~25 MB/s of compressed input on a single core, dominated by the
|
|
20
|
+
* zstd subprocesses, with zero JS string allocation.
|
|
21
|
+
*
|
|
22
|
+
* Requires the `zstd` binary on PATH (same precondition as the rest of the
|
|
23
|
+
* stake-bridge tooling).
|
|
24
|
+
*/
|
|
25
|
+
|
|
26
|
+
import { spawn } from 'node:child_process';
|
|
27
|
+
|
|
28
|
+
export type LineMapper = (
|
|
29
|
+
line: string,
|
|
30
|
+
index: number,
|
|
31
|
+
) => string | string[] | null | undefined;
|
|
32
|
+
|
|
33
|
+
export type BinaryLineMapper = (
|
|
34
|
+
line: Buffer,
|
|
35
|
+
index: number,
|
|
36
|
+
) => Buffer | string | Array<Buffer | string> | null | undefined;
|
|
37
|
+
|
|
38
|
+
export interface TransformJsonlZstParams {
|
|
39
|
+
/** Path to a zstd-compressed `.jsonl.zst` file. */
|
|
40
|
+
inputPath: string;
|
|
41
|
+
/** Path where the transformed `.jsonl.zst` will be written (overwritten). */
|
|
42
|
+
outputPath: string;
|
|
43
|
+
/** Per-line transform with the line decoded as a JS string. Default =
|
|
44
|
+
* identity passthrough (byte pipe, no per-line allocations).
|
|
45
|
+
*
|
|
46
|
+
* - Return a `string` to replace the line with that content.
|
|
47
|
+
* - Return a `string[]` to expand one input line into several output lines.
|
|
48
|
+
* - Return `null` / `undefined` to drop the line entirely.
|
|
49
|
+
*
|
|
50
|
+
* Mutually exclusive with `binaryMapper`. Use `binaryMapper` instead when
|
|
51
|
+
* any single line could exceed V8's ~512 MB string-length cap (e.g. bonus
|
|
52
|
+
* game books with massive event arrays) — `toString('utf8')` will throw
|
|
53
|
+
* `ERR_STRING_TOO_LONG` on lines above that limit. */
|
|
54
|
+
mapper?: LineMapper;
|
|
55
|
+
/** Per-line transform with the line passed as a raw `Buffer`. Use this
|
|
56
|
+
* for any line that may exceed V8's ~512 MB string limit, or when you
|
|
57
|
+
* only need to peek at a small prefix (`line.subarray(0, 64).toString()`)
|
|
58
|
+
* and want to pass the rest of the bytes through verbatim.
|
|
59
|
+
*
|
|
60
|
+
* Return shape:
|
|
61
|
+
* - `Buffer` or `string` — replace the line with that content.
|
|
62
|
+
* - array of `Buffer | string` — expand to N output lines.
|
|
63
|
+
* - `null` / `undefined` — drop.
|
|
64
|
+
*
|
|
65
|
+
* Mutually exclusive with `mapper`. */
|
|
66
|
+
binaryMapper?: BinaryLineMapper;
|
|
67
|
+
/** zstd compression level for the output. 1 = fastest, 22 = smallest.
|
|
68
|
+
* Default 9 — same level the kitsune optimize pipeline uses. */
|
|
69
|
+
zstdLevel?: number;
|
|
70
|
+
/** Called every `progressEveryLines` input lines with running counts.
|
|
71
|
+
* Useful for progress bars on multi-million-row files. Identity mode
|
|
72
|
+
* reports `linesRead == linesWritten == 0` because we don't split. */
|
|
73
|
+
onProgress?: (linesRead: number, linesWritten: number) => void;
|
|
74
|
+
/** How often to fire `onProgress`. Default 100_000. */
|
|
75
|
+
progressEveryLines?: number;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
export interface TransformJsonlZstResult {
|
|
79
|
+
linesRead: number;
|
|
80
|
+
linesWritten: number;
|
|
81
|
+
/** True when identity mode was used: byte-pipe passthrough without
|
|
82
|
+
* per-line counting (so `linesRead`/`linesWritten` will be 0). */
|
|
83
|
+
identityPassthrough: boolean;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
const LF = 0x0a;
|
|
87
|
+
const LF_BUFFER = Buffer.from([LF]);
|
|
88
|
+
|
|
89
|
+
export async function transformJsonlZst(
|
|
90
|
+
params: TransformJsonlZstParams,
|
|
91
|
+
): Promise<TransformJsonlZstResult> {
|
|
92
|
+
const {
|
|
93
|
+
inputPath,
|
|
94
|
+
outputPath,
|
|
95
|
+
mapper,
|
|
96
|
+
binaryMapper,
|
|
97
|
+
zstdLevel = 9,
|
|
98
|
+
onProgress,
|
|
99
|
+
progressEveryLines = 100_000,
|
|
100
|
+
} = params;
|
|
101
|
+
|
|
102
|
+
if (mapper && binaryMapper) {
|
|
103
|
+
throw new Error(
|
|
104
|
+
'transformJsonlZst: pass either `mapper` (string) or `binaryMapper` (Buffer), not both',
|
|
105
|
+
);
|
|
106
|
+
}
|
|
107
|
+
const anyMapper = mapper ?? binaryMapper;
|
|
108
|
+
|
|
109
|
+
const decompress = spawn('zstd', ['-dc', '-q', inputPath], {
|
|
110
|
+
stdio: ['ignore', 'pipe', 'inherit'],
|
|
111
|
+
});
|
|
112
|
+
const compress = spawn(
|
|
113
|
+
'zstd',
|
|
114
|
+
[`-${zstdLevel}`, '-q', '-f', '-o', outputPath],
|
|
115
|
+
{ stdio: ['pipe', 'inherit', 'inherit'] },
|
|
116
|
+
);
|
|
117
|
+
|
|
118
|
+
const decompressDone = waitForExit(decompress, 'zstd -d');
|
|
119
|
+
const compressDone = waitForExit(compress, 'zstd -c');
|
|
120
|
+
|
|
121
|
+
const writeChunk = (chunk: Buffer | string): Promise<void> => {
|
|
122
|
+
if (compress.stdin.write(chunk)) return Promise.resolve();
|
|
123
|
+
return new Promise<void>((resolve, reject) => {
|
|
124
|
+
const onDrain = () => {
|
|
125
|
+
cleanup();
|
|
126
|
+
resolve();
|
|
127
|
+
};
|
|
128
|
+
const onError = (err: Error) => {
|
|
129
|
+
cleanup();
|
|
130
|
+
reject(err);
|
|
131
|
+
};
|
|
132
|
+
const cleanup = () => {
|
|
133
|
+
compress.stdin.off('drain', onDrain);
|
|
134
|
+
compress.stdin.off('error', onError);
|
|
135
|
+
};
|
|
136
|
+
compress.stdin.once('drain', onDrain);
|
|
137
|
+
compress.stdin.once('error', onError);
|
|
138
|
+
});
|
|
139
|
+
};
|
|
140
|
+
|
|
141
|
+
let linesRead = 0;
|
|
142
|
+
let linesWritten = 0;
|
|
143
|
+
|
|
144
|
+
try {
|
|
145
|
+
if (!anyMapper) {
|
|
146
|
+
// Identity mode: byte-pipe. Never split into lines, never materialize
|
|
147
|
+
// strings, never accumulate buffers. Constant memory regardless of how
|
|
148
|
+
// long individual lines are.
|
|
149
|
+
for await (const chunk of decompress.stdout!) {
|
|
150
|
+
await writeChunk(chunk);
|
|
151
|
+
}
|
|
152
|
+
} else {
|
|
153
|
+
// Mapper mode: split on LF boundaries by scanning raw bytes. We keep
|
|
154
|
+
// incomplete-line bytes in a small array of Buffers (no concatenation
|
|
155
|
+
// into a single growing JS string), then `Buffer.concat` + `toString`
|
|
156
|
+
// when the LF is finally seen (string mapper) or never (binary mapper).
|
|
157
|
+
let pending: Buffer[] = [];
|
|
158
|
+
let pendingLen = 0;
|
|
159
|
+
|
|
160
|
+
const writeMapperResult = async (out: Buffer | string): Promise<void> => {
|
|
161
|
+
await writeChunk(out);
|
|
162
|
+
await writeChunk(LF_BUFFER);
|
|
163
|
+
linesWritten++;
|
|
164
|
+
};
|
|
165
|
+
|
|
166
|
+
const flushLine = async (lineBuf: Buffer): Promise<void> => {
|
|
167
|
+
// Strip trailing CR for CRLF tolerance, matching readline behaviour.
|
|
168
|
+
const trimmed =
|
|
169
|
+
lineBuf.length > 0 && lineBuf[lineBuf.length - 1] === 0x0d
|
|
170
|
+
? lineBuf.subarray(0, lineBuf.length - 1)
|
|
171
|
+
: lineBuf;
|
|
172
|
+
|
|
173
|
+
let result: string | string[] | Buffer | Array<Buffer | string> | null | undefined;
|
|
174
|
+
if (binaryMapper) {
|
|
175
|
+
result = binaryMapper(trimmed, linesRead);
|
|
176
|
+
} else {
|
|
177
|
+
// String mapper: decode the line. Lines above V8's ~512 MB string
|
|
178
|
+
// cap throw ERR_STRING_TOO_LONG here — re-throw with a pointer to
|
|
179
|
+
// `binaryMapper` so the failure mode is obvious.
|
|
180
|
+
let lineStr: string;
|
|
181
|
+
try {
|
|
182
|
+
lineStr = trimmed.toString('utf8');
|
|
183
|
+
} catch (err) {
|
|
184
|
+
if (
|
|
185
|
+
err instanceof Error &&
|
|
186
|
+
(err as NodeJS.ErrnoException).code === 'ERR_STRING_TOO_LONG'
|
|
187
|
+
) {
|
|
188
|
+
const wrapped = new Error(
|
|
189
|
+
`transformJsonlZst: line ${linesRead} is ${trimmed.length} bytes — ` +
|
|
190
|
+
`exceeds V8 max JS string length (~512 MB). Use the ` +
|
|
191
|
+
'`binaryMapper` option to receive the line as a Buffer.',
|
|
192
|
+
);
|
|
193
|
+
(wrapped as { cause?: unknown }).cause = err;
|
|
194
|
+
throw wrapped;
|
|
195
|
+
}
|
|
196
|
+
throw err;
|
|
197
|
+
}
|
|
198
|
+
result = mapper!(lineStr, linesRead);
|
|
199
|
+
}
|
|
200
|
+
linesRead++;
|
|
201
|
+
|
|
202
|
+
if (result === null || result === undefined) {
|
|
203
|
+
// drop
|
|
204
|
+
} else if (Array.isArray(result)) {
|
|
205
|
+
for (const out of result) {
|
|
206
|
+
await writeMapperResult(out);
|
|
207
|
+
}
|
|
208
|
+
} else {
|
|
209
|
+
await writeMapperResult(result);
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
if (onProgress && linesRead % progressEveryLines === 0) {
|
|
213
|
+
onProgress(linesRead, linesWritten);
|
|
214
|
+
}
|
|
215
|
+
};
|
|
216
|
+
|
|
217
|
+
for await (const chunk of decompress.stdout! as AsyncIterable<Buffer>) {
|
|
218
|
+
let start = 0;
|
|
219
|
+
// Buffer.indexOf(LF) is a C++ scan, ~20× faster than a JS byte loop.
|
|
220
|
+
while (start < chunk.length) {
|
|
221
|
+
const lf = chunk.indexOf(LF, start);
|
|
222
|
+
if (lf < 0) {
|
|
223
|
+
// No LF in the remainder — stash as pending and move on.
|
|
224
|
+
const remainder = chunk.subarray(start);
|
|
225
|
+
const owned = Buffer.from(remainder);
|
|
226
|
+
pending.push(owned);
|
|
227
|
+
pendingLen += owned.length;
|
|
228
|
+
break;
|
|
229
|
+
}
|
|
230
|
+
const tail = chunk.subarray(start, lf);
|
|
231
|
+
let lineBuf: Buffer;
|
|
232
|
+
if (pendingLen === 0) {
|
|
233
|
+
lineBuf = tail;
|
|
234
|
+
} else {
|
|
235
|
+
pending.push(tail);
|
|
236
|
+
lineBuf = Buffer.concat(pending, pendingLen + tail.length);
|
|
237
|
+
pending = [];
|
|
238
|
+
pendingLen = 0;
|
|
239
|
+
}
|
|
240
|
+
await flushLine(lineBuf);
|
|
241
|
+
start = lf + 1;
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
// Trailing line without a terminating LF — emit it the same way readline
|
|
246
|
+
// would (so callers don't silently lose data when the input lacks a
|
|
247
|
+
// final newline).
|
|
248
|
+
if (pendingLen > 0) {
|
|
249
|
+
const lineBuf = Buffer.concat(pending, pendingLen);
|
|
250
|
+
await flushLine(lineBuf);
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
} catch (err) {
|
|
254
|
+
compress.stdin.destroy();
|
|
255
|
+
throw err;
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
compress.stdin.end();
|
|
259
|
+
|
|
260
|
+
await Promise.all([decompressDone, compressDone]);
|
|
261
|
+
|
|
262
|
+
if (mapper && onProgress) onProgress(linesRead, linesWritten);
|
|
263
|
+
|
|
264
|
+
return { linesRead, linesWritten, identityPassthrough: !mapper };
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
function waitForExit(
|
|
268
|
+
child: ReturnType<typeof spawn>,
|
|
269
|
+
label: string,
|
|
270
|
+
): Promise<void> {
|
|
271
|
+
return new Promise((resolve, reject) => {
|
|
272
|
+
child.on('error', (err) => {
|
|
273
|
+
reject(new Error(`${label} failed to spawn: ${err.message}`));
|
|
274
|
+
});
|
|
275
|
+
child.on('close', (code, signal) => {
|
|
276
|
+
if (code === 0) resolve();
|
|
277
|
+
else
|
|
278
|
+
reject(
|
|
279
|
+
new Error(
|
|
280
|
+
`${label} exited with ${code === null ? `signal ${signal}` : `code ${code}`}`,
|
|
281
|
+
),
|
|
282
|
+
);
|
|
283
|
+
});
|
|
284
|
+
});
|
|
285
|
+
}
|
package/src/types.ts
CHANGED
|
@@ -82,6 +82,29 @@ export interface OptimizeParams {
|
|
|
82
82
|
* Default true. */
|
|
83
83
|
ensureRangeCoverage?: boolean;
|
|
84
84
|
|
|
85
|
+
/** Tier-based only: reshape the high-tier sampling so the per-bucket row
|
|
86
|
+
* counts follow a log-decay curve across Stake hit-rate ranges — each
|
|
87
|
+
* bucket above the lowest one in the high tier targets `ratio × prev`
|
|
88
|
+
* rows. Turns the typical sparse tail of `…18 → 1 → 1 → 1 → 4` into
|
|
89
|
+
* a smooth `…18 → 9 → 4 → 2 → 1` instead.
|
|
90
|
+
*
|
|
91
|
+
* When true and `largePmThreshold` is unset, auto-sets it to
|
|
92
|
+
* `max(50, capPmThreshold / 20)` so the decay covers multiple Stake
|
|
93
|
+
* buckets, not just the single cap bucket. Default false. */
|
|
94
|
+
shapeDistribution?: boolean;
|
|
95
|
+
|
|
96
|
+
/** Tier-based only: ratio between adjacent Stake-bucket row counts when
|
|
97
|
+
* `shapeDistribution=true`. 0.5 = each higher bucket has half the rows
|
|
98
|
+
* of the one below it. Default 0.5. */
|
|
99
|
+
shapeDecayRatio?: number;
|
|
100
|
+
|
|
101
|
+
/** Tier-based only: auto-pick `shapeDecayRatio` by binary search so the
|
|
102
|
+
* achieved CV lands at `targetCV` within `toleranceCV`. Requires
|
|
103
|
+
* `shapeDistribution=true` and a `targetCV > 0`. Runs the full pipeline
|
|
104
|
+
* up to 6 times (one per bisection step) — expect a 5×-6× wall-clock
|
|
105
|
+
* hit on builds where it triggers. Default false. */
|
|
106
|
+
shapeAutoMatchCV?: boolean;
|
|
107
|
+
|
|
85
108
|
/** Tier-based only: minimum fraction of nRowsOut that must be distinct payoutCents
|
|
86
109
|
* values in the output. Stake Engine rejects "Insufficient Unique Events" when
|
|
87
110
|
* too few distinct outcomes exist (same events repeat in a session). Default 0.01
|
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
import { describe, it, expect, beforeEach, afterEach } from 'vitest';
|
|
2
|
+
import { execFileSync } from 'node:child_process';
|
|
3
|
+
import { mkdtempSync, rmSync, writeFileSync, readFileSync, existsSync, statSync } from 'node:fs';
|
|
4
|
+
import { tmpdir } from 'node:os';
|
|
5
|
+
import { join } from 'node:path';
|
|
6
|
+
|
|
7
|
+
import { transformJsonlZst } from '../src/transform-jsonl-zst.js';
|
|
8
|
+
|
|
9
|
+
let workDir: string;
|
|
10
|
+
|
|
11
|
+
beforeEach(() => {
|
|
12
|
+
workDir = mkdtempSync(join(tmpdir(), 'transform-jsonl-zst-'));
|
|
13
|
+
});
|
|
14
|
+
|
|
15
|
+
afterEach(() => {
|
|
16
|
+
rmSync(workDir, { recursive: true, force: true });
|
|
17
|
+
});
|
|
18
|
+
|
|
19
|
+
function writeJsonlZst(name: string, lines: string[]): string {
|
|
20
|
+
const jsonlPath = join(workDir, `${name}.jsonl`);
|
|
21
|
+
const zstPath = join(workDir, `${name}.jsonl.zst`);
|
|
22
|
+
writeFileSync(jsonlPath, lines.join('\n') + (lines.length > 0 ? '\n' : ''));
|
|
23
|
+
execFileSync('zstd', ['-q', '-f', '-o', zstPath, jsonlPath]);
|
|
24
|
+
return zstPath;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
function readJsonlZst(zstPath: string): string[] {
|
|
28
|
+
// Bump maxBuffer well past the 1 MiB default — some tests round-trip
|
|
29
|
+
// multi-megabyte payloads.
|
|
30
|
+
const jsonl = execFileSync('zstd', ['-dc', '-q', zstPath], {
|
|
31
|
+
maxBuffer: 64 * 1024 * 1024,
|
|
32
|
+
}).toString('utf8');
|
|
33
|
+
if (jsonl.length === 0) return [];
|
|
34
|
+
return jsonl.endsWith('\n') ? jsonl.slice(0, -1).split('\n') : jsonl.split('\n');
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
describe('transformJsonlZst', () => {
|
|
38
|
+
it('round-trips identity as a pure byte passthrough', async () => {
|
|
39
|
+
const lines = [
|
|
40
|
+
'{"id":0,"payoutMultiplier":120}',
|
|
41
|
+
'{"id":1,"payoutMultiplier":0}',
|
|
42
|
+
'{"id":2,"payoutMultiplier":500}',
|
|
43
|
+
];
|
|
44
|
+
const input = writeJsonlZst('in', lines);
|
|
45
|
+
const output = join(workDir, 'out.jsonl.zst');
|
|
46
|
+
|
|
47
|
+
const result = await transformJsonlZst({ inputPath: input, outputPath: output });
|
|
48
|
+
|
|
49
|
+
// Identity mode does not split into lines, so the counters stay at zero —
|
|
50
|
+
// by design, to keep the path allocation-free.
|
|
51
|
+
expect(result.identityPassthrough).toBe(true);
|
|
52
|
+
expect(result.linesRead).toBe(0);
|
|
53
|
+
expect(result.linesWritten).toBe(0);
|
|
54
|
+
expect(readJsonlZst(output)).toEqual(lines);
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
it('applies a line mapper as a 1:1 transform', async () => {
|
|
58
|
+
const input = writeJsonlZst('in', [
|
|
59
|
+
'{"id":0,"v":1}',
|
|
60
|
+
'{"id":1,"v":2}',
|
|
61
|
+
'{"id":2,"v":3}',
|
|
62
|
+
]);
|
|
63
|
+
const output = join(workDir, 'out.jsonl.zst');
|
|
64
|
+
|
|
65
|
+
const result = await transformJsonlZst({
|
|
66
|
+
inputPath: input,
|
|
67
|
+
outputPath: output,
|
|
68
|
+
mapper: (line, i) => {
|
|
69
|
+
const obj = JSON.parse(line);
|
|
70
|
+
return JSON.stringify({ ...obj, idx: i, doubled: obj.v * 2 });
|
|
71
|
+
},
|
|
72
|
+
});
|
|
73
|
+
|
|
74
|
+
expect(result.identityPassthrough).toBe(false);
|
|
75
|
+
expect(result.linesWritten).toBe(3);
|
|
76
|
+
expect(readJsonlZst(output)).toEqual([
|
|
77
|
+
'{"id":0,"v":1,"idx":0,"doubled":2}',
|
|
78
|
+
'{"id":1,"v":2,"idx":1,"doubled":4}',
|
|
79
|
+
'{"id":2,"v":3,"idx":2,"doubled":6}',
|
|
80
|
+
]);
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
it('drops lines when mapper returns null', async () => {
|
|
84
|
+
const input = writeJsonlZst('in', [
|
|
85
|
+
'{"keep":true,"id":0}',
|
|
86
|
+
'{"keep":false,"id":1}',
|
|
87
|
+
'{"keep":true,"id":2}',
|
|
88
|
+
'{"keep":false,"id":3}',
|
|
89
|
+
]);
|
|
90
|
+
const output = join(workDir, 'out.jsonl.zst');
|
|
91
|
+
|
|
92
|
+
const result = await transformJsonlZst({
|
|
93
|
+
inputPath: input,
|
|
94
|
+
outputPath: output,
|
|
95
|
+
mapper: (line) => (JSON.parse(line).keep ? line : null),
|
|
96
|
+
});
|
|
97
|
+
|
|
98
|
+
expect(result.linesRead).toBe(4);
|
|
99
|
+
expect(result.linesWritten).toBe(2);
|
|
100
|
+
expect(readJsonlZst(output)).toEqual([
|
|
101
|
+
'{"keep":true,"id":0}',
|
|
102
|
+
'{"keep":true,"id":2}',
|
|
103
|
+
]);
|
|
104
|
+
});
|
|
105
|
+
|
|
106
|
+
it('expands a single input line into multiple outputs when mapper returns an array', async () => {
|
|
107
|
+
const input = writeJsonlZst('in', ['{"id":0}', '{"id":1}']);
|
|
108
|
+
const output = join(workDir, 'out.jsonl.zst');
|
|
109
|
+
|
|
110
|
+
const result = await transformJsonlZst({
|
|
111
|
+
inputPath: input,
|
|
112
|
+
outputPath: output,
|
|
113
|
+
mapper: (line) => [line, line],
|
|
114
|
+
});
|
|
115
|
+
|
|
116
|
+
expect(result.linesRead).toBe(2);
|
|
117
|
+
expect(result.linesWritten).toBe(4);
|
|
118
|
+
expect(readJsonlZst(output)).toEqual([
|
|
119
|
+
'{"id":0}',
|
|
120
|
+
'{"id":0}',
|
|
121
|
+
'{"id":1}',
|
|
122
|
+
'{"id":1}',
|
|
123
|
+
]);
|
|
124
|
+
});
|
|
125
|
+
|
|
126
|
+
it('handles an empty input file', async () => {
|
|
127
|
+
const input = writeJsonlZst('in', []);
|
|
128
|
+
const output = join(workDir, 'out.jsonl.zst');
|
|
129
|
+
|
|
130
|
+
const result = await transformJsonlZst({ inputPath: input, outputPath: output });
|
|
131
|
+
|
|
132
|
+
expect(result.linesRead).toBe(0);
|
|
133
|
+
expect(result.linesWritten).toBe(0);
|
|
134
|
+
expect(existsSync(output)).toBe(true);
|
|
135
|
+
expect(readJsonlZst(output)).toEqual([]);
|
|
136
|
+
});
|
|
137
|
+
|
|
138
|
+
it('emits a trailing line that lacks a final newline', async () => {
|
|
139
|
+
// Build a raw jsonl with no terminating \n, compress it manually so we
|
|
140
|
+
// exercise the trailing-flush path in the mapper branch.
|
|
141
|
+
const jsonlPath = join(workDir, 'no-final-lf.jsonl');
|
|
142
|
+
writeFileSync(jsonlPath, '{"id":0}\n{"id":1}');
|
|
143
|
+
const input = join(workDir, 'no-final-lf.jsonl.zst');
|
|
144
|
+
execFileSync('zstd', ['-q', '-f', '-o', input, jsonlPath]);
|
|
145
|
+
const output = join(workDir, 'out.jsonl.zst');
|
|
146
|
+
|
|
147
|
+
const result = await transformJsonlZst({
|
|
148
|
+
inputPath: input,
|
|
149
|
+
outputPath: output,
|
|
150
|
+
mapper: (line) => line,
|
|
151
|
+
});
|
|
152
|
+
|
|
153
|
+
expect(result.linesRead).toBe(2);
|
|
154
|
+
expect(result.linesWritten).toBe(2);
|
|
155
|
+
expect(readJsonlZst(output)).toEqual(['{"id":0}', '{"id":1}']);
|
|
156
|
+
});
|
|
157
|
+
|
|
158
|
+
it('rejects when the input file does not exist', async () => {
|
|
159
|
+
const output = join(workDir, 'out.jsonl.zst');
|
|
160
|
+
await expect(
|
|
161
|
+
transformJsonlZst({
|
|
162
|
+
inputPath: join(workDir, 'does-not-exist.jsonl.zst'),
|
|
163
|
+
outputPath: output,
|
|
164
|
+
}),
|
|
165
|
+
).rejects.toThrow(/zstd -d/);
|
|
166
|
+
});
|
|
167
|
+
|
|
168
|
+
it('calls onProgress with the running counts (mapper mode)', async () => {
|
|
169
|
+
const lines = Array.from({ length: 250 }, (_, i) => `{"i":${i}}`);
|
|
170
|
+
const input = writeJsonlZst('in', lines);
|
|
171
|
+
const output = join(workDir, 'out.jsonl.zst');
|
|
172
|
+
const calls: Array<[number, number]> = [];
|
|
173
|
+
|
|
174
|
+
const result = await transformJsonlZst({
|
|
175
|
+
inputPath: input,
|
|
176
|
+
outputPath: output,
|
|
177
|
+
mapper: (line) => line,
|
|
178
|
+
progressEveryLines: 100,
|
|
179
|
+
onProgress: (r, w) => calls.push([r, w]),
|
|
180
|
+
});
|
|
181
|
+
|
|
182
|
+
expect(result.linesRead).toBe(250);
|
|
183
|
+
// Mid-stream ticks at 100 / 200 + a final flush at 250.
|
|
184
|
+
expect(calls).toEqual([
|
|
185
|
+
[100, 100],
|
|
186
|
+
[200, 200],
|
|
187
|
+
[250, 250],
|
|
188
|
+
]);
|
|
189
|
+
});
|
|
190
|
+
|
|
191
|
+
it('supports a large stream (identity byte-pipe) without per-line allocations', async () => {
|
|
192
|
+
const N = 50_000;
|
|
193
|
+
const lines = Array.from({ length: N }, (_, i) =>
|
|
194
|
+
JSON.stringify({ id: i, payload: 'x'.repeat(40) }),
|
|
195
|
+
);
|
|
196
|
+
const input = writeJsonlZst('in', lines);
|
|
197
|
+
const output = join(workDir, 'out.jsonl.zst');
|
|
198
|
+
|
|
199
|
+
const result = await transformJsonlZst({ inputPath: input, outputPath: output });
|
|
200
|
+
|
|
201
|
+
expect(result.identityPassthrough).toBe(true);
|
|
202
|
+
const out = readJsonlZst(output);
|
|
203
|
+
expect(out.length).toBe(N);
|
|
204
|
+
expect(out[0]).toBe(lines[0]);
|
|
205
|
+
expect(out[N - 1]).toBe(lines[N - 1]);
|
|
206
|
+
});
|
|
207
|
+
|
|
208
|
+
it('processes a single line larger than a default stream chunk (128 KiB) via mapper', async () => {
|
|
209
|
+
// Construct a line of ~512 KiB so it spans many decompressor chunks. This
|
|
210
|
+
// is the failure mode that `readline += string` hits at scale; the
|
|
211
|
+
// Buffer-based splitter must concatenate transparently.
|
|
212
|
+
const bigLine = '{"id":0,"payload":"' + 'x'.repeat(500_000) + '"}';
|
|
213
|
+
const input = writeJsonlZst('in', [bigLine, '{"id":1}']);
|
|
214
|
+
const output = join(workDir, 'out.jsonl.zst');
|
|
215
|
+
|
|
216
|
+
const sizes: number[] = [];
|
|
217
|
+
const result = await transformJsonlZst({
|
|
218
|
+
inputPath: input,
|
|
219
|
+
outputPath: output,
|
|
220
|
+
mapper: (line) => {
|
|
221
|
+
sizes.push(line.length);
|
|
222
|
+
return line;
|
|
223
|
+
},
|
|
224
|
+
});
|
|
225
|
+
|
|
226
|
+
expect(result.linesRead).toBe(2);
|
|
227
|
+
expect(result.linesWritten).toBe(2);
|
|
228
|
+
expect(sizes[0]).toBe(bigLine.length);
|
|
229
|
+
expect(sizes[1]).toBe('{"id":1}'.length);
|
|
230
|
+
const out = readJsonlZst(output);
|
|
231
|
+
expect(out[0].length).toBe(bigLine.length);
|
|
232
|
+
expect(out[1]).toBe('{"id":1}');
|
|
233
|
+
});
|
|
234
|
+
|
|
235
|
+
it('passes raw Buffer to binaryMapper and lets it emit Buffer or string', async () => {
|
|
236
|
+
const input = writeJsonlZst('in', [
|
|
237
|
+
'{"id":0,"v":1}',
|
|
238
|
+
'{"id":1,"v":2}',
|
|
239
|
+
'{"id":2,"v":3}',
|
|
240
|
+
]);
|
|
241
|
+
const output = join(workDir, 'out.jsonl.zst');
|
|
242
|
+
|
|
243
|
+
const seen: Array<{ isBuffer: boolean; byteLength: number; firstByte: number }> = [];
|
|
244
|
+
const result = await transformJsonlZst({
|
|
245
|
+
inputPath: input,
|
|
246
|
+
outputPath: output,
|
|
247
|
+
binaryMapper: (lineBuf, i) => {
|
|
248
|
+
seen.push({
|
|
249
|
+
isBuffer: Buffer.isBuffer(lineBuf),
|
|
250
|
+
byteLength: lineBuf.length,
|
|
251
|
+
firstByte: lineBuf[0],
|
|
252
|
+
});
|
|
253
|
+
// Mix Buffer + string returns: even indices stay as Buffer, odd as string.
|
|
254
|
+
return i % 2 === 0 ? lineBuf : lineBuf.toString('utf8');
|
|
255
|
+
},
|
|
256
|
+
});
|
|
257
|
+
|
|
258
|
+
expect(result.linesRead).toBe(3);
|
|
259
|
+
expect(result.linesWritten).toBe(3);
|
|
260
|
+
expect(seen.every((s) => s.isBuffer)).toBe(true);
|
|
261
|
+
expect(seen[0].firstByte).toBe('{'.charCodeAt(0));
|
|
262
|
+
expect(readJsonlZst(output)).toEqual([
|
|
263
|
+
'{"id":0,"v":1}',
|
|
264
|
+
'{"id":1,"v":2}',
|
|
265
|
+
'{"id":2,"v":3}',
|
|
266
|
+
]);
|
|
267
|
+
});
|
|
268
|
+
|
|
269
|
+
it('binaryMapper rewrites a multi-megabyte line via prefix-only string conversion', async () => {
|
|
270
|
+
// Mimic the curate use case: id-prefix lookup + verbatim tail. Build a
|
|
271
|
+
// ~3 MB line so the test stays fast but the path is identical to what a
|
|
272
|
+
// 1 GB book line would exercise — only the prefix becomes a string.
|
|
273
|
+
const bigTail = '"events":[' + '0,'.repeat(1_500_000) + '0]';
|
|
274
|
+
const bigLine = `{"id":42,${bigTail}}`;
|
|
275
|
+
const input = writeJsonlZst('in', [
|
|
276
|
+
`{"id":1,"keep":false}`,
|
|
277
|
+
bigLine,
|
|
278
|
+
`{"id":99,"keep":true}`,
|
|
279
|
+
]);
|
|
280
|
+
const output = join(workDir, 'out.jsonl.zst');
|
|
281
|
+
|
|
282
|
+
const selected = new Map<number, number>([
|
|
283
|
+
[42, 0],
|
|
284
|
+
[99, 1],
|
|
285
|
+
]);
|
|
286
|
+
const idPrefix = /^\{"id":(\d+),/;
|
|
287
|
+
|
|
288
|
+
const result = await transformJsonlZst({
|
|
289
|
+
inputPath: input,
|
|
290
|
+
outputPath: output,
|
|
291
|
+
binaryMapper: (lineBuf) => {
|
|
292
|
+
// Peek only the first 32 bytes — works regardless of full line size.
|
|
293
|
+
const head = lineBuf.subarray(0, 32).toString('utf8');
|
|
294
|
+
const m = idPrefix.exec(head);
|
|
295
|
+
if (!m) return null;
|
|
296
|
+
const newId = selected.get(Number(m[1]));
|
|
297
|
+
if (newId === undefined) return null;
|
|
298
|
+
const prefix = Buffer.from(`{"id":${newId},`);
|
|
299
|
+
const tail = lineBuf.subarray(m[0].length);
|
|
300
|
+
return Buffer.concat([prefix, tail], prefix.length + tail.length);
|
|
301
|
+
},
|
|
302
|
+
});
|
|
303
|
+
|
|
304
|
+
expect(result.linesRead).toBe(3);
|
|
305
|
+
expect(result.linesWritten).toBe(2);
|
|
306
|
+
const out = readJsonlZst(output);
|
|
307
|
+
expect(out.length).toBe(2);
|
|
308
|
+
// First written line is the rewritten big one (id 42 → 0).
|
|
309
|
+
expect(out[0].startsWith('{"id":0,"events":[')).toBe(true);
|
|
310
|
+
expect(out[0].length).toBe(bigLine.length - `{"id":42,`.length + `{"id":0,`.length);
|
|
311
|
+
expect(out[1]).toBe('{"id":1,"keep":true}');
|
|
312
|
+
});
|
|
313
|
+
|
|
314
|
+
it('rejects when both mapper and binaryMapper are provided', async () => {
|
|
315
|
+
const input = writeJsonlZst('in', ['{"id":0}']);
|
|
316
|
+
const output = join(workDir, 'out.jsonl.zst');
|
|
317
|
+
await expect(
|
|
318
|
+
transformJsonlZst({
|
|
319
|
+
inputPath: input,
|
|
320
|
+
outputPath: output,
|
|
321
|
+
mapper: (l) => l,
|
|
322
|
+
binaryMapper: (b) => b,
|
|
323
|
+
}),
|
|
324
|
+
).rejects.toThrow(/either.*mapper.*binaryMapper/);
|
|
325
|
+
});
|
|
326
|
+
|
|
327
|
+
it('honors the zstdLevel parameter', async () => {
|
|
328
|
+
const lines = Array.from({ length: 1000 }, (_, i) =>
|
|
329
|
+
JSON.stringify({ id: i, lots: 'of repeating text '.repeat(5) }),
|
|
330
|
+
);
|
|
331
|
+
const input = writeJsonlZst('in', lines);
|
|
332
|
+
const outFast = join(workDir, 'fast.jsonl.zst');
|
|
333
|
+
const outSmall = join(workDir, 'small.jsonl.zst');
|
|
334
|
+
|
|
335
|
+
await transformJsonlZst({ inputPath: input, outputPath: outFast, zstdLevel: 1 });
|
|
336
|
+
await transformJsonlZst({ inputPath: input, outputPath: outSmall, zstdLevel: 19 });
|
|
337
|
+
|
|
338
|
+
expect(readJsonlZst(outFast)).toEqual(readJsonlZst(outSmall));
|
|
339
|
+
const fastSize = statSync(outFast).size;
|
|
340
|
+
const smallSize = statSync(outSmall).size;
|
|
341
|
+
expect(smallSize).toBeLessThanOrEqual(fastSize);
|
|
342
|
+
});
|
|
343
|
+
});
|