memchunk 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +18 -8
- package/index.js +103 -22
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -40,33 +40,43 @@ import { init, chunk } from 'memchunk';
|
|
|
40
40
|
// initialize wasm (required once)
|
|
41
41
|
await init();
|
|
42
42
|
|
|
43
|
-
const text =
|
|
43
|
+
const text = "Hello world. How are you? I'm fine.\nThanks for asking.";
|
|
44
44
|
|
|
45
45
|
// with defaults (4KB chunks, split at \n . ?)
|
|
46
46
|
for (const slice of chunk(text)) {
|
|
47
|
-
console.log(
|
|
47
|
+
console.log(slice);
|
|
48
48
|
}
|
|
49
49
|
|
|
50
50
|
// with custom size
|
|
51
51
|
for (const slice of chunk(text, { size: 1024 })) {
|
|
52
|
-
console.log(
|
|
52
|
+
console.log(slice);
|
|
53
53
|
}
|
|
54
54
|
|
|
55
55
|
// with custom delimiters
|
|
56
56
|
for (const slice of chunk(text, { delimiters: ".?!\n" })) {
|
|
57
|
-
console.log(
|
|
57
|
+
console.log(slice);
|
|
58
58
|
}
|
|
59
59
|
|
|
60
|
-
// with
|
|
61
|
-
for (const slice of chunk(text, {
|
|
62
|
-
console.log(
|
|
60
|
+
// with multi-byte pattern (e.g., metaspace ▁ for SentencePiece tokenizers)
|
|
61
|
+
for (const slice of chunk(text, { pattern: "▁", prefix: true })) {
|
|
62
|
+
console.log(slice);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// with consecutive pattern handling (split at START of runs, not middle)
|
|
66
|
+
for (const slice of chunk("word next", { pattern: " ", consecutive: true })) {
|
|
67
|
+
console.log(slice);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// with forward fallback (search forward if no pattern in backward window)
|
|
71
|
+
for (const slice of chunk(text, { pattern: " ", forwardFallback: true })) {
|
|
72
|
+
console.log(slice);
|
|
63
73
|
}
|
|
64
74
|
|
|
65
75
|
// collect all chunks
|
|
66
76
|
const chunks = [...chunk(text)];
|
|
67
77
|
```
|
|
68
78
|
|
|
69
|
-
|
|
79
|
+
pass strings and get strings back. for zero-copy performance with binary data, pass `Uint8Array` and you'll get `Uint8Array` views back.
|
|
70
80
|
|
|
71
81
|
## 📝 citation
|
|
72
82
|
|
package/index.js
CHANGED
|
@@ -7,9 +7,15 @@
|
|
|
7
7
|
*
|
|
8
8
|
* await init();
|
|
9
9
|
*
|
|
10
|
-
*
|
|
11
|
-
* for (const slice of chunk(
|
|
12
|
-
* console.log(
|
|
10
|
+
* // Simple string API - strings in, strings out
|
|
11
|
+
* for (const slice of chunk("Hello. World. Test.", { size: 10 })) {
|
|
12
|
+
* console.log(slice);
|
|
13
|
+
* }
|
|
14
|
+
*
|
|
15
|
+
* // Or use bytes for zero-copy performance
|
|
16
|
+
* const bytes = new TextEncoder().encode("Hello. World.");
|
|
17
|
+
* for (const slice of chunk(bytes, { size: 10 })) {
|
|
18
|
+
* console.log(slice); // Uint8Array
|
|
13
19
|
* }
|
|
14
20
|
* ```
|
|
15
21
|
*/
|
|
@@ -19,31 +25,65 @@ import initWasm, {
|
|
|
19
25
|
default_target_size,
|
|
20
26
|
default_delimiters,
|
|
21
27
|
chunk_offsets as wasmChunkOffsets,
|
|
28
|
+
chunk_offsets_pattern as wasmChunkOffsetsPattern,
|
|
22
29
|
} from './pkg/memchunk_wasm.js';
|
|
23
30
|
|
|
24
31
|
export { default_target_size, default_delimiters };
|
|
25
32
|
|
|
33
|
+
const encoder = new TextEncoder();
|
|
34
|
+
const decoder = new TextDecoder();
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Convert input to bytes if it's a string.
|
|
38
|
+
* @param {string | Uint8Array} input
|
|
39
|
+
* @returns {Uint8Array}
|
|
40
|
+
*/
|
|
41
|
+
function toBytes(input) {
|
|
42
|
+
return typeof input === 'string' ? encoder.encode(input) : input;
|
|
43
|
+
}
|
|
44
|
+
|
|
26
45
|
/**
|
|
27
46
|
* Split text into chunks at delimiter boundaries.
|
|
28
|
-
*
|
|
47
|
+
* Accepts strings or Uint8Array. Returns the same type as input.
|
|
29
48
|
*
|
|
30
|
-
* @param {Uint8Array} text - The text to chunk
|
|
49
|
+
* @param {string | Uint8Array} text - The text to chunk
|
|
31
50
|
* @param {Object} [options] - Options
|
|
32
51
|
* @param {number} [options.size=4096] - Target chunk size in bytes
|
|
33
52
|
* @param {string} [options.delimiters="\n.?"] - Delimiter characters
|
|
34
|
-
* @
|
|
53
|
+
* @param {string | Uint8Array} [options.pattern] - Multi-byte pattern to split on
|
|
54
|
+
* @param {boolean} [options.prefix=false] - Put delimiter/pattern at start of next chunk
|
|
55
|
+
* @param {boolean} [options.consecutive=false] - Split at START of consecutive runs
|
|
56
|
+
* @param {boolean} [options.forwardFallback=false] - Search forward if no pattern in backward window
|
|
57
|
+
* @yields {string | Uint8Array} Chunks (same type as input)
|
|
58
|
+
*
|
|
59
|
+
* @example
|
|
60
|
+
* // String input returns strings
|
|
61
|
+
* for (const slice of chunk("Hello. World.", { size: 10 })) {
|
|
62
|
+
* console.log(slice);
|
|
63
|
+
* }
|
|
35
64
|
*
|
|
36
65
|
* @example
|
|
37
|
-
*
|
|
38
|
-
* for (const slice of chunk(
|
|
39
|
-
* console.log(
|
|
66
|
+
* // With pattern (e.g., metaspace for SentencePiece)
|
|
67
|
+
* for (const slice of chunk("Hello▁World▁Test", { pattern: "▁", prefix: true })) {
|
|
68
|
+
* console.log(slice);
|
|
40
69
|
* }
|
|
41
70
|
*/
|
|
42
71
|
export function* chunk(text, options = {}) {
|
|
43
|
-
const
|
|
44
|
-
const
|
|
72
|
+
const isString = typeof text === 'string';
|
|
73
|
+
const bytes = toBytes(text);
|
|
74
|
+
const { size, delimiters, pattern, prefix, consecutive, forwardFallback } = options;
|
|
75
|
+
|
|
76
|
+
let flat;
|
|
77
|
+
if (pattern) {
|
|
78
|
+
const patternBytes = toBytes(pattern);
|
|
79
|
+
flat = wasmChunkOffsetsPattern(bytes, size ?? 4096, patternBytes, prefix, consecutive, forwardFallback);
|
|
80
|
+
} else {
|
|
81
|
+
flat = wasmChunkOffsets(bytes, size, delimiters, prefix);
|
|
82
|
+
}
|
|
83
|
+
|
|
45
84
|
for (let i = 0; i < flat.length; i += 2) {
|
|
46
|
-
|
|
85
|
+
const slice = bytes.subarray(flat[i], flat[i + 1]);
|
|
86
|
+
yield isString ? decoder.decode(slice) : slice;
|
|
47
87
|
}
|
|
48
88
|
}
|
|
49
89
|
|
|
@@ -51,15 +91,28 @@ export function* chunk(text, options = {}) {
|
|
|
51
91
|
* Get chunk offsets without creating views.
|
|
52
92
|
* Returns an array of [start, end] offset pairs.
|
|
53
93
|
*
|
|
54
|
-
* @param {Uint8Array} text - The text to chunk
|
|
94
|
+
* @param {string | Uint8Array} text - The text to chunk
|
|
55
95
|
* @param {Object} [options] - Options
|
|
56
96
|
* @param {number} [options.size=4096] - Target chunk size in bytes
|
|
57
97
|
* @param {string} [options.delimiters="\n.?"] - Delimiter characters
|
|
58
|
-
* @
|
|
98
|
+
* @param {string | Uint8Array} [options.pattern] - Multi-byte pattern to split on
|
|
99
|
+
* @param {boolean} [options.prefix=false] - Put delimiter/pattern at start of next chunk
|
|
100
|
+
* @param {boolean} [options.consecutive=false] - Split at START of consecutive runs
|
|
101
|
+
* @param {boolean} [options.forwardFallback=false] - Search forward if no pattern in backward window
|
|
102
|
+
* @returns {Array<[number, number]>} Array of [start, end] byte offset pairs
|
|
59
103
|
*/
|
|
60
104
|
export function chunk_offsets(text, options = {}) {
|
|
61
|
-
const
|
|
62
|
-
const
|
|
105
|
+
const bytes = toBytes(text);
|
|
106
|
+
const { size, delimiters, pattern, prefix, consecutive, forwardFallback } = options;
|
|
107
|
+
|
|
108
|
+
let flat;
|
|
109
|
+
if (pattern) {
|
|
110
|
+
const patternBytes = toBytes(pattern);
|
|
111
|
+
flat = wasmChunkOffsetsPattern(bytes, size ?? 4096, patternBytes, prefix, consecutive, forwardFallback);
|
|
112
|
+
} else {
|
|
113
|
+
flat = wasmChunkOffsets(bytes, size, delimiters, prefix);
|
|
114
|
+
}
|
|
115
|
+
|
|
63
116
|
const pairs = [];
|
|
64
117
|
for (let i = 0; i < flat.length; i += 2) {
|
|
65
118
|
pairs.push([flat[i], flat[i + 1]]);
|
|
@@ -82,26 +135,54 @@ export async function init() {
|
|
|
82
135
|
/**
|
|
83
136
|
* Chunker splits text at delimiter boundaries.
|
|
84
137
|
* Implements Symbol.iterator for use in for...of loops.
|
|
138
|
+
*
|
|
139
|
+
* @example
|
|
140
|
+
* // String input
|
|
141
|
+
* const chunker = new Chunker("Hello. World. Test.", { size: 10 });
|
|
142
|
+
* for (const slice of chunker) {
|
|
143
|
+
* console.log(slice); // strings
|
|
144
|
+
* }
|
|
145
|
+
*
|
|
146
|
+
* @example
|
|
147
|
+
* // With pattern
|
|
148
|
+
* const chunker = new Chunker("Hello▁World", { pattern: "▁", prefix: true });
|
|
149
|
+
* for (const slice of chunker) {
|
|
150
|
+
* console.log(slice);
|
|
151
|
+
* }
|
|
85
152
|
*/
|
|
86
153
|
export class Chunker {
|
|
87
154
|
/**
|
|
88
155
|
* Create a new Chunker.
|
|
89
|
-
* @param {Uint8Array} text - The text to chunk
|
|
156
|
+
* @param {string | Uint8Array} text - The text to chunk
|
|
90
157
|
* @param {Object} [options] - Options
|
|
91
158
|
* @param {number} [options.size=4096] - Target chunk size in bytes
|
|
92
159
|
* @param {string} [options.delimiters="\n.?"] - Delimiter characters
|
|
160
|
+
* @param {string | Uint8Array} [options.pattern] - Multi-byte pattern to split on
|
|
161
|
+
* @param {boolean} [options.prefix=false] - Put delimiter/pattern at start of next chunk
|
|
162
|
+
* @param {boolean} [options.consecutive=false] - Split at START of consecutive runs
|
|
163
|
+
* @param {boolean} [options.forwardFallback=false] - Search forward if no pattern in backward window
|
|
93
164
|
*/
|
|
94
165
|
constructor(text, options = {}) {
|
|
95
|
-
|
|
96
|
-
|
|
166
|
+
this._isString = typeof text === 'string';
|
|
167
|
+
const bytes = toBytes(text);
|
|
168
|
+
const { size, delimiters, pattern, prefix, consecutive, forwardFallback } = options;
|
|
169
|
+
|
|
170
|
+
if (pattern) {
|
|
171
|
+
const patternBytes = toBytes(pattern);
|
|
172
|
+
this._chunker = WasmChunker.with_pattern(bytes, size ?? 4096, patternBytes, prefix, consecutive, forwardFallback);
|
|
173
|
+
} else {
|
|
174
|
+
this._chunker = new WasmChunker(bytes, size, delimiters, prefix);
|
|
175
|
+
}
|
|
97
176
|
}
|
|
98
177
|
|
|
99
178
|
/**
|
|
100
179
|
* Get the next chunk, or undefined if exhausted.
|
|
101
|
-
* @returns {Uint8Array | undefined}
|
|
180
|
+
* @returns {string | Uint8Array | undefined}
|
|
102
181
|
*/
|
|
103
182
|
next() {
|
|
104
|
-
|
|
183
|
+
const chunk = this._chunker.next();
|
|
184
|
+
if (chunk === undefined) return undefined;
|
|
185
|
+
return this._isString ? decoder.decode(chunk) : chunk;
|
|
105
186
|
}
|
|
106
187
|
|
|
107
188
|
/**
|
|
@@ -138,7 +219,7 @@ export class Chunker {
|
|
|
138
219
|
*[Symbol.iterator]() {
|
|
139
220
|
let chunk;
|
|
140
221
|
while ((chunk = this._chunker.next()) !== undefined) {
|
|
141
|
-
yield chunk;
|
|
222
|
+
yield this._isString ? decoder.decode(chunk) : chunk;
|
|
142
223
|
}
|
|
143
224
|
}
|
|
144
225
|
}
|