memchunk 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +18 -8
  2. package/index.js +103 -22
  3. package/package.json +1 -1
package/README.md CHANGED
@@ -40,33 +40,43 @@ import { init, chunk } from 'memchunk';
40
40
  // initialize wasm (required once)
41
41
  await init();
42
42
 
43
- const text = new TextEncoder().encode("Hello world. How are you? I'm fine.\nThanks for asking.");
43
+ const text = "Hello world. How are you? I'm fine.\nThanks for asking.";
44
44
 
45
45
  // with defaults (4KB chunks, split at \n . ?)
46
46
  for (const slice of chunk(text)) {
47
- console.log(new TextDecoder().decode(slice));
47
+ console.log(slice);
48
48
  }
49
49
 
50
50
  // with custom size
51
51
  for (const slice of chunk(text, { size: 1024 })) {
52
- console.log(new TextDecoder().decode(slice));
52
+ console.log(slice);
53
53
  }
54
54
 
55
55
  // with custom delimiters
56
56
  for (const slice of chunk(text, { delimiters: ".?!\n" })) {
57
- console.log(new TextDecoder().decode(slice));
57
+ console.log(slice);
58
58
  }
59
59
 
60
- // with both
61
- for (const slice of chunk(text, { size: 8192, delimiters: "\n" })) {
62
- console.log(new TextDecoder().decode(slice));
60
+ // with multi-byte pattern (e.g., metaspace ▁ for SentencePiece tokenizers)
61
+ for (const slice of chunk(text, { pattern: "▁", prefix: true })) {
62
+ console.log(slice);
63
+ }
64
+
65
+ // with consecutive pattern handling (split at START of runs, not middle)
66
+ for (const slice of chunk("word next", { pattern: " ", consecutive: true })) {
67
+ console.log(slice);
68
+ }
69
+
70
+ // with forward fallback (search forward if no pattern in backward window)
71
+ for (const slice of chunk(text, { pattern: " ", forwardFallback: true })) {
72
+ console.log(slice);
63
73
  }
64
74
 
65
75
  // collect all chunks
66
76
  const chunks = [...chunk(text)];
67
77
  ```
68
78
 
69
- chunks are returned as `Uint8Array` subarrays (zero-copy views of the original text).
79
+ pass strings and get strings back. for zero-copy performance with binary data, pass `Uint8Array` and you'll get `Uint8Array` views back.
70
80
 
71
81
  ## 📝 citation
72
82
 
package/index.js CHANGED
@@ -7,9 +7,15 @@
7
7
  *
8
8
  * await init();
9
9
  *
10
- * const text = new TextEncoder().encode("Hello. World. Test.");
11
- * for (const slice of chunk(text, { size: 10, delimiters: "." })) {
12
- * console.log(new TextDecoder().decode(slice));
10
+ * // Simple string API - strings in, strings out
11
+ * for (const slice of chunk("Hello. World. Test.", { size: 10 })) {
12
+ * console.log(slice);
13
+ * }
14
+ *
15
+ * // Or use bytes for zero-copy performance
16
+ * const bytes = new TextEncoder().encode("Hello. World.");
17
+ * for (const slice of chunk(bytes, { size: 10 })) {
18
+ * console.log(slice); // Uint8Array
13
19
  * }
14
20
  * ```
15
21
  */
@@ -19,31 +25,65 @@ import initWasm, {
19
25
  default_target_size,
20
26
  default_delimiters,
21
27
  chunk_offsets as wasmChunkOffsets,
28
+ chunk_offsets_pattern as wasmChunkOffsetsPattern,
22
29
  } from './pkg/memchunk_wasm.js';
23
30
 
24
31
  export { default_target_size, default_delimiters };
25
32
 
33
+ const encoder = new TextEncoder();
34
+ const decoder = new TextDecoder();
35
+
36
+ /**
37
+ * Convert input to bytes if it's a string.
38
+ * @param {string | Uint8Array} input
39
+ * @returns {Uint8Array}
40
+ */
41
+ function toBytes(input) {
42
+ return typeof input === 'string' ? encoder.encode(input) : input;
43
+ }
44
+
26
45
  /**
27
46
  * Split text into chunks at delimiter boundaries.
28
- * Returns an iterator of zero-copy Uint8Array subarray views.
47
+ * Accepts strings or Uint8Array. Returns the same type as input.
29
48
  *
30
- * @param {Uint8Array} text - The text to chunk as bytes
49
+ * @param {string | Uint8Array} text - The text to chunk
31
50
  * @param {Object} [options] - Options
32
51
  * @param {number} [options.size=4096] - Target chunk size in bytes
33
52
  * @param {string} [options.delimiters="\n.?"] - Delimiter characters
34
- * @yields {Uint8Array} Zero-copy subarray views of the original text
53
+ * @param {string | Uint8Array} [options.pattern] - Multi-byte pattern to split on
54
+ * @param {boolean} [options.prefix=false] - Put delimiter/pattern at start of next chunk
55
+ * @param {boolean} [options.consecutive=false] - Split at START of consecutive runs
56
+ * @param {boolean} [options.forwardFallback=false] - Search forward if no pattern in backward window
57
+ * @yields {string | Uint8Array} Chunks (same type as input)
58
+ *
59
+ * @example
60
+ * // String input returns strings
61
+ * for (const slice of chunk("Hello. World.", { size: 10 })) {
62
+ * console.log(slice);
63
+ * }
35
64
  *
36
65
  * @example
37
- * const text = new TextEncoder().encode("Hello. World. Test.");
38
- * for (const slice of chunk(text, { size: 10, delimiters: "." })) {
39
- * console.log(new TextDecoder().decode(slice));
66
+ * // With pattern (e.g., metaspace for SentencePiece)
67
+ * for (const slice of chunk("Hello▁World▁Test", { pattern: "▁", prefix: true })) {
68
+ * console.log(slice);
40
69
  * }
41
70
  */
42
71
  export function* chunk(text, options = {}) {
43
- const { size, delimiters } = options;
44
- const flat = wasmChunkOffsets(text, size, delimiters);
72
+ const isString = typeof text === 'string';
73
+ const bytes = toBytes(text);
74
+ const { size, delimiters, pattern, prefix, consecutive, forwardFallback } = options;
75
+
76
+ let flat;
77
+ if (pattern) {
78
+ const patternBytes = toBytes(pattern);
79
+ flat = wasmChunkOffsetsPattern(bytes, size ?? 4096, patternBytes, prefix, consecutive, forwardFallback);
80
+ } else {
81
+ flat = wasmChunkOffsets(bytes, size, delimiters, prefix);
82
+ }
83
+
45
84
  for (let i = 0; i < flat.length; i += 2) {
46
- yield text.subarray(flat[i], flat[i + 1]);
85
+ const slice = bytes.subarray(flat[i], flat[i + 1]);
86
+ yield isString ? decoder.decode(slice) : slice;
47
87
  }
48
88
  }
49
89
 
@@ -51,15 +91,28 @@ export function* chunk(text, options = {}) {
51
91
  * Get chunk offsets without creating views.
52
92
  * Returns an array of [start, end] offset pairs.
53
93
  *
54
- * @param {Uint8Array} text - The text to chunk as bytes
94
+ * @param {string | Uint8Array} text - The text to chunk
55
95
  * @param {Object} [options] - Options
56
96
  * @param {number} [options.size=4096] - Target chunk size in bytes
57
97
  * @param {string} [options.delimiters="\n.?"] - Delimiter characters
58
- * @returns {Array<[number, number]>} Array of [start, end] offset pairs
98
+ * @param {string | Uint8Array} [options.pattern] - Multi-byte pattern to split on
99
+ * @param {boolean} [options.prefix=false] - Put delimiter/pattern at start of next chunk
100
+ * @param {boolean} [options.consecutive=false] - Split at START of consecutive runs
101
+ * @param {boolean} [options.forwardFallback=false] - Search forward if no pattern in backward window
102
+ * @returns {Array<[number, number]>} Array of [start, end] byte offset pairs
59
103
  */
60
104
  export function chunk_offsets(text, options = {}) {
61
- const { size, delimiters } = options;
62
- const flat = wasmChunkOffsets(text, size, delimiters);
105
+ const bytes = toBytes(text);
106
+ const { size, delimiters, pattern, prefix, consecutive, forwardFallback } = options;
107
+
108
+ let flat;
109
+ if (pattern) {
110
+ const patternBytes = toBytes(pattern);
111
+ flat = wasmChunkOffsetsPattern(bytes, size ?? 4096, patternBytes, prefix, consecutive, forwardFallback);
112
+ } else {
113
+ flat = wasmChunkOffsets(bytes, size, delimiters, prefix);
114
+ }
115
+
63
116
  const pairs = [];
64
117
  for (let i = 0; i < flat.length; i += 2) {
65
118
  pairs.push([flat[i], flat[i + 1]]);
@@ -82,26 +135,54 @@ export async function init() {
82
135
  /**
83
136
  * Chunker splits text at delimiter boundaries.
84
137
  * Implements Symbol.iterator for use in for...of loops.
138
+ *
139
+ * @example
140
+ * // String input
141
+ * const chunker = new Chunker("Hello. World. Test.", { size: 10 });
142
+ * for (const slice of chunker) {
143
+ * console.log(slice); // strings
144
+ * }
145
+ *
146
+ * @example
147
+ * // With pattern
148
+ * const chunker = new Chunker("Hello▁World", { pattern: "▁", prefix: true });
149
+ * for (const slice of chunker) {
150
+ * console.log(slice);
151
+ * }
85
152
  */
86
153
  export class Chunker {
87
154
  /**
88
155
  * Create a new Chunker.
89
- * @param {Uint8Array} text - The text to chunk as bytes
156
+ * @param {string | Uint8Array} text - The text to chunk
90
157
  * @param {Object} [options] - Options
91
158
  * @param {number} [options.size=4096] - Target chunk size in bytes
92
159
  * @param {string} [options.delimiters="\n.?"] - Delimiter characters
160
+ * @param {string | Uint8Array} [options.pattern] - Multi-byte pattern to split on
161
+ * @param {boolean} [options.prefix=false] - Put delimiter/pattern at start of next chunk
162
+ * @param {boolean} [options.consecutive=false] - Split at START of consecutive runs
163
+ * @param {boolean} [options.forwardFallback=false] - Search forward if no pattern in backward window
93
164
  */
94
165
  constructor(text, options = {}) {
95
- const { size, delimiters } = options;
96
- this._chunker = new WasmChunker(text, size, delimiters);
166
+ this._isString = typeof text === 'string';
167
+ const bytes = toBytes(text);
168
+ const { size, delimiters, pattern, prefix, consecutive, forwardFallback } = options;
169
+
170
+ if (pattern) {
171
+ const patternBytes = toBytes(pattern);
172
+ this._chunker = WasmChunker.with_pattern(bytes, size ?? 4096, patternBytes, prefix, consecutive, forwardFallback);
173
+ } else {
174
+ this._chunker = new WasmChunker(bytes, size, delimiters, prefix);
175
+ }
97
176
  }
98
177
 
99
178
  /**
100
179
  * Get the next chunk, or undefined if exhausted.
101
- * @returns {Uint8Array | undefined}
180
+ * @returns {string | Uint8Array | undefined}
102
181
  */
103
182
  next() {
104
- return this._chunker.next();
183
+ const chunk = this._chunker.next();
184
+ if (chunk === undefined) return undefined;
185
+ return this._isString ? decoder.decode(chunk) : chunk;
105
186
  }
106
187
 
107
188
  /**
@@ -138,7 +219,7 @@ export class Chunker {
138
219
  *[Symbol.iterator]() {
139
220
  let chunk;
140
221
  while ((chunk = this._chunker.next()) !== undefined) {
141
- yield chunk;
222
+ yield this._isString ? decoder.decode(chunk) : chunk;
142
223
  }
143
224
  }
144
225
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "memchunk",
3
- "version": "0.2.0",
3
+ "version": "0.3.0",
4
4
  "description": "The fastest semantic text chunking library",
5
5
  "type": "module",
6
6
  "main": "index.js",