@chonkiejs/chunk 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +97 -0
  2. package/index.js +225 -0
  3. package/package.json +32 -0
package/README.md ADDED
@@ -0,0 +1,97 @@
1
+ <p align="center">
2
+ <img src="../../assets/memchunk_wide.png" alt="@chonkiejs/chunk" width="500">
3
+ </p>
4
+
5
+ <h1 align="center">@chonkiejs/chunk</h1>
6
+
7
+ <p align="center">
8
+ <em>the fastest text chunking library β€” up to 1 TB/s throughput</em>
9
+ </p>
10
+
11
+ <p align="center">
12
+ <a href="https://crates.io/crates/chunk"><img src="https://img.shields.io/crates/v/chunk.svg?color=e74c3c" alt="crates.io"></a>
13
+ <a href="https://pypi.org/project/chonkie-core"><img src="https://img.shields.io/pypi/v/chonkie-core.svg?color=e67e22" alt="PyPI"></a>
14
+ <a href="https://www.npmjs.com/package/@chonkiejs/chunk"><img src="https://img.shields.io/npm/v/@chonkiejs/chunk.svg?color=2ecc71" alt="npm"></a>
15
+ <a href="https://github.com/chonkie-inc/chunk"><img src="https://img.shields.io/badge/github-chunk-3498db" alt="GitHub"></a>
16
+ <a href="LICENSE-MIT"><img src="https://img.shields.io/badge/license-MIT%2FApache--2.0-9b59b6.svg" alt="License"></a>
17
+ </p>
18
+
19
+ ---
20
+
21
+ you know how every chunking library claims to be fast? yeah, we actually meant it.
22
+
23
+ **@chonkiejs/chunk** splits text at semantic boundaries (periods, newlines, the usual suspects) and does it stupid fast. we're talking "chunk the entire english wikipedia in 120ms" fast.
24
+
25
+ want to know how? [read the blog post](https://minha.sh/posts/so,-you-want-to-chunk-really-fast) where we nerd out about SIMD instructions and lookup tables.
26
+
27
+ ## πŸ“¦ installation
28
+
29
+ ```bash
30
+ npm install @chonkiejs/chunk
31
+ ```
32
+
33
+ looking for [rust](https://github.com/chonkie-inc/chunk) or [python](https://github.com/chonkie-inc/chunk/tree/main/packages/python)?
34
+
35
+ ## πŸš€ usage
36
+
37
+ ```javascript
38
+ import { init, chunk } from '@chonkiejs/chunk';
39
+
40
+ // initialize wasm (required once)
41
+ await init();
42
+
43
+ const text = "Hello world. How are you? I'm fine.\nThanks for asking.";
44
+
45
+ // with defaults (4KB chunks, split at \n . ?)
46
+ for (const slice of chunk(text)) {
47
+ console.log(slice);
48
+ }
49
+
50
+ // with custom size
51
+ for (const slice of chunk(text, { size: 1024 })) {
52
+ console.log(slice);
53
+ }
54
+
55
+ // with custom delimiters
56
+ for (const slice of chunk(text, { delimiters: ".?!\n" })) {
57
+ console.log(slice);
58
+ }
59
+
60
+ // with multi-byte pattern (e.g., metaspace ▁ for SentencePiece tokenizers)
61
+ for (const slice of chunk(text, { pattern: "▁", prefix: true })) {
62
+ console.log(slice);
63
+ }
64
+
65
+ // with consecutive pattern handling (split at START of runs, not middle)
66
+ for (const slice of chunk("word next", { pattern: " ", consecutive: true })) {
67
+ console.log(slice);
68
+ }
69
+
70
+ // with forward fallback (search forward if no pattern in backward window)
71
+ for (const slice of chunk(text, { pattern: " ", forwardFallback: true })) {
72
+ console.log(slice);
73
+ }
74
+
75
+ // collect all chunks
76
+ const chunks = [...chunk(text)];
77
+ ```
78
+
79
+ pass strings and get strings back. for zero-copy performance with binary data, pass `Uint8Array` and you'll get `Uint8Array` views back.
80
+
81
+ ## πŸ“ citation
82
+
83
+ if you use @chonkiejs/chunk in your research, please cite it as follows:
84
+
85
+ ```bibtex
86
+ @software{chunk2025,
87
+ author = {Minhas, Bhavnick},
88
+ title = {chunk: The fastest text chunking library},
89
+ year = {2025},
90
+ publisher = {GitHub},
91
+ howpublished = {\url{https://github.com/chonkie-inc/chunk}},
92
+ }
93
+ ```
94
+
95
+ ## πŸ“„ license
96
+
97
+ licensed under either of [Apache License, Version 2.0](LICENSE-APACHE) or [MIT license](LICENSE-MIT) at your option.
package/index.js ADDED
@@ -0,0 +1,225 @@
1
+ /**
2
+ * @chonkiejs/chunk - The fastest semantic text chunking library
3
+ *
4
+ * @example
5
+ * ```javascript
6
+ * import { init, chunk } from '@chonkiejs/chunk';
7
+ *
8
+ * await init();
9
+ *
10
+ * // Simple string API - strings in, strings out
11
+ * for (const slice of chunk("Hello. World. Test.", { size: 10 })) {
12
+ * console.log(slice);
13
+ * }
14
+ *
15
+ * // Or use bytes for zero-copy performance
16
+ * const bytes = new TextEncoder().encode("Hello. World.");
17
+ * for (const slice of chunk(bytes, { size: 10 })) {
18
+ * console.log(slice); // Uint8Array
19
+ * }
20
+ * ```
21
+ */
22
+
23
+ import initWasm, {
24
+ Chunker as WasmChunker,
25
+ default_target_size,
26
+ default_delimiters,
27
+ chunk_offsets as wasmChunkOffsets,
28
+ chunk_offsets_pattern as wasmChunkOffsetsPattern,
29
+ } from './pkg/chonkiejs_chunk.js';
30
+
31
+ export { default_target_size, default_delimiters };
32
+
33
+ const encoder = new TextEncoder();
34
+ const decoder = new TextDecoder();
35
+
36
+ /**
37
+ * Convert input to bytes if it's a string.
38
+ * @param {string | Uint8Array} input
39
+ * @returns {Uint8Array}
40
+ */
41
+ function toBytes(input) {
42
+ return typeof input === 'string' ? encoder.encode(input) : input;
43
+ }
44
+
45
+ /**
46
+ * Split text into chunks at delimiter boundaries.
47
+ * Accepts strings or Uint8Array. Returns the same type as input.
48
+ *
49
+ * @param {string | Uint8Array} text - The text to chunk
50
+ * @param {Object} [options] - Options
51
+ * @param {number} [options.size=4096] - Target chunk size in bytes
52
+ * @param {string} [options.delimiters="\n.?"] - Delimiter characters
53
+ * @param {string | Uint8Array} [options.pattern] - Multi-byte pattern to split on
54
+ * @param {boolean} [options.prefix=false] - Put delimiter/pattern at start of next chunk
55
+ * @param {boolean} [options.consecutive=false] - Split at START of consecutive runs
56
+ * @param {boolean} [options.forwardFallback=false] - Search forward if no pattern in backward window
57
+ * @yields {string | Uint8Array} Chunks (same type as input)
58
+ *
59
+ * @example
60
+ * // String input returns strings
61
+ * for (const slice of chunk("Hello. World.", { size: 10 })) {
62
+ * console.log(slice);
63
+ * }
64
+ *
65
+ * @example
66
+ * // With pattern (e.g., metaspace for SentencePiece)
67
+ * for (const slice of chunk("Hello▁World▁Test", { pattern: "▁", prefix: true })) {
68
+ * console.log(slice);
69
+ * }
70
+ */
71
+ export function* chunk(text, options = {}) {
72
+ const isString = typeof text === 'string';
73
+ const bytes = toBytes(text);
74
+ const { size, delimiters, pattern, prefix, consecutive, forwardFallback } = options;
75
+
76
+ let flat;
77
+ if (pattern) {
78
+ const patternBytes = toBytes(pattern);
79
+ flat = wasmChunkOffsetsPattern(bytes, size ?? 4096, patternBytes, prefix, consecutive, forwardFallback);
80
+ } else {
81
+ flat = wasmChunkOffsets(bytes, size, delimiters, prefix);
82
+ }
83
+
84
+ for (let i = 0; i < flat.length; i += 2) {
85
+ const slice = bytes.subarray(flat[i], flat[i + 1]);
86
+ yield isString ? decoder.decode(slice) : slice;
87
+ }
88
+ }
89
+
90
+ /**
91
+ * Get chunk offsets without creating views.
92
+ * Returns an array of [start, end] offset pairs.
93
+ *
94
+ * @param {string | Uint8Array} text - The text to chunk
95
+ * @param {Object} [options] - Options
96
+ * @param {number} [options.size=4096] - Target chunk size in bytes
97
+ * @param {string} [options.delimiters="\n.?"] - Delimiter characters
98
+ * @param {string | Uint8Array} [options.pattern] - Multi-byte pattern to split on
99
+ * @param {boolean} [options.prefix=false] - Put delimiter/pattern at start of next chunk
100
+ * @param {boolean} [options.consecutive=false] - Split at START of consecutive runs
101
+ * @param {boolean} [options.forwardFallback=false] - Search forward if no pattern in backward window
102
+ * @returns {Array<[number, number]>} Array of [start, end] byte offset pairs
103
+ */
104
+ export function chunk_offsets(text, options = {}) {
105
+ const bytes = toBytes(text);
106
+ const { size, delimiters, pattern, prefix, consecutive, forwardFallback } = options;
107
+
108
+ let flat;
109
+ if (pattern) {
110
+ const patternBytes = toBytes(pattern);
111
+ flat = wasmChunkOffsetsPattern(bytes, size ?? 4096, patternBytes, prefix, consecutive, forwardFallback);
112
+ } else {
113
+ flat = wasmChunkOffsets(bytes, size, delimiters, prefix);
114
+ }
115
+
116
+ const pairs = [];
117
+ for (let i = 0; i < flat.length; i += 2) {
118
+ pairs.push([flat[i], flat[i + 1]]);
119
+ }
120
+ return pairs;
121
+ }
122
+
123
+ let initialized = false;
124
+
125
+ /**
126
+ * Initialize the WASM module. Must be called before using chunk functions.
127
+ */
128
+ export async function init() {
129
+ if (!initialized) {
130
+ await initWasm();
131
+ initialized = true;
132
+ }
133
+ }
134
+
135
+ /**
136
+ * Chunker splits text at delimiter boundaries.
137
+ * Implements Symbol.iterator for use in for...of loops.
138
+ *
139
+ * @example
140
+ * // String input
141
+ * const chunker = new Chunker("Hello. World. Test.", { size: 10 });
142
+ * for (const slice of chunker) {
143
+ * console.log(slice); // strings
144
+ * }
145
+ *
146
+ * @example
147
+ * // With pattern
148
+ * const chunker = new Chunker("Hello▁World", { pattern: "▁", prefix: true });
149
+ * for (const slice of chunker) {
150
+ * console.log(slice);
151
+ * }
152
+ */
153
+ export class Chunker {
154
+ /**
155
+ * Create a new Chunker.
156
+ * @param {string | Uint8Array} text - The text to chunk
157
+ * @param {Object} [options] - Options
158
+ * @param {number} [options.size=4096] - Target chunk size in bytes
159
+ * @param {string} [options.delimiters="\n.?"] - Delimiter characters
160
+ * @param {string | Uint8Array} [options.pattern] - Multi-byte pattern to split on
161
+ * @param {boolean} [options.prefix=false] - Put delimiter/pattern at start of next chunk
162
+ * @param {boolean} [options.consecutive=false] - Split at START of consecutive runs
163
+ * @param {boolean} [options.forwardFallback=false] - Search forward if no pattern in backward window
164
+ */
165
+ constructor(text, options = {}) {
166
+ this._isString = typeof text === 'string';
167
+ const bytes = toBytes(text);
168
+ const { size, delimiters, pattern, prefix, consecutive, forwardFallback } = options;
169
+
170
+ if (pattern) {
171
+ const patternBytes = toBytes(pattern);
172
+ this._chunker = WasmChunker.with_pattern(bytes, size ?? 4096, patternBytes, prefix, consecutive, forwardFallback);
173
+ } else {
174
+ this._chunker = new WasmChunker(bytes, size, delimiters, prefix);
175
+ }
176
+ }
177
+
178
+ /**
179
+ * Get the next chunk, or undefined if exhausted.
180
+ * @returns {string | Uint8Array | undefined}
181
+ */
182
+ next() {
183
+ const chunk = this._chunker.next();
184
+ if (chunk === undefined) return undefined;
185
+ return this._isString ? decoder.decode(chunk) : chunk;
186
+ }
187
+
188
+ /**
189
+ * Reset the chunker to iterate from the beginning.
190
+ */
191
+ reset() {
192
+ this._chunker.reset();
193
+ }
194
+
195
+ /**
196
+ * Collect all chunk offsets as an array of [start, end] pairs.
197
+ * This is faster than iterating as it makes a single WASM call.
198
+ * @returns {Array<[number, number]>}
199
+ */
200
+ collectOffsets() {
201
+ const flat = this._chunker.collect_offsets();
202
+ const pairs = [];
203
+ for (let i = 0; i < flat.length; i += 2) {
204
+ pairs.push([flat[i], flat[i + 1]]);
205
+ }
206
+ return pairs;
207
+ }
208
+
209
+ /**
210
+ * Free the underlying WASM memory.
211
+ */
212
+ free() {
213
+ this._chunker.free();
214
+ }
215
+
216
+ /**
217
+ * Iterator protocol - allows use in for...of loops.
218
+ */
219
+ *[Symbol.iterator]() {
220
+ let chunk;
221
+ while ((chunk = this._chunker.next()) !== undefined) {
222
+ yield this._isString ? decoder.decode(chunk) : chunk;
223
+ }
224
+ }
225
+ }
package/package.json ADDED
@@ -0,0 +1,32 @@
1
+ {
2
+ "name": "@chonkiejs/chunk",
3
+ "version": "0.5.0",
4
+ "description": "The fastest semantic text chunking library",
5
+ "type": "module",
6
+ "main": "index.js",
7
+ "files": [
8
+ "index.js",
9
+ "pkg/"
10
+ ],
11
+ "scripts": {
12
+ "build": "wasm-pack build --target web",
13
+ "test": "node --test tests/"
14
+ },
15
+ "keywords": [
16
+ "chunking",
17
+ "text",
18
+ "simd",
19
+ "nlp",
20
+ "tokenization",
21
+ "rag",
22
+ "wasm",
23
+ "webassembly",
24
+ "chonkie"
25
+ ],
26
+ "author": "Bhavnick Minhas",
27
+ "license": "MIT OR Apache-2.0",
28
+ "repository": {
29
+ "type": "git",
30
+ "url": "https://github.com/chonkie-inc/chunk"
31
+ }
32
+ }