@chonkiejs/chunk 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +97 -0
- package/index.js +225 -0
- package/package.json +32 -0
package/README.md
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="../../assets/memchunk_wide.png" alt="@chonkiejs/chunk" width="500">
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
<h1 align="center">@chonkiejs/chunk</h1>
|
|
6
|
+
|
|
7
|
+
<p align="center">
|
|
8
|
+
<em>the fastest text chunking library β up to 1 TB/s throughput</em>
|
|
9
|
+
</p>
|
|
10
|
+
|
|
11
|
+
<p align="center">
|
|
12
|
+
<a href="https://crates.io/crates/chunk"><img src="https://img.shields.io/crates/v/chunk.svg?color=e74c3c" alt="crates.io"></a>
|
|
13
|
+
<a href="https://pypi.org/project/chonkie-core"><img src="https://img.shields.io/pypi/v/chonkie-core.svg?color=e67e22" alt="PyPI"></a>
|
|
14
|
+
<a href="https://www.npmjs.com/package/@chonkiejs/chunk"><img src="https://img.shields.io/npm/v/@chonkiejs/chunk.svg?color=2ecc71" alt="npm"></a>
|
|
15
|
+
<a href="https://github.com/chonkie-inc/chunk"><img src="https://img.shields.io/badge/github-chunk-3498db" alt="GitHub"></a>
|
|
16
|
+
<a href="LICENSE-MIT"><img src="https://img.shields.io/badge/license-MIT%2FApache--2.0-9b59b6.svg" alt="License"></a>
|
|
17
|
+
</p>
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
you know how every chunking library claims to be fast? yeah, we actually meant it.
|
|
22
|
+
|
|
23
|
+
**@chonkiejs/chunk** splits text at semantic boundaries (periods, newlines, the usual suspects) and does it stupid fast. we're talking "chunk the entire english wikipedia in 120ms" fast.
|
|
24
|
+
|
|
25
|
+
want to know how? [read the blog post](https://minha.sh/posts/so,-you-want-to-chunk-really-fast) where we nerd out about SIMD instructions and lookup tables.
|
|
26
|
+
|
|
27
|
+
## π¦ installation
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
npm install @chonkiejs/chunk
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
looking for [rust](https://github.com/chonkie-inc/chunk) or [python](https://github.com/chonkie-inc/chunk/tree/main/packages/python)?
|
|
34
|
+
|
|
35
|
+
## π usage
|
|
36
|
+
|
|
37
|
+
```javascript
|
|
38
|
+
import { init, chunk } from '@chonkiejs/chunk';
|
|
39
|
+
|
|
40
|
+
// initialize wasm (required once)
|
|
41
|
+
await init();
|
|
42
|
+
|
|
43
|
+
const text = "Hello world. How are you? I'm fine.\nThanks for asking.";
|
|
44
|
+
|
|
45
|
+
// with defaults (4KB chunks, split at \n . ?)
|
|
46
|
+
for (const slice of chunk(text)) {
|
|
47
|
+
console.log(slice);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
// with custom size
|
|
51
|
+
for (const slice of chunk(text, { size: 1024 })) {
|
|
52
|
+
console.log(slice);
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// with custom delimiters
|
|
56
|
+
for (const slice of chunk(text, { delimiters: ".?!\n" })) {
|
|
57
|
+
console.log(slice);
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// with multi-byte pattern (e.g., metaspace β for SentencePiece tokenizers)
|
|
61
|
+
for (const slice of chunk(text, { pattern: "β", prefix: true })) {
|
|
62
|
+
console.log(slice);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// with consecutive pattern handling (split at START of runs, not middle)
|
|
66
|
+
for (const slice of chunk("word next", { pattern: " ", consecutive: true })) {
|
|
67
|
+
console.log(slice);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// with forward fallback (search forward if no pattern in backward window)
|
|
71
|
+
for (const slice of chunk(text, { pattern: " ", forwardFallback: true })) {
|
|
72
|
+
console.log(slice);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// collect all chunks
|
|
76
|
+
const chunks = [...chunk(text)];
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
pass strings and get strings back. for zero-copy performance with binary data, pass `Uint8Array` and you'll get `Uint8Array` views back.
|
|
80
|
+
|
|
81
|
+
## π citation
|
|
82
|
+
|
|
83
|
+
if you use @chonkiejs/chunk in your research, please cite it as follows:
|
|
84
|
+
|
|
85
|
+
```bibtex
|
|
86
|
+
@software{chunk2025,
|
|
87
|
+
author = {Minhas, Bhavnick},
|
|
88
|
+
title = {chunk: The fastest text chunking library},
|
|
89
|
+
year = {2025},
|
|
90
|
+
publisher = {GitHub},
|
|
91
|
+
howpublished = {\url{https://github.com/chonkie-inc/chunk}},
|
|
92
|
+
}
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
## π license
|
|
96
|
+
|
|
97
|
+
licensed under either of [Apache License, Version 2.0](LICENSE-APACHE) or [MIT license](LICENSE-MIT) at your option.
|
package/index.js
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @chonkiejs/chunk - The fastest semantic text chunking library
|
|
3
|
+
*
|
|
4
|
+
* @example
|
|
5
|
+
* ```javascript
|
|
6
|
+
* import { init, chunk } from '@chonkiejs/chunk';
|
|
7
|
+
*
|
|
8
|
+
* await init();
|
|
9
|
+
*
|
|
10
|
+
* // Simple string API - strings in, strings out
|
|
11
|
+
* for (const slice of chunk("Hello. World. Test.", { size: 10 })) {
|
|
12
|
+
* console.log(slice);
|
|
13
|
+
* }
|
|
14
|
+
*
|
|
15
|
+
* // Or use bytes for zero-copy performance
|
|
16
|
+
* const bytes = new TextEncoder().encode("Hello. World.");
|
|
17
|
+
* for (const slice of chunk(bytes, { size: 10 })) {
|
|
18
|
+
* console.log(slice); // Uint8Array
|
|
19
|
+
* }
|
|
20
|
+
* ```
|
|
21
|
+
*/
|
|
22
|
+
|
|
23
|
+
import initWasm, {
|
|
24
|
+
Chunker as WasmChunker,
|
|
25
|
+
default_target_size,
|
|
26
|
+
default_delimiters,
|
|
27
|
+
chunk_offsets as wasmChunkOffsets,
|
|
28
|
+
chunk_offsets_pattern as wasmChunkOffsetsPattern,
|
|
29
|
+
} from './pkg/chonkiejs_chunk.js';
|
|
30
|
+
|
|
31
|
+
export { default_target_size, default_delimiters };
|
|
32
|
+
|
|
33
|
+
const encoder = new TextEncoder();
|
|
34
|
+
const decoder = new TextDecoder();
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Convert input to bytes if it's a string.
|
|
38
|
+
* @param {string | Uint8Array} input
|
|
39
|
+
* @returns {Uint8Array}
|
|
40
|
+
*/
|
|
41
|
+
function toBytes(input) {
|
|
42
|
+
return typeof input === 'string' ? encoder.encode(input) : input;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Split text into chunks at delimiter boundaries.
|
|
47
|
+
* Accepts strings or Uint8Array. Returns the same type as input.
|
|
48
|
+
*
|
|
49
|
+
* @param {string | Uint8Array} text - The text to chunk
|
|
50
|
+
* @param {Object} [options] - Options
|
|
51
|
+
* @param {number} [options.size=4096] - Target chunk size in bytes
|
|
52
|
+
* @param {string} [options.delimiters="\n.?"] - Delimiter characters
|
|
53
|
+
* @param {string | Uint8Array} [options.pattern] - Multi-byte pattern to split on
|
|
54
|
+
* @param {boolean} [options.prefix=false] - Put delimiter/pattern at start of next chunk
|
|
55
|
+
* @param {boolean} [options.consecutive=false] - Split at START of consecutive runs
|
|
56
|
+
* @param {boolean} [options.forwardFallback=false] - Search forward if no pattern in backward window
|
|
57
|
+
* @yields {string | Uint8Array} Chunks (same type as input)
|
|
58
|
+
*
|
|
59
|
+
* @example
|
|
60
|
+
* // String input returns strings
|
|
61
|
+
* for (const slice of chunk("Hello. World.", { size: 10 })) {
|
|
62
|
+
* console.log(slice);
|
|
63
|
+
* }
|
|
64
|
+
*
|
|
65
|
+
* @example
|
|
66
|
+
* // With pattern (e.g., metaspace for SentencePiece)
|
|
67
|
+
* for (const slice of chunk("HelloβWorldβTest", { pattern: "β", prefix: true })) {
|
|
68
|
+
* console.log(slice);
|
|
69
|
+
* }
|
|
70
|
+
*/
|
|
71
|
+
export function* chunk(text, options = {}) {
|
|
72
|
+
const isString = typeof text === 'string';
|
|
73
|
+
const bytes = toBytes(text);
|
|
74
|
+
const { size, delimiters, pattern, prefix, consecutive, forwardFallback } = options;
|
|
75
|
+
|
|
76
|
+
let flat;
|
|
77
|
+
if (pattern) {
|
|
78
|
+
const patternBytes = toBytes(pattern);
|
|
79
|
+
flat = wasmChunkOffsetsPattern(bytes, size ?? 4096, patternBytes, prefix, consecutive, forwardFallback);
|
|
80
|
+
} else {
|
|
81
|
+
flat = wasmChunkOffsets(bytes, size, delimiters, prefix);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
for (let i = 0; i < flat.length; i += 2) {
|
|
85
|
+
const slice = bytes.subarray(flat[i], flat[i + 1]);
|
|
86
|
+
yield isString ? decoder.decode(slice) : slice;
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Get chunk offsets without creating views.
|
|
92
|
+
* Returns an array of [start, end] offset pairs.
|
|
93
|
+
*
|
|
94
|
+
* @param {string | Uint8Array} text - The text to chunk
|
|
95
|
+
* @param {Object} [options] - Options
|
|
96
|
+
* @param {number} [options.size=4096] - Target chunk size in bytes
|
|
97
|
+
* @param {string} [options.delimiters="\n.?"] - Delimiter characters
|
|
98
|
+
* @param {string | Uint8Array} [options.pattern] - Multi-byte pattern to split on
|
|
99
|
+
* @param {boolean} [options.prefix=false] - Put delimiter/pattern at start of next chunk
|
|
100
|
+
* @param {boolean} [options.consecutive=false] - Split at START of consecutive runs
|
|
101
|
+
* @param {boolean} [options.forwardFallback=false] - Search forward if no pattern in backward window
|
|
102
|
+
* @returns {Array<[number, number]>} Array of [start, end] byte offset pairs
|
|
103
|
+
*/
|
|
104
|
+
export function chunk_offsets(text, options = {}) {
|
|
105
|
+
const bytes = toBytes(text);
|
|
106
|
+
const { size, delimiters, pattern, prefix, consecutive, forwardFallback } = options;
|
|
107
|
+
|
|
108
|
+
let flat;
|
|
109
|
+
if (pattern) {
|
|
110
|
+
const patternBytes = toBytes(pattern);
|
|
111
|
+
flat = wasmChunkOffsetsPattern(bytes, size ?? 4096, patternBytes, prefix, consecutive, forwardFallback);
|
|
112
|
+
} else {
|
|
113
|
+
flat = wasmChunkOffsets(bytes, size, delimiters, prefix);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
const pairs = [];
|
|
117
|
+
for (let i = 0; i < flat.length; i += 2) {
|
|
118
|
+
pairs.push([flat[i], flat[i + 1]]);
|
|
119
|
+
}
|
|
120
|
+
return pairs;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
let initialized = false;
|
|
124
|
+
|
|
125
|
+
/**
|
|
126
|
+
* Initialize the WASM module. Must be called before using chunk functions.
|
|
127
|
+
*/
|
|
128
|
+
export async function init() {
|
|
129
|
+
if (!initialized) {
|
|
130
|
+
await initWasm();
|
|
131
|
+
initialized = true;
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Chunker splits text at delimiter boundaries.
|
|
137
|
+
* Implements Symbol.iterator for use in for...of loops.
|
|
138
|
+
*
|
|
139
|
+
* @example
|
|
140
|
+
* // String input
|
|
141
|
+
* const chunker = new Chunker("Hello. World. Test.", { size: 10 });
|
|
142
|
+
* for (const slice of chunker) {
|
|
143
|
+
* console.log(slice); // strings
|
|
144
|
+
* }
|
|
145
|
+
*
|
|
146
|
+
* @example
|
|
147
|
+
* // With pattern
|
|
148
|
+
* const chunker = new Chunker("HelloβWorld", { pattern: "β", prefix: true });
|
|
149
|
+
* for (const slice of chunker) {
|
|
150
|
+
* console.log(slice);
|
|
151
|
+
* }
|
|
152
|
+
*/
|
|
153
|
+
export class Chunker {
|
|
154
|
+
/**
|
|
155
|
+
* Create a new Chunker.
|
|
156
|
+
* @param {string | Uint8Array} text - The text to chunk
|
|
157
|
+
* @param {Object} [options] - Options
|
|
158
|
+
* @param {number} [options.size=4096] - Target chunk size in bytes
|
|
159
|
+
* @param {string} [options.delimiters="\n.?"] - Delimiter characters
|
|
160
|
+
* @param {string | Uint8Array} [options.pattern] - Multi-byte pattern to split on
|
|
161
|
+
* @param {boolean} [options.prefix=false] - Put delimiter/pattern at start of next chunk
|
|
162
|
+
* @param {boolean} [options.consecutive=false] - Split at START of consecutive runs
|
|
163
|
+
* @param {boolean} [options.forwardFallback=false] - Search forward if no pattern in backward window
|
|
164
|
+
*/
|
|
165
|
+
constructor(text, options = {}) {
|
|
166
|
+
this._isString = typeof text === 'string';
|
|
167
|
+
const bytes = toBytes(text);
|
|
168
|
+
const { size, delimiters, pattern, prefix, consecutive, forwardFallback } = options;
|
|
169
|
+
|
|
170
|
+
if (pattern) {
|
|
171
|
+
const patternBytes = toBytes(pattern);
|
|
172
|
+
this._chunker = WasmChunker.with_pattern(bytes, size ?? 4096, patternBytes, prefix, consecutive, forwardFallback);
|
|
173
|
+
} else {
|
|
174
|
+
this._chunker = new WasmChunker(bytes, size, delimiters, prefix);
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/**
|
|
179
|
+
* Get the next chunk, or undefined if exhausted.
|
|
180
|
+
* @returns {string | Uint8Array | undefined}
|
|
181
|
+
*/
|
|
182
|
+
next() {
|
|
183
|
+
const chunk = this._chunker.next();
|
|
184
|
+
if (chunk === undefined) return undefined;
|
|
185
|
+
return this._isString ? decoder.decode(chunk) : chunk;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
/**
|
|
189
|
+
* Reset the chunker to iterate from the beginning.
|
|
190
|
+
*/
|
|
191
|
+
reset() {
|
|
192
|
+
this._chunker.reset();
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
/**
|
|
196
|
+
* Collect all chunk offsets as an array of [start, end] pairs.
|
|
197
|
+
* This is faster than iterating as it makes a single WASM call.
|
|
198
|
+
* @returns {Array<[number, number]>}
|
|
199
|
+
*/
|
|
200
|
+
collectOffsets() {
|
|
201
|
+
const flat = this._chunker.collect_offsets();
|
|
202
|
+
const pairs = [];
|
|
203
|
+
for (let i = 0; i < flat.length; i += 2) {
|
|
204
|
+
pairs.push([flat[i], flat[i + 1]]);
|
|
205
|
+
}
|
|
206
|
+
return pairs;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
/**
|
|
210
|
+
* Free the underlying WASM memory.
|
|
211
|
+
*/
|
|
212
|
+
free() {
|
|
213
|
+
this._chunker.free();
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
/**
|
|
217
|
+
* Iterator protocol - allows use in for...of loops.
|
|
218
|
+
*/
|
|
219
|
+
*[Symbol.iterator]() {
|
|
220
|
+
let chunk;
|
|
221
|
+
while ((chunk = this._chunker.next()) !== undefined) {
|
|
222
|
+
yield this._isString ? decoder.decode(chunk) : chunk;
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@chonkiejs/chunk",
|
|
3
|
+
"version": "0.5.0",
|
|
4
|
+
"description": "The fastest semantic text chunking library",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "index.js",
|
|
7
|
+
"files": [
|
|
8
|
+
"index.js",
|
|
9
|
+
"pkg/"
|
|
10
|
+
],
|
|
11
|
+
"scripts": {
|
|
12
|
+
"build": "wasm-pack build --target web",
|
|
13
|
+
"test": "node --test tests/"
|
|
14
|
+
},
|
|
15
|
+
"keywords": [
|
|
16
|
+
"chunking",
|
|
17
|
+
"text",
|
|
18
|
+
"simd",
|
|
19
|
+
"nlp",
|
|
20
|
+
"tokenization",
|
|
21
|
+
"rag",
|
|
22
|
+
"wasm",
|
|
23
|
+
"webassembly",
|
|
24
|
+
"chonkie"
|
|
25
|
+
],
|
|
26
|
+
"author": "Bhavnick Minhas",
|
|
27
|
+
"license": "MIT OR Apache-2.0",
|
|
28
|
+
"repository": {
|
|
29
|
+
"type": "git",
|
|
30
|
+
"url": "https://github.com/chonkie-inc/chunk"
|
|
31
|
+
}
|
|
32
|
+
}
|