@chonkiejs/chunk 0.8.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@chonkiejs/chunk",
3
- "version": "0.8.0",
3
+ "version": "0.9.1",
4
4
  "description": "The fastest semantic text chunking library",
5
5
  "type": "module",
6
6
  "main": "index.js",
package/pkg/README.md ADDED
@@ -0,0 +1,97 @@
1
+ <p align="center">
2
+ <img src="../../assets/memchunk_wide.png" alt="@chonkiejs/chunk" width="500">
3
+ </p>
4
+
5
+ <h1 align="center">@chonkiejs/chunk</h1>
6
+
7
+ <p align="center">
8
+ <em>the fastest text chunking library — up to 1 TB/s throughput</em>
9
+ </p>
10
+
11
+ <p align="center">
12
+ <a href="https://crates.io/crates/chunk"><img src="https://img.shields.io/crates/v/chunk.svg?color=e74c3c" alt="crates.io"></a>
13
+ <a href="https://pypi.org/project/chonkie-core"><img src="https://img.shields.io/pypi/v/chonkie-core.svg?color=e67e22" alt="PyPI"></a>
14
+ <a href="https://www.npmjs.com/package/@chonkiejs/chunk"><img src="https://img.shields.io/npm/v/@chonkiejs/chunk.svg?color=2ecc71" alt="npm"></a>
15
+ <a href="https://github.com/chonkie-inc/chunk"><img src="https://img.shields.io/badge/github-chunk-3498db" alt="GitHub"></a>
16
+ <a href="LICENSE-MIT"><img src="https://img.shields.io/badge/license-MIT%2FApache--2.0-9b59b6.svg" alt="License"></a>
17
+ </p>
18
+
19
+ ---
20
+
21
+ you know how every chunking library claims to be fast? yeah, we actually meant it.
22
+
23
+ **@chonkiejs/chunk** splits text at semantic boundaries (periods, newlines, the usual suspects) and does it stupid fast. we're talking "chunk the entire english wikipedia in 120ms" fast.
24
+
25
+ want to know how? [read the blog post](https://minha.sh/posts/so,-you-want-to-chunk-really-fast) where we nerd out about SIMD instructions and lookup tables.
26
+
27
+ ## 📦 installation
28
+
29
+ ```bash
30
+ npm install @chonkiejs/chunk
31
+ ```
32
+
33
+ looking for [rust](https://github.com/chonkie-inc/chunk) or [python](https://github.com/chonkie-inc/chunk/tree/main/packages/python)?
34
+
35
+ ## 🚀 usage
36
+
37
+ ```javascript
38
+ import { init, chunk } from '@chonkiejs/chunk';
39
+
40
+ // initialize wasm (required once)
41
+ await init();
42
+
43
+ const text = "Hello world. How are you? I'm fine.\nThanks for asking.";
44
+
45
+ // with defaults (4KB chunks, split at \n . ?)
46
+ for (const slice of chunk(text)) {
47
+ console.log(slice);
48
+ }
49
+
50
+ // with custom size
51
+ for (const slice of chunk(text, { size: 1024 })) {
52
+ console.log(slice);
53
+ }
54
+
55
+ // with custom delimiters
56
+ for (const slice of chunk(text, { delimiters: ".?!\n" })) {
57
+ console.log(slice);
58
+ }
59
+
60
+ // with multi-byte pattern (e.g., metaspace ▁ for SentencePiece tokenizers)
61
+ for (const slice of chunk(text, { pattern: "▁", prefix: true })) {
62
+ console.log(slice);
63
+ }
64
+
65
+ // with consecutive pattern handling (split at START of runs, not middle)
66
+ for (const slice of chunk("word next", { pattern: " ", consecutive: true })) {
67
+ console.log(slice);
68
+ }
69
+
70
+ // with forward fallback (search forward if no pattern in backward window)
71
+ for (const slice of chunk(text, { pattern: " ", forwardFallback: true })) {
72
+ console.log(slice);
73
+ }
74
+
75
+ // collect all chunks
76
+ const chunks = [...chunk(text)];
77
+ ```
78
+
79
+ pass strings and get strings back. for zero-copy performance with binary data, pass `Uint8Array` and you'll get `Uint8Array` views back.
80
+
81
+ ## 📝 citation
82
+
83
+ if you use @chonkiejs/chunk in your research, please cite it as follows:
84
+
85
+ ```bibtex
86
+ @software{chunk2025,
87
+ author = {Minhas, Bhavnick},
88
+ title = {chunk: The fastest text chunking library},
89
+ year = {2025},
90
+ publisher = {GitHub},
91
+ howpublished = {\url{https://github.com/chonkie-inc/chunk}},
92
+ }
93
+ ```
94
+
95
+ ## 📄 license
96
+
97
+ licensed under either of [Apache License, Version 2.0](LICENSE-APACHE) or [MIT license](LICENSE-MIT) at your option.
@@ -0,0 +1,171 @@
1
+ /* tslint:disable */
2
+ /* eslint-disable */
3
+
4
+ export class Chunker {
5
+ free(): void;
6
+ [Symbol.dispose](): void;
7
+ /**
8
+ * Create a new Chunker with a multi-byte pattern.
9
+ *
10
+ * @param text - The text to chunk (as Uint8Array)
11
+ * @param size - Target chunk size in bytes
12
+ * @param pattern - Multi-byte pattern to split on (as Uint8Array)
13
+ * @param prefix - Put pattern at start of next chunk (default: false)
14
+ * @param consecutive - Split at START of consecutive runs (default: false)
15
+ * @param forward_fallback - Search forward if no pattern in backward window (default: false)
16
+ */
17
+ static with_pattern(text: Uint8Array, size: number, pattern: Uint8Array, prefix?: boolean | null, consecutive?: boolean | null, forward_fallback?: boolean | null): Chunker;
18
+ /**
19
+ * Collect all chunk offsets as a flat array [start1, end1, start2, end2, ...].
20
+ * This is faster than iterating as it makes a single WASM call.
21
+ */
22
+ collect_offsets(): Uint32Array;
23
+ /**
24
+ * Create a new Chunker with single-byte delimiters.
25
+ *
26
+ * @param text - The text to chunk (as Uint8Array)
27
+ * @param size - Target chunk size in bytes (default: 4096)
28
+ * @param delimiters - Delimiter characters as string (default: "\n.?")
29
+ * @param prefix - Put delimiter at start of next chunk (default: false)
30
+ * @param consecutive - Split at START of consecutive runs (default: false)
31
+ * @param forward_fallback - Search forward if no delimiter in backward window (default: false)
32
+ */
33
+ constructor(text: Uint8Array, size?: number | null, delimiters?: string | null, prefix?: boolean | null, consecutive?: boolean | null, forward_fallback?: boolean | null);
34
+ /**
35
+ * Get the next chunk, or undefined if exhausted.
36
+ */
37
+ next(): Uint8Array | undefined;
38
+ /**
39
+ * Reset the chunker to iterate from the beginning.
40
+ */
41
+ reset(): void;
42
+ }
43
+
44
+ /**
45
+ * Fast chunking function that returns offsets in a single call.
46
+ * Returns a flat array [start1, end1, start2, end2, ...].
47
+ * Use this with subarray for maximum performance.
48
+ *
49
+ * @example Single-byte delimiters
50
+ * ```javascript
51
+ * const offsets = chunk_offsets(textBytes, 4096, ".\n?");
52
+ * const chunks = [];
53
+ * for (let i = 0; i < offsets.length; i += 2) {
54
+ * chunks.push(textBytes.subarray(offsets[i], offsets[i + 1]));
55
+ * }
56
+ * ```
57
+ */
58
+ export function chunk_offsets(text: Uint8Array, size?: number | null, delimiters?: string | null, prefix?: boolean | null, consecutive?: boolean | null, forward_fallback?: boolean | null): Uint32Array;
59
+
60
+ /**
61
+ * Fast chunking function with multi-byte pattern support.
62
+ * Returns a flat array [start1, end1, start2, end2, ...].
63
+ *
64
+ * @example Multi-byte pattern (metaspace)
65
+ * ```javascript
66
+ * const encoder = new TextEncoder();
67
+ * const metaspace = encoder.encode("▁");
68
+ * const offsets = chunk_offsets_pattern(textBytes, 4096, metaspace, true, true, true);
69
+ * ```
70
+ */
71
+ export function chunk_offsets_pattern(text: Uint8Array, size: number, pattern: Uint8Array, prefix?: boolean | null, consecutive?: boolean | null, forward_fallback?: boolean | null): Uint32Array;
72
+
73
+ /**
74
+ * Get the default delimiters ("\n.?").
75
+ */
76
+ export function default_delimiters(): Uint8Array;
77
+
78
+ /**
79
+ * Get the default target size (4096 bytes).
80
+ */
81
+ export function default_target_size(): number;
82
+
83
+ /**
84
+ * Find merge indices for combining segments within token limits.
85
+ *
86
+ * Returns indices marking where to split segments into chunks that
87
+ * respect the token budget. Use this to determine merge boundaries,
88
+ * then join strings in JavaScript.
89
+ *
90
+ * @param token_counts - Array of token counts for each segment
91
+ * @param chunk_size - Maximum tokens per merged chunk
92
+ * @returns Array of end indices (exclusive) for each chunk
93
+ *
94
+ * @example
95
+ * ```javascript
96
+ * const tokenCounts = new Uint32Array([1, 1, 1, 1, 1, 1, 1]);
97
+ * const indices = find_merge_indices(tokenCounts, 3);
98
+ * // indices = [3, 6, 7]
99
+ * // Use to slice: segments.slice(0, 3), segments.slice(3, 6), segments.slice(6, 7)
100
+ * ```
101
+ */
102
+ export function find_merge_indices(token_counts: Uint32Array, chunk_size: number): Uint32Array;
103
+
104
+ /**
105
+ * Split text at every delimiter occurrence, returning offsets.
106
+ * Unlike chunk_offsets which creates size-based chunks, this splits at
107
+ * **every** delimiter occurrence.
108
+ *
109
+ * Returns a flat array [start1, end1, start2, end2, ...].
110
+ *
111
+ * @param text - The text to split (as Uint8Array)
112
+ * @param delimiters - Delimiter characters as string (default: "\n.?")
113
+ * @param include_delim - Where to attach delimiter: "prev" (default), "next", or "none"
114
+ * @param min_chars - Minimum characters per segment (default: 0). Shorter segments are merged.
115
+ *
116
+ * @example
117
+ * ```javascript
118
+ * const offsets = split_offsets(textBytes, ".", "prev", 0);
119
+ * const segments = [];
120
+ * for (let i = 0; i < offsets.length; i += 2) {
121
+ * segments.push(textBytes.subarray(offsets[i], offsets[i + 1]));
122
+ * }
123
+ * // ["Hello.", " World.", " Test."]
124
+ * ```
125
+ */
126
+ export function split_offsets(text: Uint8Array, delimiters?: string | null, include_delim?: string | null, min_chars?: number | null): Uint32Array;
127
+
128
+ export type InitInput = RequestInfo | URL | Response | BufferSource | WebAssembly.Module;
129
+
130
+ export interface InitOutput {
131
+ readonly memory: WebAssembly.Memory;
132
+ readonly __wbg_chunker_free: (a: number, b: number) => void;
133
+ readonly chunk_offsets: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number) => [number, number];
134
+ readonly chunk_offsets_pattern: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number) => [number, number];
135
+ readonly chunker_collect_offsets: (a: number) => [number, number];
136
+ readonly chunker_new: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number) => number;
137
+ readonly chunker_next: (a: number) => [number, number];
138
+ readonly chunker_reset: (a: number) => void;
139
+ readonly chunker_with_pattern: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number) => number;
140
+ readonly default_delimiters: () => [number, number];
141
+ readonly default_target_size: () => number;
142
+ readonly find_merge_indices: (a: number, b: number, c: number) => [number, number];
143
+ readonly split_offsets: (a: number, b: number, c: number, d: number, e: number, f: number, g: number) => [number, number];
144
+ readonly __wbindgen_externrefs: WebAssembly.Table;
145
+ readonly __wbindgen_malloc: (a: number, b: number) => number;
146
+ readonly __wbindgen_realloc: (a: number, b: number, c: number, d: number) => number;
147
+ readonly __wbindgen_free: (a: number, b: number, c: number) => void;
148
+ readonly __wbindgen_start: () => void;
149
+ }
150
+
151
+ export type SyncInitInput = BufferSource | WebAssembly.Module;
152
+
153
+ /**
154
+ * Instantiates the given `module`, which can either be bytes or
155
+ * a precompiled `WebAssembly.Module`.
156
+ *
157
+ * @param {{ module: SyncInitInput }} module - Passing `SyncInitInput` directly is deprecated.
158
+ *
159
+ * @returns {InitOutput}
160
+ */
161
+ export function initSync(module: { module: SyncInitInput } | SyncInitInput): InitOutput;
162
+
163
+ /**
164
+ * If `module_or_path` is {RequestInfo} or {URL}, makes a request and
165
+ * for everything else, calls `WebAssembly.instantiate` directly.
166
+ *
167
+ * @param {{ module_or_path: InitInput | Promise<InitInput> }} module_or_path - Passing `InitInput` directly is deprecated.
168
+ *
169
+ * @returns {Promise<InitOutput>}
170
+ */
171
+ export default function __wbg_init (module_or_path?: { module_or_path: InitInput | Promise<InitInput> } | InitInput | Promise<InitInput>): Promise<InitOutput>;
@@ -0,0 +1,504 @@
1
+ let wasm;
2
+
3
+ function getArrayU32FromWasm0(ptr, len) {
4
+ ptr = ptr >>> 0;
5
+ return getUint32ArrayMemory0().subarray(ptr / 4, ptr / 4 + len);
6
+ }
7
+
8
+ function getArrayU8FromWasm0(ptr, len) {
9
+ ptr = ptr >>> 0;
10
+ return getUint8ArrayMemory0().subarray(ptr / 1, ptr / 1 + len);
11
+ }
12
+
13
+ function getStringFromWasm0(ptr, len) {
14
+ ptr = ptr >>> 0;
15
+ return decodeText(ptr, len);
16
+ }
17
+
18
+ let cachedUint32ArrayMemory0 = null;
19
+ function getUint32ArrayMemory0() {
20
+ if (cachedUint32ArrayMemory0 === null || cachedUint32ArrayMemory0.byteLength === 0) {
21
+ cachedUint32ArrayMemory0 = new Uint32Array(wasm.memory.buffer);
22
+ }
23
+ return cachedUint32ArrayMemory0;
24
+ }
25
+
26
+ let cachedUint8ArrayMemory0 = null;
27
+ function getUint8ArrayMemory0() {
28
+ if (cachedUint8ArrayMemory0 === null || cachedUint8ArrayMemory0.byteLength === 0) {
29
+ cachedUint8ArrayMemory0 = new Uint8Array(wasm.memory.buffer);
30
+ }
31
+ return cachedUint8ArrayMemory0;
32
+ }
33
+
34
+ function isLikeNone(x) {
35
+ return x === undefined || x === null;
36
+ }
37
+
38
+ function passArray32ToWasm0(arg, malloc) {
39
+ const ptr = malloc(arg.length * 4, 4) >>> 0;
40
+ getUint32ArrayMemory0().set(arg, ptr / 4);
41
+ WASM_VECTOR_LEN = arg.length;
42
+ return ptr;
43
+ }
44
+
45
+ function passArray8ToWasm0(arg, malloc) {
46
+ const ptr = malloc(arg.length * 1, 1) >>> 0;
47
+ getUint8ArrayMemory0().set(arg, ptr / 1);
48
+ WASM_VECTOR_LEN = arg.length;
49
+ return ptr;
50
+ }
51
+
52
+ function passStringToWasm0(arg, malloc, realloc) {
53
+ if (realloc === undefined) {
54
+ const buf = cachedTextEncoder.encode(arg);
55
+ const ptr = malloc(buf.length, 1) >>> 0;
56
+ getUint8ArrayMemory0().subarray(ptr, ptr + buf.length).set(buf);
57
+ WASM_VECTOR_LEN = buf.length;
58
+ return ptr;
59
+ }
60
+
61
+ let len = arg.length;
62
+ let ptr = malloc(len, 1) >>> 0;
63
+
64
+ const mem = getUint8ArrayMemory0();
65
+
66
+ let offset = 0;
67
+
68
+ for (; offset < len; offset++) {
69
+ const code = arg.charCodeAt(offset);
70
+ if (code > 0x7F) break;
71
+ mem[ptr + offset] = code;
72
+ }
73
+ if (offset !== len) {
74
+ if (offset !== 0) {
75
+ arg = arg.slice(offset);
76
+ }
77
+ ptr = realloc(ptr, len, len = offset + arg.length * 3, 1) >>> 0;
78
+ const view = getUint8ArrayMemory0().subarray(ptr + offset, ptr + len);
79
+ const ret = cachedTextEncoder.encodeInto(arg, view);
80
+
81
+ offset += ret.written;
82
+ ptr = realloc(ptr, len, offset, 1) >>> 0;
83
+ }
84
+
85
+ WASM_VECTOR_LEN = offset;
86
+ return ptr;
87
+ }
88
+
89
+ let cachedTextDecoder = new TextDecoder('utf-8', { ignoreBOM: true, fatal: true });
90
+ cachedTextDecoder.decode();
91
+ const MAX_SAFARI_DECODE_BYTES = 2146435072;
92
+ let numBytesDecoded = 0;
93
+ function decodeText(ptr, len) {
94
+ numBytesDecoded += len;
95
+ if (numBytesDecoded >= MAX_SAFARI_DECODE_BYTES) {
96
+ cachedTextDecoder = new TextDecoder('utf-8', { ignoreBOM: true, fatal: true });
97
+ cachedTextDecoder.decode();
98
+ numBytesDecoded = len;
99
+ }
100
+ return cachedTextDecoder.decode(getUint8ArrayMemory0().subarray(ptr, ptr + len));
101
+ }
102
+
103
+ const cachedTextEncoder = new TextEncoder();
104
+
105
+ if (!('encodeInto' in cachedTextEncoder)) {
106
+ cachedTextEncoder.encodeInto = function (arg, view) {
107
+ const buf = cachedTextEncoder.encode(arg);
108
+ view.set(buf);
109
+ return {
110
+ read: arg.length,
111
+ written: buf.length
112
+ };
113
+ }
114
+ }
115
+
116
+ let WASM_VECTOR_LEN = 0;
117
+
118
+ const ChunkerFinalization = (typeof FinalizationRegistry === 'undefined')
119
+ ? { register: () => {}, unregister: () => {} }
120
+ : new FinalizationRegistry(ptr => wasm.__wbg_chunker_free(ptr >>> 0, 1));
121
+
122
+ /**
123
+ * Chunker splits text at delimiter boundaries.
124
+ *
125
+ * @example Single-byte delimiters
126
+ * ```javascript
127
+ * const chunker = new Chunker(textBytes, 4096, ".\n?");
128
+ * let chunk;
129
+ * while ((chunk = chunker.next()) !== undefined) {
130
+ * console.log(chunk);
131
+ * }
132
+ * ```
133
+ *
134
+ * @example Multi-byte pattern (e.g., metaspace for SentencePiece)
135
+ * ```javascript
136
+ * const encoder = new TextEncoder();
137
+ * const metaspace = encoder.encode("▁");
138
+ * const chunker = Chunker.with_pattern(textBytes, 4096, metaspace, true);
139
+ * ```
140
+ */
141
+ export class Chunker {
142
+ static __wrap(ptr) {
143
+ ptr = ptr >>> 0;
144
+ const obj = Object.create(Chunker.prototype);
145
+ obj.__wbg_ptr = ptr;
146
+ ChunkerFinalization.register(obj, obj.__wbg_ptr, obj);
147
+ return obj;
148
+ }
149
+ __destroy_into_raw() {
150
+ const ptr = this.__wbg_ptr;
151
+ this.__wbg_ptr = 0;
152
+ ChunkerFinalization.unregister(this);
153
+ return ptr;
154
+ }
155
+ free() {
156
+ const ptr = this.__destroy_into_raw();
157
+ wasm.__wbg_chunker_free(ptr, 0);
158
+ }
159
+ /**
160
+ * Create a new Chunker with a multi-byte pattern.
161
+ *
162
+ * @param text - The text to chunk (as Uint8Array)
163
+ * @param size - Target chunk size in bytes
164
+ * @param pattern - Multi-byte pattern to split on (as Uint8Array)
165
+ * @param prefix - Put pattern at start of next chunk (default: false)
166
+ * @param consecutive - Split at START of consecutive runs (default: false)
167
+ * @param forward_fallback - Search forward if no pattern in backward window (default: false)
168
+ * @param {Uint8Array} text
169
+ * @param {number} size
170
+ * @param {Uint8Array} pattern
171
+ * @param {boolean | null} [prefix]
172
+ * @param {boolean | null} [consecutive]
173
+ * @param {boolean | null} [forward_fallback]
174
+ * @returns {Chunker}
175
+ */
176
+ static with_pattern(text, size, pattern, prefix, consecutive, forward_fallback) {
177
+ const ptr0 = passArray8ToWasm0(text, wasm.__wbindgen_malloc);
178
+ const len0 = WASM_VECTOR_LEN;
179
+ const ptr1 = passArray8ToWasm0(pattern, wasm.__wbindgen_malloc);
180
+ const len1 = WASM_VECTOR_LEN;
181
+ const ret = wasm.chunker_with_pattern(ptr0, len0, size, ptr1, len1, isLikeNone(prefix) ? 0xFFFFFF : prefix ? 1 : 0, isLikeNone(consecutive) ? 0xFFFFFF : consecutive ? 1 : 0, isLikeNone(forward_fallback) ? 0xFFFFFF : forward_fallback ? 1 : 0);
182
+ return Chunker.__wrap(ret);
183
+ }
184
+ /**
185
+ * Collect all chunk offsets as a flat array [start1, end1, start2, end2, ...].
186
+ * This is faster than iterating as it makes a single WASM call.
187
+ * @returns {Uint32Array}
188
+ */
189
+ collect_offsets() {
190
+ const ret = wasm.chunker_collect_offsets(this.__wbg_ptr);
191
+ var v1 = getArrayU32FromWasm0(ret[0], ret[1]).slice();
192
+ wasm.__wbindgen_free(ret[0], ret[1] * 4, 4);
193
+ return v1;
194
+ }
195
+ /**
196
+ * Create a new Chunker with single-byte delimiters.
197
+ *
198
+ * @param text - The text to chunk (as Uint8Array)
199
+ * @param size - Target chunk size in bytes (default: 4096)
200
+ * @param delimiters - Delimiter characters as string (default: "\n.?")
201
+ * @param prefix - Put delimiter at start of next chunk (default: false)
202
+ * @param consecutive - Split at START of consecutive runs (default: false)
203
+ * @param forward_fallback - Search forward if no delimiter in backward window (default: false)
204
+ * @param {Uint8Array} text
205
+ * @param {number | null} [size]
206
+ * @param {string | null} [delimiters]
207
+ * @param {boolean | null} [prefix]
208
+ * @param {boolean | null} [consecutive]
209
+ * @param {boolean | null} [forward_fallback]
210
+ */
211
+ constructor(text, size, delimiters, prefix, consecutive, forward_fallback) {
212
+ const ptr0 = passArray8ToWasm0(text, wasm.__wbindgen_malloc);
213
+ const len0 = WASM_VECTOR_LEN;
214
+ var ptr1 = isLikeNone(delimiters) ? 0 : passStringToWasm0(delimiters, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
215
+ var len1 = WASM_VECTOR_LEN;
216
+ const ret = wasm.chunker_new(ptr0, len0, isLikeNone(size) ? 0x100000001 : (size) >>> 0, ptr1, len1, isLikeNone(prefix) ? 0xFFFFFF : prefix ? 1 : 0, isLikeNone(consecutive) ? 0xFFFFFF : consecutive ? 1 : 0, isLikeNone(forward_fallback) ? 0xFFFFFF : forward_fallback ? 1 : 0);
217
+ this.__wbg_ptr = ret >>> 0;
218
+ ChunkerFinalization.register(this, this.__wbg_ptr, this);
219
+ return this;
220
+ }
221
+ /**
222
+ * Get the next chunk, or undefined if exhausted.
223
+ * @returns {Uint8Array | undefined}
224
+ */
225
+ next() {
226
+ const ret = wasm.chunker_next(this.__wbg_ptr);
227
+ let v1;
228
+ if (ret[0] !== 0) {
229
+ v1 = getArrayU8FromWasm0(ret[0], ret[1]).slice();
230
+ wasm.__wbindgen_free(ret[0], ret[1] * 1, 1);
231
+ }
232
+ return v1;
233
+ }
234
+ /**
235
+ * Reset the chunker to iterate from the beginning.
236
+ */
237
+ reset() {
238
+ wasm.chunker_reset(this.__wbg_ptr);
239
+ }
240
+ }
241
+ if (Symbol.dispose) Chunker.prototype[Symbol.dispose] = Chunker.prototype.free;
242
+
243
+ /**
244
+ * Fast chunking function that returns offsets in a single call.
245
+ * Returns a flat array [start1, end1, start2, end2, ...].
246
+ * Use this with subarray for maximum performance.
247
+ *
248
+ * @example Single-byte delimiters
249
+ * ```javascript
250
+ * const offsets = chunk_offsets(textBytes, 4096, ".\n?");
251
+ * const chunks = [];
252
+ * for (let i = 0; i < offsets.length; i += 2) {
253
+ * chunks.push(textBytes.subarray(offsets[i], offsets[i + 1]));
254
+ * }
255
+ * ```
256
+ * @param {Uint8Array} text
257
+ * @param {number | null} [size]
258
+ * @param {string | null} [delimiters]
259
+ * @param {boolean | null} [prefix]
260
+ * @param {boolean | null} [consecutive]
261
+ * @param {boolean | null} [forward_fallback]
262
+ * @returns {Uint32Array}
263
+ */
264
+ export function chunk_offsets(text, size, delimiters, prefix, consecutive, forward_fallback) {
265
+ const ptr0 = passArray8ToWasm0(text, wasm.__wbindgen_malloc);
266
+ const len0 = WASM_VECTOR_LEN;
267
+ var ptr1 = isLikeNone(delimiters) ? 0 : passStringToWasm0(delimiters, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
268
+ var len1 = WASM_VECTOR_LEN;
269
+ const ret = wasm.chunk_offsets(ptr0, len0, isLikeNone(size) ? 0x100000001 : (size) >>> 0, ptr1, len1, isLikeNone(prefix) ? 0xFFFFFF : prefix ? 1 : 0, isLikeNone(consecutive) ? 0xFFFFFF : consecutive ? 1 : 0, isLikeNone(forward_fallback) ? 0xFFFFFF : forward_fallback ? 1 : 0);
270
+ var v3 = getArrayU32FromWasm0(ret[0], ret[1]).slice();
271
+ wasm.__wbindgen_free(ret[0], ret[1] * 4, 4);
272
+ return v3;
273
+ }
274
+
275
+ /**
276
+ * Fast chunking function with multi-byte pattern support.
277
+ * Returns a flat array [start1, end1, start2, end2, ...].
278
+ *
279
+ * @example Multi-byte pattern (metaspace)
280
+ * ```javascript
281
+ * const encoder = new TextEncoder();
282
+ * const metaspace = encoder.encode("▁");
283
+ * const offsets = chunk_offsets_pattern(textBytes, 4096, metaspace, true, true, true);
284
+ * ```
285
+ * @param {Uint8Array} text
286
+ * @param {number} size
287
+ * @param {Uint8Array} pattern
288
+ * @param {boolean | null} [prefix]
289
+ * @param {boolean | null} [consecutive]
290
+ * @param {boolean | null} [forward_fallback]
291
+ * @returns {Uint32Array}
292
+ */
293
+ export function chunk_offsets_pattern(text, size, pattern, prefix, consecutive, forward_fallback) {
294
+ const ptr0 = passArray8ToWasm0(text, wasm.__wbindgen_malloc);
295
+ const len0 = WASM_VECTOR_LEN;
296
+ const ptr1 = passArray8ToWasm0(pattern, wasm.__wbindgen_malloc);
297
+ const len1 = WASM_VECTOR_LEN;
298
+ const ret = wasm.chunk_offsets_pattern(ptr0, len0, size, ptr1, len1, isLikeNone(prefix) ? 0xFFFFFF : prefix ? 1 : 0, isLikeNone(consecutive) ? 0xFFFFFF : consecutive ? 1 : 0, isLikeNone(forward_fallback) ? 0xFFFFFF : forward_fallback ? 1 : 0);
299
+ var v3 = getArrayU32FromWasm0(ret[0], ret[1]).slice();
300
+ wasm.__wbindgen_free(ret[0], ret[1] * 4, 4);
301
+ return v3;
302
+ }
303
+
304
+ /**
305
+ * Get the default delimiters ("\n.?").
306
+ * @returns {Uint8Array}
307
+ */
308
+ export function default_delimiters() {
309
+ const ret = wasm.default_delimiters();
310
+ var v1 = getArrayU8FromWasm0(ret[0], ret[1]).slice();
311
+ wasm.__wbindgen_free(ret[0], ret[1] * 1, 1);
312
+ return v1;
313
+ }
314
+
315
+ /**
316
+ * Get the default target size (4096 bytes).
317
+ * @returns {number}
318
+ */
319
+ export function default_target_size() {
320
+ const ret = wasm.default_target_size();
321
+ return ret >>> 0;
322
+ }
323
+
324
+ /**
325
+ * Find merge indices for combining segments within token limits.
326
+ *
327
+ * Returns indices marking where to split segments into chunks that
328
+ * respect the token budget. Use this to determine merge boundaries,
329
+ * then join strings in JavaScript.
330
+ *
331
+ * @param token_counts - Array of token counts for each segment
332
+ * @param chunk_size - Maximum tokens per merged chunk
333
+ * @returns Array of end indices (exclusive) for each chunk
334
+ *
335
+ * @example
336
+ * ```javascript
337
+ * const tokenCounts = new Uint32Array([1, 1, 1, 1, 1, 1, 1]);
338
+ * const indices = find_merge_indices(tokenCounts, 3);
339
+ * // indices = [3, 6, 7]
340
+ * // Use to slice: segments.slice(0, 3), segments.slice(3, 6), segments.slice(6, 7)
341
+ * ```
342
+ * @param {Uint32Array} token_counts
343
+ * @param {number} chunk_size
344
+ * @returns {Uint32Array}
345
+ */
346
+ export function find_merge_indices(token_counts, chunk_size) {
347
+ const ptr0 = passArray32ToWasm0(token_counts, wasm.__wbindgen_malloc);
348
+ const len0 = WASM_VECTOR_LEN;
349
+ const ret = wasm.find_merge_indices(ptr0, len0, chunk_size);
350
+ var v2 = getArrayU32FromWasm0(ret[0], ret[1]).slice();
351
+ wasm.__wbindgen_free(ret[0], ret[1] * 4, 4);
352
+ return v2;
353
+ }
354
+
355
+ /**
356
+ * Split text at every delimiter occurrence, returning offsets.
357
+ * Unlike chunk_offsets which creates size-based chunks, this splits at
358
+ * **every** delimiter occurrence.
359
+ *
360
+ * Returns a flat array [start1, end1, start2, end2, ...].
361
+ *
362
+ * @param text - The text to split (as Uint8Array)
363
+ * @param delimiters - Delimiter characters as string (default: "\n.?")
364
+ * @param include_delim - Where to attach delimiter: "prev" (default), "next", or "none"
365
+ * @param min_chars - Minimum characters per segment (default: 0). Shorter segments are merged.
366
+ *
367
+ * @example
368
+ * ```javascript
369
+ * const offsets = split_offsets(textBytes, ".", "prev", 0);
370
+ * const segments = [];
371
+ * for (let i = 0; i < offsets.length; i += 2) {
372
+ * segments.push(textBytes.subarray(offsets[i], offsets[i + 1]));
373
+ * }
374
+ * // ["Hello.", " World.", " Test."]
375
+ * ```
376
+ * @param {Uint8Array} text
377
+ * @param {string | null} [delimiters]
378
+ * @param {string | null} [include_delim]
379
+ * @param {number | null} [min_chars]
380
+ * @returns {Uint32Array}
381
+ */
382
+ export function split_offsets(text, delimiters, include_delim, min_chars) {
383
+ const ptr0 = passArray8ToWasm0(text, wasm.__wbindgen_malloc);
384
+ const len0 = WASM_VECTOR_LEN;
385
+ var ptr1 = isLikeNone(delimiters) ? 0 : passStringToWasm0(delimiters, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
386
+ var len1 = WASM_VECTOR_LEN;
387
+ var ptr2 = isLikeNone(include_delim) ? 0 : passStringToWasm0(include_delim, wasm.__wbindgen_malloc, wasm.__wbindgen_realloc);
388
+ var len2 = WASM_VECTOR_LEN;
389
+ const ret = wasm.split_offsets(ptr0, len0, ptr1, len1, ptr2, len2, isLikeNone(min_chars) ? 0x100000001 : (min_chars) >>> 0);
390
+ var v4 = getArrayU32FromWasm0(ret[0], ret[1]).slice();
391
+ wasm.__wbindgen_free(ret[0], ret[1] * 4, 4);
392
+ return v4;
393
+ }
394
+
395
+ const EXPECTED_RESPONSE_TYPES = new Set(['basic', 'cors', 'default']);
396
+
397
+ async function __wbg_load(module, imports) {
398
+ if (typeof Response === 'function' && module instanceof Response) {
399
+ if (typeof WebAssembly.instantiateStreaming === 'function') {
400
+ try {
401
+ return await WebAssembly.instantiateStreaming(module, imports);
402
+ } catch (e) {
403
+ const validResponse = module.ok && EXPECTED_RESPONSE_TYPES.has(module.type);
404
+
405
+ if (validResponse && module.headers.get('Content-Type') !== 'application/wasm') {
406
+ console.warn("`WebAssembly.instantiateStreaming` failed because your server does not serve Wasm with `application/wasm` MIME type. Falling back to `WebAssembly.instantiate` which is slower. Original error:\n", e);
407
+
408
+ } else {
409
+ throw e;
410
+ }
411
+ }
412
+ }
413
+
414
+ const bytes = await module.arrayBuffer();
415
+ return await WebAssembly.instantiate(bytes, imports);
416
+ } else {
417
+ const instance = await WebAssembly.instantiate(module, imports);
418
+
419
+ if (instance instanceof WebAssembly.Instance) {
420
+ return { instance, module };
421
+ } else {
422
+ return instance;
423
+ }
424
+ }
425
+ }
426
+
427
+ function __wbg_get_imports() {
428
+ const imports = {};
429
+ imports.wbg = {};
430
+ imports.wbg.__wbg___wbindgen_throw_dd24417ed36fc46e = function(arg0, arg1) {
431
+ throw new Error(getStringFromWasm0(arg0, arg1));
432
+ };
433
+ imports.wbg.__wbindgen_init_externref_table = function() {
434
+ const table = wasm.__wbindgen_externrefs;
435
+ const offset = table.grow(4);
436
+ table.set(0, undefined);
437
+ table.set(offset + 0, undefined);
438
+ table.set(offset + 1, null);
439
+ table.set(offset + 2, true);
440
+ table.set(offset + 3, false);
441
+ };
442
+
443
+ return imports;
444
+ }
445
+
446
+ function __wbg_finalize_init(instance, module) {
447
+ wasm = instance.exports;
448
+ __wbg_init.__wbindgen_wasm_module = module;
449
+ cachedUint32ArrayMemory0 = null;
450
+ cachedUint8ArrayMemory0 = null;
451
+
452
+
453
+ wasm.__wbindgen_start();
454
+ return wasm;
455
+ }
456
+
457
+ function initSync(module) {
458
+ if (wasm !== undefined) return wasm;
459
+
460
+
461
+ if (typeof module !== 'undefined') {
462
+ if (Object.getPrototypeOf(module) === Object.prototype) {
463
+ ({module} = module)
464
+ } else {
465
+ console.warn('using deprecated parameters for `initSync()`; pass a single object instead')
466
+ }
467
+ }
468
+
469
+ const imports = __wbg_get_imports();
470
+ if (!(module instanceof WebAssembly.Module)) {
471
+ module = new WebAssembly.Module(module);
472
+ }
473
+ const instance = new WebAssembly.Instance(module, imports);
474
+ return __wbg_finalize_init(instance, module);
475
+ }
476
+
477
+ async function __wbg_init(module_or_path) {
478
+ if (wasm !== undefined) return wasm;
479
+
480
+
481
+ if (typeof module_or_path !== 'undefined') {
482
+ if (Object.getPrototypeOf(module_or_path) === Object.prototype) {
483
+ ({module_or_path} = module_or_path)
484
+ } else {
485
+ console.warn('using deprecated parameters for the initialization function; pass a single object instead')
486
+ }
487
+ }
488
+
489
+ if (typeof module_or_path === 'undefined') {
490
+ module_or_path = new URL('chonkiejs_chunk_bg.wasm', import.meta.url);
491
+ }
492
+ const imports = __wbg_get_imports();
493
+
494
+ if (typeof module_or_path === 'string' || (typeof Request === 'function' && module_or_path instanceof Request) || (typeof URL === 'function' && module_or_path instanceof URL)) {
495
+ module_or_path = fetch(module_or_path);
496
+ }
497
+
498
+ const { instance, module } = await __wbg_load(await module_or_path, imports);
499
+
500
+ return __wbg_finalize_init(instance, module);
501
+ }
502
+
503
+ export { initSync };
504
+ export default __wbg_init;
Binary file
@@ -0,0 +1,20 @@
1
+ /* tslint:disable */
2
+ /* eslint-disable */
3
+ export const memory: WebAssembly.Memory;
4
+ export const __wbg_chunker_free: (a: number, b: number) => void;
5
+ export const chunk_offsets: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number) => [number, number];
6
+ export const chunk_offsets_pattern: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number) => [number, number];
7
+ export const chunker_collect_offsets: (a: number) => [number, number];
8
+ export const chunker_new: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number) => number;
9
+ export const chunker_next: (a: number) => [number, number];
10
+ export const chunker_reset: (a: number) => void;
11
+ export const chunker_with_pattern: (a: number, b: number, c: number, d: number, e: number, f: number, g: number, h: number) => number;
12
+ export const default_delimiters: () => [number, number];
13
+ export const default_target_size: () => number;
14
+ export const find_merge_indices: (a: number, b: number, c: number) => [number, number];
15
+ export const split_offsets: (a: number, b: number, c: number, d: number, e: number, f: number, g: number) => [number, number];
16
+ export const __wbindgen_externrefs: WebAssembly.Table;
17
+ export const __wbindgen_malloc: (a: number, b: number) => number;
18
+ export const __wbindgen_realloc: (a: number, b: number, c: number, d: number) => number;
19
+ export const __wbindgen_free: (a: number, b: number, c: number) => void;
20
+ export const __wbindgen_start: () => void;
@@ -0,0 +1,16 @@
1
+ {
2
+ "name": "chonkiejs-chunk",
3
+ "type": "module",
4
+ "version": "0.9.1",
5
+ "license": "MIT OR Apache-2.0",
6
+ "files": [
7
+ "chonkiejs_chunk_bg.wasm",
8
+ "chonkiejs_chunk.js",
9
+ "chonkiejs_chunk.d.ts"
10
+ ],
11
+ "main": "chonkiejs_chunk.js",
12
+ "types": "chonkiejs_chunk.d.ts",
13
+ "sideEffects": [
14
+ "./snippets/*"
15
+ ]
16
+ }