memchunk 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +83 -0
  2. package/index.js +144 -0
  3. package/package.json +31 -0
package/README.md ADDED
@@ -0,0 +1,83 @@
1
+ <p align="center">
2
+ <img src="../../assets/memchunk_wide.png" alt="memchunk" width="500">
3
+ </p>
4
+
5
+ <h1 align="center">memchunk</h1>
6
+
7
+ <p align="center">
8
+ <em>the fastest text chunking library — up to 1 TB/s throughput</em>
9
+ </p>
10
+
11
+ <p align="center">
12
+ <a href="https://www.npmjs.com/package/memchunk"><img src="https://img.shields.io/npm/v/memchunk.svg" alt="npm"></a>
13
+ <a href="https://github.com/chonkie-inc/memchunk"><img src="https://img.shields.io/badge/github-memchunk-blue" alt="GitHub"></a>
14
+ <a href="LICENSE-MIT"><img src="https://img.shields.io/badge/license-MIT%2FApache--2.0-blue.svg" alt="License"></a>
15
+ </p>
16
+
17
+ ---
18
+
19
+ you know how every chunking library claims to be fast? yeah, we actually meant it.
20
+
21
+ **memchunk** splits text at semantic boundaries (periods, newlines, the usual suspects) and does it stupid fast. we're talking "chunk the entire english wikipedia in 120ms" fast.
22
+
23
+ want to know how? [read the blog post](https://minha.sh/posts/so-you-want-to-chunk-really-fast) where we nerd out about SIMD instructions and lookup tables.
24
+
25
+ ## installation
26
+
27
+ ```bash
28
+ npm install memchunk
29
+ ```
30
+
31
+ ## usage
32
+
33
+ ```javascript
34
+ import { init, chunk } from 'memchunk';
35
+
36
+ // initialize wasm (required once)
37
+ await init();
38
+
39
+ const text = new TextEncoder().encode("Hello world. How are you? I'm fine.\nThanks for asking.");
40
+
41
+ // with defaults (4KB chunks, split at \n . ?)
42
+ for (const slice of chunk(text)) {
43
+ console.log(new TextDecoder().decode(slice));
44
+ }
45
+
46
+ // with custom size
47
+ for (const slice of chunk(text, { size: 1024 })) {
48
+ console.log(new TextDecoder().decode(slice));
49
+ }
50
+
51
+ // with custom delimiters
52
+ for (const slice of chunk(text, { delimiters: ".?!\n" })) {
53
+ console.log(new TextDecoder().decode(slice));
54
+ }
55
+
56
+ // with both
57
+ for (const slice of chunk(text, { size: 8192, delimiters: "\n" })) {
58
+ console.log(new TextDecoder().decode(slice));
59
+ }
60
+
61
+ // collect all chunks
62
+ const chunks = [...chunk(text)];
63
+ ```
64
+
65
+ chunks are returned as `Uint8Array` subarrays (zero-copy views of the original text).
66
+
67
+ ## citation
68
+
69
+ if you use memchunk in your research, please cite it as follows:
70
+
71
+ ```bibtex
72
+ @software{memchunk2025,
73
+ author = {Minhas, Bhavnick},
74
+ title = {memchunk: The fastest text chunking library},
75
+ year = {2025},
76
+ publisher = {GitHub},
77
+ howpublished = {\url{https://github.com/chonkie-inc/memchunk}},
78
+ }
79
+ ```
80
+
81
+ ## license
82
+
83
+ licensed under either of [Apache License, Version 2.0](LICENSE-APACHE) or [MIT license](LICENSE-MIT) at your option.
package/index.js ADDED
@@ -0,0 +1,144 @@
1
+ /**
2
+ * memchunk - The fastest semantic text chunking library
3
+ *
4
+ * @example
5
+ * ```javascript
6
+ * import { init, chunk } from 'memchunk';
7
+ *
8
+ * await init();
9
+ *
10
+ * const text = new TextEncoder().encode("Hello. World. Test.");
11
+ * for (const slice of chunk(text, { size: 10, delimiters: "." })) {
12
+ * console.log(new TextDecoder().decode(slice));
13
+ * }
14
+ * ```
15
+ */
16
+
17
+ import initWasm, {
18
+ Chunker as WasmChunker,
19
+ default_target_size,
20
+ default_delimiters,
21
+ chunk_offsets as wasmChunkOffsets,
22
+ } from './pkg/memchunk_wasm.js';
23
+
24
+ export { default_target_size, default_delimiters };
25
+
26
+ /**
27
+ * Split text into chunks at delimiter boundaries.
28
+ * Returns an iterator of zero-copy Uint8Array subarray views.
29
+ *
30
+ * @param {Uint8Array} text - The text to chunk as bytes
31
+ * @param {Object} [options] - Options
32
+ * @param {number} [options.size=4096] - Target chunk size in bytes
33
+ * @param {string} [options.delimiters="\n.?"] - Delimiter characters
34
+ * @yields {Uint8Array} Zero-copy subarray views of the original text
35
+ *
36
+ * @example
37
+ * const text = new TextEncoder().encode("Hello. World. Test.");
38
+ * for (const slice of chunk(text, { size: 10, delimiters: "." })) {
39
+ * console.log(new TextDecoder().decode(slice));
40
+ * }
41
+ */
42
+ export function* chunk(text, options = {}) {
43
+ const { size, delimiters } = options;
44
+ const flat = wasmChunkOffsets(text, size, delimiters);
45
+ for (let i = 0; i < flat.length; i += 2) {
46
+ yield text.subarray(flat[i], flat[i + 1]);
47
+ }
48
+ }
49
+
50
+ /**
51
+ * Get chunk offsets without creating views.
52
+ * Returns an array of [start, end] offset pairs.
53
+ *
54
+ * @param {Uint8Array} text - The text to chunk as bytes
55
+ * @param {Object} [options] - Options
56
+ * @param {number} [options.size=4096] - Target chunk size in bytes
57
+ * @param {string} [options.delimiters="\n.?"] - Delimiter characters
58
+ * @returns {Array<[number, number]>} Array of [start, end] offset pairs
59
+ */
60
+ export function chunk_offsets(text, options = {}) {
61
+ const { size, delimiters } = options;
62
+ const flat = wasmChunkOffsets(text, size, delimiters);
63
+ const pairs = [];
64
+ for (let i = 0; i < flat.length; i += 2) {
65
+ pairs.push([flat[i], flat[i + 1]]);
66
+ }
67
+ return pairs;
68
+ }
69
+
70
+ let initialized = false;
71
+
72
+ /**
73
+ * Initialize the WASM module. Must be called before using chunk functions.
74
+ */
75
+ export async function init() {
76
+ if (!initialized) {
77
+ await initWasm();
78
+ initialized = true;
79
+ }
80
+ }
81
+
82
+ /**
83
+ * Chunker splits text at delimiter boundaries.
84
+ * Implements Symbol.iterator for use in for...of loops.
85
+ */
86
+ export class Chunker {
87
+ /**
88
+ * Create a new Chunker.
89
+ * @param {Uint8Array} text - The text to chunk as bytes
90
+ * @param {Object} [options] - Options
91
+ * @param {number} [options.size=4096] - Target chunk size in bytes
92
+ * @param {string} [options.delimiters="\n.?"] - Delimiter characters
93
+ */
94
+ constructor(text, options = {}) {
95
+ const { size, delimiters } = options;
96
+ this._chunker = new WasmChunker(text, size, delimiters);
97
+ }
98
+
99
+ /**
100
+ * Get the next chunk, or undefined if exhausted.
101
+ * @returns {Uint8Array | undefined}
102
+ */
103
+ next() {
104
+ return this._chunker.next();
105
+ }
106
+
107
+ /**
108
+ * Reset the chunker to iterate from the beginning.
109
+ */
110
+ reset() {
111
+ this._chunker.reset();
112
+ }
113
+
114
+ /**
115
+ * Collect all chunk offsets as an array of [start, end] pairs.
116
+ * This is faster than iterating as it makes a single WASM call.
117
+ * @returns {Array<[number, number]>}
118
+ */
119
+ collectOffsets() {
120
+ const flat = this._chunker.collect_offsets();
121
+ const pairs = [];
122
+ for (let i = 0; i < flat.length; i += 2) {
123
+ pairs.push([flat[i], flat[i + 1]]);
124
+ }
125
+ return pairs;
126
+ }
127
+
128
+ /**
129
+ * Free the underlying WASM memory.
130
+ */
131
+ free() {
132
+ this._chunker.free();
133
+ }
134
+
135
+ /**
136
+ * Iterator protocol - allows use in for...of loops.
137
+ */
138
+ *[Symbol.iterator]() {
139
+ let chunk;
140
+ while ((chunk = this._chunker.next()) !== undefined) {
141
+ yield chunk;
142
+ }
143
+ }
144
+ }
package/package.json ADDED
@@ -0,0 +1,31 @@
1
+ {
2
+ "name": "memchunk",
3
+ "version": "0.1.0",
4
+ "description": "The fastest semantic text chunking library",
5
+ "type": "module",
6
+ "main": "index.js",
7
+ "files": [
8
+ "index.js",
9
+ "pkg/"
10
+ ],
11
+ "scripts": {
12
+ "build": "wasm-pack build --target web",
13
+ "test": "node --test tests/"
14
+ },
15
+ "keywords": [
16
+ "chunking",
17
+ "text",
18
+ "simd",
19
+ "nlp",
20
+ "tokenization",
21
+ "rag",
22
+ "wasm",
23
+ "webassembly"
24
+ ],
25
+ "author": "Bhavnick Minhas",
26
+ "license": "MIT OR Apache-2.0",
27
+ "repository": {
28
+ "type": "git",
29
+ "url": "https://github.com/chonkie-inc/memchunk"
30
+ }
31
+ }