memchunk 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +83 -0
- package/index.js +144 -0
- package/package.json +31 -0
package/README.md
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="../../assets/memchunk_wide.png" alt="memchunk" width="500">
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
<h1 align="center">memchunk</h1>
|
|
6
|
+
|
|
7
|
+
<p align="center">
|
|
8
|
+
<em>the fastest text chunking library — up to 1 TB/s throughput</em>
|
|
9
|
+
</p>
|
|
10
|
+
|
|
11
|
+
<p align="center">
|
|
12
|
+
<a href="https://www.npmjs.com/package/memchunk"><img src="https://img.shields.io/npm/v/memchunk.svg" alt="npm"></a>
|
|
13
|
+
<a href="https://github.com/chonkie-inc/memchunk"><img src="https://img.shields.io/badge/github-memchunk-blue" alt="GitHub"></a>
|
|
14
|
+
<a href="LICENSE-MIT"><img src="https://img.shields.io/badge/license-MIT%2FApache--2.0-blue.svg" alt="License"></a>
|
|
15
|
+
</p>
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
you know how every chunking library claims to be fast? yeah, we actually meant it.
|
|
20
|
+
|
|
21
|
+
**memchunk** splits text at semantic boundaries (periods, newlines, the usual suspects) and does it stupid fast. we're talking "chunk the entire english wikipedia in 120ms" fast.
|
|
22
|
+
|
|
23
|
+
want to know how? [read the blog post](https://minha.sh/posts/so-you-want-to-chunk-really-fast) where we nerd out about SIMD instructions and lookup tables.
|
|
24
|
+
|
|
25
|
+
## installation
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
npm install memchunk
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## usage
|
|
32
|
+
|
|
33
|
+
```javascript
|
|
34
|
+
import { init, chunk } from 'memchunk';
|
|
35
|
+
|
|
36
|
+
// initialize wasm (required once)
|
|
37
|
+
await init();
|
|
38
|
+
|
|
39
|
+
const text = new TextEncoder().encode("Hello world. How are you? I'm fine.\nThanks for asking.");
|
|
40
|
+
|
|
41
|
+
// with defaults (4KB chunks, split at \n . ?)
|
|
42
|
+
for (const slice of chunk(text)) {
|
|
43
|
+
console.log(new TextDecoder().decode(slice));
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// with custom size
|
|
47
|
+
for (const slice of chunk(text, { size: 1024 })) {
|
|
48
|
+
console.log(new TextDecoder().decode(slice));
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
// with custom delimiters
|
|
52
|
+
for (const slice of chunk(text, { delimiters: ".?!\n" })) {
|
|
53
|
+
console.log(new TextDecoder().decode(slice));
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
// with both
|
|
57
|
+
for (const slice of chunk(text, { size: 8192, delimiters: "\n" })) {
|
|
58
|
+
console.log(new TextDecoder().decode(slice));
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// collect all chunks
|
|
62
|
+
const chunks = [...chunk(text)];
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
chunks are returned as `Uint8Array` subarrays (zero-copy views of the original text).
|
|
66
|
+
|
|
67
|
+
## citation
|
|
68
|
+
|
|
69
|
+
if you use memchunk in your research, please cite it as follows:
|
|
70
|
+
|
|
71
|
+
```bibtex
|
|
72
|
+
@software{memchunk2025,
|
|
73
|
+
author = {Minhas, Bhavnick},
|
|
74
|
+
title = {memchunk: The fastest text chunking library},
|
|
75
|
+
year = {2025},
|
|
76
|
+
publisher = {GitHub},
|
|
77
|
+
howpublished = {\url{https://github.com/chonkie-inc/memchunk}},
|
|
78
|
+
}
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## license
|
|
82
|
+
|
|
83
|
+
licensed under either of [Apache License, Version 2.0](LICENSE-APACHE) or [MIT license](LICENSE-MIT) at your option.
|
package/index.js
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* memchunk - The fastest semantic text chunking library
|
|
3
|
+
*
|
|
4
|
+
* @example
|
|
5
|
+
* ```javascript
|
|
6
|
+
* import { init, chunk } from 'memchunk';
|
|
7
|
+
*
|
|
8
|
+
* await init();
|
|
9
|
+
*
|
|
10
|
+
* const text = new TextEncoder().encode("Hello. World. Test.");
|
|
11
|
+
* for (const slice of chunk(text, { size: 10, delimiters: "." })) {
|
|
12
|
+
* console.log(new TextDecoder().decode(slice));
|
|
13
|
+
* }
|
|
14
|
+
* ```
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import initWasm, {
|
|
18
|
+
Chunker as WasmChunker,
|
|
19
|
+
default_target_size,
|
|
20
|
+
default_delimiters,
|
|
21
|
+
chunk_offsets as wasmChunkOffsets,
|
|
22
|
+
} from './pkg/memchunk_wasm.js';
|
|
23
|
+
|
|
24
|
+
export { default_target_size, default_delimiters };
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Split text into chunks at delimiter boundaries.
|
|
28
|
+
* Returns an iterator of zero-copy Uint8Array subarray views.
|
|
29
|
+
*
|
|
30
|
+
* @param {Uint8Array} text - The text to chunk as bytes
|
|
31
|
+
* @param {Object} [options] - Options
|
|
32
|
+
* @param {number} [options.size=4096] - Target chunk size in bytes
|
|
33
|
+
* @param {string} [options.delimiters="\n.?"] - Delimiter characters
|
|
34
|
+
* @yields {Uint8Array} Zero-copy subarray views of the original text
|
|
35
|
+
*
|
|
36
|
+
* @example
|
|
37
|
+
* const text = new TextEncoder().encode("Hello. World. Test.");
|
|
38
|
+
* for (const slice of chunk(text, { size: 10, delimiters: "." })) {
|
|
39
|
+
* console.log(new TextDecoder().decode(slice));
|
|
40
|
+
* }
|
|
41
|
+
*/
|
|
42
|
+
export function* chunk(text, options = {}) {
|
|
43
|
+
const { size, delimiters } = options;
|
|
44
|
+
const flat = wasmChunkOffsets(text, size, delimiters);
|
|
45
|
+
for (let i = 0; i < flat.length; i += 2) {
|
|
46
|
+
yield text.subarray(flat[i], flat[i + 1]);
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Get chunk offsets without creating views.
|
|
52
|
+
* Returns an array of [start, end] offset pairs.
|
|
53
|
+
*
|
|
54
|
+
* @param {Uint8Array} text - The text to chunk as bytes
|
|
55
|
+
* @param {Object} [options] - Options
|
|
56
|
+
* @param {number} [options.size=4096] - Target chunk size in bytes
|
|
57
|
+
* @param {string} [options.delimiters="\n.?"] - Delimiter characters
|
|
58
|
+
* @returns {Array<[number, number]>} Array of [start, end] offset pairs
|
|
59
|
+
*/
|
|
60
|
+
export function chunk_offsets(text, options = {}) {
|
|
61
|
+
const { size, delimiters } = options;
|
|
62
|
+
const flat = wasmChunkOffsets(text, size, delimiters);
|
|
63
|
+
const pairs = [];
|
|
64
|
+
for (let i = 0; i < flat.length; i += 2) {
|
|
65
|
+
pairs.push([flat[i], flat[i + 1]]);
|
|
66
|
+
}
|
|
67
|
+
return pairs;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
let initialized = false;
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Initialize the WASM module. Must be called before using chunk functions.
|
|
74
|
+
*/
|
|
75
|
+
export async function init() {
|
|
76
|
+
if (!initialized) {
|
|
77
|
+
await initWasm();
|
|
78
|
+
initialized = true;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Chunker splits text at delimiter boundaries.
|
|
84
|
+
* Implements Symbol.iterator for use in for...of loops.
|
|
85
|
+
*/
|
|
86
|
+
export class Chunker {
|
|
87
|
+
/**
|
|
88
|
+
* Create a new Chunker.
|
|
89
|
+
* @param {Uint8Array} text - The text to chunk as bytes
|
|
90
|
+
* @param {Object} [options] - Options
|
|
91
|
+
* @param {number} [options.size=4096] - Target chunk size in bytes
|
|
92
|
+
* @param {string} [options.delimiters="\n.?"] - Delimiter characters
|
|
93
|
+
*/
|
|
94
|
+
constructor(text, options = {}) {
|
|
95
|
+
const { size, delimiters } = options;
|
|
96
|
+
this._chunker = new WasmChunker(text, size, delimiters);
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Get the next chunk, or undefined if exhausted.
|
|
101
|
+
* @returns {Uint8Array | undefined}
|
|
102
|
+
*/
|
|
103
|
+
next() {
|
|
104
|
+
return this._chunker.next();
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
/**
|
|
108
|
+
* Reset the chunker to iterate from the beginning.
|
|
109
|
+
*/
|
|
110
|
+
reset() {
|
|
111
|
+
this._chunker.reset();
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
* Collect all chunk offsets as an array of [start, end] pairs.
|
|
116
|
+
* This is faster than iterating as it makes a single WASM call.
|
|
117
|
+
* @returns {Array<[number, number]>}
|
|
118
|
+
*/
|
|
119
|
+
collectOffsets() {
|
|
120
|
+
const flat = this._chunker.collect_offsets();
|
|
121
|
+
const pairs = [];
|
|
122
|
+
for (let i = 0; i < flat.length; i += 2) {
|
|
123
|
+
pairs.push([flat[i], flat[i + 1]]);
|
|
124
|
+
}
|
|
125
|
+
return pairs;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/**
|
|
129
|
+
* Free the underlying WASM memory.
|
|
130
|
+
*/
|
|
131
|
+
free() {
|
|
132
|
+
this._chunker.free();
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Iterator protocol - allows use in for...of loops.
|
|
137
|
+
*/
|
|
138
|
+
*[Symbol.iterator]() {
|
|
139
|
+
let chunk;
|
|
140
|
+
while ((chunk = this._chunker.next()) !== undefined) {
|
|
141
|
+
yield chunk;
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "memchunk",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "The fastest semantic text chunking library",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "index.js",
|
|
7
|
+
"files": [
|
|
8
|
+
"index.js",
|
|
9
|
+
"pkg/"
|
|
10
|
+
],
|
|
11
|
+
"scripts": {
|
|
12
|
+
"build": "wasm-pack build --target web",
|
|
13
|
+
"test": "node --test tests/"
|
|
14
|
+
},
|
|
15
|
+
"keywords": [
|
|
16
|
+
"chunking",
|
|
17
|
+
"text",
|
|
18
|
+
"simd",
|
|
19
|
+
"nlp",
|
|
20
|
+
"tokenization",
|
|
21
|
+
"rag",
|
|
22
|
+
"wasm",
|
|
23
|
+
"webassembly"
|
|
24
|
+
],
|
|
25
|
+
"author": "Bhavnick Minhas",
|
|
26
|
+
"license": "MIT OR Apache-2.0",
|
|
27
|
+
"repository": {
|
|
28
|
+
"type": "git",
|
|
29
|
+
"url": "https://github.com/chonkie-inc/memchunk"
|
|
30
|
+
}
|
|
31
|
+
}
|