chonkie-core 0.5.0__cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
chonkie_core/__init__.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""chonkie-core - The fastest semantic text chunking library."""
|
|
2
|
+
|
|
3
|
+
from chonkie_core._chunk import (
|
|
4
|
+
Chunker,
|
|
5
|
+
chunk_offsets,
|
|
6
|
+
DEFAULT_TARGET_SIZE,
|
|
7
|
+
DEFAULT_DELIMITERS,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
__all__ = ["chunk", "Chunker", "chunk_offsets", "DEFAULT_TARGET_SIZE", "DEFAULT_DELIMITERS"]
|
|
11
|
+
__version__ = "0.5.0"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def chunk(text, *, size=DEFAULT_TARGET_SIZE, delimiters=None):
|
|
15
|
+
"""
|
|
16
|
+
Split text into chunks at delimiter boundaries.
|
|
17
|
+
Returns an iterator of zero-copy memoryview slices.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
text: bytes or str to chunk
|
|
21
|
+
size: Target chunk size in bytes (default: 4096)
|
|
22
|
+
delimiters: bytes or str of delimiter characters (default: "\\n.?")
|
|
23
|
+
|
|
24
|
+
Yields:
|
|
25
|
+
memoryview slices of the original text
|
|
26
|
+
|
|
27
|
+
Example:
|
|
28
|
+
>>> text = b"Hello. World. Test."
|
|
29
|
+
>>> for chunk in chunk(text, size=10, delimiters=b"."):
|
|
30
|
+
... print(bytes(chunk))
|
|
31
|
+
b'Hello.'
|
|
32
|
+
b' World.'
|
|
33
|
+
b' Test.'
|
|
34
|
+
"""
|
|
35
|
+
# Convert str to bytes if needed
|
|
36
|
+
if isinstance(text, str):
|
|
37
|
+
text = text.encode("utf-8")
|
|
38
|
+
|
|
39
|
+
# Get offsets from Rust (single FFI call)
|
|
40
|
+
offsets = chunk_offsets(text, size, delimiters)
|
|
41
|
+
|
|
42
|
+
# Return memoryview slices (zero-copy)
|
|
43
|
+
mv = memoryview(text)
|
|
44
|
+
for start, end in offsets:
|
|
45
|
+
yield mv[start:end]
|
|
Binary file
|
chonkie_core/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: chonkie-core
|
|
3
|
+
Version: 0.5.0
|
|
4
|
+
Classifier: Development Status :: 4 - Beta
|
|
5
|
+
Classifier: Intended Audience :: Developers
|
|
6
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
7
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
8
|
+
Classifier: Programming Language :: Rust
|
|
9
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Topic :: Text Processing
|
|
17
|
+
Summary: The fastest semantic text chunking library
|
|
18
|
+
Keywords: chunking,text,simd,nlp,tokenization,rag,chonkie
|
|
19
|
+
Author: Bhavnick Minhas
|
|
20
|
+
License: MIT OR Apache-2.0
|
|
21
|
+
Requires-Python: >=3.8
|
|
22
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
23
|
+
Project-URL: Homepage, https://github.com/chonkie-inc/chunk
|
|
24
|
+
Project-URL: Repository, https://github.com/chonkie-inc/chunk
|
|
25
|
+
|
|
26
|
+
<p align="center">
|
|
27
|
+
<img src="../../assets/memchunk_wide.png" alt="chonkie-core" width="500">
|
|
28
|
+
</p>
|
|
29
|
+
|
|
30
|
+
<h1 align="center">chonkie-core</h1>
|
|
31
|
+
|
|
32
|
+
<p align="center">
|
|
33
|
+
<em>the fastest text chunking library — up to 1 TB/s throughput</em>
|
|
34
|
+
</p>
|
|
35
|
+
|
|
36
|
+
<p align="center">
|
|
37
|
+
<a href="https://crates.io/crates/chunk"><img src="https://img.shields.io/crates/v/chunk.svg?color=e74c3c" alt="crates.io"></a>
|
|
38
|
+
<a href="https://pypi.org/project/chonkie-core"><img src="https://img.shields.io/pypi/v/chonkie-core.svg?color=e67e22" alt="PyPI"></a>
|
|
39
|
+
<a href="https://www.npmjs.com/package/@chonkiejs/chunk"><img src="https://img.shields.io/npm/v/@chonkiejs/chunk.svg?color=2ecc71" alt="npm"></a>
|
|
40
|
+
<a href="https://github.com/chonkie-inc/chunk"><img src="https://img.shields.io/badge/github-chunk-3498db" alt="GitHub"></a>
|
|
41
|
+
<a href="LICENSE-MIT"><img src="https://img.shields.io/badge/license-MIT%2FApache--2.0-9b59b6.svg" alt="License"></a>
|
|
42
|
+
</p>
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
you know how every chunking library claims to be fast? yeah, we actually meant it.
|
|
47
|
+
|
|
48
|
+
**chonkie-core** splits text at semantic boundaries (periods, newlines, the usual suspects) and does it stupid fast. we're talking "chunk the entire english wikipedia in 120ms" fast.
|
|
49
|
+
|
|
50
|
+
want to know how? [read the blog post](https://minha.sh/posts/so,-you-want-to-chunk-really-fast) where we nerd out about SIMD instructions and lookup tables.
|
|
51
|
+
|
|
52
|
+
## 📦 installation
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install chonkie-core
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
looking for [rust](https://github.com/chonkie-inc/chunk) or [javascript](https://github.com/chonkie-inc/chunk/tree/main/packages/wasm)?
|
|
59
|
+
|
|
60
|
+
## 🚀 usage
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
from chonkie_core import Chunker
|
|
64
|
+
|
|
65
|
+
text = "Hello world. How are you? I'm fine.\nThanks for asking."
|
|
66
|
+
|
|
67
|
+
# with defaults (4KB chunks, split at \n . ?)
|
|
68
|
+
for chunk in Chunker(text):
|
|
69
|
+
print(bytes(chunk))
|
|
70
|
+
|
|
71
|
+
# with custom size
|
|
72
|
+
for chunk in Chunker(text, size=1024):
|
|
73
|
+
print(bytes(chunk))
|
|
74
|
+
|
|
75
|
+
# with custom delimiters
|
|
76
|
+
for chunk in Chunker(text, delimiters=".?!\n"):
|
|
77
|
+
print(bytes(chunk))
|
|
78
|
+
|
|
79
|
+
# with multi-byte pattern (e.g., metaspace ▁ for SentencePiece tokenizers)
|
|
80
|
+
for chunk in Chunker(text, pattern="▁", prefix=True):
|
|
81
|
+
print(bytes(chunk))
|
|
82
|
+
|
|
83
|
+
# with consecutive pattern handling (split at START of runs, not middle)
|
|
84
|
+
for chunk in Chunker("word next", pattern=" ", consecutive=True):
|
|
85
|
+
print(bytes(chunk))
|
|
86
|
+
|
|
87
|
+
# with forward fallback (search forward if no pattern in backward window)
|
|
88
|
+
for chunk in Chunker(text, pattern=" ", forward_fallback=True):
|
|
89
|
+
print(bytes(chunk))
|
|
90
|
+
|
|
91
|
+
# collect all chunks
|
|
92
|
+
chunks = list(Chunker(text))
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
chunks are returned as `memoryview` objects (zero-copy slices of the original text).
|
|
96
|
+
|
|
97
|
+
## 📝 citation
|
|
98
|
+
|
|
99
|
+
if you use chonkie-core in your research, please cite it as follows:
|
|
100
|
+
|
|
101
|
+
```bibtex
|
|
102
|
+
@software{chunk2025,
|
|
103
|
+
author = {Minhas, Bhavnick},
|
|
104
|
+
title = {chunk: The fastest text chunking library},
|
|
105
|
+
year = {2025},
|
|
106
|
+
publisher = {GitHub},
|
|
107
|
+
howpublished = {\url{https://github.com/chonkie-inc/chunk}},
|
|
108
|
+
}
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## 📄 license
|
|
112
|
+
|
|
113
|
+
licensed under either of [Apache License, Version 2.0](LICENSE-APACHE) or [MIT license](LICENSE-MIT) at your option.
|
|
114
|
+
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
chonkie_core/__init__.py,sha256=yYwnoosAaDnd8Pwdsb9O0F2oKhjduXE-VlyE0kR6dLI,1270
|
|
2
|
+
chonkie_core/_chunk.cpython-313-aarch64-linux-gnu.so,sha256=WYl3C_6HU562VgQK2PscemIUY_x6IrzUT5_BBFV7VPk,694848
|
|
3
|
+
chonkie_core/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
chonkie_core-0.5.0.dist-info/METADATA,sha256=P3M8Pqfsf-XItoCtmWMIOikb1d6jExcUh1rorISRIs0,4119
|
|
5
|
+
chonkie_core-0.5.0.dist-info/WHEEL,sha256=8v2ZPaqWlxLt72MoP9HmdukfAIPmanayQxGa1NmZv6I,149
|
|
6
|
+
chonkie_core-0.5.0.dist-info/RECORD,,
|