chonkie-core 0.7.0__cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,53 @@
1
+ """chonkie-core - The fastest semantic text chunking library."""
2
+
3
+ from chonkie_core._chunk import (
4
+ Chunker,
5
+ chunk_offsets,
6
+ split_offsets,
7
+ DEFAULT_TARGET_SIZE,
8
+ DEFAULT_DELIMITERS,
9
+ )
10
+
11
+ __all__ = [
12
+ "chunk",
13
+ "Chunker",
14
+ "chunk_offsets",
15
+ "split_offsets",
16
+ "DEFAULT_TARGET_SIZE",
17
+ "DEFAULT_DELIMITERS",
18
+ ]
19
+ __version__ = "0.7.0"
20
+
21
+
22
+ def chunk(text, *, size=DEFAULT_TARGET_SIZE, delimiters=None):
23
+ """
24
+ Split text into chunks at delimiter boundaries.
25
+ Returns an iterator of zero-copy memoryview slices.
26
+
27
+ Args:
28
+ text: bytes or str to chunk
29
+ size: Target chunk size in bytes (default: 4096)
30
+ delimiters: bytes or str of delimiter characters (default: "\\n.?")
31
+
32
+ Yields:
33
+ memoryview slices of the original text
34
+
35
+ Example:
36
+ >>> text = b"Hello. World. Test."
37
+ >>> for chunk in chunk(text, size=10, delimiters=b"."):
38
+ ... print(bytes(chunk))
39
+ b'Hello.'
40
+ b' World.'
41
+ b' Test.'
42
+ """
43
+ # Convert str to bytes if needed
44
+ if isinstance(text, str):
45
+ text = text.encode("utf-8")
46
+
47
+ # Get offsets from Rust (single FFI call)
48
+ offsets = chunk_offsets(text, size, delimiters)
49
+
50
+ # Return memoryview slices (zero-copy)
51
+ mv = memoryview(text)
52
+ for start, end in offsets:
53
+ yield mv[start:end]
chonkie_core/py.typed ADDED
File without changes
@@ -0,0 +1,114 @@
1
+ Metadata-Version: 2.4
2
+ Name: chonkie-core
3
+ Version: 0.7.0
4
+ Classifier: Development Status :: 4 - Beta
5
+ Classifier: Intended Audience :: Developers
6
+ Classifier: License :: OSI Approved :: MIT License
7
+ Classifier: License :: OSI Approved :: Apache Software License
8
+ Classifier: Programming Language :: Rust
9
+ Classifier: Programming Language :: Python :: Implementation :: CPython
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.8
12
+ Classifier: Programming Language :: Python :: 3.9
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Topic :: Text Processing
17
+ Summary: The fastest semantic text chunking library
18
+ Keywords: chunking,text,simd,nlp,tokenization,rag,chonkie
19
+ Author: Bhavnick Minhas
20
+ License: MIT OR Apache-2.0
21
+ Requires-Python: >=3.8
22
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
23
+ Project-URL: Homepage, https://github.com/chonkie-inc/chunk
24
+ Project-URL: Repository, https://github.com/chonkie-inc/chunk
25
+
26
+ <p align="center">
27
+ <img src="../../assets/memchunk_wide.png" alt="chonkie-core" width="500">
28
+ </p>
29
+
30
+ <h1 align="center">chonkie-core</h1>
31
+
32
+ <p align="center">
33
+ <em>the fastest text chunking library — up to 1 TB/s throughput</em>
34
+ </p>
35
+
36
+ <p align="center">
37
+ <a href="https://crates.io/crates/chunk"><img src="https://img.shields.io/crates/v/chunk.svg?color=e74c3c" alt="crates.io"></a>
38
+ <a href="https://pypi.org/project/chonkie-core"><img src="https://img.shields.io/pypi/v/chonkie-core.svg?color=e67e22" alt="PyPI"></a>
39
+ <a href="https://www.npmjs.com/package/@chonkiejs/chunk"><img src="https://img.shields.io/npm/v/@chonkiejs/chunk.svg?color=2ecc71" alt="npm"></a>
40
+ <a href="https://github.com/chonkie-inc/chunk"><img src="https://img.shields.io/badge/github-chunk-3498db" alt="GitHub"></a>
41
+ <a href="LICENSE-MIT"><img src="https://img.shields.io/badge/license-MIT%2FApache--2.0-9b59b6.svg" alt="License"></a>
42
+ </p>
43
+
44
+ ---
45
+
46
+ you know how every chunking library claims to be fast? yeah, we actually meant it.
47
+
48
+ **chonkie-core** splits text at semantic boundaries (periods, newlines, the usual suspects) and does it stupid fast. we're talking "chunk the entire english wikipedia in 120ms" fast.
49
+
50
+ want to know how? [read the blog post](https://minha.sh/posts/so,-you-want-to-chunk-really-fast) where we nerd out about SIMD instructions and lookup tables.
51
+
52
+ ## 📦 installation
53
+
54
+ ```bash
55
+ pip install chonkie-core
56
+ ```
57
+
58
+ looking for [rust](https://github.com/chonkie-inc/chunk) or [javascript](https://github.com/chonkie-inc/chunk/tree/main/packages/wasm)?
59
+
60
+ ## 🚀 usage
61
+
62
+ ```python
63
+ from chonkie_core import Chunker
64
+
65
+ text = "Hello world. How are you? I'm fine.\nThanks for asking."
66
+
67
+ # with defaults (4KB chunks, split at \n . ?)
68
+ for chunk in Chunker(text):
69
+ print(bytes(chunk))
70
+
71
+ # with custom size
72
+ for chunk in Chunker(text, size=1024):
73
+ print(bytes(chunk))
74
+
75
+ # with custom delimiters
76
+ for chunk in Chunker(text, delimiters=".?!\n"):
77
+ print(bytes(chunk))
78
+
79
+ # with multi-byte pattern (e.g., metaspace ▁ for SentencePiece tokenizers)
80
+ for chunk in Chunker(text, pattern="▁", prefix=True):
81
+ print(bytes(chunk))
82
+
83
+ # with consecutive pattern handling (split at START of runs, not middle)
84
+ for chunk in Chunker("word next", pattern=" ", consecutive=True):
85
+ print(bytes(chunk))
86
+
87
+ # with forward fallback (search forward if no pattern in backward window)
88
+ for chunk in Chunker(text, pattern=" ", forward_fallback=True):
89
+ print(bytes(chunk))
90
+
91
+ # collect all chunks
92
+ chunks = list(Chunker(text))
93
+ ```
94
+
95
+ chunks are returned as `memoryview` objects (zero-copy slices of the original text).
96
+
97
+ ## 📝 citation
98
+
99
+ if you use chonkie-core in your research, please cite it as follows:
100
+
101
+ ```bibtex
102
+ @software{chunk2025,
103
+ author = {Minhas, Bhavnick},
104
+ title = {chunk: The fastest text chunking library},
105
+ year = {2025},
106
+ publisher = {GitHub},
107
+ howpublished = {\url{https://github.com/chonkie-inc/chunk}},
108
+ }
109
+ ```
110
+
111
+ ## 📄 license
112
+
113
+ licensed under either of [Apache License, Version 2.0](LICENSE-APACHE) or [MIT license](LICENSE-MIT) at your option.
114
+
@@ -0,0 +1,6 @@
1
+ chonkie_core/__init__.py,sha256=-NZBMrJk0zd5q8SVdvR_zZu9NMZ69x1z6m5DrogFeLo,1333
2
+ chonkie_core/_chunk.cpython-311-x86_64-linux-gnu.so,sha256=JB3NoPf8PGIacijfwUuEPnMqSbj7-LOsoybHRys7oyQ,723152
3
+ chonkie_core/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ chonkie_core-0.7.0.dist-info/METADATA,sha256=AUZwf5FNCQQevO_jNZk2MR_yvQsS_SbCBArbOc7FoDk,4119
5
+ chonkie_core-0.7.0.dist-info/WHEEL,sha256=KmtbzEMhBG7ILlpCgdxkDv7AlFCmdxefhRI54YAwnLk,147
6
+ chonkie_core-0.7.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: maturin (1.11.5)
3
+ Root-Is-Purelib: false
4
+ Tag: cp311-cp311-manylinux_2_17_x86_64
5
+ Tag: cp311-cp311-manylinux2014_x86_64