chonkie-core 0.9.1__cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,73 @@
1
+ """chonkie-core - The fastest semantic text chunking library."""
2
+
3
+ from chonkie_core._chunk import (
4
+ Chunker,
5
+ MergeResult,
6
+ PatternSplitter,
7
+ chunk_offsets,
8
+ find_merge_indices,
9
+ merge_splits,
10
+ split_offsets,
11
+ split_pattern_offsets,
12
+ # Savitzky-Golay filter functions
13
+ savgol_filter,
14
+ find_local_minima_interpolated,
15
+ windowed_cross_similarity,
16
+ filter_split_indices,
17
+ DEFAULT_TARGET_SIZE,
18
+ DEFAULT_DELIMITERS,
19
+ )
20
+
21
+ __all__ = [
22
+ "chunk",
23
+ "Chunker",
24
+ "MergeResult",
25
+ "PatternSplitter",
26
+ "chunk_offsets",
27
+ "find_merge_indices",
28
+ "merge_splits",
29
+ "split_offsets",
30
+ "split_pattern_offsets",
31
+ # Savitzky-Golay filter functions
32
+ "savgol_filter",
33
+ "find_local_minima_interpolated",
34
+ "windowed_cross_similarity",
35
+ "filter_split_indices",
36
+ "DEFAULT_TARGET_SIZE",
37
+ "DEFAULT_DELIMITERS",
38
+ ]
39
+ __version__ = "0.9.1"
40
+
41
+
42
+ def chunk(text, *, size=DEFAULT_TARGET_SIZE, delimiters=None):
43
+ """
44
+ Split text into chunks at delimiter boundaries.
45
+ Returns an iterator of zero-copy memoryview slices.
46
+
47
+ Args:
48
+ text: bytes or str to chunk
49
+ size: Target chunk size in bytes (default: 4096)
50
+ delimiters: bytes or str of delimiter characters (default: "\\n.?")
51
+
52
+ Yields:
53
+ memoryview slices of the original text
54
+
55
+ Example:
56
+ >>> text = b"Hello. World. Test."
57
+ >>> for chunk in chunk(text, size=10, delimiters=b"."):
58
+ ... print(bytes(chunk))
59
+ b'Hello.'
60
+ b' World.'
61
+ b' Test.'
62
+ """
63
+ # Convert str to bytes if needed
64
+ if isinstance(text, str):
65
+ text = text.encode("utf-8")
66
+
67
+ # Get offsets from Rust (single FFI call)
68
+ offsets = chunk_offsets(text, size, delimiters)
69
+
70
+ # Return memoryview slices (zero-copy)
71
+ mv = memoryview(text)
72
+ for start, end in offsets:
73
+ yield mv[start:end]
chonkie_core/py.typed ADDED
File without changes
@@ -0,0 +1,115 @@
1
+ Metadata-Version: 2.4
2
+ Name: chonkie-core
3
+ Version: 0.9.1
4
+ Classifier: Development Status :: 4 - Beta
5
+ Classifier: Intended Audience :: Developers
6
+ Classifier: License :: OSI Approved :: MIT License
7
+ Classifier: License :: OSI Approved :: Apache Software License
8
+ Classifier: Programming Language :: Rust
9
+ Classifier: Programming Language :: Python :: Implementation :: CPython
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.8
12
+ Classifier: Programming Language :: Python :: 3.9
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Topic :: Text Processing
17
+ Requires-Dist: numpy>=1.20
18
+ Summary: The fastest semantic text chunking library
19
+ Keywords: chunking,text,simd,nlp,tokenization,rag,chonkie
20
+ Author: Bhavnick Minhas
21
+ License: MIT OR Apache-2.0
22
+ Requires-Python: >=3.8
23
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
24
+ Project-URL: Homepage, https://github.com/chonkie-inc/chunk
25
+ Project-URL: Repository, https://github.com/chonkie-inc/chunk
26
+
27
+ <p align="center">
28
+ <img src="../../assets/memchunk_wide.png" alt="chonkie-core" width="500">
29
+ </p>
30
+
31
+ <h1 align="center">chonkie-core</h1>
32
+
33
+ <p align="center">
34
+ <em>the fastest text chunking library — up to 1 TB/s throughput</em>
35
+ </p>
36
+
37
+ <p align="center">
38
+ <a href="https://crates.io/crates/chunk"><img src="https://img.shields.io/crates/v/chunk.svg?color=e74c3c" alt="crates.io"></a>
39
+ <a href="https://pypi.org/project/chonkie-core"><img src="https://img.shields.io/pypi/v/chonkie-core.svg?color=e67e22" alt="PyPI"></a>
40
+ <a href="https://www.npmjs.com/package/@chonkiejs/chunk"><img src="https://img.shields.io/npm/v/@chonkiejs/chunk.svg?color=2ecc71" alt="npm"></a>
41
+ <a href="https://github.com/chonkie-inc/chunk"><img src="https://img.shields.io/badge/github-chunk-3498db" alt="GitHub"></a>
42
+ <a href="LICENSE-MIT"><img src="https://img.shields.io/badge/license-MIT%2FApache--2.0-9b59b6.svg" alt="License"></a>
43
+ </p>
44
+
45
+ ---
46
+
47
+ you know how every chunking library claims to be fast? yeah, we actually meant it.
48
+
49
+ **chonkie-core** splits text at semantic boundaries (periods, newlines, the usual suspects) and does it stupid fast. we're talking "chunk the entire english wikipedia in 120ms" fast.
50
+
51
+ want to know how? [read the blog post](https://minha.sh/posts/so,-you-want-to-chunk-really-fast) where we nerd out about SIMD instructions and lookup tables.
52
+
53
+ ## 📦 installation
54
+
55
+ ```bash
56
+ pip install chonkie-core
57
+ ```
58
+
59
+ looking for [rust](https://github.com/chonkie-inc/chunk) or [javascript](https://github.com/chonkie-inc/chunk/tree/main/packages/wasm)?
60
+
61
+ ## 🚀 usage
62
+
63
+ ```python
64
+ from chonkie_core import Chunker
65
+
66
+ text = "Hello world. How are you? I'm fine.\nThanks for asking."
67
+
68
+ # with defaults (4KB chunks, split at \n . ?)
69
+ for chunk in Chunker(text):
70
+ print(bytes(chunk))
71
+
72
+ # with custom size
73
+ for chunk in Chunker(text, size=1024):
74
+ print(bytes(chunk))
75
+
76
+ # with custom delimiters
77
+ for chunk in Chunker(text, delimiters=".?!\n"):
78
+ print(bytes(chunk))
79
+
80
+ # with multi-byte pattern (e.g., metaspace ▁ for SentencePiece tokenizers)
81
+ for chunk in Chunker(text, pattern="▁", prefix=True):
82
+ print(bytes(chunk))
83
+
84
+ # with consecutive pattern handling (split at START of runs, not middle)
85
+ for chunk in Chunker("word next", pattern=" ", consecutive=True):
86
+ print(bytes(chunk))
87
+
88
+ # with forward fallback (search forward if no pattern in backward window)
89
+ for chunk in Chunker(text, pattern=" ", forward_fallback=True):
90
+ print(bytes(chunk))
91
+
92
+ # collect all chunks
93
+ chunks = list(Chunker(text))
94
+ ```
95
+
96
+ chunks are returned as `memoryview` objects (zero-copy slices of the original text).
97
+
98
+ ## 📝 citation
99
+
100
+ if you use chonkie-core in your research, please cite it as follows:
101
+
102
+ ```bibtex
103
+ @software{chunk2025,
104
+ author = {Minhas, Bhavnick},
105
+ title = {chunk: The fastest text chunking library},
106
+ year = {2025},
107
+ publisher = {GitHub},
108
+ howpublished = {\url{https://github.com/chonkie-inc/chunk}},
109
+ }
110
+ ```
111
+
112
+ ## 📄 license
113
+
114
+ licensed under either of [Apache License, Version 2.0](LICENSE-APACHE) or [MIT license](LICENSE-MIT) at your option.
115
+
@@ -0,0 +1,6 @@
1
+ chonkie_core/__init__.py,sha256=DNpYt5zYmK3a4VRAZmVp-F2Utgi3TYom7Nic63Veuxs,1865
2
+ chonkie_core/_chunk.cpython-314-x86_64-linux-gnu.so,sha256=VtKuZziX_UhAfjAIm4h4VUXjiW8OMaoWqmrdkAYrENM,949992
3
+ chonkie_core/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ chonkie_core-0.9.1.dist-info/METADATA,sha256=vGIgFz9F-xj_WwtJDfeT37t_sooUJJNDkWM4g2RPIRI,4146
5
+ chonkie_core-0.9.1.dist-info/WHEEL,sha256=1GO8NDKTfrlRkgVXFQ5KJDUbdgIfejZHnbKJV8XeeSw,147
6
+ chonkie_core-0.9.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: maturin (1.11.5)
3
+ Root-Is-Purelib: false
4
+ Tag: cp314-cp314-manylinux_2_17_x86_64
5
+ Tag: cp314-cp314-manylinux2014_x86_64