c-bpe 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- c_bpe-0.1.0/LICENSE +21 -0
- c_bpe-0.1.0/MANIFEST.in +11 -0
- c_bpe-0.1.0/PKG-INFO +481 -0
- c_bpe-0.1.0/README.md +440 -0
- c_bpe-0.1.0/codegen/gen_dict.py +115 -0
- c_bpe-0.1.0/codegen/gen_precomputed.py +190 -0
- c_bpe-0.1.0/data/cl100k_base.tiktoken.gz +0 -0
- c_bpe-0.1.0/data/o200k_base.tiktoken.gz +0 -0
- c_bpe-0.1.0/data/precomputed_cl100k.bin.gz +0 -0
- c_bpe-0.1.0/data/precomputed_o200k.bin.gz +0 -0
- c_bpe-0.1.0/include/ac_bpe.h +146 -0
- c_bpe-0.1.0/include/appendable_encoder.h +43 -0
- c_bpe-0.1.0/include/bitfield.h +105 -0
- c_bpe-0.1.0/include/bpe.h +239 -0
- c_bpe-0.1.0/include/c11_threads.h +76 -0
- c_bpe-0.1.0/include/fnv_hash.h +186 -0
- c_bpe-0.1.0/include/interval_encoding.h +41 -0
- c_bpe-0.1.0/include/lru_cache.h +223 -0
- c_bpe-0.1.0/include/prependable_encoder.h +44 -0
- c_bpe-0.1.0/include/pretok.h +105 -0
- c_bpe-0.1.0/include/pretok_ucd.h +128 -0
- c_bpe-0.1.0/include/threadpool.h +36 -0
- c_bpe-0.1.0/include/tokenizer.h +84 -0
- c_bpe-0.1.0/pyproject.toml +101 -0
- c_bpe-0.1.0/python/c_bpe/__init__.py +20 -0
- c_bpe-0.1.0/python/c_bpe/openai.py +64 -0
- c_bpe-0.1.0/python/c_bpe/py.typed +0 -0
- c_bpe-0.1.0/python/c_bpe.egg-info/PKG-INFO +481 -0
- c_bpe-0.1.0/python/c_bpe.egg-info/SOURCES.txt +60 -0
- c_bpe-0.1.0/python/c_bpe.egg-info/dependency_links.txt +1 -0
- c_bpe-0.1.0/python/c_bpe.egg-info/requires.txt +14 -0
- c_bpe-0.1.0/python/c_bpe.egg-info/top_level.txt +1 -0
- c_bpe-0.1.0/setup.cfg +4 -0
- c_bpe-0.1.0/setup.py +204 -0
- c_bpe-0.1.0/src/ac_bpe.c +439 -0
- c_bpe-0.1.0/src/ac_bpe_old.c +493 -0
- c_bpe-0.1.0/src/appendable_encoder.c +81 -0
- c_bpe-0.1.0/src/bpe_core.c +793 -0
- c_bpe-0.1.0/src/dict_cl100k.h +52785 -0
- c_bpe-0.1.0/src/dict_o200k.h +112367 -0
- c_bpe-0.1.0/src/interval_encoding.c +194 -0
- c_bpe-0.1.0/src/parallel.c +70 -0
- c_bpe-0.1.0/src/prependable_encoder.c +85 -0
- c_bpe-0.1.0/src/pretok_cl100k.c +369 -0
- c_bpe-0.1.0/src/pretok_o200k.c +385 -0
- c_bpe-0.1.0/src/pymodule.c +647 -0
- c_bpe-0.1.0/src/threadpool.c +262 -0
- c_bpe-0.1.0/src/tokenizer.c +206 -0
- c_bpe-0.1.0/tests/conftest.py +183 -0
- c_bpe-0.1.0/tests/test.py +47 -0
- c_bpe-0.1.0/tests/test_basic.py +97 -0
- c_bpe-0.1.0/tests/test_benchmarks.py +281 -0
- c_bpe-0.1.0/tests/test_import.py +55 -0
- c_bpe-0.1.0/third_party/pcre2/config.h +22 -0
- c_bpe-0.1.0/third_party/pcre2/pcre2.h +1069 -0
- c_bpe-0.1.0/third_party/pcre2/src/pcre2_internal.h +2235 -0
- c_bpe-0.1.0/third_party/pcre2/src/pcre2_intmodedep.h +973 -0
- c_bpe-0.1.0/third_party/pcre2/src/pcre2_tables.c +234 -0
- c_bpe-0.1.0/third_party/pcre2/src/pcre2_ucd.c +5804 -0
- c_bpe-0.1.0/third_party/pcre2/src/pcre2_ucp.h +408 -0
- c_bpe-0.1.0/third_party/pcre2/src/pcre2_ucptables.c +1596 -0
- c_bpe-0.1.0/third_party/pcre2/src/pcre2_util.h +132 -0
c_bpe-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 gweidart
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
c_bpe-0.1.0/MANIFEST.in
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
include LICENSE
|
|
2
|
+
include README.md
|
|
3
|
+
include pyproject.toml
|
|
4
|
+
include setup.py
|
|
5
|
+
recursive-include src *.c *.h
|
|
6
|
+
recursive-include include *.h
|
|
7
|
+
recursive-include third_party *.c *.h
|
|
8
|
+
recursive-include data *.gz
|
|
9
|
+
recursive-include codegen *.py
|
|
10
|
+
recursive-include python *.py *.pyi py.typed
|
|
11
|
+
recursive-include tests *.py
|
c_bpe-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,481 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: c-bpe
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: High-performance C implementation of BPE tokenizer
|
|
5
|
+
Author: andrey-savov
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/andrey-savov/c-bpe
|
|
8
|
+
Project-URL: Repository, https://github.com/andrey-savov/c-bpe
|
|
9
|
+
Project-URL: Documentation, https://github.com/andrey-savov/c-bpe#readme
|
|
10
|
+
Project-URL: Bug Tracker, https://github.com/andrey-savov/c-bpe/issues
|
|
11
|
+
Keywords: byte-pair-encoding,tokenization,tokenizer,bpe,tiktoken,tiktoken-alternative,tiktoken-compatible,nlp,natural-language-processing,llm,large-language-models,openai,text-processing,high-performance,c,cpython-extension
|
|
12
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: C
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
24
|
+
Classifier: Topic :: Text Processing
|
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
26
|
+
Requires-Python: >=3.9
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
License-File: LICENSE
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: pytest>=8.3.5; extra == "dev"
|
|
31
|
+
Requires-Dist: pytest-benchmark>=5.2.3; extra == "dev"
|
|
32
|
+
Requires-Dist: mypy>=1.15.0; extra == "dev"
|
|
33
|
+
Requires-Dist: ruff>=0.11.0; extra == "dev"
|
|
34
|
+
Provides-Extra: bench
|
|
35
|
+
Requires-Dist: numpy>=2.2.4; extra == "bench"
|
|
36
|
+
Requires-Dist: matplotlib>=3.10.1; extra == "bench"
|
|
37
|
+
Requires-Dist: tiktoken>=0.9.0; extra == "bench"
|
|
38
|
+
Provides-Extra: rust
|
|
39
|
+
Requires-Dist: maturin>=1.8.3; extra == "rust"
|
|
40
|
+
Dynamic: license-file
|
|
41
|
+
|
|
42
|
+
[](https://github.com/andrey-savov/c-bpe/actions/workflows/workflow.yml)
|
|
43
|
+
|
|
44
|
+
# c-bpe
|
|
45
|
+
|
|
46
|
+
High-performance C implementation of BPE (Byte Pair Encoding) tokenizer with Python bindings.
|
|
47
|
+
|
|
48
|
+
This library provides fast and correct token counting for chunking algorithms with a focus on high performance. It implements novel algorithms for BPE tokenization that are both correct and significantly faster than existing solutions.
|
|
49
|
+
|
|
50
|
+
## Attribution
|
|
51
|
+
|
|
52
|
+
This project is based on [rs-bpe](https://github.com/gweidart/rs-bpe) by [gweidart](https://github.com/gweidart), a Rust implementation of BPE tokenization. The C implementation ports the same novel algorithms to pure C for maximum portability and performance. The original Rust implementation is included in the [`rust/`](rust/) directory for comparison benchmarking.
|
|
53
|
+
|
|
54
|
+
#### Installation
|
|
55
|
+
|
|
56
|
+
```
|
|
57
|
+
pip install c-bpe
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
*c_bpe consistently outperforms tiktoken (March 7, 2026)*
|
|
61
|
+
|
|
62
|
+

|
|
63
|
+
|
|
64
|
+
## Key Features
|
|
65
|
+
|
|
66
|
+
* Efficient token counting with linear time complexity even for adversarial inputs
|
|
67
|
+
* Split text at exact token boundaries while respecting UTF-8 character boundaries
|
|
68
|
+
* Incrementally count tokens while appending text to a chunk
|
|
69
|
+
* Calculate token counts for sub-ranges of text with constant-time complexity
|
|
70
|
+
* Python bindings with OpenAI-compatible interface
|
|
71
|
+
|
|
72
|
+
These operations are particularly important for LLM applications but are challenging to implement efficiently for BPE tokenization.
|
|
73
|
+
|
|
74
|
+
## Motivation *(problems this library aims to solve)*
|
|
75
|
+
|
|
76
|
+
Existing BPE tokenizers often face performance and correctness issues when used for chunking operations:
|
|
77
|
+
|
|
78
|
+
### Split-at-N-Tokens Problem
|
|
79
|
+
|
|
80
|
+
Naively splitting text after N tokens by first encoding the entire text and then selecting a boundary often produces suboptimal results:
|
|
81
|
+
|
|
82
|
+
* The split point might not align with a UTF-8 character boundary
|
|
83
|
+
* Dropping tokens until a character boundary is reached might result in chunks much shorter than desired
|
|
84
|
+
* The algorithm wastes resources by encoding more text than necessary
|
|
85
|
+
|
|
86
|
+
### Incremental Counting Problem
|
|
87
|
+
|
|
88
|
+
Incrementally counting tokens as text is appended is challenging with traditional implementations:
|
|
89
|
+
|
|
90
|
+
* Recomputing the encoding after every append leads to quadratic complexity
|
|
91
|
+
* Approximating counts by aggregating piece counts leads to incorrect results due to BPE's non-monotonic nature
|
|
92
|
+
* Incorrect counting can cause problems when staying within token limits for LLM APIs
|
|
93
|
+
|
|
94
|
+
### Interval Counting Problem
|
|
95
|
+
|
|
96
|
+
Counting tokens for arbitrary subranges traditionally requires reprocessing the entire substring:
|
|
97
|
+
|
|
98
|
+
* Leads to poor performance for applications that need to count many subranges
|
|
99
|
+
* Makes operations like binary search for token boundaries inefficient
|
|
100
|
+
|
|
101
|
+
Our library provides novel algorithms to solve these problems with superior performance characteristics.
|
|
102
|
+
|
|
103
|
+
## Implementation
|
|
104
|
+
|
|
105
|
+
### Core Algorithm
|
|
106
|
+
|
|
107
|
+
The novel O(n) algorithm preserves the exact output of the original BPE algorithm by tracking encodings of all text prefixes using mathematical properties of valid BPE encodings.
|
|
108
|
+
|
|
109
|
+
Instead of storing full token sequences for each prefix, only the last token of each prefix needs to be remembered. This is possible because:
|
|
110
|
+
|
|
111
|
+
1. There exists exactly one valid encoding sequence for any input text
|
|
112
|
+
2. Any substring of a valid encoding sequence is itself a valid encoding sequence
|
|
113
|
+
3. Knowing the last token of a valid encoding sequence uniquely determines the full sequence
|
|
114
|
+
|
|
115
|
+
The algorithm determines the correct last token for each prefix by checking token compatibility with the preceding token, yielding a linear-time solution.
|
|
116
|
+
|
|
117
|
+
### Backtracking Optimization
|
|
118
|
+
|
|
119
|
+
For average-case improvement, a backtracking-based algorithm:
|
|
120
|
+
|
|
121
|
+
1. Tries the greedy approach first, using the longest matching token at each step
|
|
122
|
+
2. Backtracks when necessary to produce a valid BPE encoding
|
|
123
|
+
3. Uses a bitfield so worst-case runtime stays linear in input length
|
|
124
|
+
|
|
125
|
+
### Data Structures
|
|
126
|
+
|
|
127
|
+
* **`BytePairEncoding` struct**: Stores the concatenated token byte array, per-token start offsets, a `BytesMap` (bytes→token id), `split_left`/`split_right` arrays for token decomposition, a `PairMap` (pair→merged token), three Aho-Corasick automatons, and a `next_prefix_match` table.
|
|
128
|
+
|
|
129
|
+
* **`PairMap`**: Open-addressing hash table (linear probing, 50% max load) for `(token1, token2) → merged_id` lookups. Uses a splitmix64 finaliser instead of byte-by-byte FNV-1a for the fixed 8-byte key, keeping the merge step cache-friendly.
|
|
130
|
+
|
|
131
|
+
* **`BytesMap`**: Open-addressing hash table for `bytes → token_id` lookups. Uses FNV-1a hashing identical to Rust's `fnv` crate, ensuring consistent hash values across both implementations.
|
|
132
|
+
|
|
133
|
+
### Aho-Corasick Automatons
|
|
134
|
+
|
|
135
|
+
Three Double-Array Aho-Corasick automatons are built over the token vocabulary at initialisation time:
|
|
136
|
+
|
|
137
|
+
* **`longest_searcher`** (`AC_KIND_LEFTMOST_LONGEST`): leftmost-longest token match at each position — used for the backtrack encoder.
|
|
138
|
+
* **`overlapping_searcher`** (`AC_KIND_OVERLAPPING_FWD`): all overlapping forward matches — used by `AppendableEncoder` to maintain per-byte AC state.
|
|
139
|
+
* **`overlapping_searcher_rev`** (`AC_KIND_OVERLAPPING_REV`): all overlapping reverse matches — used by `PrependableEncoder`.
|
|
140
|
+
|
|
141
|
+
The Double-Array layout gives O(1) state transitions per input byte, making the automaton traversal extremely cache-friendly.
|
|
142
|
+
|
|
143
|
+
### Special Purpose Encoders
|
|
144
|
+
|
|
145
|
+
* **`AppendableEncoder`**: Stores one `AppState` per byte appended (`ac_state`, `last_token`, running count), allowing O(1) amortised count queries via the forward AC automaton.
|
|
146
|
+
* **`PrependableEncoder`**: Mirror of `AppendableEncoder` using the reverse AC automaton — supports O(1) amortised queries while prepending.
|
|
147
|
+
* **`IntervalEncoding`**: Precomputes a `last_token`, `tree_id`, `tree_end`, and `tree_depth` array per byte position, enabling typically-O(1) `count(start, end)` queries.
|
|
148
|
+
* **OpenAI-compatible Tokenizer**: Hand-coded pre-tokenisation with PCRE2 UCD tables (regex splitting identical to tiktoken) feeding into the shared BPE encode/decode logic.
|
|
149
|
+
|
|
150
|
+
## Performance
|
|
151
|
+
|
|
152
|
+
Our benchmarks show significant performance improvements over existing implementations:
|
|
153
|
+
|
|
154
|
+
> **Note**: All benchmark results shown here were achieved using the Python bindings, not the direct native implementation. This provides a more realistic representation of the performance users will experience in Python applications.
|
|
155
|
+
|
|
156
|
+
### Single-Text Tokenization
|
|
157
|
+
|
|
158
|
+
| Text Size | c\_bpe vs tiktoken | rs\_bpe vs tiktoken |
|
|
159
|
+
| ----------- | ----------------------- | -------------------- |
|
|
160
|
+
| Small | 2.9× faster | 3.0× faster |
|
|
161
|
+
| Medium | 1.7× faster | 1.6× faster |
|
|
162
|
+
| Large | 4.4× faster | 2.3× faster |
|
|
163
|
+
|
|
164
|
+
_Encoding speed (benchmark.py results):_
|
|
165
|
+
|
|
166
|
+

|
|
167
|
+
|
|
168
|
+
```
|
|
169
|
+
SMALL TEXT:
|
|
170
|
+
tiktoken: 0.000102s
|
|
171
|
+
c_bpe: 0.000035s
|
|
172
|
+
rs_bpe: 0.000034s
|
|
173
|
+
|
|
174
|
+
MEDIUM TEXT:
|
|
175
|
+
tiktoken: 0.001735s
|
|
176
|
+
c_bpe: 0.001007s
|
|
177
|
+
rs_bpe: 0.001092s
|
|
178
|
+
|
|
179
|
+
LARGE TEXT:
|
|
180
|
+
tiktoken: 0.068093s
|
|
181
|
+
c_bpe: 0.015330s
|
|
182
|
+
rs_bpe: 0.029147s
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
Both libraries also provide significantly faster decoding and roundtrip operations:
|
|
186
|
+
|
|
187
|
+
_Decoding speed:_
|
|
188
|
+
|
|
189
|
+

|
|
190
|
+
|
|
191
|
+
```
|
|
192
|
+
SMALL TEXT:
|
|
193
|
+
tiktoken: 0.000027s
|
|
194
|
+
c_bpe: 0.000011s
|
|
195
|
+
rs_bpe: 0.000018s
|
|
196
|
+
|
|
197
|
+
MEDIUM TEXT:
|
|
198
|
+
tiktoken: 0.000200s
|
|
199
|
+
c_bpe: 0.000076s
|
|
200
|
+
rs_bpe: 0.000105s
|
|
201
|
+
|
|
202
|
+
LARGE TEXT:
|
|
203
|
+
tiktoken: 0.003799s
|
|
204
|
+
c_bpe: 0.001709s
|
|
205
|
+
rs_bpe: 0.002504s
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
### Batch Processing Performance
|
|
209
|
+
|
|
210
|
+
| Batch Size | c\_bpe encode | c\_bpe decode | rs\_bpe encode | rs\_bpe decode |
|
|
211
|
+
| ------------ | ------------- | ------------- | -------------- | -------------- |
|
|
212
|
+
| 1 | 35× faster | 165× faster | 79× faster | 94× faster |
|
|
213
|
+
| 10 | 32× faster | 92× faster | 43× faster | 100× faster |
|
|
214
|
+
| 100 | 5× faster | 94× faster | 17× faster | 52× faster |
|
|
215
|
+
| 1000 | 22× faster | 57× faster | 13× faster | 31× faster |
|
|
216
|
+
|
|
217
|
+
_Encode speedup vs tiktoken (all sizes):_
|
|
218
|
+
|
|
219
|
+

|
|
220
|
+
|
|
221
|
+
### Worst-Case Performance
|
|
222
|
+
|
|
223
|
+
While tiktoken shows quadratic growth for certain adversarial inputs, c_bpe maintains linear scaling even in worst-case scenarios. This is critical for production systems that need consistent performance guarantees.
|
|
224
|
+
|
|
225
|
+
### Key Performance Advantages
|
|
226
|
+
|
|
227
|
+
1. **Memory Efficiency**: Compact data structures (tightly-packed token byte arrays, power-of-2 hash tables at ≤50% load) and no redundant token storage
|
|
228
|
+
2. **Cache-Friendly Hash Tables**: `PairMap` uses a splitmix64 finaliser for fixed 8-byte keys; `BytesMap` uses FNV-1a — both with linear probing for sequential memory access
|
|
229
|
+
3. **O(1) State Transitions**: Double-Array Aho-Corasick automatons enable single-byte-per-step token matching without backtracking through the vocabulary
|
|
230
|
+
4. **Full LTO**: Compiled with full Link-Time Optimisation (MSVC `/GL`+`/LTCG` / GCC `-flto`)
|
|
231
|
+
5. **No Correctness Trade-offs**: Verified to produce token-for-token identical output to tiktoken
|
|
232
|
+
|
|
233
|
+
All benchmarks were run on standard hardware and results may vary based on your specific environment.
|
|
234
|
+
|
|
235
|
+
## Python Usage Examples
|
|
236
|
+
|
|
237
|
+
### Basic Tokenization
|
|
238
|
+
|
|
239
|
+
```python
|
|
240
|
+
from c_bpe.bpe import openai
|
|
241
|
+
|
|
242
|
+
# Load OpenAI tokenizers (automatically caches for reuse)
|
|
243
|
+
cl100k_tokenizer = openai.cl100k_base() # GPT-3.5/4 tokenizer
|
|
244
|
+
o200k_tokenizer = openai.o200k_base() # o200k tokenizer
|
|
245
|
+
|
|
246
|
+
# Basic encoding
|
|
247
|
+
text = "Hello, world! This is an example."
|
|
248
|
+
tokens = cl100k_tokenizer.encode(text)
|
|
249
|
+
print(f"Encoded tokens: {tokens}")
|
|
250
|
+
|
|
251
|
+
# Basic decoding
|
|
252
|
+
decoded_text = cl100k_tokenizer.decode(tokens)
|
|
253
|
+
print(f"Decoded text: {decoded_text}")
|
|
254
|
+
|
|
255
|
+
# Simple token counting
|
|
256
|
+
token_count = cl100k_tokenizer.count(text)
|
|
257
|
+
print(f"Token count: {token_count}")
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
### Efficient Token Limiting
|
|
261
|
+
|
|
262
|
+
One of the key features is the ability to efficiently count tokens up to a limit, which is useful when you need to stay within token constraints:
|
|
263
|
+
|
|
264
|
+
```python
|
|
265
|
+
from c_bpe.bpe import openai
|
|
266
|
+
|
|
267
|
+
tokenizer = openai.cl100k_base()
|
|
268
|
+
max_tokens = 50
|
|
269
|
+
|
|
270
|
+
# Count tokens until limit is reached
|
|
271
|
+
text = "This is a long text that might exceed our token limit... " * 20
|
|
272
|
+
char_position = tokenizer.count_till_limit(text, max_tokens)
|
|
273
|
+
|
|
274
|
+
if char_position is not None:
|
|
275
|
+
# We reached the limit before the end of the text
|
|
276
|
+
truncated_text = text[:char_position]
|
|
277
|
+
print(f"Truncated to {tokenizer.count(truncated_text)} tokens")
|
|
278
|
+
print(f"Truncated text: {truncated_text}")
|
|
279
|
+
else:
|
|
280
|
+
# The entire text is within our token limit
|
|
281
|
+
print(f"Text is within token limit: {tokenizer.count(text)} tokens")
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
### Batch Processing
|
|
285
|
+
|
|
286
|
+
c_bpe excels at batch processing, which is perfect for processing large datasets:
|
|
287
|
+
|
|
288
|
+
```python
|
|
289
|
+
from c_bpe.bpe import openai
|
|
290
|
+
import time
|
|
291
|
+
|
|
292
|
+
# Load the tokenizer
|
|
293
|
+
tokenizer = openai.cl100k_base()
|
|
294
|
+
|
|
295
|
+
# Create a batch of texts
|
|
296
|
+
texts = [
|
|
297
|
+
"This is the first document to encode.",
|
|
298
|
+
"Here's another one with different content.",
|
|
299
|
+
"A third document with some more text to process.",
|
|
300
|
+
# Add more as needed...
|
|
301
|
+
]
|
|
302
|
+
|
|
303
|
+
# Configure parallel processing options (optional)
|
|
304
|
+
parallel_options = openai.ParallelOptions(
|
|
305
|
+
min_batch_size=20, # Minimum batch size to engage parallel processing
|
|
306
|
+
chunk_size=100, # Number of texts to process in each thread
|
|
307
|
+
max_threads=0, # 0 means use optimal thread count (based on CPU cores)
|
|
308
|
+
use_thread_pool=True # Reuse thread pool for better performance
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
# Encode batch with performance metrics
|
|
312
|
+
start_time = time.time()
|
|
313
|
+
result = tokenizer.encode_batch(texts, parallel_options)
|
|
314
|
+
end_time = time.time()
|
|
315
|
+
|
|
316
|
+
print(f"Processed {len(texts)} texts in {result.time_taken:.6f}s")
|
|
317
|
+
print(f"Total tokens: {result.total_tokens}")
|
|
318
|
+
print(f"Throughput: {result.total_tokens / result.time_taken:.1f} tokens/second")
|
|
319
|
+
|
|
320
|
+
# Access individual token lists
|
|
321
|
+
for i, tokens in enumerate(result.tokens):
|
|
322
|
+
print(f"Text {i} has {len(tokens)} tokens")
|
|
323
|
+
```
|
|
324
|
+
|
|
325
|
+
### Text Chunking
|
|
326
|
+
|
|
327
|
+
c_bpe can be used to efficiently chunk text based on token counts:
|
|
328
|
+
|
|
329
|
+
```python
|
|
330
|
+
from c_bpe.bpe import openai
|
|
331
|
+
|
|
332
|
+
tokenizer = openai.cl100k_base()
|
|
333
|
+
|
|
334
|
+
def chunk_text(text, max_chunk_tokens=1024, overlap_tokens=50):
|
|
335
|
+
"""Split text into chunks of approximately max_chunk_tokens."""
|
|
336
|
+
chunks = []
|
|
337
|
+
|
|
338
|
+
# Get the full text token count
|
|
339
|
+
total_tokens = tokenizer.count(text)
|
|
340
|
+
|
|
341
|
+
if total_tokens <= max_chunk_tokens:
|
|
342
|
+
return [text]
|
|
343
|
+
|
|
344
|
+
# Keep track of where we are in the text
|
|
345
|
+
start_pos = 0
|
|
346
|
+
|
|
347
|
+
while start_pos < len(text):
|
|
348
|
+
# Find where to end this chunk
|
|
349
|
+
char_position = tokenizer.count_till_limit(text[start_pos:], max_chunk_tokens)
|
|
350
|
+
|
|
351
|
+
if char_position is None:
|
|
352
|
+
# The rest of the text fits within our limit
|
|
353
|
+
chunks.append(text[start_pos:])
|
|
354
|
+
break
|
|
355
|
+
|
|
356
|
+
# Add the chunk
|
|
357
|
+
end_pos = start_pos + char_position
|
|
358
|
+
chunks.append(text[start_pos:end_pos])
|
|
359
|
+
|
|
360
|
+
# Move to the next chunk, considering overlap
|
|
361
|
+
if overlap_tokens > 0 and end_pos < len(text):
|
|
362
|
+
# Move back by overlap tokens
|
|
363
|
+
overlap_char_position = tokenizer.count_till_limit(
|
|
364
|
+
text[start_pos:end_pos], max_chunk_tokens - overlap_tokens
|
|
365
|
+
)
|
|
366
|
+
if overlap_char_position is not None:
|
|
367
|
+
start_pos += overlap_char_position
|
|
368
|
+
else:
|
|
369
|
+
start_pos = end_pos
|
|
370
|
+
else:
|
|
371
|
+
start_pos = end_pos
|
|
372
|
+
|
|
373
|
+
return chunks
|
|
374
|
+
|
|
375
|
+
# Example usage
|
|
376
|
+
long_text = "This is a long document that needs to be split into chunks. " * 100
|
|
377
|
+
chunks = chunk_text(long_text, max_chunk_tokens=100, overlap_tokens=10)
|
|
378
|
+
|
|
379
|
+
print(f"Split text into {len(chunks)} chunks:")
|
|
380
|
+
for i, chunk in enumerate(chunks):
|
|
381
|
+
token_count = tokenizer.count(chunk)
|
|
382
|
+
print(f"Chunk {i}: {token_count} tokens, {len(chunk)} chars")
|
|
383
|
+
```
|
|
384
|
+
|
|
385
|
+
## Building from Source
|
|
386
|
+
|
|
387
|
+
### Prerequisites
|
|
388
|
+
|
|
389
|
+
| Requirement | c_bpe | rs_bpe (companion) |
|
|
390
|
+
|-------------|-------|---------------------|
|
|
391
|
+
| Python ≥ 3.8 | ✅ | ✅ |
|
|
392
|
+
| C11 compiler (GCC, Clang, or MSVC) | ✅ | — |
|
|
393
|
+
| Rust toolchain (stable) | — | ✅ |
|
|
394
|
+
| setuptools (`pip install setuptools`) | ✅ | — |
|
|
395
|
+
| maturin (`pip install maturin`) | — | ✅ |
|
|
396
|
+
|
|
397
|
+
### c_bpe (C extension)
|
|
398
|
+
|
|
399
|
+
```bash
|
|
400
|
+
# Clone the repository
|
|
401
|
+
git clone https://github.com/andrey-savov/c-bpe.git
|
|
402
|
+
cd c-bpe
|
|
403
|
+
|
|
404
|
+
# Create and activate a virtual environment (recommended)
|
|
405
|
+
python -m venv .venv
|
|
406
|
+
source .venv/bin/activate # Linux/macOS
|
|
407
|
+
# .venv\Scripts\activate # Windows
|
|
408
|
+
|
|
409
|
+
# Install in development mode (editable)
|
|
410
|
+
pip install -e .
|
|
411
|
+
|
|
412
|
+
# Or build the extension in-place without installing
|
|
413
|
+
python setup.py build_ext --inplace
|
|
414
|
+
```
|
|
415
|
+
|
|
416
|
+
The build auto-detects the compiler and applies platform-appropriate optimisations:
|
|
417
|
+
- **GCC/Clang**: `-O3 -march=native -flto -DNDEBUG`
|
|
418
|
+
- **MSVC**: `/O2 /Ox /GL /DNDEBUG` with `/LTCG` at link time
|
|
419
|
+
|
|
420
|
+
PCRE2 Unicode tables are bundled in `third_party/`; no external PCRE2 installation is required.
|
|
421
|
+
|
|
422
|
+
### rs_bpe (Rust companion — for benchmarking)
|
|
423
|
+
|
|
424
|
+
The `rust/` directory contains the original Rust implementation ([rs-bpe](https://github.com/gweidart/rs-bpe)) for comparison benchmarking:
|
|
425
|
+
|
|
426
|
+
```bash
|
|
427
|
+
cd rust
|
|
428
|
+
|
|
429
|
+
# Install maturin if not already present
|
|
430
|
+
pip install maturin
|
|
431
|
+
|
|
432
|
+
# Build and install in development mode
|
|
433
|
+
maturin develop --release
|
|
434
|
+
```
|
|
435
|
+
|
|
436
|
+
The Rust toolchain can be installed from [rustup.rs](https://rustup.rs/).
|
|
437
|
+
|
|
438
|
+
### Installing both side by side
|
|
439
|
+
|
|
440
|
+
Both packages can be installed in the same environment — they use separate namespaces (`c_bpe` and `rs_bpe`):
|
|
441
|
+
|
|
442
|
+
```bash
|
|
443
|
+
cd c-bpe
|
|
444
|
+
pip install -e . # c_bpe
|
|
445
|
+
cd rust && pip install maturin && maturin develop --release # rs_bpe
|
|
446
|
+
```
|
|
447
|
+
|
|
448
|
+
Verify:
|
|
449
|
+
|
|
450
|
+
```python
|
|
451
|
+
from c_bpe.bpe import openai as c_openai
|
|
452
|
+
from rs_bpe.bpe import openai as rs_openai
|
|
453
|
+
|
|
454
|
+
c_tok = c_openai.cl100k_base()
|
|
455
|
+
rs_tok = rs_openai.cl100k_base()
|
|
456
|
+
|
|
457
|
+
text = "Hello, world!"
|
|
458
|
+
assert c_tok.encode(text) == rs_tok.encode(text)
|
|
459
|
+
print("Both implementations installed and producing identical output.")
|
|
460
|
+
```
|
|
461
|
+
|
|
462
|
+
### Running tests
|
|
463
|
+
|
|
464
|
+
```bash
|
|
465
|
+
# From the repository root, with both implementations installed:
|
|
466
|
+
pip install pytest pytest-benchmark
|
|
467
|
+
|
|
468
|
+
# Correctness tests
|
|
469
|
+
pytest tests/test_basic.py -v
|
|
470
|
+
|
|
471
|
+
# Benchmarks
|
|
472
|
+
pytest tests/test_benchmarks.py --benchmark-only -v
|
|
473
|
+
```
|
|
474
|
+
|
|
475
|
+
## Acknowledgements
|
|
476
|
+
|
|
477
|
+
This project was developed with the assistance of [Claude Code](https://claude.ai/claude-code), using Claude Opus 4.6 and Claude Sonnet 4.6 models by [Anthropic](https://www.anthropic.com).
|
|
478
|
+
|
|
479
|
+
## License
|
|
480
|
+
|
|
481
|
+
[MIT License](LICENSE)
|