c-bpe 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. c_bpe-0.1.0/LICENSE +21 -0
  2. c_bpe-0.1.0/MANIFEST.in +11 -0
  3. c_bpe-0.1.0/PKG-INFO +481 -0
  4. c_bpe-0.1.0/README.md +440 -0
  5. c_bpe-0.1.0/codegen/gen_dict.py +115 -0
  6. c_bpe-0.1.0/codegen/gen_precomputed.py +190 -0
  7. c_bpe-0.1.0/data/cl100k_base.tiktoken.gz +0 -0
  8. c_bpe-0.1.0/data/o200k_base.tiktoken.gz +0 -0
  9. c_bpe-0.1.0/data/precomputed_cl100k.bin.gz +0 -0
  10. c_bpe-0.1.0/data/precomputed_o200k.bin.gz +0 -0
  11. c_bpe-0.1.0/include/ac_bpe.h +146 -0
  12. c_bpe-0.1.0/include/appendable_encoder.h +43 -0
  13. c_bpe-0.1.0/include/bitfield.h +105 -0
  14. c_bpe-0.1.0/include/bpe.h +239 -0
  15. c_bpe-0.1.0/include/c11_threads.h +76 -0
  16. c_bpe-0.1.0/include/fnv_hash.h +186 -0
  17. c_bpe-0.1.0/include/interval_encoding.h +41 -0
  18. c_bpe-0.1.0/include/lru_cache.h +223 -0
  19. c_bpe-0.1.0/include/prependable_encoder.h +44 -0
  20. c_bpe-0.1.0/include/pretok.h +105 -0
  21. c_bpe-0.1.0/include/pretok_ucd.h +128 -0
  22. c_bpe-0.1.0/include/threadpool.h +36 -0
  23. c_bpe-0.1.0/include/tokenizer.h +84 -0
  24. c_bpe-0.1.0/pyproject.toml +101 -0
  25. c_bpe-0.1.0/python/c_bpe/__init__.py +20 -0
  26. c_bpe-0.1.0/python/c_bpe/openai.py +64 -0
  27. c_bpe-0.1.0/python/c_bpe/py.typed +0 -0
  28. c_bpe-0.1.0/python/c_bpe.egg-info/PKG-INFO +481 -0
  29. c_bpe-0.1.0/python/c_bpe.egg-info/SOURCES.txt +60 -0
  30. c_bpe-0.1.0/python/c_bpe.egg-info/dependency_links.txt +1 -0
  31. c_bpe-0.1.0/python/c_bpe.egg-info/requires.txt +14 -0
  32. c_bpe-0.1.0/python/c_bpe.egg-info/top_level.txt +1 -0
  33. c_bpe-0.1.0/setup.cfg +4 -0
  34. c_bpe-0.1.0/setup.py +204 -0
  35. c_bpe-0.1.0/src/ac_bpe.c +439 -0
  36. c_bpe-0.1.0/src/ac_bpe_old.c +493 -0
  37. c_bpe-0.1.0/src/appendable_encoder.c +81 -0
  38. c_bpe-0.1.0/src/bpe_core.c +793 -0
  39. c_bpe-0.1.0/src/dict_cl100k.h +52785 -0
  40. c_bpe-0.1.0/src/dict_o200k.h +112367 -0
  41. c_bpe-0.1.0/src/interval_encoding.c +194 -0
  42. c_bpe-0.1.0/src/parallel.c +70 -0
  43. c_bpe-0.1.0/src/prependable_encoder.c +85 -0
  44. c_bpe-0.1.0/src/pretok_cl100k.c +369 -0
  45. c_bpe-0.1.0/src/pretok_o200k.c +385 -0
  46. c_bpe-0.1.0/src/pymodule.c +647 -0
  47. c_bpe-0.1.0/src/threadpool.c +262 -0
  48. c_bpe-0.1.0/src/tokenizer.c +206 -0
  49. c_bpe-0.1.0/tests/conftest.py +183 -0
  50. c_bpe-0.1.0/tests/test.py +47 -0
  51. c_bpe-0.1.0/tests/test_basic.py +97 -0
  52. c_bpe-0.1.0/tests/test_benchmarks.py +281 -0
  53. c_bpe-0.1.0/tests/test_import.py +55 -0
  54. c_bpe-0.1.0/third_party/pcre2/config.h +22 -0
  55. c_bpe-0.1.0/third_party/pcre2/pcre2.h +1069 -0
  56. c_bpe-0.1.0/third_party/pcre2/src/pcre2_internal.h +2235 -0
  57. c_bpe-0.1.0/third_party/pcre2/src/pcre2_intmodedep.h +973 -0
  58. c_bpe-0.1.0/third_party/pcre2/src/pcre2_tables.c +234 -0
  59. c_bpe-0.1.0/third_party/pcre2/src/pcre2_ucd.c +5804 -0
  60. c_bpe-0.1.0/third_party/pcre2/src/pcre2_ucp.h +408 -0
  61. c_bpe-0.1.0/third_party/pcre2/src/pcre2_ucptables.c +1596 -0
  62. c_bpe-0.1.0/third_party/pcre2/src/pcre2_util.h +132 -0
c_bpe-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 gweidart
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,11 @@
1
+ include LICENSE
2
+ include README.md
3
+ include pyproject.toml
4
+ include setup.py
5
+ recursive-include src *.c *.h
6
+ recursive-include include *.h
7
+ recursive-include third_party *.c *.h
8
+ recursive-include data *.gz
9
+ recursive-include codegen *.py
10
+ recursive-include python *.py *.pyi py.typed
11
+ recursive-include tests *.py
c_bpe-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,481 @@
1
+ Metadata-Version: 2.4
2
+ Name: c-bpe
3
+ Version: 0.1.0
4
+ Summary: High-performance C implementation of BPE tokenizer
5
+ Author: andrey-savov
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/andrey-savov/c-bpe
8
+ Project-URL: Repository, https://github.com/andrey-savov/c-bpe
9
+ Project-URL: Documentation, https://github.com/andrey-savov/c-bpe#readme
10
+ Project-URL: Bug Tracker, https://github.com/andrey-savov/c-bpe/issues
11
+ Keywords: byte-pair-encoding,tokenization,tokenizer,bpe,tiktoken,tiktoken-alternative,tiktoken-compatible,nlp,natural-language-processing,llm,large-language-models,openai,text-processing,high-performance,c,cpython-extension
12
+ Classifier: Development Status :: 5 - Production/Stable
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: C
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Programming Language :: Python :: 3.13
24
+ Classifier: Topic :: Text Processing
25
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
26
+ Requires-Python: >=3.9
27
+ Description-Content-Type: text/markdown
28
+ License-File: LICENSE
29
+ Provides-Extra: dev
30
+ Requires-Dist: pytest>=8.3.5; extra == "dev"
31
+ Requires-Dist: pytest-benchmark>=5.2.3; extra == "dev"
32
+ Requires-Dist: mypy>=1.15.0; extra == "dev"
33
+ Requires-Dist: ruff>=0.11.0; extra == "dev"
34
+ Provides-Extra: bench
35
+ Requires-Dist: numpy>=2.2.4; extra == "bench"
36
+ Requires-Dist: matplotlib>=3.10.1; extra == "bench"
37
+ Requires-Dist: tiktoken>=0.9.0; extra == "bench"
38
+ Provides-Extra: rust
39
+ Requires-Dist: maturin>=1.8.3; extra == "rust"
40
+ Dynamic: license-file
41
+
42
+ [![Build](https://github.com/andrey-savov/c-bpe/actions/workflows/workflow.yml/badge.svg?branch=main)](https://github.com/andrey-savov/c-bpe/actions/workflows/workflow.yml)
43
+
44
+ # c-bpe
45
+
46
+ High-performance C implementation of BPE (Byte Pair Encoding) tokenizer with Python bindings.
47
+
48
+ This library provides fast and correct token counting for chunking algorithms with a focus on high performance. It implements novel algorithms for BPE tokenization that are both correct and significantly faster than existing solutions.
49
+
50
+ ## Attribution
51
+
52
+ This project is based on [rs-bpe](https://github.com/gweidart/rs-bpe) by [gweidart](https://github.com/gweidart), a Rust implementation of BPE tokenization. The C implementation ports the same novel algorithms to pure C for maximum portability and performance. The original Rust implementation is included in the [`rust/`](rust/) directory for comparison benchmarking.
53
+
54
+ #### Installation
55
+
56
+ ```
57
+ pip install c-bpe
58
+ ```
59
+
60
+ *c_bpe consistently outperforms tiktoken (March 7, 2026)*
61
+
62
+ ![c_bpe throughput vs tiktoken](benchmark/tokenizer_benchmark_results_throughput.svg)
63
+
64
+ ## Key Features
65
+
66
+ * Efficient token counting with linear time complexity even for adversarial inputs
67
+ * Split text at exact token boundaries while respecting UTF-8 character boundaries
68
+ * Incrementally count tokens while appending text to a chunk
69
+ * Calculate token counts for sub-ranges of text with constant-time complexity
70
+ * Python bindings with OpenAI-compatible interface
71
+
72
+ These operations are particularly important for LLM applications but are challenging to implement efficiently for BPE tokenization.
73
+
74
+ ## Motivation *(problems this library aims to solve)*
75
+
76
+ Existing BPE tokenizers often face performance and correctness issues when used for chunking operations:
77
+
78
+ ### Split-at-N-Tokens Problem
79
+
80
+ Naively splitting text after N tokens by first encoding the entire text and then selecting a boundary often produces suboptimal results:
81
+
82
+ * The split point might not align with a UTF-8 character boundary
83
+ * Dropping tokens until a character boundary is reached might result in chunks much shorter than desired
84
+ * The algorithm wastes resources by encoding more text than necessary
85
+
86
+ ### Incremental Counting Problem
87
+
88
+ Incrementally counting tokens as text is appended is challenging with traditional implementations:
89
+
90
+ * Recomputing the encoding after every append leads to quadratic complexity
91
+ * Approximating counts by aggregating piece counts leads to incorrect results due to BPE's non-monotonic nature
92
+ * Incorrect counting can cause problems when staying within token limits for LLM APIs
93
+
94
+ ### Interval Counting Problem
95
+
96
+ Counting tokens for arbitrary subranges traditionally requires reprocessing the entire substring:
97
+
98
+ * Leads to poor performance for applications that need to count many subranges
99
+ * Makes operations like binary search for token boundaries inefficient
100
+
101
+ Our library provides novel algorithms to solve these problems with superior performance characteristics.
102
+
103
+ ## Implementation
104
+
105
+ ### Core Algorithm
106
+
107
+ The novel O(n) algorithm preserves the exact output of the original BPE algorithm by tracking encodings of all text prefixes using mathematical properties of valid BPE encodings.
108
+
109
+ Instead of storing full token sequences for each prefix, only the last token of each prefix needs to be remembered. This is possible because:
110
+
111
+ 1. There exists exactly one valid encoding sequence for any input text
112
+ 2. Any substring of a valid encoding sequence is itself a valid encoding sequence
113
+ 3. Knowing the last token of a valid encoding sequence uniquely determines the full sequence
114
+
115
+ The algorithm determines the correct last token for each prefix by checking token compatibility with the preceding token, yielding a linear-time solution.
116
+
117
+ ### Backtracking Optimization
118
+
119
+ For average-case improvement, a backtracking-based algorithm:
120
+
121
+ 1. Tries the greedy approach first, using the longest matching token at each step
122
+ 2. Backtracks when necessary to produce a valid BPE encoding
123
+ 3. Uses a bitfield so worst-case runtime stays linear in input length
124
+
125
+ ### Data Structures
126
+
127
+ * **`BytePairEncoding` struct**: Stores the concatenated token byte array, per-token start offsets, a `BytesMap` (bytes→token id), `split_left`/`split_right` arrays for token decomposition, a `PairMap` (pair→merged token), three Aho-Corasick automatons, and a `next_prefix_match` table.
128
+
129
+ * **`PairMap`**: Open-addressing hash table (linear probing, 50% max load) for `(token1, token2) → merged_id` lookups. Uses a splitmix64 finaliser instead of byte-by-byte FNV-1a for the fixed 8-byte key, keeping the merge step cache-friendly.
130
+
131
+ * **`BytesMap`**: Open-addressing hash table for `bytes → token_id` lookups. Uses FNV-1a hashing identical to Rust's `fnv` crate, ensuring consistent hash values across both implementations.
132
+
133
+ ### Aho-Corasick Automatons
134
+
135
+ Three Double-Array Aho-Corasick automatons are built over the token vocabulary at initialisation time:
136
+
137
+ * **`longest_searcher`** (`AC_KIND_LEFTMOST_LONGEST`): leftmost-longest token match at each position — used for the backtrack encoder.
138
+ * **`overlapping_searcher`** (`AC_KIND_OVERLAPPING_FWD`): all overlapping forward matches — used by `AppendableEncoder` to maintain per-byte AC state.
139
+ * **`overlapping_searcher_rev`** (`AC_KIND_OVERLAPPING_REV`): all overlapping reverse matches — used by `PrependableEncoder`.
140
+
141
+ The Double-Array layout gives O(1) state transitions per input byte, making the automaton traversal extremely cache-friendly.
142
+
143
+ ### Special Purpose Encoders
144
+
145
+ * **`AppendableEncoder`**: Stores one `AppState` per byte appended (`ac_state`, `last_token`, running count), allowing O(1) amortised count queries via the forward AC automaton.
146
+ * **`PrependableEncoder`**: Mirror of `AppendableEncoder` using the reverse AC automaton — supports O(1) amortised queries while prepending.
147
+ * **`IntervalEncoding`**: Precomputes a `last_token`, `tree_id`, `tree_end`, and `tree_depth` array per byte position, enabling typically-O(1) `count(start, end)` queries.
148
+ * **OpenAI-compatible Tokenizer**: Hand-coded pre-tokenisation with PCRE2 UCD tables (regex splitting identical to tiktoken) feeding into the shared BPE encode/decode logic.
149
+
150
+ ## Performance
151
+
152
+ Our benchmarks show significant performance improvements over existing implementations:
153
+
154
+ > **Note**: All benchmark results shown here were achieved using the Python bindings, not the direct native implementation. This provides a more realistic representation of the performance users will experience in Python applications.
155
+
156
+ ### Single-Text Tokenization
157
+
158
+ | Text Size | c\_bpe vs tiktoken | rs\_bpe vs tiktoken |
159
+ | ----------- | ----------------------- | -------------------- |
160
+ | Small | 2.9× faster | 3.0× faster |
161
+ | Medium | 1.7× faster | 1.6× faster |
162
+ | Large | 4.4× faster | 2.3× faster |
163
+
164
+ _Encoding speed (benchmark.py results):_
165
+
166
+ ![Encoding throughput](assets/20260307_tokenizer_benchmark_results_throughput.svg)
167
+
168
+ ```
169
+ SMALL TEXT:
170
+ tiktoken: 0.000102s
171
+ c_bpe: 0.000035s
172
+ rs_bpe: 0.000034s
173
+
174
+ MEDIUM TEXT:
175
+ tiktoken: 0.001735s
176
+ c_bpe: 0.001007s
177
+ rs_bpe: 0.001092s
178
+
179
+ LARGE TEXT:
180
+ tiktoken: 0.068093s
181
+ c_bpe: 0.015330s
182
+ rs_bpe: 0.029147s
183
+ ```
184
+
185
+ Both libraries also provide significantly faster decoding and roundtrip operations:
186
+
187
+ _Decoding speed:_
188
+
189
+ ![Tokenizer timing comparison](assets/20260307_tokenizer_benchmark_results_time.svg)
190
+
191
+ ```
192
+ SMALL TEXT:
193
+ tiktoken: 0.000027s
194
+ c_bpe: 0.000011s
195
+ rs_bpe: 0.000018s
196
+
197
+ MEDIUM TEXT:
198
+ tiktoken: 0.000200s
199
+ c_bpe: 0.000076s
200
+ rs_bpe: 0.000105s
201
+
202
+ LARGE TEXT:
203
+ tiktoken: 0.003799s
204
+ c_bpe: 0.001709s
205
+ rs_bpe: 0.002504s
206
+ ```
207
+
208
+ ### Batch Processing Performance
209
+
210
+ | Batch Size | c\_bpe encode | c\_bpe decode | rs\_bpe encode | rs\_bpe decode |
211
+ | ------------ | ------------- | ------------- | -------------- | -------------- |
212
+ | 1 | 35× faster | 165× faster | 79× faster | 94× faster |
213
+ | 10 | 32× faster | 92× faster | 43× faster | 100× faster |
214
+ | 100 | 5× faster | 94× faster | 17× faster | 52× faster |
215
+ | 1000 | 22× faster | 57× faster | 13× faster | 31× faster |
216
+
217
+ _Encode speedup vs tiktoken (all sizes):_
218
+
219
+ ![Encode speedup vs tiktoken](assets/20260307_tokenizer_benchmark_results_speedup.svg)
220
+
221
+ ### Worst-Case Performance
222
+
223
+ While tiktoken shows quadratic growth for certain adversarial inputs, c_bpe maintains linear scaling even in worst-case scenarios. This is critical for production systems that need consistent performance guarantees.
224
+
225
+ ### Key Performance Advantages
226
+
227
+ 1. **Memory Efficiency**: Compact data structures (tightly-packed token byte arrays, power-of-2 hash tables at ≤50% load) and no redundant token storage
228
+ 2. **Cache-Friendly Hash Tables**: `PairMap` uses a splitmix64 finaliser for fixed 8-byte keys; `BytesMap` uses FNV-1a — both with linear probing for sequential memory access
229
+ 3. **O(1) State Transitions**: Double-Array Aho-Corasick automatons enable single-byte-per-step token matching without backtracking through the vocabulary
230
+ 4. **Full LTO**: Compiled with full Link-Time Optimisation (MSVC `/GL`+`/LTCG` / GCC `-flto`)
231
+ 5. **No Correctness Trade-offs**: Verified to produce token-for-token identical output to tiktoken
232
+
233
+ All benchmarks were run on standard hardware and results may vary based on your specific environment.
234
+
235
+ ## Python Usage Examples
236
+
237
+ ### Basic Tokenization
238
+
239
+ ```python
240
+ from c_bpe.bpe import openai
241
+
242
+ # Load OpenAI tokenizers (automatically caches for reuse)
243
+ cl100k_tokenizer = openai.cl100k_base() # GPT-3.5/4 tokenizer
244
+ o200k_tokenizer = openai.o200k_base() # o200k tokenizer
245
+
246
+ # Basic encoding
247
+ text = "Hello, world! This is an example."
248
+ tokens = cl100k_tokenizer.encode(text)
249
+ print(f"Encoded tokens: {tokens}")
250
+
251
+ # Basic decoding
252
+ decoded_text = cl100k_tokenizer.decode(tokens)
253
+ print(f"Decoded text: {decoded_text}")
254
+
255
+ # Simple token counting
256
+ token_count = cl100k_tokenizer.count(text)
257
+ print(f"Token count: {token_count}")
258
+ ```
259
+
260
+ ### Efficient Token Limiting
261
+
262
+ One of the key features is the ability to efficiently count tokens up to a limit, which is useful when you need to stay within token constraints:
263
+
264
+ ```python
265
+ from c_bpe.bpe import openai
266
+
267
+ tokenizer = openai.cl100k_base()
268
+ max_tokens = 50
269
+
270
+ # Count tokens until limit is reached
271
+ text = "This is a long text that might exceed our token limit... " * 20
272
+ char_position = tokenizer.count_till_limit(text, max_tokens)
273
+
274
+ if char_position is not None:
275
+ # We reached the limit before the end of the text
276
+ truncated_text = text[:char_position]
277
+ print(f"Truncated to {tokenizer.count(truncated_text)} tokens")
278
+ print(f"Truncated text: {truncated_text}")
279
+ else:
280
+ # The entire text is within our token limit
281
+ print(f"Text is within token limit: {tokenizer.count(text)} tokens")
282
+ ```
283
+
284
+ ### Batch Processing
285
+
286
+ c_bpe excels at batch processing, which is perfect for processing large datasets:
287
+
288
+ ```python
289
+ from c_bpe.bpe import openai
290
+ import time
291
+
292
+ # Load the tokenizer
293
+ tokenizer = openai.cl100k_base()
294
+
295
+ # Create a batch of texts
296
+ texts = [
297
+ "This is the first document to encode.",
298
+ "Here's another one with different content.",
299
+ "A third document with some more text to process.",
300
+ # Add more as needed...
301
+ ]
302
+
303
+ # Configure parallel processing options (optional)
304
+ parallel_options = openai.ParallelOptions(
305
+ min_batch_size=20, # Minimum batch size to engage parallel processing
306
+ chunk_size=100, # Number of texts to process in each thread
307
+ max_threads=0, # 0 means use optimal thread count (based on CPU cores)
308
+ use_thread_pool=True # Reuse thread pool for better performance
309
+ )
310
+
311
+ # Encode batch with performance metrics
312
+ start_time = time.time()
313
+ result = tokenizer.encode_batch(texts, parallel_options)
314
+ end_time = time.time()
315
+
316
+ print(f"Processed {len(texts)} texts in {result.time_taken:.6f}s")
317
+ print(f"Total tokens: {result.total_tokens}")
318
+ print(f"Throughput: {result.total_tokens / result.time_taken:.1f} tokens/second")
319
+
320
+ # Access individual token lists
321
+ for i, tokens in enumerate(result.tokens):
322
+ print(f"Text {i} has {len(tokens)} tokens")
323
+ ```
324
+
325
+ ### Text Chunking
326
+
327
+ c_bpe can be used to efficiently chunk text based on token counts:
328
+
329
+ ```python
330
+ from c_bpe.bpe import openai
331
+
332
+ tokenizer = openai.cl100k_base()
333
+
334
+ def chunk_text(text, max_chunk_tokens=1024, overlap_tokens=50):
335
+ """Split text into chunks of approximately max_chunk_tokens."""
336
+ chunks = []
337
+
338
+ # Get the full text token count
339
+ total_tokens = tokenizer.count(text)
340
+
341
+ if total_tokens <= max_chunk_tokens:
342
+ return [text]
343
+
344
+ # Keep track of where we are in the text
345
+ start_pos = 0
346
+
347
+ while start_pos < len(text):
348
+ # Find where to end this chunk
349
+ char_position = tokenizer.count_till_limit(text[start_pos:], max_chunk_tokens)
350
+
351
+ if char_position is None:
352
+ # The rest of the text fits within our limit
353
+ chunks.append(text[start_pos:])
354
+ break
355
+
356
+ # Add the chunk
357
+ end_pos = start_pos + char_position
358
+ chunks.append(text[start_pos:end_pos])
359
+
360
+ # Move to the next chunk, considering overlap
361
+ if overlap_tokens > 0 and end_pos < len(text):
362
+ # Move back by overlap tokens
363
+ overlap_char_position = tokenizer.count_till_limit(
364
+ text[start_pos:end_pos], max_chunk_tokens - overlap_tokens
365
+ )
366
+ if overlap_char_position is not None:
367
+ start_pos += overlap_char_position
368
+ else:
369
+ start_pos = end_pos
370
+ else:
371
+ start_pos = end_pos
372
+
373
+ return chunks
374
+
375
+ # Example usage
376
+ long_text = "This is a long document that needs to be split into chunks. " * 100
377
+ chunks = chunk_text(long_text, max_chunk_tokens=100, overlap_tokens=10)
378
+
379
+ print(f"Split text into {len(chunks)} chunks:")
380
+ for i, chunk in enumerate(chunks):
381
+ token_count = tokenizer.count(chunk)
382
+ print(f"Chunk {i}: {token_count} tokens, {len(chunk)} chars")
383
+ ```
384
+
385
+ ## Building from Source
386
+
387
+ ### Prerequisites
388
+
389
+ | Requirement | c_bpe | rs_bpe (companion) |
390
+ |-------------|-------|---------------------|
391
+ | Python ≥ 3.8 | ✅ | ✅ |
392
+ | C11 compiler (GCC, Clang, or MSVC) | ✅ | — |
393
+ | Rust toolchain (stable) | — | ✅ |
394
+ | setuptools (`pip install setuptools`) | ✅ | — |
395
+ | maturin (`pip install maturin`) | — | ✅ |
396
+
397
+ ### c_bpe (C extension)
398
+
399
+ ```bash
400
+ # Clone the repository
401
+ git clone https://github.com/andrey-savov/c-bpe.git
402
+ cd c-bpe
403
+
404
+ # Create and activate a virtual environment (recommended)
405
+ python -m venv .venv
406
+ source .venv/bin/activate # Linux/macOS
407
+ # .venv\Scripts\activate # Windows
408
+
409
+ # Install in development mode (editable)
410
+ pip install -e .
411
+
412
+ # Or build the extension in-place without installing
413
+ python setup.py build_ext --inplace
414
+ ```
415
+
416
+ The build auto-detects the compiler and applies platform-appropriate optimisations:
417
+ - **GCC/Clang**: `-O3 -march=native -flto -DNDEBUG`
418
+ - **MSVC**: `/O2 /Ox /GL /DNDEBUG` with `/LTCG` at link time
419
+
420
+ PCRE2 Unicode tables are bundled in `third_party/`; no external PCRE2 installation is required.
421
+
422
+ ### rs_bpe (Rust companion — for benchmarking)
423
+
424
+ The `rust/` directory contains the original Rust implementation ([rs-bpe](https://github.com/gweidart/rs-bpe)) for comparison benchmarking:
425
+
426
+ ```bash
427
+ cd rust
428
+
429
+ # Install maturin if not already present
430
+ pip install maturin
431
+
432
+ # Build and install in development mode
433
+ maturin develop --release
434
+ ```
435
+
436
+ The Rust toolchain can be installed from [rustup.rs](https://rustup.rs/).
437
+
438
+ ### Installing both side by side
439
+
440
+ Both packages can be installed in the same environment — they use separate namespaces (`c_bpe` and `rs_bpe`):
441
+
442
+ ```bash
443
+ cd c-bpe
444
+ pip install -e . # c_bpe
445
+ cd rust && pip install maturin && maturin develop --release # rs_bpe
446
+ ```
447
+
448
+ Verify:
449
+
450
+ ```python
451
+ from c_bpe.bpe import openai as c_openai
452
+ from rs_bpe.bpe import openai as rs_openai
453
+
454
+ c_tok = c_openai.cl100k_base()
455
+ rs_tok = rs_openai.cl100k_base()
456
+
457
+ text = "Hello, world!"
458
+ assert c_tok.encode(text) == rs_tok.encode(text)
459
+ print("Both implementations installed and producing identical output.")
460
+ ```
461
+
462
+ ### Running tests
463
+
464
+ ```bash
465
+ # From the repository root, with both implementations installed:
466
+ pip install pytest pytest-benchmark
467
+
468
+ # Correctness tests
469
+ pytest tests/test_basic.py -v
470
+
471
+ # Benchmarks
472
+ pytest tests/test_benchmarks.py --benchmark-only -v
473
+ ```
474
+
475
+ ## Acknowledgements
476
+
477
+ This project was developed with the assistance of [Claude Code](https://claude.ai/claude-code), using Claude Opus 4.6 and Claude Sonnet 4.6 models by [Anthropic](https://www.anthropic.com).
478
+
479
+ ## License
480
+
481
+ [MIT License](LICENSE)