tinybpe 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tinybpe-0.1.2/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025-2026 E.T. Romani
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,14 @@
1
+ # C extension source files
2
+ include src/*.c
3
+ include src/*.h
4
+
5
+ # Package metadata
6
+ include LICENSE
7
+ include README.md
8
+ include pyproject.toml
9
+ include setup.py
10
+ include requirements_dev.txt
11
+
12
+ # Type information
13
+ include tinybpe/py.typed
14
+ include tinybpe/bpe.pyi
tinybpe-0.1.2/PKG-INFO ADDED
@@ -0,0 +1,169 @@
1
+ Metadata-Version: 2.4
2
+ Name: tinybpe
3
+ Version: 0.1.2
4
+ Summary: An ultra-fast, lightweight and clean CPython implementation of the Byte Pair Encoding (BPE) algorithm for language model tokenization.
5
+ Author-email: "E.T. Romani" <myneluca@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/neluca/tinybpe
8
+ Project-URL: Repository, https://github.com/neluca/tinybpe
9
+ Project-URL: Issues, https://github.com/neluca/tinybpe/issues
10
+ Project-URL: Documentation, https://github.com/neluca/tinybpe#readme
11
+ Keywords: BPE,Byte Pair Encoding,LLM,Tokenizer,CPython
12
+ Classifier: Development Status :: 5 - Production/Stable
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Programming Language :: Python :: 3.14
19
+ Classifier: Intended Audience :: Developers
20
+ Classifier: Intended Audience :: Science/Research
21
+ Classifier: Operating System :: MacOS
22
+ Classifier: Operating System :: POSIX :: Linux
23
+ Classifier: Operating System :: Microsoft :: Windows
24
+ Classifier: Topic :: Text Processing :: Linguistic
25
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
26
+ Requires-Python: >=3.9
27
+ Description-Content-Type: text/markdown
28
+ License-File: LICENSE
29
+ Requires-Dist: regex
30
+ Dynamic: license-file
31
+
32
+ [English] | [δΈ­ζ–‡](README_zh.md)
33
+
34
+ # πŸš€ TinyBPE
35
+
36
+ [![build](https://github.com/neluca/tinybpe/workflows/build/badge.svg)](https://github.com/neluca/tinybpe/actions/workflows/python-package.yml)
37
+ [![wheels](https://github.com/neluca/tinybpe/workflows/wheels/badge.svg)](https://github.com/neluca/tinybpe/actions/workflows/wheels.yml)
38
+ [![lint](https://github.com/neluca/tinybpe/workflows/lint/badge.svg)](https://github.com/neluca/tinybpe/actions/workflows/lint.yml)
39
+ [![codecov](https://codecov.io/gh/neluca/tinybpe/branch/main/graph/badge.svg)](https://codecov.io/gh/neluca/tinybpe)
40
+ [![PyPI version](https://img.shields.io/pypi/v/tinybpe)](https://pypi.org/project/tinybpe/)
41
+ [![Python versions](https://img.shields.io/pypi/pyversions/tinybpe)](https://pypi.org/project/tinybpe/)
42
+ [![License](https://img.shields.io/github/license/neluca/tinybpe)](https://github.com/neluca/tinybpe/blob/main/LICENSE)
43
+
44
+ **TinyBPE** is an ultra-fast, lightweight, and clean **language model** tokenizer and BPE model trainer implemented as a **CPython** extension.
45
+
46
+ ## πŸ“¦ Installation
47
+
48
+ ```bash
49
+ pip install tinybpe
50
+ ```
51
+
52
+ Pre-built wheels are available for Linux (x86_64, aarch64), macOS (x86_64, arm64), and Windows (x86_64), for Python 3.9–3.13.
53
+
54
+ ## 🌟 Features
55
+
56
+ - **C core** β€” Meticulously designed C implementation using AVL-tree indexing for fast pair lookup.
57
+ - **Clean Python API** β€” Simple, elegant interface with type hints.
58
+ - **BPE training** β€” Train from scratch or continue training on imported models.
59
+ - **Byte-level tokenizer** β€” Fast encode/decode with streaming decode support.
60
+ - **Regex pre-tokenization** β€” Split text before encoding using regex patterns.
61
+ - **Special tokens** β€” Support for control tokens like `<|endoftext|>`.
62
+ - **TikToken compatibility** β€” Convert tiktoken model parameters for use with tinybpe.
63
+ - **Zero core dependencies** β€” The C extension has zero dependencies; only `regex` is needed for pre-tokenization.
64
+
65
+ ## ⚑️ Quick Start
66
+
67
+ ### 1. Basic Tokenization
68
+
69
+ ```python
70
+ import tiktoken
71
+ from tinybpe import Tokenizer, get_from_tiktoken
72
+
73
+ # Convert a tiktoken model
74
+ tik_tokenizer = tiktoken.get_encoding("cl100k_base")
75
+ model_param = get_from_tiktoken(tik_tokenizer._mergeable_ranks)
76
+ tiny_tokenizer = Tokenizer(model_param)
77
+
78
+ text = "πŸ‘‹ Hello, this is an example. δ½ ε₯½οΌŒθΏ™ζ˜―δΈ€δΈͺδΎ‹ε­γ€‚πŸ˜"
79
+ tik_ids = tik_tokenizer.encode(text)
80
+ tiny_ids = tiny_tokenizer.encode(text)
81
+ assert tik_ids == tiny_ids # Identical output
82
+ ```
83
+
84
+ ### 2. Training a BPE Model
85
+
86
+ ```python
87
+ from tinybpe import SimpleTrainer
88
+
89
+ text = open("corpus.txt", "r", encoding="utf-8").read()
90
+ trainer = SimpleTrainer(text)
91
+ vocab_size = 1000
92
+ for _ in range(vocab_size - 256):
93
+ pair, rank, freq = trainer.step()
94
+ print(f"{pair} -> {rank} ({freq})")
95
+
96
+ print(f"Vocabulary size: {trainer.n_merges + 256}")
97
+ trainer.save("my-model") # Saves my-model.tinymodel
98
+ ```
99
+
100
+ ### 3. Loading a Model
101
+
102
+ ```python
103
+ from tinybpe import Tokenizer, load_bpe_model
104
+
105
+ model = load_bpe_model("my-model.tinymodel")
106
+ tokenizer = Tokenizer(model)
107
+
108
+ ids = tokenizer.encode("hello world")
109
+ print(ids) # [259, 32, 261, 263, 264]
110
+ print(tokenizer.decode(ids)) # hello world
111
+ print(tokenizer.n_vocab) # 1000
112
+ ```
113
+
114
+ ### 4. Streaming Decode
115
+
116
+ ```python
117
+ def on_text(text: str):
118
+ print(text, end="")
119
+
120
+ decode = tokenizer.stream_decode(on_text)
121
+ for token_id in ids:
122
+ decode(token_id) # Prints characters as soon as they're decodable
123
+ ```
124
+
125
+ ### 5. Convert TikToken Models
126
+
127
+ ```python
128
+ import tiktoken
129
+ from tinybpe import save_from_tiktoken
130
+
131
+ enc = tiktoken.get_encoding("cl100k_base")
132
+ save_from_tiktoken("cl100k_base", enc._mergeable_ranks)
133
+ # Creates cl100k_base.tinymodel
134
+ ```
135
+
136
+ **Note:** In commercial settings, be mindful of copyright when converting third-party tokenizer models. Training your own model is recommended.
137
+
138
+ ## πŸ§ͺ Development
139
+
140
+ ```bash
141
+ git clone https://github.com/neluca/tinybpe.git
142
+ cd tinybpe
143
+ pip install -r requirements_dev.txt
144
+ pip install -e .
145
+ python -m pytest
146
+ ```
147
+
148
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for detailed development setup and guidelines.
149
+
150
+ ## πŸ“Š Benchmarks
151
+
152
+ Run benchmarks with:
153
+ ```bash
154
+ cd benchmarks
155
+ python bench_encode.py
156
+ python bench_decode.py
157
+ python bench_train.py
158
+ ```
159
+
160
+ TinyBPE's C implementation typically achieves **10–100x faster** encoding than pure-Python BPE implementations.
161
+
162
+ ## 🀝 Acknowledgements
163
+
164
+ - [minbpe](https://github.com/karpathy/minbpe) β€” Excellent educational resource on BPE algorithm internals.
165
+ - [tiktoken](https://github.com/openai/tiktoken) β€” Reference tokenizer models for validation and compatibility.
166
+
167
+ ## πŸ“„ License
168
+
169
+ MIT License. See [LICENSE](LICENSE) for details.
@@ -0,0 +1,138 @@
1
+ [English] | [δΈ­ζ–‡](README_zh.md)
2
+
3
+ # πŸš€ TinyBPE
4
+
5
+ [![build](https://github.com/neluca/tinybpe/workflows/build/badge.svg)](https://github.com/neluca/tinybpe/actions/workflows/python-package.yml)
6
+ [![wheels](https://github.com/neluca/tinybpe/workflows/wheels/badge.svg)](https://github.com/neluca/tinybpe/actions/workflows/wheels.yml)
7
+ [![lint](https://github.com/neluca/tinybpe/workflows/lint/badge.svg)](https://github.com/neluca/tinybpe/actions/workflows/lint.yml)
8
+ [![codecov](https://codecov.io/gh/neluca/tinybpe/branch/main/graph/badge.svg)](https://codecov.io/gh/neluca/tinybpe)
9
+ [![PyPI version](https://img.shields.io/pypi/v/tinybpe)](https://pypi.org/project/tinybpe/)
10
+ [![Python versions](https://img.shields.io/pypi/pyversions/tinybpe)](https://pypi.org/project/tinybpe/)
11
+ [![License](https://img.shields.io/github/license/neluca/tinybpe)](https://github.com/neluca/tinybpe/blob/main/LICENSE)
12
+
13
+ **TinyBPE** is an ultra-fast, lightweight, and clean **language model** tokenizer and BPE model trainer implemented as a **CPython** extension.
14
+
15
+ ## πŸ“¦ Installation
16
+
17
+ ```bash
18
+ pip install tinybpe
19
+ ```
20
+
21
+ Pre-built wheels are available for Linux (x86_64, aarch64), macOS (x86_64, arm64), and Windows (x86_64), for Python 3.9–3.13.
22
+
23
+ ## 🌟 Features
24
+
25
+ - **C core** β€” Meticulously designed C implementation using AVL-tree indexing for fast pair lookup.
26
+ - **Clean Python API** β€” Simple, elegant interface with type hints.
27
+ - **BPE training** β€” Train from scratch or continue training on imported models.
28
+ - **Byte-level tokenizer** β€” Fast encode/decode with streaming decode support.
29
+ - **Regex pre-tokenization** β€” Split text before encoding using regex patterns.
30
+ - **Special tokens** β€” Support for control tokens like `<|endoftext|>`.
31
+ - **TikToken compatibility** β€” Convert tiktoken model parameters for use with tinybpe.
32
+ - **Zero core dependencies** β€” The C extension has zero dependencies; only `regex` is needed for pre-tokenization.
33
+
34
+ ## ⚑️ Quick Start
35
+
36
+ ### 1. Basic Tokenization
37
+
38
+ ```python
39
+ import tiktoken
40
+ from tinybpe import Tokenizer, get_from_tiktoken
41
+
42
+ # Convert a tiktoken model
43
+ tik_tokenizer = tiktoken.get_encoding("cl100k_base")
44
+ model_param = get_from_tiktoken(tik_tokenizer._mergeable_ranks)
45
+ tiny_tokenizer = Tokenizer(model_param)
46
+
47
+ text = "πŸ‘‹ Hello, this is an example. δ½ ε₯½οΌŒθΏ™ζ˜―δΈ€δΈͺδΎ‹ε­γ€‚πŸ˜"
48
+ tik_ids = tik_tokenizer.encode(text)
49
+ tiny_ids = tiny_tokenizer.encode(text)
50
+ assert tik_ids == tiny_ids # Identical output
51
+ ```
52
+
53
+ ### 2. Training a BPE Model
54
+
55
+ ```python
56
+ from tinybpe import SimpleTrainer
57
+
58
+ text = open("corpus.txt", "r", encoding="utf-8").read()
59
+ trainer = SimpleTrainer(text)
60
+ vocab_size = 1000
61
+ for _ in range(vocab_size - 256):
62
+ pair, rank, freq = trainer.step()
63
+ print(f"{pair} -> {rank} ({freq})")
64
+
65
+ print(f"Vocabulary size: {trainer.n_merges + 256}")
66
+ trainer.save("my-model") # Saves my-model.tinymodel
67
+ ```
68
+
69
+ ### 3. Loading a Model
70
+
71
+ ```python
72
+ from tinybpe import Tokenizer, load_bpe_model
73
+
74
+ model = load_bpe_model("my-model.tinymodel")
75
+ tokenizer = Tokenizer(model)
76
+
77
+ ids = tokenizer.encode("hello world")
78
+ print(ids) # [259, 32, 261, 263, 264]
79
+ print(tokenizer.decode(ids)) # hello world
80
+ print(tokenizer.n_vocab) # 1000
81
+ ```
82
+
83
+ ### 4. Streaming Decode
84
+
85
+ ```python
86
+ def on_text(text: str):
87
+ print(text, end="")
88
+
89
+ decode = tokenizer.stream_decode(on_text)
90
+ for token_id in ids:
91
+ decode(token_id) # Prints characters as soon as they're decodable
92
+ ```
93
+
94
+ ### 5. Convert TikToken Models
95
+
96
+ ```python
97
+ import tiktoken
98
+ from tinybpe import save_from_tiktoken
99
+
100
+ enc = tiktoken.get_encoding("cl100k_base")
101
+ save_from_tiktoken("cl100k_base", enc._mergeable_ranks)
102
+ # Creates cl100k_base.tinymodel
103
+ ```
104
+
105
+ **Note:** In commercial settings, be mindful of copyright when converting third-party tokenizer models. Training your own model is recommended.
106
+
107
+ ## πŸ§ͺ Development
108
+
109
+ ```bash
110
+ git clone https://github.com/neluca/tinybpe.git
111
+ cd tinybpe
112
+ pip install -r requirements_dev.txt
113
+ pip install -e .
114
+ python -m pytest
115
+ ```
116
+
117
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for detailed development setup and guidelines.
118
+
119
+ ## πŸ“Š Benchmarks
120
+
121
+ Run benchmarks with:
122
+ ```bash
123
+ cd benchmarks
124
+ python bench_encode.py
125
+ python bench_decode.py
126
+ python bench_train.py
127
+ ```
128
+
129
+ TinyBPE's C implementation typically achieves **10–100x faster** encoding than pure-Python BPE implementations.
130
+
131
+ ## 🀝 Acknowledgements
132
+
133
+ - [minbpe](https://github.com/karpathy/minbpe) β€” Excellent educational resource on BPE algorithm internals.
134
+ - [tiktoken](https://github.com/openai/tiktoken) β€” Reference tokenizer models for validation and compatibility.
135
+
136
+ ## πŸ“„ License
137
+
138
+ MIT License. See [LICENSE](LICENSE) for details.
@@ -0,0 +1,62 @@
1
+ [build-system]
2
+ requires = ["setuptools", "wheel", "regex"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "tinybpe"
7
+ authors = [{ name = "E.T. Romani", email = "myneluca@gmail.com" }]
8
+ description = "An ultra-fast, lightweight and clean CPython implementation of the Byte Pair Encoding (BPE) algorithm for language model tokenization."
9
+ dependencies = ["regex"]
10
+ readme = "README.md"
11
+ requires-python = ">=3.9"
12
+ license = "MIT"
13
+ license-files = ["LICENSE"]
14
+ dynamic = ["version"]
15
+ keywords = ["BPE", "Byte Pair Encoding", "LLM", "Tokenizer", "CPython"]
16
+ classifiers = [
17
+ "Development Status :: 5 - Production/Stable",
18
+ "Programming Language :: Python :: 3.9",
19
+ "Programming Language :: Python :: 3.10",
20
+ "Programming Language :: Python :: 3.11",
21
+ "Programming Language :: Python :: 3.12",
22
+ "Programming Language :: Python :: 3.13",
23
+ "Programming Language :: Python :: 3.14",
24
+ "Intended Audience :: Developers",
25
+ "Intended Audience :: Science/Research",
26
+ "Operating System :: MacOS",
27
+ "Operating System :: POSIX :: Linux",
28
+ "Operating System :: Microsoft :: Windows",
29
+ "Topic :: Text Processing :: Linguistic",
30
+ "Topic :: Software Development :: Libraries :: Python Modules",
31
+ ]
32
+
33
+ [project.urls]
34
+ Homepage = "https://github.com/neluca/tinybpe"
35
+ Repository = "https://github.com/neluca/tinybpe"
36
+ Issues = "https://github.com/neluca/tinybpe/issues"
37
+ Documentation = "https://github.com/neluca/tinybpe#readme"
38
+
39
+ [tool.setuptools]
40
+ packages = ["tinybpe"]
41
+
42
+ [tool.setuptools.dynamic]
43
+ version = { attr = "tinybpe._version.__version__" }
44
+
45
+ [tool.ruff]
46
+ line-length = 120
47
+ target-version = "py39"
48
+
49
+ [tool.ruff.lint]
50
+ select = ["E", "F", "W", "I", "N", "UP", "B", "C4"]
51
+
52
+ [tool.ruff.format]
53
+ quote-style = "double"
54
+ indent-style = "space"
55
+
56
+ [tool.mypy]
57
+ strict = true
58
+
59
+ [tool.pytest.ini_options]
60
+ testpaths = ["tests"]
61
+ python_files = ["test_*.py"]
62
+ addopts = ["--strict-markers", "-ra"]
@@ -0,0 +1,20 @@
1
+ # Test
2
+ tiktoken>=0.7.0
3
+ pytest>=8.0
4
+ pytest-cov>=5.0
5
+
6
+ # Runtime (explicit for build requirements)
7
+ regex
8
+
9
+ # Build Packages
10
+ build>=1.0
11
+ setuptools>=68.0
12
+ wheel>=0.40
13
+ twine>=5.0
14
+
15
+ # Lint & Type Checking
16
+ ruff>=0.5.0
17
+ mypy>=1.10
18
+
19
+ # Pre-commit
20
+ pre-commit>=3.5
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
tinybpe-0.1.2/setup.py ADDED
@@ -0,0 +1,33 @@
1
+ """Minimal setup.py β€” only defines the C extension.
2
+
3
+ All package metadata lives in pyproject.toml.
4
+ """
5
+
6
+ import sys
7
+ from setuptools import Extension, setup
8
+
9
+ ext_modules = [
10
+ Extension(
11
+ "tinybpe.bpe",
12
+ sources=[
13
+ "src/bpe_module.c",
14
+ "src/_tree_core.c",
15
+ "src/bpe_common.c",
16
+ "src/bpe_trainer.c",
17
+ "src/bpe_tokenizer.c",
18
+ ],
19
+ depends=[
20
+ "src/_tree_core.h",
21
+ "src/bpe_common.h",
22
+ "src/bpe_trainer.h",
23
+ "src/bpe_tokenizer.h",
24
+ ],
25
+ extra_compile_args={
26
+ "win32": [],
27
+ }.get(sys.platform, ["-Werror", "-std=c99"]),
28
+ )
29
+ ]
30
+
31
+ setup(
32
+ ext_modules=ext_modules,
33
+ )