PyPI - tinybpe - Versions diffs - 0.1.2__tar.gz - Mend

tinybpe 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

tinybpe-0.1.2/LICENSE +21 -0
tinybpe-0.1.2/MANIFEST.in +14 -0
tinybpe-0.1.2/PKG-INFO +169 -0
tinybpe-0.1.2/README.md +138 -0
tinybpe-0.1.2/pyproject.toml +62 -0
tinybpe-0.1.2/requirements_dev.txt +20 -0
tinybpe-0.1.2/setup.cfg +4 -0
tinybpe-0.1.2/setup.py +33 -0
tinybpe-0.1.2/src/_tree_core.c +302 -0
tinybpe-0.1.2/src/_tree_core.h +77 -0
tinybpe-0.1.2/src/bpe_common.c +97 -0
tinybpe-0.1.2/src/bpe_common.h +113 -0
tinybpe-0.1.2/src/bpe_module.c +671 -0
tinybpe-0.1.2/src/bpe_tokenizer.c +355 -0
tinybpe-0.1.2/src/bpe_tokenizer.h +118 -0
tinybpe-0.1.2/src/bpe_trainer.c +199 -0
tinybpe-0.1.2/src/bpe_trainer.h +88 -0
tinybpe-0.1.2/tests/test_cpy_bpe.py +100 -0
tinybpe-0.1.2/tests/test_edge_cases.py +174 -0
tinybpe-0.1.2/tests/test_fuzz.py +173 -0
tinybpe-0.1.2/tests/test_tinybpe.py +222 -0
tinybpe-0.1.2/tinybpe/__init__.py +34 -0
tinybpe-0.1.2/tinybpe/_model_io.py +167 -0
tinybpe-0.1.2/tinybpe/_tiktoken.py +95 -0
tinybpe-0.1.2/tinybpe/_utils.py +26 -0
tinybpe-0.1.2/tinybpe/_version.py +1 -0
tinybpe-0.1.2/tinybpe/bpe.pyi +41 -0
tinybpe-0.1.2/tinybpe/core.py +350 -0
tinybpe-0.1.2/tinybpe/py.typed +0 -0
tinybpe-0.1.2/tinybpe/simple.py +126 -0
tinybpe-0.1.2/tinybpe.egg-info/PKG-INFO +169 -0
tinybpe-0.1.2/tinybpe.egg-info/SOURCES.txt +33 -0
tinybpe-0.1.2/tinybpe.egg-info/dependency_links.txt +1 -0
tinybpe-0.1.2/tinybpe.egg-info/requires.txt +1 -0
tinybpe-0.1.2/tinybpe.egg-info/top_level.txt +1 -0

tinybpe-0.1.2/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025-2026 E.T. Romani
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

tinybpe-0.1.2/MANIFEST.in ADDED Viewed

@@ -0,0 +1,14 @@
+# C extension source files
+include src/*.c
+include src/*.h
+# Package metadata
+include LICENSE
+include README.md
+include pyproject.toml
+include setup.py
+include requirements_dev.txt
+# Type information
+include tinybpe/py.typed
+include tinybpe/bpe.pyi

tinybpe-0.1.2/PKG-INFO ADDED Viewed

@@ -0,0 +1,169 @@
+Metadata-Version: 2.4
+Name: tinybpe
+Version: 0.1.2
+Summary: An ultra-fast, lightweight and clean CPython implementation of the Byte Pair Encoding (BPE) algorithm for language model tokenization.
+Author-email: "E.T. Romani" <myneluca@gmail.com>
+License-Expression: MIT
+Project-URL: Homepage, https://github.com/neluca/tinybpe
+Project-URL: Repository, https://github.com/neluca/tinybpe
+Project-URL: Issues, https://github.com/neluca/tinybpe/issues
+Project-URL: Documentation, https://github.com/neluca/tinybpe#readme
+Keywords: BPE,Byte Pair Encoding,LLM,Tokenizer,CPython
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Programming Language :: Python :: 3.14
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: Operating System :: MacOS
+Classifier: Operating System :: POSIX :: Linux
+Classifier: Operating System :: Microsoft :: Windows
+Classifier: Topic :: Text Processing :: Linguistic
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: regex
+Dynamic: license-file
+[English] | [中文](README_zh.md)
+# 🚀 TinyBPE
+[![build](https://github.com/neluca/tinybpe/workflows/build/badge.svg)](https://github.com/neluca/tinybpe/actions/workflows/python-package.yml)
+[![wheels](https://github.com/neluca/tinybpe/workflows/wheels/badge.svg)](https://github.com/neluca/tinybpe/actions/workflows/wheels.yml)
+[![lint](https://github.com/neluca/tinybpe/workflows/lint/badge.svg)](https://github.com/neluca/tinybpe/actions/workflows/lint.yml)
+[![codecov](https://codecov.io/gh/neluca/tinybpe/branch/main/graph/badge.svg)](https://codecov.io/gh/neluca/tinybpe)
+[![PyPI version](https://img.shields.io/pypi/v/tinybpe)](https://pypi.org/project/tinybpe/)
+[![Python versions](https://img.shields.io/pypi/pyversions/tinybpe)](https://pypi.org/project/tinybpe/)
+[![License](https://img.shields.io/github/license/neluca/tinybpe)](https://github.com/neluca/tinybpe/blob/main/LICENSE)
+**TinyBPE** is an ultra-fast, lightweight, and clean **language model** tokenizer and BPE model trainer implemented as a **CPython** extension.
+## 📦 Installation
+```bash
+pip install tinybpe
+```
+Pre-built wheels are available for Linux (x86_64, aarch64), macOS (x86_64, arm64), and Windows (x86_64), for Python 3.9–3.13.
+## 🌟 Features
+- **C core** — Meticulously designed C implementation using AVL-tree indexing for fast pair lookup.
+- **Clean Python API** — Simple, elegant interface with type hints.
+- **BPE training** — Train from scratch or continue training on imported models.
+- **Byte-level tokenizer** — Fast encode/decode with streaming decode support.
+- **Regex pre-tokenization** — Split text before encoding using regex patterns.
+- **Special tokens** — Support for control tokens like `<|endoftext|>`.
+- **TikToken compatibility** — Convert tiktoken model parameters for use with tinybpe.
+- **Zero core dependencies** — The C extension has zero dependencies; only `regex` is needed for pre-tokenization.
+## ⚡️ Quick Start
+### 1. Basic Tokenization
+```python
+import tiktoken
+from tinybpe import Tokenizer, get_from_tiktoken
+# Convert a tiktoken model
+tik_tokenizer = tiktoken.get_encoding("cl100k_base")
+model_param = get_from_tiktoken(tik_tokenizer._mergeable_ranks)
+tiny_tokenizer = Tokenizer(model_param)
+text = "👋 Hello, this is an example. 你好，这是一个例子。😁"
+tik_ids = tik_tokenizer.encode(text)
+tiny_ids = tiny_tokenizer.encode(text)
+assert tik_ids == tiny_ids  # Identical output
+```
+### 2. Training a BPE Model
+```python
+from tinybpe import SimpleTrainer
+text = open("corpus.txt", "r", encoding="utf-8").read()
+trainer = SimpleTrainer(text)
+vocab_size = 1000
+for _ in range(vocab_size - 256):
+    pair, rank, freq = trainer.step()
+    print(f"{pair} -> {rank} ({freq})")
+print(f"Vocabulary size: {trainer.n_merges + 256}")
+trainer.save("my-model")  # Saves my-model.tinymodel
+```
+### 3. Loading a Model
+```python
+from tinybpe import Tokenizer, load_bpe_model
+model = load_bpe_model("my-model.tinymodel")
+tokenizer = Tokenizer(model)
+ids = tokenizer.encode("hello world")
+print(ids)                      # [259, 32, 261, 263, 264]
+print(tokenizer.decode(ids))    # hello world
+print(tokenizer.n_vocab)        # 1000
+```
+### 4. Streaming Decode
+```python
+def on_text(text: str):
+    print(text, end="")
+decode = tokenizer.stream_decode(on_text)
+for token_id in ids:
+    decode(token_id)  # Prints characters as soon as they're decodable
+```
+### 5. Convert TikToken Models
+```python
+import tiktoken
+from tinybpe import save_from_tiktoken
+enc = tiktoken.get_encoding("cl100k_base")
+save_from_tiktoken("cl100k_base", enc._mergeable_ranks)
+# Creates cl100k_base.tinymodel
+```
+**Note:** In commercial settings, be mindful of copyright when converting third-party tokenizer models. Training your own model is recommended.
+## 🧪 Development
+```bash
+git clone https://github.com/neluca/tinybpe.git
+cd tinybpe
+pip install -r requirements_dev.txt
+pip install -e .
+python -m pytest
+```
+See [CONTRIBUTING.md](CONTRIBUTING.md) for detailed development setup and guidelines.
+## 📊 Benchmarks
+Run benchmarks with:
+```bash
+cd benchmarks
+python bench_encode.py
+python bench_decode.py
+python bench_train.py
+```
+TinyBPE's C implementation typically achieves **10–100x faster** encoding than pure-Python BPE implementations.
+## 🤝 Acknowledgements
+- [minbpe](https://github.com/karpathy/minbpe) — Excellent educational resource on BPE algorithm internals.
+- [tiktoken](https://github.com/openai/tiktoken) — Reference tokenizer models for validation and compatibility.
+## 📄 License
+MIT License. See [LICENSE](LICENSE) for details.

tinybpe-0.1.2/README.md ADDED Viewed

@@ -0,0 +1,138 @@
+[English] | [中文](README_zh.md)
+# 🚀 TinyBPE
+[![build](https://github.com/neluca/tinybpe/workflows/build/badge.svg)](https://github.com/neluca/tinybpe/actions/workflows/python-package.yml)
+[![wheels](https://github.com/neluca/tinybpe/workflows/wheels/badge.svg)](https://github.com/neluca/tinybpe/actions/workflows/wheels.yml)
+[![lint](https://github.com/neluca/tinybpe/workflows/lint/badge.svg)](https://github.com/neluca/tinybpe/actions/workflows/lint.yml)
+[![codecov](https://codecov.io/gh/neluca/tinybpe/branch/main/graph/badge.svg)](https://codecov.io/gh/neluca/tinybpe)
+[![PyPI version](https://img.shields.io/pypi/v/tinybpe)](https://pypi.org/project/tinybpe/)
+[![Python versions](https://img.shields.io/pypi/pyversions/tinybpe)](https://pypi.org/project/tinybpe/)
+[![License](https://img.shields.io/github/license/neluca/tinybpe)](https://github.com/neluca/tinybpe/blob/main/LICENSE)
+**TinyBPE** is an ultra-fast, lightweight, and clean **language model** tokenizer and BPE model trainer implemented as a **CPython** extension.
+## 📦 Installation
+```bash
+pip install tinybpe
+```
+Pre-built wheels are available for Linux (x86_64, aarch64), macOS (x86_64, arm64), and Windows (x86_64), for Python 3.9–3.13.
+## 🌟 Features
+- **C core** — Meticulously designed C implementation using AVL-tree indexing for fast pair lookup.
+- **Clean Python API** — Simple, elegant interface with type hints.
+- **BPE training** — Train from scratch or continue training on imported models.
+- **Byte-level tokenizer** — Fast encode/decode with streaming decode support.
+- **Regex pre-tokenization** — Split text before encoding using regex patterns.
+- **Special tokens** — Support for control tokens like `<|endoftext|>`.
+- **TikToken compatibility** — Convert tiktoken model parameters for use with tinybpe.
+- **Zero core dependencies** — The C extension has zero dependencies; only `regex` is needed for pre-tokenization.
+## ⚡️ Quick Start
+### 1. Basic Tokenization
+```python
+import tiktoken
+from tinybpe import Tokenizer, get_from_tiktoken
+# Convert a tiktoken model
+tik_tokenizer = tiktoken.get_encoding("cl100k_base")
+model_param = get_from_tiktoken(tik_tokenizer._mergeable_ranks)
+tiny_tokenizer = Tokenizer(model_param)
+text = "👋 Hello, this is an example. 你好，这是一个例子。😁"
+tik_ids = tik_tokenizer.encode(text)
+tiny_ids = tiny_tokenizer.encode(text)
+assert tik_ids == tiny_ids  # Identical output
+```
+### 2. Training a BPE Model
+```python
+from tinybpe import SimpleTrainer
+text = open("corpus.txt", "r", encoding="utf-8").read()
+trainer = SimpleTrainer(text)
+vocab_size = 1000
+for _ in range(vocab_size - 256):
+    pair, rank, freq = trainer.step()
+    print(f"{pair} -> {rank} ({freq})")
+print(f"Vocabulary size: {trainer.n_merges + 256}")
+trainer.save("my-model")  # Saves my-model.tinymodel
+```
+### 3. Loading a Model
+```python
+from tinybpe import Tokenizer, load_bpe_model
+model = load_bpe_model("my-model.tinymodel")
+tokenizer = Tokenizer(model)
+ids = tokenizer.encode("hello world")
+print(ids)                      # [259, 32, 261, 263, 264]
+print(tokenizer.decode(ids))    # hello world
+print(tokenizer.n_vocab)        # 1000
+```
+### 4. Streaming Decode
+```python
+def on_text(text: str):
+    print(text, end="")
+decode = tokenizer.stream_decode(on_text)
+for token_id in ids:
+    decode(token_id)  # Prints characters as soon as they're decodable
+```
+### 5. Convert TikToken Models
+```python
+import tiktoken
+from tinybpe import save_from_tiktoken
+enc = tiktoken.get_encoding("cl100k_base")
+save_from_tiktoken("cl100k_base", enc._mergeable_ranks)
+# Creates cl100k_base.tinymodel
+```
+**Note:** In commercial settings, be mindful of copyright when converting third-party tokenizer models. Training your own model is recommended.
+## 🧪 Development
+```bash
+git clone https://github.com/neluca/tinybpe.git
+cd tinybpe
+pip install -r requirements_dev.txt
+pip install -e .
+python -m pytest
+```
+See [CONTRIBUTING.md](CONTRIBUTING.md) for detailed development setup and guidelines.
+## 📊 Benchmarks
+Run benchmarks with:
+```bash
+cd benchmarks
+python bench_encode.py
+python bench_decode.py
+python bench_train.py
+```
+TinyBPE's C implementation typically achieves **10–100x faster** encoding than pure-Python BPE implementations.
+## 🤝 Acknowledgements
+- [minbpe](https://github.com/karpathy/minbpe) — Excellent educational resource on BPE algorithm internals.
+- [tiktoken](https://github.com/openai/tiktoken) — Reference tokenizer models for validation and compatibility.
+## 📄 License
+MIT License. See [LICENSE](LICENSE) for details.

tinybpe-0.1.2/pyproject.toml ADDED Viewed

@@ -0,0 +1,62 @@
+[build-system]
+requires = ["setuptools", "wheel", "regex"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "tinybpe"
+authors = [{ name = "E.T. Romani", email = "myneluca@gmail.com" }]
+description = "An ultra-fast, lightweight and clean CPython implementation of the Byte Pair Encoding (BPE) algorithm for language model tokenization."
+dependencies = ["regex"]
+readme = "README.md"
+requires-python = ">=3.9"
+license = "MIT"
+license-files = ["LICENSE"]
+dynamic = ["version"]
+keywords = ["BPE", "Byte Pair Encoding", "LLM", "Tokenizer", "CPython"]
+classifiers = [
+    "Development Status :: 5 - Production/Stable",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Programming Language :: Python :: 3.14",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "Operating System :: MacOS",
+    "Operating System :: POSIX :: Linux",
+    "Operating System :: Microsoft :: Windows",
+    "Topic :: Text Processing :: Linguistic",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+]
+[project.urls]
+Homepage = "https://github.com/neluca/tinybpe"
+Repository = "https://github.com/neluca/tinybpe"
+Issues = "https://github.com/neluca/tinybpe/issues"
+Documentation = "https://github.com/neluca/tinybpe#readme"
+[tool.setuptools]
+packages = ["tinybpe"]
+[tool.setuptools.dynamic]
+version = { attr = "tinybpe._version.__version__" }
+[tool.ruff]
+line-length = 120
+target-version = "py39"
+[tool.ruff.lint]
+select = ["E", "F", "W", "I", "N", "UP", "B", "C4"]
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
+[tool.mypy]
+strict = true
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+python_files = ["test_*.py"]
+addopts = ["--strict-markers", "-ra"]

tinybpe-0.1.2/requirements_dev.txt ADDED Viewed

@@ -0,0 +1,20 @@
+# Test
+tiktoken>=0.7.0
+pytest>=8.0
+pytest-cov>=5.0
+# Runtime (explicit for build requirements)
+regex
+# Build Packages
+build>=1.0
+setuptools>=68.0
+wheel>=0.40
+twine>=5.0
+# Lint & Type Checking
+ruff>=0.5.0
+mypy>=1.10
+# Pre-commit
+pre-commit>=3.5

tinybpe-0.1.2/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

tinybpe-0.1.2/setup.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""Minimal setup.py — only defines the C extension.
+All package metadata lives in pyproject.toml.
+"""
+import sys
+from setuptools import Extension, setup
+ext_modules = [
+    Extension(
+        "tinybpe.bpe",
+        sources=[
+            "src/bpe_module.c",
+            "src/_tree_core.c",
+            "src/bpe_common.c",
+            "src/bpe_trainer.c",
+            "src/bpe_tokenizer.c",
+        ],
+        depends=[
+            "src/_tree_core.h",
+            "src/bpe_common.h",
+            "src/bpe_trainer.h",
+            "src/bpe_tokenizer.h",
+        ],
+        extra_compile_args={
+            "win32": [],
+        }.get(sys.platform, ["-Werror", "-std=c99"]),
+    )
+]
+setup(
+    ext_modules=ext_modules,
+)