tokenizerbench 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,70 @@
1
+ # This workflow will upload a Python Package to PyPI when a release is created
2
+ # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
3
+
4
+ # This workflow uses actions that are not certified by GitHub.
5
+ # They are provided by a third-party and are governed by
6
+ # separate terms of service, privacy policy, and support
7
+ # documentation.
8
+
9
+ name: Upload Python Package
10
+
11
+ on:
12
+ release:
13
+ types: [published]
14
+
15
+ permissions:
16
+ contents: read
17
+
18
+ jobs:
19
+ release-build:
20
+ runs-on: ubuntu-latest
21
+
22
+ steps:
23
+ - uses: actions/checkout@v4
24
+
25
+ - uses: actions/setup-python@v5
26
+ with:
27
+ python-version: "3.x"
28
+
29
+ - name: Build release distributions
30
+ run: |
31
+ # NOTE: put your own distribution build steps here.
32
+ python -m pip install build
33
+ python -m build
34
+
35
+ - name: Upload distributions
36
+ uses: actions/upload-artifact@v4
37
+ with:
38
+ name: release-dists
39
+ path: dist/
40
+
41
+ pypi-publish:
42
+ runs-on: ubuntu-latest
43
+ needs:
44
+ - release-build
45
+ permissions:
46
+ # IMPORTANT: this permission is mandatory for trusted publishing
47
+ id-token: write
48
+
49
+ # Dedicated environments with protections for publishing are strongly recommended.
50
+ # For more information, see: https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment#deployment-protection-rules
51
+ environment:
52
+ name: pypi
53
+ # OPTIONAL: uncomment and update to include your PyPI project URL in the deployment status:
54
+ # url: https://pypi.org/p/YOURPROJECT
55
+ #
56
+ # ALTERNATIVE: if your GitHub Release name is the PyPI project version string
57
+ # ALTERNATIVE: exactly, uncomment the following line instead:
58
+ # url: https://pypi.org/project/YOURPROJECT/${{ github.event.release.name }}
59
+
60
+ steps:
61
+ - name: Retrieve release distributions
62
+ uses: actions/download-artifact@v4
63
+ with:
64
+ name: release-dists
65
+ path: dist/
66
+
67
+ - name: Publish release distributions to PyPI
68
+ uses: pypa/gh-action-pypi-publish@release/v1
69
+ with:
70
+ packages-dir: dist/
@@ -0,0 +1,212 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+ #poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ #pdm.lock
116
+ #pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ #pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # SageMath parsed files
135
+ *.sage.py
136
+
137
+ # Environments
138
+ .env
139
+ .envrc
140
+ .venv
141
+ env/
142
+ venv/
143
+ ENV/
144
+ env.bak/
145
+ venv.bak/
146
+
147
+ # Spyder project settings
148
+ .spyderproject
149
+ .spyproject
150
+
151
+ # Rope project settings
152
+ .ropeproject
153
+
154
+ # mkdocs documentation
155
+ /site
156
+
157
+ # mypy
158
+ .mypy_cache/
159
+ .dmypy.json
160
+ dmypy.json
161
+
162
+ # Pyre type checker
163
+ .pyre/
164
+
165
+ # pytype static type analyzer
166
+ .pytype/
167
+
168
+ # Cython debug symbols
169
+ cython_debug/
170
+
171
+ # PyCharm
172
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
173
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
174
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
175
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
176
+ #.idea/
177
+
178
+ # Abstra
179
+ # Abstra is an AI-powered process automation framework.
180
+ # Ignore directories containing user credentials, local state, and settings.
181
+ # Learn more at https://abstra.io/docs
182
+ .abstra/
183
+
184
+ # Visual Studio Code
185
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
186
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
187
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
188
+ # you could uncomment the following to ignore the entire vscode folder
189
+ # .vscode/
190
+
191
+ # Ruff stuff:
192
+ .ruff_cache/
193
+
194
+ # PyPI configuration file
195
+ .pypirc
196
+
197
+ # Cursor
198
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
199
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
200
+ # refer to https://docs.cursor.com/context/ignore-files
201
+ .cursorignore
202
+ .cursorindexingignore
203
+
204
+ # Marimo
205
+ marimo/_static/
206
+ marimo/_lsp/
207
+ __marimo__/
208
+
209
+
210
+ # Code files
211
+ rough.py
212
+ test.py
@@ -0,0 +1,254 @@
1
+ # TokenizerBench — Release Notes
2
+
3
+ ## v0.2.0
4
+
5
+ **The "Actually Useful" Release** — this version transforms TokenizerBench from a thin proof-of-concept into a production-ready benchmark you can run before committing to a tokenizer for pre-training or fine-tuning.
6
+
7
+ ---
8
+
9
+ ### What's new at a glance
10
+
11
+ | | v0.1.0 | v0.2.0 |
12
+ |---|---|---|
13
+ | Human languages | ~13 | **84** |
14
+ | Samples per language | 4 | **20** |
15
+ | Total samples | ~200 | **1,967** |
16
+ | Metrics | none | **8 metrics** |
17
+ | CLI runner | none | ✅ |
18
+ | Tests | none | **101 passing** |
19
+ | CI | none | ✅ GitHub Actions |
20
+ | Analysis / plotting | none | ✅ |
21
+
22
+ ---
23
+
24
+ ### Breaking changes
25
+
26
+ **Import paths have changed.** All data modules now export by their semantic name instead of a generic `dataset` variable:
27
+
28
+ ```python
29
+ # v0.1.0 — broken
30
+ from data.human_languages import dataset
31
+
32
+ # v0.2.0 — correct
33
+ from data import human_languages, programming_languages, scientific_formulas, edge_cases
34
+ from data import ALL_DATA # combined dict of all four
35
+ ```
36
+
37
+ This matches the examples in the README and is required for the new evaluation pipeline to work.
38
+
39
+ ---
40
+
41
+ ### Dataset
42
+
43
+ #### Human languages — 84 languages, 1,193 samples
44
+
45
+ The human language dataset has grown from a handful of major languages to **84 languages** covering every major script and language family. Each language now has **20 samples** instead of 4, specifically designed to stress-test tokenizers:
46
+
47
+ - Standard sentences
48
+ - Long sequences (50–100 words)
49
+ - Repetition stress tests (`word ` × 40)
50
+ - Mixed-script / code-switching samples
51
+ - Punctuation and numeral variants
52
+ - Uppercase-only sentences
53
+ - Script alphabet listings
54
+
55
+ **New languages added in this release:**
56
+
57
+ *Indic scripts:* Punjabi, Gujarati, Marathi, Telugu, Kannada, Malayalam, Odia
58
+
59
+ *East Asian:* Chinese (Traditional)
60
+
61
+ *Slavic:* Bulgarian, Belarusian, Azerbaijani, Kazakh, Uzbek
62
+
63
+ *African:* Hausa, Yoruba, Igbo, Somali, Xhosa, Shona
64
+
65
+ *Southeast Asian:* Tagalog, Cebuano, Javanese
66
+
67
+ *European minority / constructed:* Catalan, Latvian, Estonian, Slovenian, Luxembourgish, Maltese, Occitan, Scots Gaelic, Esperanto, Latin
68
+
69
+ *Middle Eastern:* Pashto, Sindhi, Uyghur
70
+
71
+ #### Programming languages — 17 subcategories, 245 samples
72
+
73
+ All existing subcategories expanded from 4 to 15–20 samples. Two new subcategories:
74
+
75
+ - **`typescript`** — generics, decorators, mapped types, `satisfies`, conditional types, `declare module`
76
+ - **`python_ml`** — HuggingFace Transformers, PyTorch, `accelerate`, `datasets`, mixed-precision training code
77
+
78
+ #### Scientific formulas — 13 subcategories, 237 samples
79
+
80
+ Three new subcategories:
81
+
82
+ - **`maxwell_equations`** — all four Maxwell equations in differential form, wave equations, Poynting vector, field energy density
83
+ - **`machine_learning`** — cross-entropy loss, gradient descent, backprop, attention, softmax, BatchNorm, Adam, LoRA, REINFORCE, Bellman, actor-critic, Transformer FFN
84
+ - **`latex_notation`** — raw LaTeX strings with backslash commands, fractions, integrals, matrix notation, `\mathcal`, `\mathbf`, `\nabla`, `\text{}`
85
+
86
+ #### Edge cases — NEW, 16 subcategories, 292 samples
87
+
88
+ An entirely new dataset category targeting known tokenizer failure modes:
89
+
90
+ | Subcategory | What it catches |
91
+ |---|---|
92
+ | `homoglyphs` | Cyrillic/Greek/Latin lookalikes silently mixed into text |
93
+ | `zero_width_characters` | ZWS, ZWNJ, ZWJ, BOM, soft hyphens changing token boundaries |
94
+ | `rtl_ltr_mixing` | Arabic/Hebrew + Latin bidirectional text |
95
+ | `diacritics_and_special_latin` | Combining vs precomposed forms (NFC/NFD equivalence) |
96
+ | `whitespace_variants` | All 15 Unicode space types, tabs, various newlines |
97
+ | `long_tokens` | URLs, base64, UUIDs, SHA hashes, long identifiers |
98
+ | `repeated_characters` | Single-char spam, emoji runs, repeated n-grams |
99
+ | `emojis_and_unicode` | Flag sequences, ZWJ family emoji, skin tone modifiers, keycaps |
100
+ | `code_switching` | Natural language sentences with embedded code keywords |
101
+ | `noisy_text` | OCR noise, missing spaces, typos, leetspeak, alternating case |
102
+ | `mixed_scripts_single_token` | Diacritic words that should tokenize as a single unit |
103
+ | `numerical_edge_cases` | Hex, binary, fractions, Roman numerals, Unicode digits |
104
+ | `special_punctuation` | 6 quote styles, 5 dash types, arrows, all currency symbols |
105
+ | `control_characters` | Null bytes, ANSI escape codes, Unicode control chars |
106
+ | `fertility_test` | Morphologically complex words (Turkish, Finnish, Polish, Tamil…) |
107
+ | `segmentation_boundaries` | Contractions, possessives, hyphenation, abbreviations |
108
+
109
+ ---
110
+
111
+ ### Metrics (`metrics.py`)
112
+
113
+ A new module implementing 8 evaluation metrics that work with any tokenizer implementing `.encode(text) → list[int]` and `.decode(ids) → str`:
114
+
115
+ | Function | Description |
116
+ |---|---|
117
+ | `token_count` | Raw token count |
118
+ | `fertility_score` | Tokens per word — the standard multilingual quality metric |
119
+ | `compression_ratio` | Tokens per character |
120
+ | `byte_compression_ratio` | Tokens per UTF-8 byte — language-agnostic |
121
+ | `roundtrip_fidelity` | Whether `decode(encode(text)) == text` |
122
+ | `vocabulary_coverage` | Fraction of IDs within the known vocabulary range |
123
+ | `subword_consistency` | Consistency of morphological root segmentation |
124
+ | `segmentation_stats` | Min/max/mean/std/p90/p95/p99 of token counts |
125
+
126
+ High-level pipelines:
127
+
128
+ - `evaluate_tokenizer(tokenizer, dataset)` — runs all metrics across every category and subcategory, returns nested results dict with a `__summary__` block
129
+ - `compare_tokenizers(tokenizers_dict, dataset)` — evaluates multiple tokenizers in one call
130
+ - `make_leaderboard(comparison)` — returns a sorted ranking list
131
+
132
+ #### Fertility score interpretation
133
+
134
+ ```
135
+ ≈ 1.0 ideal — one token per word
136
+ 1 – 2 good — well-trained BPE on a covered language
137
+ 2 – 4 acceptable — less common scripts
138
+ ≥ 4 poor — language likely under-represented in training data
139
+ ```
140
+
141
+ #### Roundtrip fidelity
142
+
143
+ Any fidelity failure is a **bug** in the tokenizer. The metric returns the exact character position of the first divergence, making it easy to bisect which input triggered the failure. Run the `edge_cases` dataset to catch these before training.
144
+
145
+ ---
146
+
147
+ ### CLI runner (`evaluate.py`)
148
+
149
+ ```bash
150
+ # Single tokenizer
151
+ python evaluate.py --tokenizer tiktoken --model cl100k_base
152
+
153
+ # HuggingFace
154
+ python evaluate.py --tokenizer hf --model xlm-roberta-base
155
+
156
+ # SentencePiece
157
+ python evaluate.py --tokenizer sentencepiece --model spm.model
158
+
159
+ # Subset of categories
160
+ python evaluate.py --tokenizer tiktoken --categories human_languages edge_cases
161
+
162
+ # Save to JSON
163
+ python evaluate.py --tokenizer tiktoken --output results/cl100k.json
164
+
165
+ # Side-by-side comparison + leaderboard
166
+ python evaluate.py \
167
+ --tokenizer tiktoken tiktoken hf \
168
+ --model cl100k_base o200k_base xlm-roberta-base
169
+ ```
170
+
171
+ Supported backends: `tiktoken`, `hf` / `huggingface`, `sentencepiece` / `sp`.
172
+
173
+ ---
174
+
175
+ ### Tests (`tests/`)
176
+
177
+ 101 tests, zero external dependencies required.
178
+
179
+ - **`tests/conftest.py`** — `MockTokenizer` (char-level, pure stdlib), `BrokenDecodeTokenizer` (always fails fidelity), `SpaceTokenizer` (word-level baseline)
180
+ - **`tests/test_metrics.py`** — 56 unit tests, one per metric function and edge case
181
+ - **`tests/test_data.py`** — 45 dataset integrity tests: structure, encoding validity, no surrogates, minimum sample counts, content spot-checks
182
+
183
+ ```
184
+ 101 passed in 0.21s
185
+ ```
186
+
187
+ ---
188
+
189
+ ### CI (`.github/workflows/ci.yml`)
190
+
191
+ Three jobs on every push and pull request to `main`:
192
+
193
+ 1. **`test`** — pytest on Python 3.10, 3.11, 3.12 (matrix)
194
+ 2. **`test-tiktoken`** — integration test with real tiktoken installed
195
+ 3. **`lint`** — ruff with E, W, F rules
196
+
197
+ ---
198
+
199
+ ### Analysis (`analysis/plot.py`)
200
+
201
+ ```bash
202
+ # Single tokenizer — generates heatmap, bar chart, scatter plot
203
+ python analysis/plot.py results/cl100k.json --out figures/
204
+
205
+ # Multi-tokenizer comparison
206
+ python analysis/plot.py results/cl100k.json results/xlm.json --compare
207
+ ```
208
+
209
+ Plots generated:
210
+
211
+ - **Fertility heatmap** — category × subcategory grid, colour-coded by quality
212
+ - **Per-language bar chart** — all 84 languages sorted by fertility, green/orange/red thresholds
213
+ - **Fertility vs compression scatter** — spot outliers by category
214
+ - **Grouped bar comparison** — side-by-side metric comparison across tokenizers
215
+ - **Fidelity failure bar** — which tokenizers have roundtrip bugs
216
+
217
+ Requires: `pip install matplotlib seaborn pandas`
218
+
219
+ ---
220
+
221
+ ### Bug fixes
222
+
223
+ - Fixed `SyntaxError` in `edge_cases.py` caused by unescaped low-high quotation marks (`„"`) inside a double-quoted string literal
224
+ - Fixed `SyntaxWarning` for invalid escape sequence `\\/` in leetspeak sample — replaced with a raw string
225
+
226
+ ---
227
+
228
+ ### Upgrade guide
229
+
230
+ ```bash
231
+ pip install -r requirements.txt
232
+ ```
233
+
234
+ Update any imports from:
235
+ ```python
236
+ from data.human_languages import dataset
237
+ ```
238
+ to:
239
+ ```python
240
+ from data import human_languages
241
+ # or
242
+ from data import ALL_DATA
243
+ ```
244
+
245
+ ---
246
+
247
+ ### What's next (v0.3.0 ideas)
248
+
249
+ - Expand to 100 languages (16 remaining)
250
+ - Longer sequences: 2K–10K character samples for context-length stress tests
251
+ - Baseline JSON files for `cl100k_base`, `o200k_base`, `xlm-roberta-base`, `bert-base-multilingual-cased`
252
+ - Moses and NLTK word-tokenize as word-level fertility baselines
253
+ - Plotting notebook (`analysis/explore.ipynb`)
254
+ - PyPI package (`pip install tokenizerbench`)
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 KiteFishAI
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.