tokenizerbench 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tokenizerbench-0.2.0/.github/workflows/python-publish.yml +70 -0
- tokenizerbench-0.2.0/.gitignore +212 -0
- tokenizerbench-0.2.0/CHANGELOG.md +254 -0
- tokenizerbench-0.2.0/LICENSE +21 -0
- tokenizerbench-0.2.0/PKG-INFO +285 -0
- tokenizerbench-0.2.0/README.md +253 -0
- tokenizerbench-0.2.0/pyproject.toml +27 -0
- tokenizerbench-0.2.0/tokenizerbench/__init__.py +23 -0
- tokenizerbench-0.2.0/tokenizerbench/analysis/__init__.py +0 -0
- tokenizerbench-0.2.0/tokenizerbench/analysis/plot.py +306 -0
- tokenizerbench-0.2.0/tokenizerbench/data/__init__.py +18 -0
- tokenizerbench-0.2.0/tokenizerbench/data/edge_cases.py +381 -0
- tokenizerbench-0.2.0/tokenizerbench/data/human_languages.py +1864 -0
- tokenizerbench-0.2.0/tokenizerbench/data/programming_languages.py +421 -0
- tokenizerbench-0.2.0/tokenizerbench/data/scientific_formulas.py +372 -0
- tokenizerbench-0.2.0/tokenizerbench/evaluate.py +323 -0
- tokenizerbench-0.2.0/tokenizerbench/metrics.py +366 -0
- tokenizerbench-0.2.0/tokenizerbench/results/README.md +82 -0
- tokenizerbench-0.2.0/tokenizerbench/spaces/README.md +45 -0
- tokenizerbench-0.2.0/tokenizerbench/spaces/__init__.py +0 -0
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# This workflow will upload a Python Package to PyPI when a release is created
|
|
2
|
+
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
|
|
3
|
+
|
|
4
|
+
# This workflow uses actions that are not certified by GitHub.
|
|
5
|
+
# They are provided by a third-party and are governed by
|
|
6
|
+
# separate terms of service, privacy policy, and support
|
|
7
|
+
# documentation.
|
|
8
|
+
|
|
9
|
+
name: Upload Python Package
|
|
10
|
+
|
|
11
|
+
on:
|
|
12
|
+
release:
|
|
13
|
+
types: [published]
|
|
14
|
+
|
|
15
|
+
permissions:
|
|
16
|
+
contents: read
|
|
17
|
+
|
|
18
|
+
jobs:
|
|
19
|
+
release-build:
|
|
20
|
+
runs-on: ubuntu-latest
|
|
21
|
+
|
|
22
|
+
steps:
|
|
23
|
+
- uses: actions/checkout@v4
|
|
24
|
+
|
|
25
|
+
- uses: actions/setup-python@v5
|
|
26
|
+
with:
|
|
27
|
+
python-version: "3.x"
|
|
28
|
+
|
|
29
|
+
- name: Build release distributions
|
|
30
|
+
run: |
|
|
31
|
+
# NOTE: put your own distribution build steps here.
|
|
32
|
+
python -m pip install build
|
|
33
|
+
python -m build
|
|
34
|
+
|
|
35
|
+
- name: Upload distributions
|
|
36
|
+
uses: actions/upload-artifact@v4
|
|
37
|
+
with:
|
|
38
|
+
name: release-dists
|
|
39
|
+
path: dist/
|
|
40
|
+
|
|
41
|
+
pypi-publish:
|
|
42
|
+
runs-on: ubuntu-latest
|
|
43
|
+
needs:
|
|
44
|
+
- release-build
|
|
45
|
+
permissions:
|
|
46
|
+
# IMPORTANT: this permission is mandatory for trusted publishing
|
|
47
|
+
id-token: write
|
|
48
|
+
|
|
49
|
+
# Dedicated environments with protections for publishing are strongly recommended.
|
|
50
|
+
# For more information, see: https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment#deployment-protection-rules
|
|
51
|
+
environment:
|
|
52
|
+
name: pypi
|
|
53
|
+
# OPTIONAL: uncomment and update to include your PyPI project URL in the deployment status:
|
|
54
|
+
# url: https://pypi.org/p/YOURPROJECT
|
|
55
|
+
#
|
|
56
|
+
# ALTERNATIVE: if your GitHub Release name is the PyPI project version string
|
|
57
|
+
# ALTERNATIVE: exactly, uncomment the following line instead:
|
|
58
|
+
# url: https://pypi.org/project/YOURPROJECT/${{ github.event.release.name }}
|
|
59
|
+
|
|
60
|
+
steps:
|
|
61
|
+
- name: Retrieve release distributions
|
|
62
|
+
uses: actions/download-artifact@v4
|
|
63
|
+
with:
|
|
64
|
+
name: release-dists
|
|
65
|
+
path: dist/
|
|
66
|
+
|
|
67
|
+
- name: Publish release distributions to PyPI
|
|
68
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
69
|
+
with:
|
|
70
|
+
packages-dir: dist/
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[codz]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.nox/
|
|
43
|
+
.coverage
|
|
44
|
+
.coverage.*
|
|
45
|
+
.cache
|
|
46
|
+
nosetests.xml
|
|
47
|
+
coverage.xml
|
|
48
|
+
*.cover
|
|
49
|
+
*.py.cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
.pytest_cache/
|
|
52
|
+
cover/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
.pybuilder/
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
88
|
+
# .python-version
|
|
89
|
+
|
|
90
|
+
# pipenv
|
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
94
|
+
# install all needed dependencies.
|
|
95
|
+
#Pipfile.lock
|
|
96
|
+
|
|
97
|
+
# UV
|
|
98
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
|
99
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
100
|
+
# commonly ignored for libraries.
|
|
101
|
+
#uv.lock
|
|
102
|
+
|
|
103
|
+
# poetry
|
|
104
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
105
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
106
|
+
# commonly ignored for libraries.
|
|
107
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
108
|
+
#poetry.lock
|
|
109
|
+
#poetry.toml
|
|
110
|
+
|
|
111
|
+
# pdm
|
|
112
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
113
|
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
|
114
|
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
|
115
|
+
#pdm.lock
|
|
116
|
+
#pdm.toml
|
|
117
|
+
.pdm-python
|
|
118
|
+
.pdm-build/
|
|
119
|
+
|
|
120
|
+
# pixi
|
|
121
|
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
|
122
|
+
#pixi.lock
|
|
123
|
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
|
124
|
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
|
125
|
+
.pixi
|
|
126
|
+
|
|
127
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
128
|
+
__pypackages__/
|
|
129
|
+
|
|
130
|
+
# Celery stuff
|
|
131
|
+
celerybeat-schedule
|
|
132
|
+
celerybeat.pid
|
|
133
|
+
|
|
134
|
+
# SageMath parsed files
|
|
135
|
+
*.sage.py
|
|
136
|
+
|
|
137
|
+
# Environments
|
|
138
|
+
.env
|
|
139
|
+
.envrc
|
|
140
|
+
.venv
|
|
141
|
+
env/
|
|
142
|
+
venv/
|
|
143
|
+
ENV/
|
|
144
|
+
env.bak/
|
|
145
|
+
venv.bak/
|
|
146
|
+
|
|
147
|
+
# Spyder project settings
|
|
148
|
+
.spyderproject
|
|
149
|
+
.spyproject
|
|
150
|
+
|
|
151
|
+
# Rope project settings
|
|
152
|
+
.ropeproject
|
|
153
|
+
|
|
154
|
+
# mkdocs documentation
|
|
155
|
+
/site
|
|
156
|
+
|
|
157
|
+
# mypy
|
|
158
|
+
.mypy_cache/
|
|
159
|
+
.dmypy.json
|
|
160
|
+
dmypy.json
|
|
161
|
+
|
|
162
|
+
# Pyre type checker
|
|
163
|
+
.pyre/
|
|
164
|
+
|
|
165
|
+
# pytype static type analyzer
|
|
166
|
+
.pytype/
|
|
167
|
+
|
|
168
|
+
# Cython debug symbols
|
|
169
|
+
cython_debug/
|
|
170
|
+
|
|
171
|
+
# PyCharm
|
|
172
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
173
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
174
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
175
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
176
|
+
#.idea/
|
|
177
|
+
|
|
178
|
+
# Abstra
|
|
179
|
+
# Abstra is an AI-powered process automation framework.
|
|
180
|
+
# Ignore directories containing user credentials, local state, and settings.
|
|
181
|
+
# Learn more at https://abstra.io/docs
|
|
182
|
+
.abstra/
|
|
183
|
+
|
|
184
|
+
# Visual Studio Code
|
|
185
|
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
|
186
|
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
|
187
|
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
|
188
|
+
# you could uncomment the following to ignore the entire vscode folder
|
|
189
|
+
# .vscode/
|
|
190
|
+
|
|
191
|
+
# Ruff stuff:
|
|
192
|
+
.ruff_cache/
|
|
193
|
+
|
|
194
|
+
# PyPI configuration file
|
|
195
|
+
.pypirc
|
|
196
|
+
|
|
197
|
+
# Cursor
|
|
198
|
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
|
199
|
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
|
200
|
+
# refer to https://docs.cursor.com/context/ignore-files
|
|
201
|
+
.cursorignore
|
|
202
|
+
.cursorindexingignore
|
|
203
|
+
|
|
204
|
+
# Marimo
|
|
205
|
+
marimo/_static/
|
|
206
|
+
marimo/_lsp/
|
|
207
|
+
__marimo__/
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
# Code files
|
|
211
|
+
rough.py
|
|
212
|
+
test.py
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
# TokenizerBench — Release Notes
|
|
2
|
+
|
|
3
|
+
## v0.2.0
|
|
4
|
+
|
|
5
|
+
**The "Actually Useful" Release** — this version transforms TokenizerBench from a thin proof-of-concept into a production-ready benchmark you can run before committing to a tokenizer for pre-training or fine-tuning.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
### What's new at a glance
|
|
10
|
+
|
|
11
|
+
| | v0.1.0 | v0.2.0 |
|
|
12
|
+
|---|---|---|
|
|
13
|
+
| Human languages | ~13 | **84** |
|
|
14
|
+
| Samples per language | 4 | **20** |
|
|
15
|
+
| Total samples | ~200 | **1,967** |
|
|
16
|
+
| Metrics | none | **8 metrics** |
|
|
17
|
+
| CLI runner | none | ✅ |
|
|
18
|
+
| Tests | none | **101 passing** |
|
|
19
|
+
| CI | none | ✅ GitHub Actions |
|
|
20
|
+
| Analysis / plotting | none | ✅ |
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
### Breaking changes
|
|
25
|
+
|
|
26
|
+
**Import paths have changed.** All data modules now export by their semantic name instead of a generic `dataset` variable:
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
# v0.1.0 — broken
|
|
30
|
+
from data.human_languages import dataset
|
|
31
|
+
|
|
32
|
+
# v0.2.0 — correct
|
|
33
|
+
from data import human_languages, programming_languages, scientific_formulas, edge_cases
|
|
34
|
+
from data import ALL_DATA # combined dict of all four
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
This matches the examples in the README and is required for the new evaluation pipeline to work.
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
### Dataset
|
|
42
|
+
|
|
43
|
+
#### Human languages — 84 languages, 1,193 samples
|
|
44
|
+
|
|
45
|
+
The human language dataset has grown from a handful of major languages to **84 languages** covering every major script and language family. Each language now has **20 samples** instead of 4, specifically designed to stress-test tokenizers:
|
|
46
|
+
|
|
47
|
+
- Standard sentences
|
|
48
|
+
- Long sequences (50–100 words)
|
|
49
|
+
- Repetition stress tests (`word ` × 40)
|
|
50
|
+
- Mixed-script / code-switching samples
|
|
51
|
+
- Punctuation and numeral variants
|
|
52
|
+
- Uppercase-only sentences
|
|
53
|
+
- Script alphabet listings
|
|
54
|
+
|
|
55
|
+
**New languages added in this release:**
|
|
56
|
+
|
|
57
|
+
*Indic scripts:* Punjabi, Gujarati, Marathi, Telugu, Kannada, Malayalam, Odia
|
|
58
|
+
|
|
59
|
+
*East Asian:* Chinese (Traditional)
|
|
60
|
+
|
|
61
|
+
*Slavic:* Bulgarian, Belarusian, Azerbaijani, Kazakh, Uzbek
|
|
62
|
+
|
|
63
|
+
*African:* Hausa, Yoruba, Igbo, Somali, Xhosa, Shona
|
|
64
|
+
|
|
65
|
+
*Southeast Asian:* Tagalog, Cebuano, Javanese
|
|
66
|
+
|
|
67
|
+
*European minority / constructed:* Catalan, Latvian, Estonian, Slovenian, Luxembourgish, Maltese, Occitan, Scots Gaelic, Esperanto, Latin
|
|
68
|
+
|
|
69
|
+
*Middle Eastern:* Pashto, Sindhi, Uyghur
|
|
70
|
+
|
|
71
|
+
#### Programming languages — 17 subcategories, 245 samples
|
|
72
|
+
|
|
73
|
+
All existing subcategories expanded from 4 to 15–20 samples. Two new subcategories:
|
|
74
|
+
|
|
75
|
+
- **`typescript`** — generics, decorators, mapped types, `satisfies`, conditional types, `declare module`
|
|
76
|
+
- **`python_ml`** — HuggingFace Transformers, PyTorch, `accelerate`, `datasets`, mixed-precision training code
|
|
77
|
+
|
|
78
|
+
#### Scientific formulas — 13 subcategories, 237 samples
|
|
79
|
+
|
|
80
|
+
Three new subcategories:
|
|
81
|
+
|
|
82
|
+
- **`maxwell_equations`** — all four Maxwell equations in differential form, wave equations, Poynting vector, field energy density
|
|
83
|
+
- **`machine_learning`** — cross-entropy loss, gradient descent, backprop, attention, softmax, BatchNorm, Adam, LoRA, REINFORCE, Bellman, actor-critic, Transformer FFN
|
|
84
|
+
- **`latex_notation`** — raw LaTeX strings with backslash commands, fractions, integrals, matrix notation, `\mathcal`, `\mathbf`, `\nabla`, `\text{}`
|
|
85
|
+
|
|
86
|
+
#### Edge cases — NEW, 16 subcategories, 292 samples
|
|
87
|
+
|
|
88
|
+
An entirely new dataset category targeting known tokenizer failure modes:
|
|
89
|
+
|
|
90
|
+
| Subcategory | What it catches |
|
|
91
|
+
|---|---|
|
|
92
|
+
| `homoglyphs` | Cyrillic/Greek/Latin lookalikes silently mixed into text |
|
|
93
|
+
| `zero_width_characters` | ZWS, ZWNJ, ZWJ, BOM, soft hyphens changing token boundaries |
|
|
94
|
+
| `rtl_ltr_mixing` | Arabic/Hebrew + Latin bidirectional text |
|
|
95
|
+
| `diacritics_and_special_latin` | Combining vs precomposed forms (NFC/NFD equivalence) |
|
|
96
|
+
| `whitespace_variants` | All 15 Unicode space types, tabs, various newlines |
|
|
97
|
+
| `long_tokens` | URLs, base64, UUIDs, SHA hashes, long identifiers |
|
|
98
|
+
| `repeated_characters` | Single-char spam, emoji runs, repeated n-grams |
|
|
99
|
+
| `emojis_and_unicode` | Flag sequences, ZWJ family emoji, skin tone modifiers, keycaps |
|
|
100
|
+
| `code_switching` | Natural language sentences with embedded code keywords |
|
|
101
|
+
| `noisy_text` | OCR noise, missing spaces, typos, leetspeak, alternating case |
|
|
102
|
+
| `mixed_scripts_single_token` | Diacritic words that should tokenize as a single unit |
|
|
103
|
+
| `numerical_edge_cases` | Hex, binary, fractions, Roman numerals, Unicode digits |
|
|
104
|
+
| `special_punctuation` | 6 quote styles, 5 dash types, arrows, all currency symbols |
|
|
105
|
+
| `control_characters` | Null bytes, ANSI escape codes, Unicode control chars |
|
|
106
|
+
| `fertility_test` | Morphologically complex words (Turkish, Finnish, Polish, Tamil…) |
|
|
107
|
+
| `segmentation_boundaries` | Contractions, possessives, hyphenation, abbreviations |
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
### Metrics (`metrics.py`)
|
|
112
|
+
|
|
113
|
+
A new module implementing 8 evaluation metrics that work with any tokenizer implementing `.encode(text) → list[int]` and `.decode(ids) → str`:
|
|
114
|
+
|
|
115
|
+
| Function | Description |
|
|
116
|
+
|---|---|
|
|
117
|
+
| `token_count` | Raw token count |
|
|
118
|
+
| `fertility_score` | Tokens per word — the standard multilingual quality metric |
|
|
119
|
+
| `compression_ratio` | Tokens per character |
|
|
120
|
+
| `byte_compression_ratio` | Tokens per UTF-8 byte — language-agnostic |
|
|
121
|
+
| `roundtrip_fidelity` | Whether `decode(encode(text)) == text` |
|
|
122
|
+
| `vocabulary_coverage` | Fraction of IDs within the known vocabulary range |
|
|
123
|
+
| `subword_consistency` | Consistency of morphological root segmentation |
|
|
124
|
+
| `segmentation_stats` | Min/max/mean/std/p90/p95/p99 of token counts |
|
|
125
|
+
|
|
126
|
+
High-level pipelines:
|
|
127
|
+
|
|
128
|
+
- `evaluate_tokenizer(tokenizer, dataset)` — runs all metrics across every category and subcategory, returns nested results dict with a `__summary__` block
|
|
129
|
+
- `compare_tokenizers(tokenizers_dict, dataset)` — evaluates multiple tokenizers in one call
|
|
130
|
+
- `make_leaderboard(comparison)` — returns a sorted ranking list
|
|
131
|
+
|
|
132
|
+
#### Fertility score interpretation
|
|
133
|
+
|
|
134
|
+
```
|
|
135
|
+
≈ 1.0 ideal — one token per word
|
|
136
|
+
1 – 2 good — well-trained BPE on a covered language
|
|
137
|
+
2 – 4 acceptable — less common scripts
|
|
138
|
+
≥ 4 poor — language likely under-represented in training data
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
#### Roundtrip fidelity
|
|
142
|
+
|
|
143
|
+
Any fidelity failure is a **bug** in the tokenizer. The metric returns the exact character position of the first divergence, making it easy to bisect which input triggered the failure. Run the `edge_cases` dataset to catch these before training.
|
|
144
|
+
|
|
145
|
+
---
|
|
146
|
+
|
|
147
|
+
### CLI runner (`evaluate.py`)
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
# Single tokenizer
|
|
151
|
+
python evaluate.py --tokenizer tiktoken --model cl100k_base
|
|
152
|
+
|
|
153
|
+
# HuggingFace
|
|
154
|
+
python evaluate.py --tokenizer hf --model xlm-roberta-base
|
|
155
|
+
|
|
156
|
+
# SentencePiece
|
|
157
|
+
python evaluate.py --tokenizer sentencepiece --model spm.model
|
|
158
|
+
|
|
159
|
+
# Subset of categories
|
|
160
|
+
python evaluate.py --tokenizer tiktoken --categories human_languages edge_cases
|
|
161
|
+
|
|
162
|
+
# Save to JSON
|
|
163
|
+
python evaluate.py --tokenizer tiktoken --output results/cl100k.json
|
|
164
|
+
|
|
165
|
+
# Side-by-side comparison + leaderboard
|
|
166
|
+
python evaluate.py \
|
|
167
|
+
--tokenizer tiktoken tiktoken hf \
|
|
168
|
+
--model cl100k_base o200k_base xlm-roberta-base
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
Supported backends: `tiktoken`, `hf` / `huggingface`, `sentencepiece` / `sp`.
|
|
172
|
+
|
|
173
|
+
---
|
|
174
|
+
|
|
175
|
+
### Tests (`tests/`)
|
|
176
|
+
|
|
177
|
+
101 tests, zero external dependencies required.
|
|
178
|
+
|
|
179
|
+
- **`tests/conftest.py`** — `MockTokenizer` (char-level, pure stdlib), `BrokenDecodeTokenizer` (always fails fidelity), `SpaceTokenizer` (word-level baseline)
|
|
180
|
+
- **`tests/test_metrics.py`** — 56 unit tests, one per metric function and edge case
|
|
181
|
+
- **`tests/test_data.py`** — 45 dataset integrity tests: structure, encoding validity, no surrogates, minimum sample counts, content spot-checks
|
|
182
|
+
|
|
183
|
+
```
|
|
184
|
+
101 passed in 0.21s
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
---
|
|
188
|
+
|
|
189
|
+
### CI (`.github/workflows/ci.yml`)
|
|
190
|
+
|
|
191
|
+
Three jobs on every push and pull request to `main`:
|
|
192
|
+
|
|
193
|
+
1. **`test`** — pytest on Python 3.10, 3.11, 3.12 (matrix)
|
|
194
|
+
2. **`test-tiktoken`** — integration test with real tiktoken installed
|
|
195
|
+
3. **`lint`** — ruff with E, W, F rules
|
|
196
|
+
|
|
197
|
+
---
|
|
198
|
+
|
|
199
|
+
### Analysis (`analysis/plot.py`)
|
|
200
|
+
|
|
201
|
+
```bash
|
|
202
|
+
# Single tokenizer — generates heatmap, bar chart, scatter plot
|
|
203
|
+
python analysis/plot.py results/cl100k.json --out figures/
|
|
204
|
+
|
|
205
|
+
# Multi-tokenizer comparison
|
|
206
|
+
python analysis/plot.py results/cl100k.json results/xlm.json --compare
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
Plots generated:
|
|
210
|
+
|
|
211
|
+
- **Fertility heatmap** — category × subcategory grid, colour-coded by quality
|
|
212
|
+
- **Per-language bar chart** — all 84 languages sorted by fertility, green/orange/red thresholds
|
|
213
|
+
- **Fertility vs compression scatter** — spot outliers by category
|
|
214
|
+
- **Grouped bar comparison** — side-by-side metric comparison across tokenizers
|
|
215
|
+
- **Fidelity failure bar** — which tokenizers have roundtrip bugs
|
|
216
|
+
|
|
217
|
+
Requires: `pip install matplotlib seaborn pandas`
|
|
218
|
+
|
|
219
|
+
---
|
|
220
|
+
|
|
221
|
+
### Bug fixes
|
|
222
|
+
|
|
223
|
+
- Fixed `SyntaxError` in `edge_cases.py` caused by unescaped low-high quotation marks (`„"`) inside a double-quoted string literal
|
|
224
|
+
- Fixed `SyntaxWarning` for invalid escape sequence `\\/` in leetspeak sample — replaced with a raw string
|
|
225
|
+
|
|
226
|
+
---
|
|
227
|
+
|
|
228
|
+
### Upgrade guide
|
|
229
|
+
|
|
230
|
+
```bash
|
|
231
|
+
pip install -r requirements.txt
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
Update any imports from:
|
|
235
|
+
```python
|
|
236
|
+
from data.human_languages import dataset
|
|
237
|
+
```
|
|
238
|
+
to:
|
|
239
|
+
```python
|
|
240
|
+
from data import human_languages
|
|
241
|
+
# or
|
|
242
|
+
from data import ALL_DATA
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
---
|
|
246
|
+
|
|
247
|
+
### What's next (v0.3.0 ideas)
|
|
248
|
+
|
|
249
|
+
- Expand to 100 languages (16 remaining)
|
|
250
|
+
- Longer sequences: 2K–10K character samples for context-length stress tests
|
|
251
|
+
- Baseline JSON files for `cl100k_base`, `o200k_base`, `xlm-roberta-base`, `bert-base-multilingual-cased`
|
|
252
|
+
- Moses and NLTK word-tokenize as word-level fertility baselines
|
|
253
|
+
- Plotting notebook (`analysis/explore.ipynb`)
|
|
254
|
+
- PyPI package (`pip install tokenizerbench`)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 KiteFishAI
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|