tokendrift 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tokendrift-0.1.0/LICENSE +21 -0
- tokendrift-0.1.0/PKG-INFO +273 -0
- tokendrift-0.1.0/README.md +232 -0
- tokendrift-0.1.0/pyproject.toml +78 -0
- tokendrift-0.1.0/setup.cfg +4 -0
- tokendrift-0.1.0/tests/test_boundary.py +179 -0
- tokendrift-0.1.0/tests/test_cli.py +122 -0
- tokendrift-0.1.0/tests/test_cost.py +59 -0
- tokendrift-0.1.0/tests/test_differ.py +121 -0
- tokendrift-0.1.0/tests/test_loader.py +164 -0
- tokendrift-0.1.0/tests/test_loaders.py +127 -0
- tokendrift-0.1.0/tests/test_terminal.py +174 -0
- tokendrift-0.1.0/tests/test_vocab.py +70 -0
- tokendrift-0.1.0/tokendrift/__init__.py +79 -0
- tokendrift-0.1.0/tokendrift/__main__.py +9 -0
- tokendrift-0.1.0/tokendrift/cli/__init__.py +0 -0
- tokendrift-0.1.0/tokendrift/cli/main.py +315 -0
- tokendrift-0.1.0/tokendrift/core/__init__.py +0 -0
- tokendrift-0.1.0/tokendrift/core/boundary.py +238 -0
- tokendrift-0.1.0/tokendrift/core/differ.py +165 -0
- tokendrift-0.1.0/tokendrift/core/loader.py +300 -0
- tokendrift-0.1.0/tokendrift/core/vocab.py +75 -0
- tokendrift-0.1.0/tokendrift/corpus/__init__.py +0 -0
- tokendrift-0.1.0/tokendrift/corpus/loaders.py +123 -0
- tokendrift-0.1.0/tokendrift/models.py +189 -0
- tokendrift-0.1.0/tokendrift/py.typed +0 -0
- tokendrift-0.1.0/tokendrift/report/__init__.py +0 -0
- tokendrift-0.1.0/tokendrift/report/cost.py +79 -0
- tokendrift-0.1.0/tokendrift/report/terminal.py +355 -0
- tokendrift-0.1.0/tokendrift.egg-info/PKG-INFO +273 -0
- tokendrift-0.1.0/tokendrift.egg-info/SOURCES.txt +33 -0
- tokendrift-0.1.0/tokendrift.egg-info/dependency_links.txt +1 -0
- tokendrift-0.1.0/tokendrift.egg-info/entry_points.txt +2 -0
- tokendrift-0.1.0/tokendrift.egg-info/requires.txt +17 -0
- tokendrift-0.1.0/tokendrift.egg-info/top_level.txt +1 -0
tokendrift-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Armaan Sandhu
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tokendrift
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Token-count, cost, and vocabulary diffing for LLM tokenizer changes
|
|
5
|
+
Author-email: Armaan Sandhu <asandhu05@wpi.edu>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Ar-maan05/tokendrift#readme
|
|
8
|
+
Project-URL: Documentation, https://github.com/Ar-maan05/tokendrift#readme
|
|
9
|
+
Project-URL: Repository, https://github.com/Ar-maan05/tokendrift
|
|
10
|
+
Project-URL: Issues, https://github.com/Ar-maan05/tokendrift/issues
|
|
11
|
+
Project-URL: Changelog, https://github.com/Ar-maan05/tokendrift/blob/main/CHANGELOG.md
|
|
12
|
+
Keywords: llm,tokenizer,diff,regression,nlp
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Classifier: Environment :: Console
|
|
22
|
+
Classifier: Typing :: Typed
|
|
23
|
+
Requires-Python: >=3.10
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Requires-Dist: tiktoken>=0.7.0
|
|
27
|
+
Requires-Dist: tokenizers>=0.19.0
|
|
28
|
+
Requires-Dist: typer>=0.12.0
|
|
29
|
+
Requires-Dist: rich>=13.0.0
|
|
30
|
+
Provides-Extra: dev
|
|
31
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
32
|
+
Requires-Dist: pytest-cov>=5.0.0; extra == "dev"
|
|
33
|
+
Requires-Dist: hypothesis>=6.0.0; extra == "dev"
|
|
34
|
+
Requires-Dist: ruff>=0.8.5; extra == "dev"
|
|
35
|
+
Requires-Dist: pyright>=1.1.400; extra == "dev"
|
|
36
|
+
Provides-Extra: nltk
|
|
37
|
+
Requires-Dist: nltk>=3.8; extra == "nltk"
|
|
38
|
+
Provides-Extra: sentencepiece
|
|
39
|
+
Requires-Dist: sentencepiece>=0.2.0; extra == "sentencepiece"
|
|
40
|
+
Dynamic: license-file
|
|
41
|
+
|
|
42
|
+
# TokenDrift
|
|
43
|
+
|
|
44
|
+
[](https://github.com/Ar-maan05/tokendrift/actions/workflows/ci.yml)
|
|
45
|
+
[](https://pypi.org/project/tokendrift/)
|
|
46
|
+
[](https://pypi.org/project/tokendrift/)
|
|
47
|
+
[](LICENSE)
|
|
48
|
+
|
|
49
|
+
Token-count, cost, and vocabulary diffing for LLM tokenizer changes.
|
|
50
|
+
|
|
51
|
+
When you upgrade a model, switch providers, or move to a self-hosted checkpoint, the tokenizer changes silently. Token counts shift, moving your API cost and context-window pressure. Token IDs are renumbered, breaking any system that stored raw integer IDs (cache keys, classifier heads, adapter embedding rows). None of this produces an error.
|
|
52
|
+
|
|
53
|
+
TokenDrift measures these changes against your own prompt corpus before they hit production.
|
|
54
|
+
|
|
55
|
+
```
|
|
56
|
+
tokendrift diff cl100k_base o200k_base --corpus prompts.jsonl --price-a 0.03 --price-b 0.01
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
```
|
|
60
|
+
──────────── Vocab Diff cl100k_base → o200k_base ────────────
|
|
61
|
+
|
|
62
|
+
Vocab size A 100,277
|
|
63
|
+
Vocab size B 200,019
|
|
64
|
+
Added +11,997 tokens in B, not in A
|
|
65
|
+
Deleted -47 tokens in A, not in B
|
|
66
|
+
Remapped +19 ⚠ stored token IDs now point elsewhere
|
|
67
|
+
|
|
68
|
+
──────────── Encoding Diff cl100k_base → o200k_base ─────────
|
|
69
|
+
|
|
70
|
+
Corpus entries 1,247
|
|
71
|
+
Entries changed 834 / 1,247 (66.9%)
|
|
72
|
+
Total token Δ +2,841 (+3.2%)
|
|
73
|
+
Max Δ (single) +47
|
|
74
|
+
|
|
75
|
+
Top 5 most-affected entries
|
|
76
|
+
─────────────────────────────────────────────────────────────
|
|
77
|
+
p041 +47 "Summarize the biostatistical significance…"
|
|
78
|
+
p117 +31 "Translate the following JSON payload…"
|
|
79
|
+
|
|
80
|
+
──────────── Cost Report cl100k_base → o200k_base ───────────
|
|
81
|
+
|
|
82
|
+
Cost (A) $1.24
|
|
83
|
+
Cost (B) $1.28
|
|
84
|
+
Cost delta +$0.04 (+3.2%)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Installation
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
pip install tokendrift
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
For development:
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
git clone https://github.com/Ar-maan05/tokendrift
|
|
97
|
+
cd tokendrift
|
|
98
|
+
pip install -e ".[dev]"
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## Quick start
|
|
102
|
+
|
|
103
|
+
**Single text diff:**
|
|
104
|
+
```bash
|
|
105
|
+
tokendrift diff cl100k_base o200k_base \
|
|
106
|
+
--text "ChatGPT rewrites biostatistical significance tests"
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
**Corpus diff:**
|
|
110
|
+
```bash
|
|
111
|
+
tokendrift diff cl100k_base o200k_base --corpus prompts.jsonl
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
**Vocabulary diff only (no corpus needed):**
|
|
115
|
+
```bash
|
|
116
|
+
tokendrift vocab-diff cl100k_base o200k_base --show remapped
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
**Cost impact:**
|
|
120
|
+
```bash
|
|
121
|
+
tokendrift cost cl100k_base o200k_base \
|
|
122
|
+
--corpus prompts.jsonl \
|
|
123
|
+
--price-a 0.03 \
|
|
124
|
+
--price-b 0.01
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
**Inspect how a single text re-segments** (experimental boundary detection):
|
|
128
|
+
```bash
|
|
129
|
+
tokendrift entry cl100k_base o200k_base \
|
|
130
|
+
--text "ChatGPT rewrites biostatistical significance tests"
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## Corpus format
|
|
134
|
+
|
|
135
|
+
TokenDrift accepts JSONL (recommended), CSV, or plain text.
|
|
136
|
+
|
|
137
|
+
**JSONL**: one object per line, must have a `text` key:
|
|
138
|
+
```jsonl
|
|
139
|
+
{"id": "p001", "text": "What is the capital of France?"}
|
|
140
|
+
{"id": "p002", "text": "Summarize the following document:"}
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
`id` and `metadata` are optional. Everything else in the object is stored as metadata.
|
|
144
|
+
|
|
145
|
+
## What TokenDrift detects
|
|
146
|
+
|
|
147
|
+
### Vocabulary changes
|
|
148
|
+
|
|
149
|
+
- **Added tokens**: present in B, not in A.
|
|
150
|
+
- **Deleted tokens**: present in A, not in B.
|
|
151
|
+
- **Remapped tokens**: same string, different integer ID. This is the change that breaks silently; any system that stored a raw token ID rather than the string now points to a different entry.
|
|
152
|
+
|
|
153
|
+
### Encoding changes (the core of the tool)
|
|
154
|
+
|
|
155
|
+
- **Token count delta** per prompt and across the corpus. Positive = more tokens = higher cost and more context pressure.
|
|
156
|
+
- **Cost delta**: the count delta priced out, per prompt and corpus-wide.
|
|
157
|
+
- **First divergence position**: the character offset where the two encodings first differ.
|
|
158
|
+
|
|
159
|
+
These are exact, fully-supported, and the reason to use TokenDrift.
|
|
160
|
+
|
|
161
|
+
### Boundary changes (experimental)
|
|
162
|
+
|
|
163
|
+
Enabled with `--boundaries` on `diff`, or shown by the `entry` command. This is a **structural** report of how individual words are segmented differently, nothing more:
|
|
164
|
+
|
|
165
|
+
| Type | Meaning |
|
|
166
|
+
|------|---------|
|
|
167
|
+
| SPLIT | a word gains tokens (1 → 2+) |
|
|
168
|
+
| MERGE | a word loses tokens (2+ → 1) |
|
|
169
|
+
| RESEGMENT | same token count, but the segmentation boundaries moved |
|
|
170
|
+
|
|
171
|
+
**This is not a quality judgement.** TokenDrift does not claim a boundary change degrades model behaviour: re-segmentation is a normal consequence of a tokenizer change, and any behavioural effect is task-specific and not measured here. The feature is off by default and reports structure only, without severity ranking. (Pure ID renumbering, a word that encodes to the same strings in both tokenizers but with different IDs, is reported at the vocabulary level, not here, where it would flag almost every word.)
|
|
172
|
+
|
|
173
|
+
## Python API
|
|
174
|
+
|
|
175
|
+
```python
|
|
176
|
+
from tokendrift.core.loader import TokenizerLoader
|
|
177
|
+
from tokendrift.core.differ import EncodingDiffer
|
|
178
|
+
from tokendrift.core.vocab import VocabDiffer
|
|
179
|
+
from tokendrift.corpus.loaders import load_corpus
|
|
180
|
+
|
|
181
|
+
# Load tokenizers
|
|
182
|
+
tok_a = TokenizerLoader.load("cl100k_base") # tiktoken
|
|
183
|
+
tok_b = TokenizerLoader.load("o200k_base") # tiktoken
|
|
184
|
+
# tok_b = TokenizerLoader.load("Qwen/Qwen3-4B") # HuggingFace Hub
|
|
185
|
+
|
|
186
|
+
# Vocab diff
|
|
187
|
+
from tokendrift.core.vocab import VocabDiffer
|
|
188
|
+
v_diff = VocabDiffer().diff(tok_a, tok_b)
|
|
189
|
+
print(f"Added: {len(v_diff.added)}, Remapped: {len(v_diff.remapped)}")
|
|
190
|
+
|
|
191
|
+
# Single text diff (count/divergence only: the default, fully-supported path)
|
|
192
|
+
differ = EncodingDiffer()
|
|
193
|
+
d = differ.diff("biostatistical significance", tok_a, tok_b)
|
|
194
|
+
print(f"Token delta: {d.count_delta}, first divergence at char {d.first_divergence_pos}")
|
|
195
|
+
|
|
196
|
+
# Opt into experimental structural boundary detection
|
|
197
|
+
boundary_differ = EncodingDiffer(detect_boundaries=True)
|
|
198
|
+
d = boundary_differ.diff("biostatistical significance", tok_a, tok_b)
|
|
199
|
+
for v in d.boundary_violations: # SPLIT / MERGE / RESEGMENT, structural only
|
|
200
|
+
print(f" {v.word}: {v.tokens_a} → {v.tokens_b} ({v.violation_type.value})")
|
|
201
|
+
|
|
202
|
+
# Corpus diff
|
|
203
|
+
entries = load_corpus("prompts.jsonl")
|
|
204
|
+
pairs = [(e.id, e.text) for e in entries]
|
|
205
|
+
diffs = differ.diff_many(pairs, tok_a, tok_b)
|
|
206
|
+
|
|
207
|
+
# Cost report
|
|
208
|
+
from tokendrift.report.cost import CostCalculator
|
|
209
|
+
report = CostCalculator().compute(diffs, price_a=0.03, price_b=0.01)
|
|
210
|
+
print(f"Cost delta: ${report.cost_delta_usd:.4f}")
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
## Supported tokenizers
|
|
214
|
+
|
|
215
|
+
| Source | Example identifier | Notes |
|
|
216
|
+
|--------|-------------------|-------|
|
|
217
|
+
| tiktoken | `cl100k_base`, `o200k_base`, `p50k_base` | All OpenAI encodings |
|
|
218
|
+
| HuggingFace Hub | `Qwen/Qwen3-4B`, `meta-llama/Llama-3.2-1B` | Any model with `tokenizer.json` |
|
|
219
|
+
| Local directory | `/path/to/tokenizer/` | Loaded via HuggingFace `tokenizers` |
|
|
220
|
+
| Local file | `/path/to/tokenizer.json` | Direct file load |
|
|
221
|
+
|
|
222
|
+
## Running tests
|
|
223
|
+
|
|
224
|
+
```bash
|
|
225
|
+
# Offline tests (no network required)
|
|
226
|
+
pytest
|
|
227
|
+
|
|
228
|
+
# Full suite including real tiktoken / HuggingFace tokenizers
|
|
229
|
+
TOKENDRIFT_NETWORK_TESTS=1 pytest
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
## Project structure
|
|
233
|
+
|
|
234
|
+
```
|
|
235
|
+
tokendrift/
|
|
236
|
+
├── core/
|
|
237
|
+
│ ├── loader.py # UnifiedTokenizer + backends (tiktoken, HuggingFace)
|
|
238
|
+
│ ├── vocab.py # VocabDiffer
|
|
239
|
+
│ ├── differ.py # EncodingDiffer
|
|
240
|
+
│ └── boundary.py # BoundaryDetector
|
|
241
|
+
├── corpus/
|
|
242
|
+
│ └── loaders.py # JSONL / CSV / plain-text corpus loading
|
|
243
|
+
├── report/
|
|
244
|
+
│ ├── terminal.py # Rich terminal renderer
|
|
245
|
+
│ └── cost.py # CostCalculator
|
|
246
|
+
└── cli/
|
|
247
|
+
└── main.py # Typer CLI
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
## Roadmap
|
|
251
|
+
|
|
252
|
+
The next milestone is turning the diff into a gate:
|
|
253
|
+
|
|
254
|
+
- [ ] `ci` command: pin a corpus's token counts in a baseline and exit non-zero when a tokenizer change moves them (the feature that makes this CI infrastructure rather than a one-off diagnostic)
|
|
255
|
+
- [ ] `gen-tests` command: generate a pytest regression suite pinning current behavior
|
|
256
|
+
|
|
257
|
+
Later:
|
|
258
|
+
|
|
259
|
+
- [ ] DuckDB corpus persistence (`corpus/store.py`)
|
|
260
|
+
- [ ] HTML report output
|
|
261
|
+
- [ ] Validate (or drop) the behavioural significance of boundary changes against a task benchmark; promote out of "experimental" only if it holds up
|
|
262
|
+
- [ ] Rust batch encoder for large corpora (100k+ entries)
|
|
263
|
+
- [ ] SentencePiece backend
|
|
264
|
+
|
|
265
|
+
## Contributing
|
|
266
|
+
|
|
267
|
+
Contributions are welcome. See [CONTRIBUTING.md](CONTRIBUTING.md) for the dev
|
|
268
|
+
setup and the lint/format/type/test checks CI runs. Notable changes are recorded
|
|
269
|
+
in [CHANGELOG.md](CHANGELOG.md).
|
|
270
|
+
|
|
271
|
+
## License
|
|
272
|
+
|
|
273
|
+
MIT, see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
# TokenDrift
|
|
2
|
+
|
|
3
|
+
[](https://github.com/Ar-maan05/tokendrift/actions/workflows/ci.yml)
|
|
4
|
+
[](https://pypi.org/project/tokendrift/)
|
|
5
|
+
[](https://pypi.org/project/tokendrift/)
|
|
6
|
+
[](LICENSE)
|
|
7
|
+
|
|
8
|
+
Token-count, cost, and vocabulary diffing for LLM tokenizer changes.
|
|
9
|
+
|
|
10
|
+
When you upgrade a model, switch providers, or move to a self-hosted checkpoint, the tokenizer changes silently. Token counts shift, moving your API cost and context-window pressure. Token IDs are renumbered, breaking any system that stored raw integer IDs (cache keys, classifier heads, adapter embedding rows). None of this produces an error.
|
|
11
|
+
|
|
12
|
+
TokenDrift measures these changes against your own prompt corpus before they hit production.
|
|
13
|
+
|
|
14
|
+
```
|
|
15
|
+
tokendrift diff cl100k_base o200k_base --corpus prompts.jsonl --price-a 0.03 --price-b 0.01
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
```
|
|
19
|
+
──────────── Vocab Diff cl100k_base → o200k_base ────────────
|
|
20
|
+
|
|
21
|
+
Vocab size A 100,277
|
|
22
|
+
Vocab size B 200,019
|
|
23
|
+
Added +11,997 tokens in B, not in A
|
|
24
|
+
Deleted -47 tokens in A, not in B
|
|
25
|
+
Remapped +19 ⚠ stored token IDs now point elsewhere
|
|
26
|
+
|
|
27
|
+
──────────── Encoding Diff cl100k_base → o200k_base ─────────
|
|
28
|
+
|
|
29
|
+
Corpus entries 1,247
|
|
30
|
+
Entries changed 834 / 1,247 (66.9%)
|
|
31
|
+
Total token Δ +2,841 (+3.2%)
|
|
32
|
+
Max Δ (single) +47
|
|
33
|
+
|
|
34
|
+
Top 5 most-affected entries
|
|
35
|
+
─────────────────────────────────────────────────────────────
|
|
36
|
+
p041 +47 "Summarize the biostatistical significance…"
|
|
37
|
+
p117 +31 "Translate the following JSON payload…"
|
|
38
|
+
|
|
39
|
+
──────────── Cost Report cl100k_base → o200k_base ───────────
|
|
40
|
+
|
|
41
|
+
Cost (A) $1.24
|
|
42
|
+
Cost (B) $1.28
|
|
43
|
+
Cost delta +$0.04 (+3.2%)
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Installation
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
pip install tokendrift
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
For development:
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
git clone https://github.com/Ar-maan05/tokendrift
|
|
56
|
+
cd tokendrift
|
|
57
|
+
pip install -e ".[dev]"
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Quick start
|
|
61
|
+
|
|
62
|
+
**Single text diff:**
|
|
63
|
+
```bash
|
|
64
|
+
tokendrift diff cl100k_base o200k_base \
|
|
65
|
+
--text "ChatGPT rewrites biostatistical significance tests"
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
**Corpus diff:**
|
|
69
|
+
```bash
|
|
70
|
+
tokendrift diff cl100k_base o200k_base --corpus prompts.jsonl
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
**Vocabulary diff only (no corpus needed):**
|
|
74
|
+
```bash
|
|
75
|
+
tokendrift vocab-diff cl100k_base o200k_base --show remapped
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
**Cost impact:**
|
|
79
|
+
```bash
|
|
80
|
+
tokendrift cost cl100k_base o200k_base \
|
|
81
|
+
--corpus prompts.jsonl \
|
|
82
|
+
--price-a 0.03 \
|
|
83
|
+
--price-b 0.01
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
**Inspect how a single text re-segments** (experimental boundary detection):
|
|
87
|
+
```bash
|
|
88
|
+
tokendrift entry cl100k_base o200k_base \
|
|
89
|
+
--text "ChatGPT rewrites biostatistical significance tests"
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## Corpus format
|
|
93
|
+
|
|
94
|
+
TokenDrift accepts JSONL (recommended), CSV, or plain text.
|
|
95
|
+
|
|
96
|
+
**JSONL**: one object per line, must have a `text` key:
|
|
97
|
+
```jsonl
|
|
98
|
+
{"id": "p001", "text": "What is the capital of France?"}
|
|
99
|
+
{"id": "p002", "text": "Summarize the following document:"}
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
`id` and `metadata` are optional. Everything else in the object is stored as metadata.
|
|
103
|
+
|
|
104
|
+
## What TokenDrift detects
|
|
105
|
+
|
|
106
|
+
### Vocabulary changes
|
|
107
|
+
|
|
108
|
+
- **Added tokens**: present in B, not in A.
|
|
109
|
+
- **Deleted tokens**: present in A, not in B.
|
|
110
|
+
- **Remapped tokens**: same string, different integer ID. This is the change that breaks silently; any system that stored a raw token ID rather than the string now points to a different entry.
|
|
111
|
+
|
|
112
|
+
### Encoding changes (the core of the tool)
|
|
113
|
+
|
|
114
|
+
- **Token count delta** per prompt and across the corpus. Positive = more tokens = higher cost and more context pressure.
|
|
115
|
+
- **Cost delta**: the count delta priced out, per prompt and corpus-wide.
|
|
116
|
+
- **First divergence position**: the character offset where the two encodings first differ.
|
|
117
|
+
|
|
118
|
+
These are exact, fully-supported, and the reason to use TokenDrift.
|
|
119
|
+
|
|
120
|
+
### Boundary changes (experimental)
|
|
121
|
+
|
|
122
|
+
Enabled with `--boundaries` on `diff`, or shown by the `entry` command. This is a **structural** report of how individual words are segmented differently, nothing more:
|
|
123
|
+
|
|
124
|
+
| Type | Meaning |
|
|
125
|
+
|------|---------|
|
|
126
|
+
| SPLIT | a word gains tokens (1 → 2+) |
|
|
127
|
+
| MERGE | a word loses tokens (2+ → 1) |
|
|
128
|
+
| RESEGMENT | same token count, but the segmentation boundaries moved |
|
|
129
|
+
|
|
130
|
+
**This is not a quality judgement.** TokenDrift does not claim a boundary change degrades model behaviour: re-segmentation is a normal consequence of a tokenizer change, and any behavioural effect is task-specific and not measured here. The feature is off by default and reports structure only, without severity ranking. (Pure ID renumbering, a word that encodes to the same strings in both tokenizers but with different IDs, is reported at the vocabulary level, not here, where it would flag almost every word.)
|
|
131
|
+
|
|
132
|
+
## Python API
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
from tokendrift.core.loader import TokenizerLoader
|
|
136
|
+
from tokendrift.core.differ import EncodingDiffer
|
|
137
|
+
from tokendrift.core.vocab import VocabDiffer
|
|
138
|
+
from tokendrift.corpus.loaders import load_corpus
|
|
139
|
+
|
|
140
|
+
# Load tokenizers
|
|
141
|
+
tok_a = TokenizerLoader.load("cl100k_base") # tiktoken
|
|
142
|
+
tok_b = TokenizerLoader.load("o200k_base") # tiktoken
|
|
143
|
+
# tok_b = TokenizerLoader.load("Qwen/Qwen3-4B") # HuggingFace Hub
|
|
144
|
+
|
|
145
|
+
# Vocab diff
|
|
146
|
+
from tokendrift.core.vocab import VocabDiffer
|
|
147
|
+
v_diff = VocabDiffer().diff(tok_a, tok_b)
|
|
148
|
+
print(f"Added: {len(v_diff.added)}, Remapped: {len(v_diff.remapped)}")
|
|
149
|
+
|
|
150
|
+
# Single text diff (count/divergence only: the default, fully-supported path)
|
|
151
|
+
differ = EncodingDiffer()
|
|
152
|
+
d = differ.diff("biostatistical significance", tok_a, tok_b)
|
|
153
|
+
print(f"Token delta: {d.count_delta}, first divergence at char {d.first_divergence_pos}")
|
|
154
|
+
|
|
155
|
+
# Opt into experimental structural boundary detection
|
|
156
|
+
boundary_differ = EncodingDiffer(detect_boundaries=True)
|
|
157
|
+
d = boundary_differ.diff("biostatistical significance", tok_a, tok_b)
|
|
158
|
+
for v in d.boundary_violations: # SPLIT / MERGE / RESEGMENT, structural only
|
|
159
|
+
print(f" {v.word}: {v.tokens_a} → {v.tokens_b} ({v.violation_type.value})")
|
|
160
|
+
|
|
161
|
+
# Corpus diff
|
|
162
|
+
entries = load_corpus("prompts.jsonl")
|
|
163
|
+
pairs = [(e.id, e.text) for e in entries]
|
|
164
|
+
diffs = differ.diff_many(pairs, tok_a, tok_b)
|
|
165
|
+
|
|
166
|
+
# Cost report
|
|
167
|
+
from tokendrift.report.cost import CostCalculator
|
|
168
|
+
report = CostCalculator().compute(diffs, price_a=0.03, price_b=0.01)
|
|
169
|
+
print(f"Cost delta: ${report.cost_delta_usd:.4f}")
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
## Supported tokenizers
|
|
173
|
+
|
|
174
|
+
| Source | Example identifier | Notes |
|
|
175
|
+
|--------|-------------------|-------|
|
|
176
|
+
| tiktoken | `cl100k_base`, `o200k_base`, `p50k_base` | All OpenAI encodings |
|
|
177
|
+
| HuggingFace Hub | `Qwen/Qwen3-4B`, `meta-llama/Llama-3.2-1B` | Any model with `tokenizer.json` |
|
|
178
|
+
| Local directory | `/path/to/tokenizer/` | Loaded via HuggingFace `tokenizers` |
|
|
179
|
+
| Local file | `/path/to/tokenizer.json` | Direct file load |
|
|
180
|
+
|
|
181
|
+
## Running tests
|
|
182
|
+
|
|
183
|
+
```bash
|
|
184
|
+
# Offline tests (no network required)
|
|
185
|
+
pytest
|
|
186
|
+
|
|
187
|
+
# Full suite including real tiktoken / HuggingFace tokenizers
|
|
188
|
+
TOKENDRIFT_NETWORK_TESTS=1 pytest
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
## Project structure
|
|
192
|
+
|
|
193
|
+
```
|
|
194
|
+
tokendrift/
|
|
195
|
+
├── core/
|
|
196
|
+
│ ├── loader.py # UnifiedTokenizer + backends (tiktoken, HuggingFace)
|
|
197
|
+
│ ├── vocab.py # VocabDiffer
|
|
198
|
+
│ ├── differ.py # EncodingDiffer
|
|
199
|
+
│ └── boundary.py # BoundaryDetector
|
|
200
|
+
├── corpus/
|
|
201
|
+
│ └── loaders.py # JSONL / CSV / plain-text corpus loading
|
|
202
|
+
├── report/
|
|
203
|
+
│ ├── terminal.py # Rich terminal renderer
|
|
204
|
+
│ └── cost.py # CostCalculator
|
|
205
|
+
└── cli/
|
|
206
|
+
└── main.py # Typer CLI
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
## Roadmap
|
|
210
|
+
|
|
211
|
+
The next milestone is turning the diff into a gate:
|
|
212
|
+
|
|
213
|
+
- [ ] `ci` command: pin a corpus's token counts in a baseline and exit non-zero when a tokenizer change moves them (the feature that makes this CI infrastructure rather than a one-off diagnostic)
|
|
214
|
+
- [ ] `gen-tests` command: generate a pytest regression suite pinning current behavior
|
|
215
|
+
|
|
216
|
+
Later:
|
|
217
|
+
|
|
218
|
+
- [ ] DuckDB corpus persistence (`corpus/store.py`)
|
|
219
|
+
- [ ] HTML report output
|
|
220
|
+
- [ ] Validate (or drop) the behavioural significance of boundary changes against a task benchmark; promote out of "experimental" only if it holds up
|
|
221
|
+
- [ ] Rust batch encoder for large corpora (100k+ entries)
|
|
222
|
+
- [ ] SentencePiece backend
|
|
223
|
+
|
|
224
|
+
## Contributing
|
|
225
|
+
|
|
226
|
+
Contributions are welcome. See [CONTRIBUTING.md](CONTRIBUTING.md) for the dev
|
|
227
|
+
setup and the lint/format/type/test checks CI runs. Notable changes are recorded
|
|
228
|
+
in [CHANGELOG.md](CHANGELOG.md).
|
|
229
|
+
|
|
230
|
+
## License
|
|
231
|
+
|
|
232
|
+
MIT, see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "tokendrift"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Token-count, cost, and vocabulary diffing for LLM tokenizer changes"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [{ name = "Armaan Sandhu", email = "asandhu05@wpi.edu" }]
|
|
13
|
+
keywords = ["llm", "tokenizer", "diff", "regression", "nlp"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 3 - Alpha",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"Programming Language :: Python :: 3.10",
|
|
19
|
+
"Programming Language :: Python :: 3.11",
|
|
20
|
+
"Programming Language :: Python :: 3.12",
|
|
21
|
+
"Programming Language :: Python :: 3.13",
|
|
22
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
23
|
+
"Environment :: Console",
|
|
24
|
+
"Typing :: Typed",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"tiktoken>=0.7.0",
|
|
28
|
+
"tokenizers>=0.19.0",
|
|
29
|
+
"typer>=0.12.0",
|
|
30
|
+
"rich>=13.0.0",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.optional-dependencies]
|
|
34
|
+
dev = [
|
|
35
|
+
"pytest>=8.0.0",
|
|
36
|
+
"pytest-cov>=5.0.0",
|
|
37
|
+
"hypothesis>=6.0.0",
|
|
38
|
+
"ruff>=0.8.5",
|
|
39
|
+
"pyright>=1.1.400",
|
|
40
|
+
]
|
|
41
|
+
nltk = ["nltk>=3.8"]
|
|
42
|
+
sentencepiece = ["sentencepiece>=0.2.0"]
|
|
43
|
+
|
|
44
|
+
[project.scripts]
|
|
45
|
+
tokendrift = "tokendrift.cli.main:app"
|
|
46
|
+
|
|
47
|
+
[project.urls]
|
|
48
|
+
Homepage = "https://github.com/Ar-maan05/tokendrift#readme"
|
|
49
|
+
Documentation = "https://github.com/Ar-maan05/tokendrift#readme"
|
|
50
|
+
Repository = "https://github.com/Ar-maan05/tokendrift"
|
|
51
|
+
Issues = "https://github.com/Ar-maan05/tokendrift/issues"
|
|
52
|
+
Changelog = "https://github.com/Ar-maan05/tokendrift/blob/main/CHANGELOG.md"
|
|
53
|
+
|
|
54
|
+
[tool.setuptools.packages.find]
|
|
55
|
+
where = ["."]
|
|
56
|
+
include = ["tokendrift*"]
|
|
57
|
+
|
|
58
|
+
[tool.setuptools.package-data]
|
|
59
|
+
tokendrift = ["py.typed"]
|
|
60
|
+
|
|
61
|
+
[tool.pytest.ini_options]
|
|
62
|
+
testpaths = ["tests"]
|
|
63
|
+
addopts = "-v --tb=short"
|
|
64
|
+
|
|
65
|
+
[tool.coverage.run]
|
|
66
|
+
source = ["tokendrift"]
|
|
67
|
+
omit = ["tests/*"]
|
|
68
|
+
|
|
69
|
+
[tool.ruff]
|
|
70
|
+
line-length = 120
|
|
71
|
+
target-version = "py310"
|
|
72
|
+
|
|
73
|
+
[tool.ruff.lint]
|
|
74
|
+
select = ["E", "F", "I", "UP"]
|
|
75
|
+
|
|
76
|
+
[tool.pyright]
|
|
77
|
+
include = ["tokendrift"]
|
|
78
|
+
pythonVersion = "3.10"
|