tokendrift 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. tokendrift-0.1.0/LICENSE +21 -0
  2. tokendrift-0.1.0/PKG-INFO +273 -0
  3. tokendrift-0.1.0/README.md +232 -0
  4. tokendrift-0.1.0/pyproject.toml +78 -0
  5. tokendrift-0.1.0/setup.cfg +4 -0
  6. tokendrift-0.1.0/tests/test_boundary.py +179 -0
  7. tokendrift-0.1.0/tests/test_cli.py +122 -0
  8. tokendrift-0.1.0/tests/test_cost.py +59 -0
  9. tokendrift-0.1.0/tests/test_differ.py +121 -0
  10. tokendrift-0.1.0/tests/test_loader.py +164 -0
  11. tokendrift-0.1.0/tests/test_loaders.py +127 -0
  12. tokendrift-0.1.0/tests/test_terminal.py +174 -0
  13. tokendrift-0.1.0/tests/test_vocab.py +70 -0
  14. tokendrift-0.1.0/tokendrift/__init__.py +79 -0
  15. tokendrift-0.1.0/tokendrift/__main__.py +9 -0
  16. tokendrift-0.1.0/tokendrift/cli/__init__.py +0 -0
  17. tokendrift-0.1.0/tokendrift/cli/main.py +315 -0
  18. tokendrift-0.1.0/tokendrift/core/__init__.py +0 -0
  19. tokendrift-0.1.0/tokendrift/core/boundary.py +238 -0
  20. tokendrift-0.1.0/tokendrift/core/differ.py +165 -0
  21. tokendrift-0.1.0/tokendrift/core/loader.py +300 -0
  22. tokendrift-0.1.0/tokendrift/core/vocab.py +75 -0
  23. tokendrift-0.1.0/tokendrift/corpus/__init__.py +0 -0
  24. tokendrift-0.1.0/tokendrift/corpus/loaders.py +123 -0
  25. tokendrift-0.1.0/tokendrift/models.py +189 -0
  26. tokendrift-0.1.0/tokendrift/py.typed +0 -0
  27. tokendrift-0.1.0/tokendrift/report/__init__.py +0 -0
  28. tokendrift-0.1.0/tokendrift/report/cost.py +79 -0
  29. tokendrift-0.1.0/tokendrift/report/terminal.py +355 -0
  30. tokendrift-0.1.0/tokendrift.egg-info/PKG-INFO +273 -0
  31. tokendrift-0.1.0/tokendrift.egg-info/SOURCES.txt +33 -0
  32. tokendrift-0.1.0/tokendrift.egg-info/dependency_links.txt +1 -0
  33. tokendrift-0.1.0/tokendrift.egg-info/entry_points.txt +2 -0
  34. tokendrift-0.1.0/tokendrift.egg-info/requires.txt +17 -0
  35. tokendrift-0.1.0/tokendrift.egg-info/top_level.txt +1 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Armaan Sandhu
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,273 @@
1
+ Metadata-Version: 2.4
2
+ Name: tokendrift
3
+ Version: 0.1.0
4
+ Summary: Token-count, cost, and vocabulary diffing for LLM tokenizer changes
5
+ Author-email: Armaan Sandhu <asandhu05@wpi.edu>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/Ar-maan05/tokendrift#readme
8
+ Project-URL: Documentation, https://github.com/Ar-maan05/tokendrift#readme
9
+ Project-URL: Repository, https://github.com/Ar-maan05/tokendrift
10
+ Project-URL: Issues, https://github.com/Ar-maan05/tokendrift/issues
11
+ Project-URL: Changelog, https://github.com/Ar-maan05/tokendrift/blob/main/CHANGELOG.md
12
+ Keywords: llm,tokenizer,diff,regression,nlp
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
+ Classifier: Environment :: Console
22
+ Classifier: Typing :: Typed
23
+ Requires-Python: >=3.10
24
+ Description-Content-Type: text/markdown
25
+ License-File: LICENSE
26
+ Requires-Dist: tiktoken>=0.7.0
27
+ Requires-Dist: tokenizers>=0.19.0
28
+ Requires-Dist: typer>=0.12.0
29
+ Requires-Dist: rich>=13.0.0
30
+ Provides-Extra: dev
31
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
32
+ Requires-Dist: pytest-cov>=5.0.0; extra == "dev"
33
+ Requires-Dist: hypothesis>=6.0.0; extra == "dev"
34
+ Requires-Dist: ruff>=0.8.5; extra == "dev"
35
+ Requires-Dist: pyright>=1.1.400; extra == "dev"
36
+ Provides-Extra: nltk
37
+ Requires-Dist: nltk>=3.8; extra == "nltk"
38
+ Provides-Extra: sentencepiece
39
+ Requires-Dist: sentencepiece>=0.2.0; extra == "sentencepiece"
40
+ Dynamic: license-file
41
+
42
+ # TokenDrift
43
+
44
+ [![CI](https://github.com/Ar-maan05/tokendrift/actions/workflows/ci.yml/badge.svg)](https://github.com/Ar-maan05/tokendrift/actions/workflows/ci.yml)
45
+ [![PyPI](https://img.shields.io/pypi/v/tokendrift.svg)](https://pypi.org/project/tokendrift/)
46
+ [![Python versions](https://img.shields.io/pypi/pyversions/tokendrift.svg)](https://pypi.org/project/tokendrift/)
47
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
48
+
49
+ Token-count, cost, and vocabulary diffing for LLM tokenizer changes.
50
+
51
+ When you upgrade a model, switch providers, or move to a self-hosted checkpoint, the tokenizer changes silently. Token counts shift, moving your API cost and context-window pressure. Token IDs are renumbered, breaking any system that stored raw integer IDs (cache keys, classifier heads, adapter embedding rows). None of this produces an error.
52
+
53
+ TokenDrift measures these changes against your own prompt corpus before they hit production.
54
+
55
+ ```
56
+ tokendrift diff cl100k_base o200k_base --corpus prompts.jsonl --price-a 0.03 --price-b 0.01
57
+ ```
58
+
59
+ ```
60
+ ──────────── Vocab Diff cl100k_base → o200k_base ────────────
61
+
62
+ Vocab size A 100,277
63
+ Vocab size B 200,019
64
+ Added +11,997 tokens in B, not in A
65
+ Deleted -47 tokens in A, not in B
66
+ Remapped +19 ⚠ stored token IDs now point elsewhere
67
+
68
+ ──────────── Encoding Diff cl100k_base → o200k_base ─────────
69
+
70
+ Corpus entries 1,247
71
+ Entries changed 834 / 1,247 (66.9%)
72
+ Total token Δ +2,841 (+3.2%)
73
+ Max Δ (single) +47
74
+
75
+ Top 5 most-affected entries
76
+ ─────────────────────────────────────────────────────────────
77
+ p041 +47 "Summarize the biostatistical significance…"
78
+ p117 +31 "Translate the following JSON payload…"
79
+
80
+ ──────────── Cost Report cl100k_base → o200k_base ───────────
81
+
82
+ Cost (A) $1.24
83
+ Cost (B) $1.28
84
+ Cost delta +$0.04 (+3.2%)
85
+ ```
86
+
87
+ ## Installation
88
+
89
+ ```bash
90
+ pip install tokendrift
91
+ ```
92
+
93
+ For development:
94
+
95
+ ```bash
96
+ git clone https://github.com/Ar-maan05/tokendrift
97
+ cd tokendrift
98
+ pip install -e ".[dev]"
99
+ ```
100
+
101
+ ## Quick start
102
+
103
+ **Single text diff:**
104
+ ```bash
105
+ tokendrift diff cl100k_base o200k_base \
106
+ --text "ChatGPT rewrites biostatistical significance tests"
107
+ ```
108
+
109
+ **Corpus diff:**
110
+ ```bash
111
+ tokendrift diff cl100k_base o200k_base --corpus prompts.jsonl
112
+ ```
113
+
114
+ **Vocabulary diff only (no corpus needed):**
115
+ ```bash
116
+ tokendrift vocab-diff cl100k_base o200k_base --show remapped
117
+ ```
118
+
119
+ **Cost impact:**
120
+ ```bash
121
+ tokendrift cost cl100k_base o200k_base \
122
+ --corpus prompts.jsonl \
123
+ --price-a 0.03 \
124
+ --price-b 0.01
125
+ ```
126
+
127
+ **Inspect how a single text re-segments** (experimental boundary detection):
128
+ ```bash
129
+ tokendrift entry cl100k_base o200k_base \
130
+ --text "ChatGPT rewrites biostatistical significance tests"
131
+ ```
132
+
133
+ ## Corpus format
134
+
135
+ TokenDrift accepts JSONL (recommended), CSV, or plain text.
136
+
137
+ **JSONL**: one object per line, must have a `text` key:
138
+ ```jsonl
139
+ {"id": "p001", "text": "What is the capital of France?"}
140
+ {"id": "p002", "text": "Summarize the following document:"}
141
+ ```
142
+
143
+ `id` and `metadata` are optional. Everything else in the object is stored as metadata.
144
+
145
+ ## What TokenDrift detects
146
+
147
+ ### Vocabulary changes
148
+
149
+ - **Added tokens**: present in B, not in A.
150
+ - **Deleted tokens**: present in A, not in B.
151
+ - **Remapped tokens**: same string, different integer ID. This is the change that breaks silently; any system that stored a raw token ID rather than the string now points to a different entry.
152
+
153
+ ### Encoding changes (the core of the tool)
154
+
155
+ - **Token count delta** per prompt and across the corpus. Positive = more tokens = higher cost and more context pressure.
156
+ - **Cost delta**: the count delta priced out, per prompt and corpus-wide.
157
+ - **First divergence position**: the character offset where the two encodings first differ.
158
+
159
+ These are exact, fully-supported, and the reason to use TokenDrift.
160
+
161
+ ### Boundary changes (experimental)
162
+
163
+ Enabled with `--boundaries` on `diff`, or shown by the `entry` command. This is a **structural** report of how individual words are segmented differently, nothing more:
164
+
165
+ | Type | Meaning |
166
+ |------|---------|
167
+ | SPLIT | a word gains tokens (1 → 2+) |
168
+ | MERGE | a word loses tokens (2+ → 1) |
169
+ | RESEGMENT | same token count, but the segmentation boundaries moved |
170
+
171
+ **This is not a quality judgement.** TokenDrift does not claim a boundary change degrades model behaviour: re-segmentation is a normal consequence of a tokenizer change, and any behavioural effect is task-specific and not measured here. The feature is off by default and reports structure only, without severity ranking. (Pure ID renumbering, a word that encodes to the same strings in both tokenizers but with different IDs, is reported at the vocabulary level, not here, where it would flag almost every word.)
172
+
173
+ ## Python API
174
+
175
+ ```python
176
+ from tokendrift.core.loader import TokenizerLoader
177
+ from tokendrift.core.differ import EncodingDiffer
178
+ from tokendrift.core.vocab import VocabDiffer
179
+ from tokendrift.corpus.loaders import load_corpus
180
+
181
+ # Load tokenizers
182
+ tok_a = TokenizerLoader.load("cl100k_base") # tiktoken
183
+ tok_b = TokenizerLoader.load("o200k_base") # tiktoken
184
+ # tok_b = TokenizerLoader.load("Qwen/Qwen3-4B") # HuggingFace Hub
185
+
186
+ # Vocab diff
187
+ from tokendrift.core.vocab import VocabDiffer
188
+ v_diff = VocabDiffer().diff(tok_a, tok_b)
189
+ print(f"Added: {len(v_diff.added)}, Remapped: {len(v_diff.remapped)}")
190
+
191
+ # Single text diff (count/divergence only: the default, fully-supported path)
192
+ differ = EncodingDiffer()
193
+ d = differ.diff("biostatistical significance", tok_a, tok_b)
194
+ print(f"Token delta: {d.count_delta}, first divergence at char {d.first_divergence_pos}")
195
+
196
+ # Opt into experimental structural boundary detection
197
+ boundary_differ = EncodingDiffer(detect_boundaries=True)
198
+ d = boundary_differ.diff("biostatistical significance", tok_a, tok_b)
199
+ for v in d.boundary_violations: # SPLIT / MERGE / RESEGMENT, structural only
200
+ print(f" {v.word}: {v.tokens_a} → {v.tokens_b} ({v.violation_type.value})")
201
+
202
+ # Corpus diff
203
+ entries = load_corpus("prompts.jsonl")
204
+ pairs = [(e.id, e.text) for e in entries]
205
+ diffs = differ.diff_many(pairs, tok_a, tok_b)
206
+
207
+ # Cost report
208
+ from tokendrift.report.cost import CostCalculator
209
+ report = CostCalculator().compute(diffs, price_a=0.03, price_b=0.01)
210
+ print(f"Cost delta: ${report.cost_delta_usd:.4f}")
211
+ ```
212
+
213
+ ## Supported tokenizers
214
+
215
+ | Source | Example identifier | Notes |
216
+ |--------|-------------------|-------|
217
+ | tiktoken | `cl100k_base`, `o200k_base`, `p50k_base` | All OpenAI encodings |
218
+ | HuggingFace Hub | `Qwen/Qwen3-4B`, `meta-llama/Llama-3.2-1B` | Any model with `tokenizer.json` |
219
+ | Local directory | `/path/to/tokenizer/` | Loaded via HuggingFace `tokenizers` |
220
+ | Local file | `/path/to/tokenizer.json` | Direct file load |
221
+
222
+ ## Running tests
223
+
224
+ ```bash
225
+ # Offline tests (no network required)
226
+ pytest
227
+
228
+ # Full suite including real tiktoken / HuggingFace tokenizers
229
+ TOKENDRIFT_NETWORK_TESTS=1 pytest
230
+ ```
231
+
232
+ ## Project structure
233
+
234
+ ```
235
+ tokendrift/
236
+ ├── core/
237
+ │ ├── loader.py # UnifiedTokenizer + backends (tiktoken, HuggingFace)
238
+ │ ├── vocab.py # VocabDiffer
239
+ │ ├── differ.py # EncodingDiffer
240
+ │ └── boundary.py # BoundaryDetector
241
+ ├── corpus/
242
+ │ └── loaders.py # JSONL / CSV / plain-text corpus loading
243
+ ├── report/
244
+ │ ├── terminal.py # Rich terminal renderer
245
+ │ └── cost.py # CostCalculator
246
+ └── cli/
247
+ └── main.py # Typer CLI
248
+ ```
249
+
250
+ ## Roadmap
251
+
252
+ The next milestone is turning the diff into a gate:
253
+
254
+ - [ ] `ci` command: pin a corpus's token counts in a baseline and exit non-zero when a tokenizer change moves them (the feature that makes this CI infrastructure rather than a one-off diagnostic)
255
+ - [ ] `gen-tests` command: generate a pytest regression suite pinning current behavior
256
+
257
+ Later:
258
+
259
+ - [ ] DuckDB corpus persistence (`corpus/store.py`)
260
+ - [ ] HTML report output
261
+ - [ ] Validate (or drop) the behavioural significance of boundary changes against a task benchmark; promote out of "experimental" only if it holds up
262
+ - [ ] Rust batch encoder for large corpora (100k+ entries)
263
+ - [ ] SentencePiece backend
264
+
265
+ ## Contributing
266
+
267
+ Contributions are welcome. See [CONTRIBUTING.md](CONTRIBUTING.md) for the dev
268
+ setup and the lint/format/type/test checks CI runs. Notable changes are recorded
269
+ in [CHANGELOG.md](CHANGELOG.md).
270
+
271
+ ## License
272
+
273
+ MIT, see [LICENSE](LICENSE).
@@ -0,0 +1,232 @@
1
+ # TokenDrift
2
+
3
+ [![CI](https://github.com/Ar-maan05/tokendrift/actions/workflows/ci.yml/badge.svg)](https://github.com/Ar-maan05/tokendrift/actions/workflows/ci.yml)
4
+ [![PyPI](https://img.shields.io/pypi/v/tokendrift.svg)](https://pypi.org/project/tokendrift/)
5
+ [![Python versions](https://img.shields.io/pypi/pyversions/tokendrift.svg)](https://pypi.org/project/tokendrift/)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
7
+
8
+ Token-count, cost, and vocabulary diffing for LLM tokenizer changes.
9
+
10
+ When you upgrade a model, switch providers, or move to a self-hosted checkpoint, the tokenizer changes silently. Token counts shift, moving your API cost and context-window pressure. Token IDs are renumbered, breaking any system that stored raw integer IDs (cache keys, classifier heads, adapter embedding rows). None of this produces an error.
11
+
12
+ TokenDrift measures these changes against your own prompt corpus before they hit production.
13
+
14
+ ```
15
+ tokendrift diff cl100k_base o200k_base --corpus prompts.jsonl --price-a 0.03 --price-b 0.01
16
+ ```
17
+
18
+ ```
19
+ ──────────── Vocab Diff cl100k_base → o200k_base ────────────
20
+
21
+ Vocab size A 100,277
22
+ Vocab size B 200,019
23
+ Added +11,997 tokens in B, not in A
24
+ Deleted -47 tokens in A, not in B
25
+ Remapped +19 ⚠ stored token IDs now point elsewhere
26
+
27
+ ──────────── Encoding Diff cl100k_base → o200k_base ─────────
28
+
29
+ Corpus entries 1,247
30
+ Entries changed 834 / 1,247 (66.9%)
31
+ Total token Δ +2,841 (+3.2%)
32
+ Max Δ (single) +47
33
+
34
+ Top 5 most-affected entries
35
+ ─────────────────────────────────────────────────────────────
36
+ p041 +47 "Summarize the biostatistical significance…"
37
+ p117 +31 "Translate the following JSON payload…"
38
+
39
+ ──────────── Cost Report cl100k_base → o200k_base ───────────
40
+
41
+ Cost (A) $1.24
42
+ Cost (B) $1.28
43
+ Cost delta +$0.04 (+3.2%)
44
+ ```
45
+
46
+ ## Installation
47
+
48
+ ```bash
49
+ pip install tokendrift
50
+ ```
51
+
52
+ For development:
53
+
54
+ ```bash
55
+ git clone https://github.com/Ar-maan05/tokendrift
56
+ cd tokendrift
57
+ pip install -e ".[dev]"
58
+ ```
59
+
60
+ ## Quick start
61
+
62
+ **Single text diff:**
63
+ ```bash
64
+ tokendrift diff cl100k_base o200k_base \
65
+ --text "ChatGPT rewrites biostatistical significance tests"
66
+ ```
67
+
68
+ **Corpus diff:**
69
+ ```bash
70
+ tokendrift diff cl100k_base o200k_base --corpus prompts.jsonl
71
+ ```
72
+
73
+ **Vocabulary diff only (no corpus needed):**
74
+ ```bash
75
+ tokendrift vocab-diff cl100k_base o200k_base --show remapped
76
+ ```
77
+
78
+ **Cost impact:**
79
+ ```bash
80
+ tokendrift cost cl100k_base o200k_base \
81
+ --corpus prompts.jsonl \
82
+ --price-a 0.03 \
83
+ --price-b 0.01
84
+ ```
85
+
86
+ **Inspect how a single text re-segments** (experimental boundary detection):
87
+ ```bash
88
+ tokendrift entry cl100k_base o200k_base \
89
+ --text "ChatGPT rewrites biostatistical significance tests"
90
+ ```
91
+
92
+ ## Corpus format
93
+
94
+ TokenDrift accepts JSONL (recommended), CSV, or plain text.
95
+
96
+ **JSONL**: one object per line, must have a `text` key:
97
+ ```jsonl
98
+ {"id": "p001", "text": "What is the capital of France?"}
99
+ {"id": "p002", "text": "Summarize the following document:"}
100
+ ```
101
+
102
+ `id` and `metadata` are optional. Everything else in the object is stored as metadata.
103
+
104
+ ## What TokenDrift detects
105
+
106
+ ### Vocabulary changes
107
+
108
+ - **Added tokens**: present in B, not in A.
109
+ - **Deleted tokens**: present in A, not in B.
110
+ - **Remapped tokens**: same string, different integer ID. This is the change that breaks silently; any system that stored a raw token ID rather than the string now points to a different entry.
111
+
112
+ ### Encoding changes (the core of the tool)
113
+
114
+ - **Token count delta** per prompt and across the corpus. Positive = more tokens = higher cost and more context pressure.
115
+ - **Cost delta**: the count delta priced out, per prompt and corpus-wide.
116
+ - **First divergence position**: the character offset where the two encodings first differ.
117
+
118
+ These are exact, fully-supported, and the reason to use TokenDrift.
119
+
120
+ ### Boundary changes (experimental)
121
+
122
+ Enabled with `--boundaries` on `diff`, or shown by the `entry` command. This is a **structural** report of how individual words are segmented differently, nothing more:
123
+
124
+ | Type | Meaning |
125
+ |------|---------|
126
+ | SPLIT | a word gains tokens (1 → 2+) |
127
+ | MERGE | a word loses tokens (2+ → 1) |
128
+ | RESEGMENT | same token count, but the segmentation boundaries moved |
129
+
130
+ **This is not a quality judgement.** TokenDrift does not claim a boundary change degrades model behaviour: re-segmentation is a normal consequence of a tokenizer change, and any behavioural effect is task-specific and not measured here. The feature is off by default and reports structure only, without severity ranking. (Pure ID renumbering, a word that encodes to the same strings in both tokenizers but with different IDs, is reported at the vocabulary level, not here, where it would flag almost every word.)
131
+
132
+ ## Python API
133
+
134
+ ```python
135
+ from tokendrift.core.loader import TokenizerLoader
136
+ from tokendrift.core.differ import EncodingDiffer
137
+ from tokendrift.core.vocab import VocabDiffer
138
+ from tokendrift.corpus.loaders import load_corpus
139
+
140
+ # Load tokenizers
141
+ tok_a = TokenizerLoader.load("cl100k_base") # tiktoken
142
+ tok_b = TokenizerLoader.load("o200k_base") # tiktoken
143
+ # tok_b = TokenizerLoader.load("Qwen/Qwen3-4B") # HuggingFace Hub
144
+
145
+ # Vocab diff
146
+ from tokendrift.core.vocab import VocabDiffer
147
+ v_diff = VocabDiffer().diff(tok_a, tok_b)
148
+ print(f"Added: {len(v_diff.added)}, Remapped: {len(v_diff.remapped)}")
149
+
150
+ # Single text diff (count/divergence only: the default, fully-supported path)
151
+ differ = EncodingDiffer()
152
+ d = differ.diff("biostatistical significance", tok_a, tok_b)
153
+ print(f"Token delta: {d.count_delta}, first divergence at char {d.first_divergence_pos}")
154
+
155
+ # Opt into experimental structural boundary detection
156
+ boundary_differ = EncodingDiffer(detect_boundaries=True)
157
+ d = boundary_differ.diff("biostatistical significance", tok_a, tok_b)
158
+ for v in d.boundary_violations: # SPLIT / MERGE / RESEGMENT, structural only
159
+ print(f" {v.word}: {v.tokens_a} → {v.tokens_b} ({v.violation_type.value})")
160
+
161
+ # Corpus diff
162
+ entries = load_corpus("prompts.jsonl")
163
+ pairs = [(e.id, e.text) for e in entries]
164
+ diffs = differ.diff_many(pairs, tok_a, tok_b)
165
+
166
+ # Cost report
167
+ from tokendrift.report.cost import CostCalculator
168
+ report = CostCalculator().compute(diffs, price_a=0.03, price_b=0.01)
169
+ print(f"Cost delta: ${report.cost_delta_usd:.4f}")
170
+ ```
171
+
172
+ ## Supported tokenizers
173
+
174
+ | Source | Example identifier | Notes |
175
+ |--------|-------------------|-------|
176
+ | tiktoken | `cl100k_base`, `o200k_base`, `p50k_base` | All OpenAI encodings |
177
+ | HuggingFace Hub | `Qwen/Qwen3-4B`, `meta-llama/Llama-3.2-1B` | Any model with `tokenizer.json` |
178
+ | Local directory | `/path/to/tokenizer/` | Loaded via HuggingFace `tokenizers` |
179
+ | Local file | `/path/to/tokenizer.json` | Direct file load |
180
+
181
+ ## Running tests
182
+
183
+ ```bash
184
+ # Offline tests (no network required)
185
+ pytest
186
+
187
+ # Full suite including real tiktoken / HuggingFace tokenizers
188
+ TOKENDRIFT_NETWORK_TESTS=1 pytest
189
+ ```
190
+
191
+ ## Project structure
192
+
193
+ ```
194
+ tokendrift/
195
+ ├── core/
196
+ │ ├── loader.py # UnifiedTokenizer + backends (tiktoken, HuggingFace)
197
+ │ ├── vocab.py # VocabDiffer
198
+ │ ├── differ.py # EncodingDiffer
199
+ │ └── boundary.py # BoundaryDetector
200
+ ├── corpus/
201
+ │ └── loaders.py # JSONL / CSV / plain-text corpus loading
202
+ ├── report/
203
+ │ ├── terminal.py # Rich terminal renderer
204
+ │ └── cost.py # CostCalculator
205
+ └── cli/
206
+ └── main.py # Typer CLI
207
+ ```
208
+
209
+ ## Roadmap
210
+
211
+ The next milestone is turning the diff into a gate:
212
+
213
+ - [ ] `ci` command: pin a corpus's token counts in a baseline and exit non-zero when a tokenizer change moves them (the feature that makes this CI infrastructure rather than a one-off diagnostic)
214
+ - [ ] `gen-tests` command: generate a pytest regression suite pinning current behavior
215
+
216
+ Later:
217
+
218
+ - [ ] DuckDB corpus persistence (`corpus/store.py`)
219
+ - [ ] HTML report output
220
+ - [ ] Validate (or drop) the behavioural significance of boundary changes against a task benchmark; promote out of "experimental" only if it holds up
221
+ - [ ] Rust batch encoder for large corpora (100k+ entries)
222
+ - [ ] SentencePiece backend
223
+
224
+ ## Contributing
225
+
226
+ Contributions are welcome. See [CONTRIBUTING.md](CONTRIBUTING.md) for the dev
227
+ setup and the lint/format/type/test checks CI runs. Notable changes are recorded
228
+ in [CHANGELOG.md](CHANGELOG.md).
229
+
230
+ ## License
231
+
232
+ MIT, see [LICENSE](LICENSE).
@@ -0,0 +1,78 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "tokendrift"
7
+ version = "0.1.0"
8
+ description = "Token-count, cost, and vocabulary diffing for LLM tokenizer changes"
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.10"
12
+ authors = [{ name = "Armaan Sandhu", email = "asandhu05@wpi.edu" }]
13
+ keywords = ["llm", "tokenizer", "diff", "regression", "nlp"]
14
+ classifiers = [
15
+ "Development Status :: 3 - Alpha",
16
+ "Intended Audience :: Developers",
17
+ "Programming Language :: Python :: 3",
18
+ "Programming Language :: Python :: 3.10",
19
+ "Programming Language :: Python :: 3.11",
20
+ "Programming Language :: Python :: 3.12",
21
+ "Programming Language :: Python :: 3.13",
22
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
23
+ "Environment :: Console",
24
+ "Typing :: Typed",
25
+ ]
26
+ dependencies = [
27
+ "tiktoken>=0.7.0",
28
+ "tokenizers>=0.19.0",
29
+ "typer>=0.12.0",
30
+ "rich>=13.0.0",
31
+ ]
32
+
33
+ [project.optional-dependencies]
34
+ dev = [
35
+ "pytest>=8.0.0",
36
+ "pytest-cov>=5.0.0",
37
+ "hypothesis>=6.0.0",
38
+ "ruff>=0.8.5",
39
+ "pyright>=1.1.400",
40
+ ]
41
+ nltk = ["nltk>=3.8"]
42
+ sentencepiece = ["sentencepiece>=0.2.0"]
43
+
44
+ [project.scripts]
45
+ tokendrift = "tokendrift.cli.main:app"
46
+
47
+ [project.urls]
48
+ Homepage = "https://github.com/Ar-maan05/tokendrift#readme"
49
+ Documentation = "https://github.com/Ar-maan05/tokendrift#readme"
50
+ Repository = "https://github.com/Ar-maan05/tokendrift"
51
+ Issues = "https://github.com/Ar-maan05/tokendrift/issues"
52
+ Changelog = "https://github.com/Ar-maan05/tokendrift/blob/main/CHANGELOG.md"
53
+
54
+ [tool.setuptools.packages.find]
55
+ where = ["."]
56
+ include = ["tokendrift*"]
57
+
58
+ [tool.setuptools.package-data]
59
+ tokendrift = ["py.typed"]
60
+
61
+ [tool.pytest.ini_options]
62
+ testpaths = ["tests"]
63
+ addopts = "-v --tb=short"
64
+
65
+ [tool.coverage.run]
66
+ source = ["tokendrift"]
67
+ omit = ["tests/*"]
68
+
69
+ [tool.ruff]
70
+ line-length = 120
71
+ target-version = "py310"
72
+
73
+ [tool.ruff.lint]
74
+ select = ["E", "F", "I", "UP"]
75
+
76
+ [tool.pyright]
77
+ include = ["tokendrift"]
78
+ pythonVersion = "3.10"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+