toke-tokenizer 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toke_tokenizer-0.1.0/LICENSE +21 -0
- toke_tokenizer-0.1.0/PKG-INFO +87 -0
- toke_tokenizer-0.1.0/README.md +62 -0
- toke_tokenizer-0.1.0/pyproject.toml +43 -0
- toke_tokenizer-0.1.0/setup.cfg +4 -0
- toke_tokenizer-0.1.0/toke_tokenizer/__init__.py +6 -0
- toke_tokenizer-0.1.0/toke_tokenizer/data/tokenizer_v03.json +81628 -0
- toke_tokenizer-0.1.0/toke_tokenizer/tokenizer.py +194 -0
- toke_tokenizer-0.1.0/toke_tokenizer.egg-info/PKG-INFO +87 -0
- toke_tokenizer-0.1.0/toke_tokenizer.egg-info/SOURCES.txt +11 -0
- toke_tokenizer-0.1.0/toke_tokenizer.egg-info/dependency_links.txt +1 -0
- toke_tokenizer-0.1.0/toke_tokenizer.egg-info/requires.txt +4 -0
- toke_tokenizer-0.1.0/toke_tokenizer.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Matt Watt
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: toke-tokenizer
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: BPE tokenizer for the toke programming language (16K vocab)
|
|
5
|
+
Author: Matt Watt
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/karwalski/toke-tokenizer
|
|
8
|
+
Project-URL: Repository, https://github.com/karwalski/toke-tokenizer
|
|
9
|
+
Keywords: tokenizer,bpe,toke,programming-language
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Topic :: Software Development :: Compilers
|
|
17
|
+
Classifier: Topic :: Text Processing
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Provides-Extra: dev
|
|
22
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
23
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
24
|
+
Dynamic: license-file
|
|
25
|
+
|
|
26
|
+
# toke-tokenizer
|
|
27
|
+
|
|
28
|
+
A pure Python BPE tokenizer for the [toke programming language](https://github.com/karwalski/toke). Trained on normalised toke source code with a 16,384-token vocabulary.
|
|
29
|
+
|
|
30
|
+
Achieves approximately 52% token reduction compared to OpenAI's cl100k_base tokenizer on toke source code.
|
|
31
|
+
|
|
32
|
+
## Installation
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
pip install toke-tokenizer
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Usage
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from toke_tokenizer import encode, decode, count_tokens
|
|
42
|
+
|
|
43
|
+
# Tokenize toke source code
|
|
44
|
+
text = "let x:int = 42"
|
|
45
|
+
tokens = encode(text)
|
|
46
|
+
print(tokens) # list of token IDs
|
|
47
|
+
|
|
48
|
+
# Decode back to text
|
|
49
|
+
original = decode(tokens)
|
|
50
|
+
print(original) # "let x:int = 42"
|
|
51
|
+
|
|
52
|
+
# Count tokens
|
|
53
|
+
n = count_tokens(text)
|
|
54
|
+
print(f"{n} tokens")
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### String normalisation
|
|
58
|
+
|
|
59
|
+
Toke source often contains string literals that are not useful for structural tokenization. You can normalise strings before counting:
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
from toke_tokenizer import count_tokens
|
|
63
|
+
|
|
64
|
+
# Without normalisation
|
|
65
|
+
count_tokens('let msg:str = "hello world"')
|
|
66
|
+
|
|
67
|
+
# With normalisation (replaces string contents with "_")
|
|
68
|
+
count_tokens('let msg:str = "hello world"', normalise_strings=True)
|
|
69
|
+
# Equivalent to counting: let msg:str = "_"
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## API
|
|
73
|
+
|
|
74
|
+
- `encode(text: str) -> list[int]` — Tokenize text into a list of token IDs.
|
|
75
|
+
- `decode(ids: list[int]) -> str` — Convert token IDs back to text.
|
|
76
|
+
- `count_tokens(text: str, normalise_strings: bool = False) -> int` — Count the number of tokens in text.
|
|
77
|
+
|
|
78
|
+
## Details
|
|
79
|
+
|
|
80
|
+
- Vocabulary: 16,384 tokens
|
|
81
|
+
- Algorithm: Byte-Pair Encoding (BPE)
|
|
82
|
+
- Pre-tokenization: splits on newlines (each newline is a separate token)
|
|
83
|
+
- Special tokens: `<|endoftext|>` (0), `<pad>` (1), `<newline>` (2)
|
|
84
|
+
|
|
85
|
+
## License
|
|
86
|
+
|
|
87
|
+
MIT
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# toke-tokenizer
|
|
2
|
+
|
|
3
|
+
A pure Python BPE tokenizer for the [toke programming language](https://github.com/karwalski/toke). Trained on normalised toke source code with a 16,384-token vocabulary.
|
|
4
|
+
|
|
5
|
+
Achieves approximately 52% token reduction compared to OpenAI's cl100k_base tokenizer on toke source code.
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install toke-tokenizer
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Usage
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
from toke_tokenizer import encode, decode, count_tokens
|
|
17
|
+
|
|
18
|
+
# Tokenize toke source code
|
|
19
|
+
text = "let x:int = 42"
|
|
20
|
+
tokens = encode(text)
|
|
21
|
+
print(tokens) # list of token IDs
|
|
22
|
+
|
|
23
|
+
# Decode back to text
|
|
24
|
+
original = decode(tokens)
|
|
25
|
+
print(original) # "let x:int = 42"
|
|
26
|
+
|
|
27
|
+
# Count tokens
|
|
28
|
+
n = count_tokens(text)
|
|
29
|
+
print(f"{n} tokens")
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### String normalisation
|
|
33
|
+
|
|
34
|
+
Toke source often contains string literals that are not useful for structural tokenization. You can normalise strings before counting:
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
from toke_tokenizer import count_tokens
|
|
38
|
+
|
|
39
|
+
# Without normalisation
|
|
40
|
+
count_tokens('let msg:str = "hello world"')
|
|
41
|
+
|
|
42
|
+
# With normalisation (replaces string contents with "_")
|
|
43
|
+
count_tokens('let msg:str = "hello world"', normalise_strings=True)
|
|
44
|
+
# Equivalent to counting: let msg:str = "_"
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## API
|
|
48
|
+
|
|
49
|
+
- `encode(text: str) -> list[int]` — Tokenize text into a list of token IDs.
|
|
50
|
+
- `decode(ids: list[int]) -> str` — Convert token IDs back to text.
|
|
51
|
+
- `count_tokens(text: str, normalise_strings: bool = False) -> int` — Count the number of tokens in text.
|
|
52
|
+
|
|
53
|
+
## Details
|
|
54
|
+
|
|
55
|
+
- Vocabulary: 16,384 tokens
|
|
56
|
+
- Algorithm: Byte-Pair Encoding (BPE)
|
|
57
|
+
- Pre-tokenization: splits on newlines (each newline is a separate token)
|
|
58
|
+
- Special tokens: `<|endoftext|>` (0), `<pad>` (1), `<newline>` (2)
|
|
59
|
+
|
|
60
|
+
## License
|
|
61
|
+
|
|
62
|
+
MIT
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "toke-tokenizer"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "BPE tokenizer for the toke programming language (16K vocab)"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [{name = "Matt Watt"}]
|
|
13
|
+
keywords = ["tokenizer", "bpe", "toke", "programming-language"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 3 - Alpha",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"Programming Language :: Python :: 3.10",
|
|
19
|
+
"Programming Language :: Python :: 3.11",
|
|
20
|
+
"Programming Language :: Python :: 3.12",
|
|
21
|
+
"Topic :: Software Development :: Compilers",
|
|
22
|
+
"Topic :: Text Processing",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
[project.urls]
|
|
26
|
+
Homepage = "https://github.com/karwalski/toke-tokenizer"
|
|
27
|
+
Repository = "https://github.com/karwalski/toke-tokenizer"
|
|
28
|
+
|
|
29
|
+
[project.optional-dependencies]
|
|
30
|
+
dev = [
|
|
31
|
+
"pytest>=7",
|
|
32
|
+
"ruff>=0.4",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
[tool.setuptools.packages.find]
|
|
36
|
+
include = ["toke_tokenizer*"]
|
|
37
|
+
|
|
38
|
+
[tool.setuptools.package-data]
|
|
39
|
+
toke_tokenizer = ["data/*.json"]
|
|
40
|
+
|
|
41
|
+
[tool.ruff]
|
|
42
|
+
line-length = 100
|
|
43
|
+
target-version = "py310"
|