skimtoken 0.1.2__cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- skimtoken/__init__.py +6 -0
- skimtoken/__init__.pyi +6 -0
- skimtoken/_skimtoken_core.abi3.so +0 -0
- skimtoken/cli.py +44 -0
- skimtoken-0.1.2.dist-info/METADATA +185 -0
- skimtoken-0.1.2.dist-info/RECORD +9 -0
- skimtoken-0.1.2.dist-info/WHEEL +4 -0
- skimtoken-0.1.2.dist-info/entry_points.txt +2 -0
- skimtoken-0.1.2.dist-info/licenses/LICENSE +21 -0
skimtoken/__init__.py
ADDED
skimtoken/__init__.pyi
ADDED
Binary file
|
skimtoken/cli.py
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
import sys
|
2
|
+
import argparse
|
3
|
+
|
4
|
+
from skimtoken import estimate_tokens
|
5
|
+
|
6
|
+
|
7
|
+
def main() -> None:
|
8
|
+
parser = argparse.ArgumentParser(
|
9
|
+
description="Calculate estimated token count for the given text.",
|
10
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
11
|
+
epilog="Example:\n skimtoken 'Hello, world!'\n echo 'Some text' | skimtoken",
|
12
|
+
)
|
13
|
+
|
14
|
+
parser.add_argument(
|
15
|
+
"text", nargs="*", help="Text to estimate tokens for (reads from stdin if not provided)"
|
16
|
+
)
|
17
|
+
|
18
|
+
args = parser.parse_args()
|
19
|
+
|
20
|
+
# Get text from args or stdin
|
21
|
+
if args.text:
|
22
|
+
# Join all arguments as the text
|
23
|
+
text = " ".join(args.text)
|
24
|
+
elif sys.stdin.isatty():
|
25
|
+
# No args and no piped input
|
26
|
+
parser.error("No text provided")
|
27
|
+
else:
|
28
|
+
# Read from stdin
|
29
|
+
try:
|
30
|
+
text = sys.stdin.read().strip()
|
31
|
+
except Exception as e:
|
32
|
+
print(f"Error reading from stdin: {e}", file=sys.stderr)
|
33
|
+
sys.exit(1)
|
34
|
+
|
35
|
+
if not text:
|
36
|
+
parser.error("No text provided")
|
37
|
+
|
38
|
+
# Estimate tokens and print result
|
39
|
+
token_count = estimate_tokens(text)
|
40
|
+
print(token_count)
|
41
|
+
|
42
|
+
|
43
|
+
if __name__ == "__main__":
|
44
|
+
main()
|
@@ -0,0 +1,185 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: skimtoken
|
3
|
+
Version: 0.1.2
|
4
|
+
License-File: LICENSE
|
5
|
+
Summary: Fast token count estimation library
|
6
|
+
Home-Page: https://github.com/masaishi/skimtoken
|
7
|
+
Author: masaishi <mwishiha@ucsc.edu>
|
8
|
+
Author-email: masaishi <mwishiha@ucsc.edu>
|
9
|
+
License: MIT
|
10
|
+
Requires-Python: >=3.9
|
11
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
12
|
+
Project-URL: Source Code, https://github.com/masaishi/skimtoken
|
13
|
+
|
14
|
+
# skimtoken (Beta)
|
15
|
+
|
16
|
+
A lightweight, fast token count estimation library written in Rust with Python bindings. Built for applications where approximate token counts work fine and memory/startup time efficiency matters.
|
17
|
+
|
18
|
+
# Why skimtoken?
|
19
|
+
|
20
|
+
[tiktoken](https://github.com/openai/tiktoken) is great for precise tokenization, but comes with serious overhead for simple token counting - especially **memory usage and initialization time**:
|
21
|
+
|
22
|
+
```bash
|
23
|
+
./scripts/run_benchmark_multiple.sh
|
24
|
+
```
|
25
|
+
|
26
|
+
```
|
27
|
+
╭────────────────── Mean Results After 100 Runs ─────────────────╮
|
28
|
+
│ Mean RMSE: 12.5526 tokens │
|
29
|
+
├─────────────────┬──────────────┬──────────────┬────────────────┤
|
30
|
+
│ Metric │ tiktoken │ skimtoken │ Ratio │
|
31
|
+
├─────────────────┼──────────────┼──────────────┼────────────────┤
|
32
|
+
│ Init Time │ 0.135954 s │ 0.001022 s │ 0.007x │
|
33
|
+
│ Init Memory │ 84.5169 MB│ 0.4292 MB│ 0.005x │
|
34
|
+
│ Exec Time │ 0.002947 s │ 0.113127 s │ 38.387x │
|
35
|
+
│ Exec Memory │ 0.6602 MB│ 0.0485 MB│ 0.073x │
|
36
|
+
├─────────────────┼──────────────┼──────────────┼────────────────┤
|
37
|
+
│ TOTAL Time │ 0.138901 s │ 0.114149 s │ 0.821x │
|
38
|
+
│ TOTAL Memory │ 85.1770 MB│ 0.4777 MB│ 0.005x │
|
39
|
+
╰─────────────────┴──────────────┴──────────────┴────────────────╯
|
40
|
+
```
|
41
|
+
|
42
|
+
## Memory Advantages
|
43
|
+
|
44
|
+
**skimtoken uses >99% less memory** than tiktoken:
|
45
|
+
- **tiktoken**: ~85MB for initialization (loading vocabulary and encoder files)
|
46
|
+
- **skimtoken**: ~0.43MB for initialization, ~0.48MB total peak usage
|
47
|
+
- **178x less memory usage** - perfect for memory-constrained environments
|
48
|
+
|
49
|
+
**Memory-Efficient Design**:
|
50
|
+
- No large vocabulary files to load into memory
|
51
|
+
- Minimal runtime memory footprint
|
52
|
+
- Predictable memory usage patterns
|
53
|
+
|
54
|
+
**Performance Trade-offs**: skimtoken targets **memory-constrained scenarios** and **cold-start environments** where initialization time directly impacts user experience. While tiktoken is faster for individual operations (~38x) and more accurate, skimtoken's minimal initialization overhead (133x faster startup, 178x less memory) makes it **1.22x faster overall** when you need to load fresh each time.
|
55
|
+
|
56
|
+
This makes skimtoken valuable in:
|
57
|
+
- **Serverless functions** with strict memory limits (128MB-512MB)
|
58
|
+
- **Edge computing** environments with limited RAM
|
59
|
+
- **Mobile applications** where memory matters
|
60
|
+
- **Containerized microservices** with tight memory constraints
|
61
|
+
- **Shared hosting environments** where memory usage affects cost
|
62
|
+
|
63
|
+
## Installation
|
64
|
+
|
65
|
+
```bash
|
66
|
+
pip install skimtoken
|
67
|
+
```
|
68
|
+
|
69
|
+
## Usage
|
70
|
+
|
71
|
+
```python
|
72
|
+
from skimtoken import estimate_tokens
|
73
|
+
|
74
|
+
# Basic usage
|
75
|
+
text = "Hello, world! How are you today?"
|
76
|
+
token_count = estimate_tokens(text)
|
77
|
+
print(f"Estimated tokens: {token_count}")
|
78
|
+
|
79
|
+
# Works with any text
|
80
|
+
code = """
|
81
|
+
def hello_world():
|
82
|
+
print("Hello, world!")
|
83
|
+
return True
|
84
|
+
"""
|
85
|
+
tokens = estimate_tokens(code)
|
86
|
+
print(f"Code tokens: {tokens}")
|
87
|
+
```
|
88
|
+
|
89
|
+
## Language Support
|
90
|
+
|
91
|
+
skimtoken uses **language-specific parameters** tailored for different language families to improve estimation accuracy. Each language family has its own optimized coefficients based on tokenization patterns.
|
92
|
+
|
93
|
+
**Supported languages**: English, French, Spanish, German, Russian, Hindi, Arabic, Chinese, Japanese, Korean, etc.
|
94
|
+
|
95
|
+
**Current Accuracy**: RMSE of 12.55 across 146 samples (11,745 characters) with testing across multiple language families and text types
|
96
|
+
|
97
|
+
## When to Use skimtoken vs tiktoken
|
98
|
+
|
99
|
+
**Use skimtoken when:**
|
100
|
+
- Working in **serverless/edge environments** (Cloudflare Workers, AWS Lambda, Vercel Functions) where cold start time and memory usage matter
|
101
|
+
- You need **quick token estimates** for API planning and cost estimation
|
102
|
+
- **Initialization overhead** is a concern (e.g., short-lived processes that can't amortize tiktoken's startup cost)
|
103
|
+
- Approximate counts work for your use case
|
104
|
+
- Memory constraints are tight
|
105
|
+
|
106
|
+
**Use Tiktoken when:**
|
107
|
+
- You need **exact token counts** for specific models and tokenization-dependent features
|
108
|
+
- **Processing large batches** of text where you can load the encoder once and reuse it
|
109
|
+
- Building applications that require **precise tokenization** (not just counting)
|
110
|
+
- You have **persistent memory** and can afford tiktoken's initialization cost
|
111
|
+
- **Accuracy is more important** than speed/memory efficiency
|
112
|
+
|
113
|
+
**Key Trade-off**: While tiktoken is faster for individual tokenization operations and more accurate, skimtoken excels in environments where you **can't afford to keep encoders loaded in memory** or where **cold start performance matters more than raw throughput**.
|
114
|
+
|
115
|
+
## Roadmap
|
116
|
+
|
117
|
+
**Automated Parameter Optimization**: Plans to implement hyperparameter tuning using large-scale datasets like CC100 samples to minimize RMSE scores across language families.
|
118
|
+
|
119
|
+
The goal is to achieve sub-10 RMSE for major language families while preserving skimtoken's core advantages of minimal initialization overhead and memory usage.
|
120
|
+
|
121
|
+
## Testing & Development
|
122
|
+
|
123
|
+
```bash
|
124
|
+
# Install dependencies
|
125
|
+
uv sync
|
126
|
+
|
127
|
+
# Build for development
|
128
|
+
uv run maturin dev --features python
|
129
|
+
|
130
|
+
# Run tests
|
131
|
+
cargo test
|
132
|
+
uv run pytest
|
133
|
+
|
134
|
+
# Run specific test with verbose output
|
135
|
+
uv run pytest tests/test_skimtoken_simple.py -s
|
136
|
+
|
137
|
+
# Run performance benchmarks
|
138
|
+
uv run scripts/benchmark.py
|
139
|
+
```
|
140
|
+
|
141
|
+
### Test Results
|
142
|
+
|
143
|
+
Run accuracy testing:
|
144
|
+
```bash
|
145
|
+
uv run pytest tests/test_skimtoken_simple.py -s
|
146
|
+
```
|
147
|
+
|
148
|
+
```
|
149
|
+
RMSE by Category:
|
150
|
+
╭───────────────────────┬───────┬─────────┬────────╮
|
151
|
+
│ Category │ RMSE │ Samples │ Status │
|
152
|
+
├───────────────────────┼───────┼─────────┼────────┤
|
153
|
+
│ ambiguous_punctuation │ 2.88 │ 7 │ ✓ PASS │
|
154
|
+
│ code │ 10.15 │ 14 │ ✓ PASS │
|
155
|
+
│ edge │ 0.00 │ 2 │ ✓ PASS │
|
156
|
+
│ json │ 8.54 │ 3 │ ✓ PASS │
|
157
|
+
│ jsonl │ 15.51 │ 2 │ ✓ PASS │
|
158
|
+
│ mixed │ 4.12 │ 3 │ ✓ PASS │
|
159
|
+
│ noisy_text │ 4.04 │ 7 │ ✓ PASS │
|
160
|
+
│ repetitive │ 7.25 │ 4 │ ✓ PASS │
|
161
|
+
│ rtl │ 3.71 │ 4 │ ✓ PASS │
|
162
|
+
│ special │ 4.69 │ 3 │ ✓ PASS │
|
163
|
+
│ special_encoding │ 10.65 │ 8 │ ✓ PASS │
|
164
|
+
│ structured_format │ 3.82 │ 8 │ ✓ PASS │
|
165
|
+
│ unknown │ 15.43 │ 81 │ ✓ PASS │
|
166
|
+
╰───────────────────────┴───────┴─────────┴────────╯
|
167
|
+
|
168
|
+
Summary Statistics:
|
169
|
+
Overall RMSE: 12.55 tokens
|
170
|
+
Total samples processed: 146
|
171
|
+
Total characters: 12,377
|
172
|
+
Execution time: 0.121 seconds
|
173
|
+
Processing speed: 1204 samples/second
|
174
|
+
Character throughput: 102,110 chars/second
|
175
|
+
Average per character: 9.793μs
|
176
|
+
```
|
177
|
+
|
178
|
+
## Contributing
|
179
|
+
|
180
|
+
Contributions are welcome! Feel free to submit issues or pull requests.
|
181
|
+
|
182
|
+
## License
|
183
|
+
|
184
|
+
MIT License - see [LICENSE](./LICENSE) for details.
|
185
|
+
|
@@ -0,0 +1,9 @@
|
|
1
|
+
skimtoken-0.1.2.dist-info/METADATA,sha256=LJ7lteSgDm9jPDhWnyJdXaq9dZcXOOIjU-wzq5Tuy0Q,8276
|
2
|
+
skimtoken-0.1.2.dist-info/WHEEL,sha256=oEr59ZPc2tpVnNJOPDQDxj0Rjot8eiL_zYcSYzWjqyM,127
|
3
|
+
skimtoken-0.1.2.dist-info/entry_points.txt,sha256=9385lCuBX4dMt4hoptXBmZt-9FxQSqfi0d8JF0B8nHg,47
|
4
|
+
skimtoken-0.1.2.dist-info/licenses/LICENSE,sha256=sskA_2WhrwWmzf0CYVaGiXVWfME-FY2aOLuK8UZA6XQ,1074
|
5
|
+
skimtoken/__init__.py,sha256=hwjHT1PNMelIlVW-uUR9q8zf3R-gSWO6pZgj09FustA,151
|
6
|
+
skimtoken/__init__.pyi,sha256=ILU_bxNUWtVEqpE7uIctLOKZwAB_6kOpGN9QwO0x6KA,113
|
7
|
+
skimtoken/_skimtoken_core.abi3.so,sha256=RDSf_spjkJ3fISFPcjOoSyZNQhVtz6SCn9ViCCRLVis,732880
|
8
|
+
skimtoken/cli.py,sha256=OItmWS0i4bekccdtso-vY4IJQlZEN8KgRbjkDRx9ZXU,1173
|
9
|
+
skimtoken-0.1.2.dist-info/RECORD,,
|
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2025 Masamune Ishihara
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|