skimtoken 0.1.2__cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
skimtoken/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ """Fast token count estimation library."""
2
+
3
+ from skimtoken._skimtoken_core import estimate_tokens
4
+
5
+ __all__ = ["estimate_tokens"]
6
+ __version__ = "0.1.0"
skimtoken/__init__.pyi ADDED
@@ -0,0 +1,6 @@
1
+ """Type stubs for skimtoken."""
2
+
3
+ def estimate_tokens(text: str) -> int: ...
4
+
5
+ __all__: list[str]
6
+ __version__: str
Binary file
skimtoken/cli.py ADDED
@@ -0,0 +1,44 @@
1
+ import sys
2
+ import argparse
3
+
4
+ from skimtoken import estimate_tokens
5
+
6
+
7
+ def main() -> None:
8
+ parser = argparse.ArgumentParser(
9
+ description="Calculate estimated token count for the given text.",
10
+ formatter_class=argparse.RawDescriptionHelpFormatter,
11
+ epilog="Example:\n skimtoken 'Hello, world!'\n echo 'Some text' | skimtoken",
12
+ )
13
+
14
+ parser.add_argument(
15
+ "text", nargs="*", help="Text to estimate tokens for (reads from stdin if not provided)"
16
+ )
17
+
18
+ args = parser.parse_args()
19
+
20
+ # Get text from args or stdin
21
+ if args.text:
22
+ # Join all arguments as the text
23
+ text = " ".join(args.text)
24
+ elif sys.stdin.isatty():
25
+ # No args and no piped input
26
+ parser.error("No text provided")
27
+ else:
28
+ # Read from stdin
29
+ try:
30
+ text = sys.stdin.read().strip()
31
+ except Exception as e:
32
+ print(f"Error reading from stdin: {e}", file=sys.stderr)
33
+ sys.exit(1)
34
+
35
+ if not text:
36
+ parser.error("No text provided")
37
+
38
+ # Estimate tokens and print result
39
+ token_count = estimate_tokens(text)
40
+ print(token_count)
41
+
42
+
43
+ if __name__ == "__main__":
44
+ main()
@@ -0,0 +1,185 @@
1
+ Metadata-Version: 2.4
2
+ Name: skimtoken
3
+ Version: 0.1.2
4
+ License-File: LICENSE
5
+ Summary: Fast token count estimation library
6
+ Home-Page: https://github.com/masaishi/skimtoken
7
+ Author: masaishi <mwishiha@ucsc.edu>
8
+ Author-email: masaishi <mwishiha@ucsc.edu>
9
+ License: MIT
10
+ Requires-Python: >=3.9
11
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
12
+ Project-URL: Source Code, https://github.com/masaishi/skimtoken
13
+
14
+ # skimtoken (Beta)
15
+
16
+ A lightweight, fast token count estimation library written in Rust with Python bindings. Built for applications where approximate token counts work fine and memory/startup time efficiency matters.
17
+
18
+ # Why skimtoken?
19
+
20
+ [tiktoken](https://github.com/openai/tiktoken) is great for precise tokenization, but comes with serious overhead for simple token counting - especially **memory usage and initialization time**:
21
+
22
+ ```bash
23
+ ./scripts/run_benchmark_multiple.sh
24
+ ```
25
+
26
+ ```
27
+ ╭────────────────── Mean Results After 100 Runs ─────────────────╮
28
+ │ Mean RMSE: 12.5526 tokens │
29
+ ├─────────────────┬──────────────┬──────────────┬────────────────┤
30
+ │ Metric │ tiktoken │ skimtoken │ Ratio │
31
+ ├─────────────────┼──────────────┼──────────────┼────────────────┤
32
+ │ Init Time │ 0.135954 s │ 0.001022 s │ 0.007x │
33
+ │ Init Memory │ 84.5169 MB│ 0.4292 MB│ 0.005x │
34
+ │ Exec Time │ 0.002947 s │ 0.113127 s │ 38.387x │
35
+ │ Exec Memory │ 0.6602 MB│ 0.0485 MB│ 0.073x │
36
+ ├─────────────────┼──────────────┼──────────────┼────────────────┤
37
+ │ TOTAL Time │ 0.138901 s │ 0.114149 s │ 0.821x │
38
+ │ TOTAL Memory │ 85.1770 MB│ 0.4777 MB│ 0.005x │
39
+ ╰─────────────────┴──────────────┴──────────────┴────────────────╯
40
+ ```
41
+
42
+ ## Memory Advantages
43
+
44
+ **skimtoken uses >99% less memory** than tiktoken:
45
+ - **tiktoken**: ~85MB for initialization (loading vocabulary and encoder files)
46
+ - **skimtoken**: ~0.43MB for initialization, ~0.48MB total peak usage
47
+ - **178x less memory usage** - perfect for memory-constrained environments
48
+
49
+ **Memory-Efficient Design**:
50
+ - No large vocabulary files to load into memory
51
+ - Minimal runtime memory footprint
52
+ - Predictable memory usage patterns
53
+
54
+ **Performance Trade-offs**: skimtoken targets **memory-constrained scenarios** and **cold-start environments** where initialization time directly impacts user experience. While tiktoken is faster for individual operations (~38x) and more accurate, skimtoken's minimal initialization overhead (133x faster startup, 178x less memory) makes it **1.22x faster overall** when you need to load fresh each time.
55
+
56
+ This makes skimtoken valuable in:
57
+ - **Serverless functions** with strict memory limits (128MB-512MB)
58
+ - **Edge computing** environments with limited RAM
59
+ - **Mobile applications** where memory matters
60
+ - **Containerized microservices** with tight memory constraints
61
+ - **Shared hosting environments** where memory usage affects cost
62
+
63
+ ## Installation
64
+
65
+ ```bash
66
+ pip install skimtoken
67
+ ```
68
+
69
+ ## Usage
70
+
71
+ ```python
72
+ from skimtoken import estimate_tokens
73
+
74
+ # Basic usage
75
+ text = "Hello, world! How are you today?"
76
+ token_count = estimate_tokens(text)
77
+ print(f"Estimated tokens: {token_count}")
78
+
79
+ # Works with any text
80
+ code = """
81
+ def hello_world():
82
+ print("Hello, world!")
83
+ return True
84
+ """
85
+ tokens = estimate_tokens(code)
86
+ print(f"Code tokens: {tokens}")
87
+ ```
88
+
89
+ ## Language Support
90
+
91
+ skimtoken uses **language-specific parameters** tailored for different language families to improve estimation accuracy. Each language family has its own optimized coefficients based on tokenization patterns.
92
+
93
+ **Supported languages**: English, French, Spanish, German, Russian, Hindi, Arabic, Chinese, Japanese, Korean, etc.
94
+
95
+ **Current Accuracy**: RMSE of 12.55 across 146 samples (11,745 characters) with testing across multiple language families and text types
96
+
97
+ ## When to Use skimtoken vs tiktoken
98
+
99
+ **Use skimtoken when:**
100
+ - Working in **serverless/edge environments** (Cloudflare Workers, AWS Lambda, Vercel Functions) where cold start time and memory usage matter
101
+ - You need **quick token estimates** for API planning and cost estimation
102
+ - **Initialization overhead** is a concern (e.g., short-lived processes that can't amortize tiktoken's startup cost)
103
+ - Approximate counts work for your use case
104
+ - Memory constraints are tight
105
+
106
+ **Use Tiktoken when:**
107
+ - You need **exact token counts** for specific models and tokenization-dependent features
108
+ - **Processing large batches** of text where you can load the encoder once and reuse it
109
+ - Building applications that require **precise tokenization** (not just counting)
110
+ - You have **persistent memory** and can afford tiktoken's initialization cost
111
+ - **Accuracy is more important** than speed/memory efficiency
112
+
113
+ **Key Trade-off**: While tiktoken is faster for individual tokenization operations and more accurate, skimtoken excels in environments where you **can't afford to keep encoders loaded in memory** or where **cold start performance matters more than raw throughput**.
114
+
115
+ ## Roadmap
116
+
117
+ **Automated Parameter Optimization**: Plans to implement hyperparameter tuning using large-scale datasets like CC100 samples to minimize RMSE scores across language families.
118
+
119
+ The goal is to achieve sub-10 RMSE for major language families while preserving skimtoken's core advantages of minimal initialization overhead and memory usage.
120
+
121
+ ## Testing & Development
122
+
123
+ ```bash
124
+ # Install dependencies
125
+ uv sync
126
+
127
+ # Build for development
128
+ uv run maturin dev --features python
129
+
130
+ # Run tests
131
+ cargo test
132
+ uv run pytest
133
+
134
+ # Run specific test with verbose output
135
+ uv run pytest tests/test_skimtoken_simple.py -s
136
+
137
+ # Run performance benchmarks
138
+ uv run scripts/benchmark.py
139
+ ```
140
+
141
+ ### Test Results
142
+
143
+ Run accuracy testing:
144
+ ```bash
145
+ uv run pytest tests/test_skimtoken_simple.py -s
146
+ ```
147
+
148
+ ```
149
+ RMSE by Category:
150
+ ╭───────────────────────┬───────┬─────────┬────────╮
151
+ │ Category │ RMSE │ Samples │ Status │
152
+ ├───────────────────────┼───────┼─────────┼────────┤
153
+ │ ambiguous_punctuation │ 2.88 │ 7 │ ✓ PASS │
154
+ │ code │ 10.15 │ 14 │ ✓ PASS │
155
+ │ edge │ 0.00 │ 2 │ ✓ PASS │
156
+ │ json │ 8.54 │ 3 │ ✓ PASS │
157
+ │ jsonl │ 15.51 │ 2 │ ✓ PASS │
158
+ │ mixed │ 4.12 │ 3 │ ✓ PASS │
159
+ │ noisy_text │ 4.04 │ 7 │ ✓ PASS │
160
+ │ repetitive │ 7.25 │ 4 │ ✓ PASS │
161
+ │ rtl │ 3.71 │ 4 │ ✓ PASS │
162
+ │ special │ 4.69 │ 3 │ ✓ PASS │
163
+ │ special_encoding │ 10.65 │ 8 │ ✓ PASS │
164
+ │ structured_format │ 3.82 │ 8 │ ✓ PASS │
165
+ │ unknown │ 15.43 │ 81 │ ✓ PASS │
166
+ ╰───────────────────────┴───────┴─────────┴────────╯
167
+
168
+ Summary Statistics:
169
+ Overall RMSE: 12.55 tokens
170
+ Total samples processed: 146
171
+ Total characters: 12,377
172
+ Execution time: 0.121 seconds
173
+ Processing speed: 1204 samples/second
174
+ Character throughput: 102,110 chars/second
175
+ Average per character: 9.793μs
176
+ ```
177
+
178
+ ## Contributing
179
+
180
+ Contributions are welcome! Feel free to submit issues or pull requests.
181
+
182
+ ## License
183
+
184
+ MIT License - see [LICENSE](./LICENSE) for details.
185
+
@@ -0,0 +1,9 @@
1
+ skimtoken-0.1.2.dist-info/METADATA,sha256=LJ7lteSgDm9jPDhWnyJdXaq9dZcXOOIjU-wzq5Tuy0Q,8276
2
+ skimtoken-0.1.2.dist-info/WHEEL,sha256=oEr59ZPc2tpVnNJOPDQDxj0Rjot8eiL_zYcSYzWjqyM,127
3
+ skimtoken-0.1.2.dist-info/entry_points.txt,sha256=9385lCuBX4dMt4hoptXBmZt-9FxQSqfi0d8JF0B8nHg,47
4
+ skimtoken-0.1.2.dist-info/licenses/LICENSE,sha256=sskA_2WhrwWmzf0CYVaGiXVWfME-FY2aOLuK8UZA6XQ,1074
5
+ skimtoken/__init__.py,sha256=hwjHT1PNMelIlVW-uUR9q8zf3R-gSWO6pZgj09FustA,151
6
+ skimtoken/__init__.pyi,sha256=ILU_bxNUWtVEqpE7uIctLOKZwAB_6kOpGN9QwO0x6KA,113
7
+ skimtoken/_skimtoken_core.abi3.so,sha256=RDSf_spjkJ3fISFPcjOoSyZNQhVtz6SCn9ViCCRLVis,732880
8
+ skimtoken/cli.py,sha256=OItmWS0i4bekccdtso-vY4IJQlZEN8KgRbjkDRx9ZXU,1173
9
+ skimtoken-0.1.2.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: maturin (1.9.0)
3
+ Root-Is-Purelib: false
4
+ Tag: cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ skimtoken=skimtoken.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Masamune Ishihara
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.