skimtoken 0.2.0__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. {skimtoken-0.2.0 → skimtoken-0.2.1}/Cargo.lock +1 -1
  2. {skimtoken-0.2.0 → skimtoken-0.2.1}/Cargo.toml +1 -1
  3. {skimtoken-0.2.0 → skimtoken-0.2.1}/PKG-INFO +19 -18
  4. {skimtoken-0.2.0 → skimtoken-0.2.1}/README.md +18 -18
  5. {skimtoken-0.2.0 → skimtoken-0.2.1}/pyproject.toml +1 -1
  6. {skimtoken-0.2.0 → skimtoken-0.2.1}/.github/workflows/ci.yml +0 -0
  7. {skimtoken-0.2.0 → skimtoken-0.2.1}/.github/workflows/release.yml +0 -0
  8. {skimtoken-0.2.0 → skimtoken-0.2.1}/.gitignore +0 -0
  9. {skimtoken-0.2.0 → skimtoken-0.2.1}/CONTRIBUTING.md +0 -0
  10. {skimtoken-0.2.0 → skimtoken-0.2.1}/LICENSE +0 -0
  11. {skimtoken-0.2.0 → skimtoken-0.2.1}/data/test_dataset.jsonl +0 -0
  12. {skimtoken-0.2.0 → skimtoken-0.2.1}/examples/example.py +0 -0
  13. {skimtoken-0.2.0 → skimtoken-0.2.1}/examples/multilingual_estimate.py +0 -0
  14. {skimtoken-0.2.0 → skimtoken-0.2.1}/params/basic.toml +0 -0
  15. {skimtoken-0.2.0 → skimtoken-0.2.1}/params/multilingual.toml +0 -0
  16. {skimtoken-0.2.0 → skimtoken-0.2.1}/params/multilingual_simple.toml +0 -0
  17. {skimtoken-0.2.0 → skimtoken-0.2.1}/params/simple.toml +0 -0
  18. {skimtoken-0.2.0 → skimtoken-0.2.1}/scripts/benchmark.py +0 -0
  19. {skimtoken-0.2.0 → skimtoken-0.2.1}/scripts/optimize/optimize_basic.py +0 -0
  20. {skimtoken-0.2.0 → skimtoken-0.2.1}/scripts/optimize/optimize_multilingual.py +0 -0
  21. {skimtoken-0.2.0 → skimtoken-0.2.1}/scripts/optimize/optimize_multilingual_simple.py +0 -0
  22. {skimtoken-0.2.0 → skimtoken-0.2.1}/scripts/optimize/optimize_simple.py +0 -0
  23. {skimtoken-0.2.0 → skimtoken-0.2.1}/scripts/optimize/utils.py +0 -0
  24. {skimtoken-0.2.0 → skimtoken-0.2.1}/scripts/optimize_all.py +0 -0
  25. {skimtoken-0.2.0 → skimtoken-0.2.1}/scripts/prepare_cc100_dataset.py +0 -0
  26. {skimtoken-0.2.0 → skimtoken-0.2.1}/scripts/update_rust_params.py +0 -0
  27. {skimtoken-0.2.0 → skimtoken-0.2.1}/scripts/update_token_counts.py +0 -0
  28. {skimtoken-0.2.0 → skimtoken-0.2.1}/skimtoken/__init__.py +0 -0
  29. {skimtoken-0.2.0 → skimtoken-0.2.1}/skimtoken/__init__.pyi +0 -0
  30. {skimtoken-0.2.0 → skimtoken-0.2.1}/skimtoken/basic.py +0 -0
  31. {skimtoken-0.2.0 → skimtoken-0.2.1}/skimtoken/basic.pyi +0 -0
  32. {skimtoken-0.2.0 → skimtoken-0.2.1}/skimtoken/multilingual.py +0 -0
  33. {skimtoken-0.2.0 → skimtoken-0.2.1}/skimtoken/multilingual.pyi +0 -0
  34. {skimtoken-0.2.0 → skimtoken-0.2.1}/skimtoken/multilingual_simple.py +0 -0
  35. {skimtoken-0.2.0 → skimtoken-0.2.1}/skimtoken/multilingual_simple.pyi +0 -0
  36. {skimtoken-0.2.0 → skimtoken-0.2.1}/skimtoken/simple.py +0 -0
  37. {skimtoken-0.2.0 → skimtoken-0.2.1}/skimtoken/simple.pyi +0 -0
  38. {skimtoken-0.2.0 → skimtoken-0.2.1}/src/lib.rs +0 -0
  39. {skimtoken-0.2.0 → skimtoken-0.2.1}/src/main.rs +0 -0
  40. {skimtoken-0.2.0 → skimtoken-0.2.1}/src/methods/method.rs +0 -0
  41. {skimtoken-0.2.0 → skimtoken-0.2.1}/src/methods/method_basic.rs +0 -0
  42. {skimtoken-0.2.0 → skimtoken-0.2.1}/src/methods/method_multilingual.rs +0 -0
  43. {skimtoken-0.2.0 → skimtoken-0.2.1}/src/methods/method_multilingual_simple.rs +0 -0
  44. {skimtoken-0.2.0 → skimtoken-0.2.1}/src/methods/method_simple.rs +0 -0
  45. {skimtoken-0.2.0 → skimtoken-0.2.1}/tests/test_comprehensive.py +0 -0
  46. {skimtoken-0.2.0 → skimtoken-0.2.1}/tests/test_hypothesis.py +0 -0
  47. {skimtoken-0.2.0 → skimtoken-0.2.1}/tests/test_simple.py +0 -0
  48. {skimtoken-0.2.0 → skimtoken-0.2.1}/uv.lock +0 -0
@@ -392,7 +392,7 @@ dependencies = [
392
392
 
393
393
  [[package]]
394
394
  name = "skimtoken"
395
- version = "0.2.0"
395
+ version = "0.2.1"
396
396
  dependencies = [
397
397
  "atty",
398
398
  "clap",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "skimtoken"
3
- version = "0.2.0"
3
+ version = "0.2.1"
4
4
  edition = "2021"
5
5
  authors = ["masaishi <mwishiha@ucsc.edu>"]
6
6
  license = "MIT"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: skimtoken
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  License-File: LICENSE
5
5
  Summary: Fast token count estimation library
6
6
  Home-Page: https://github.com/masaishi/skimtoken
@@ -25,7 +25,7 @@ A lightweight, fast token count estimation library written in Rust with Python b
25
25
 
26
26
  **This library is currently in early beta and has significant accuracy issues:**
27
27
 
28
- - **Multilingual method**: Takes 48.60x longer than tiktoken due to inefficient implementation
28
+ - **Multilingual method**: Takes 1.13x longer than tiktoken due to inefficient implementation
29
29
  - **Overall accuracy**: 15.11% error rate, which is too high for most use cases
30
30
 
31
31
 
@@ -37,7 +37,7 @@ A lightweight, fast token count estimation library written in Rust with Python b
37
37
 
38
38
  - ✅ **64x less memory** (0.92MB vs 60MB)
39
39
  - ✅ **128x faster startup** (4ms vs 485ms)
40
- - ❌ **48.60x slower execution** (0.93s vs 4.59s) for multilingual method
40
+ - ❌ **1.13x slower execution** (5.51s vs 4.59s) for multilingual method
41
41
  - ❌ Trade-off: ~15.11% error rate vs exact counts
42
42
 
43
43
  ## Installation
@@ -133,21 +133,21 @@ Total Characters: 13,062,391
133
133
  Mean RMSE: 21.3034 tokens
134
134
  Mean Error Rate: 15.11%
135
135
 
136
- ┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━┓
137
- ┃ Metric ┃ tiktoken ┃ skimtoken ┃ Ratio ┃
138
- ┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━┩
139
- │ Init Time │ 0.471222 s │ 0.006207 s │ 0.013x
140
- ├──────────────┼────────────┼──────────────┼─────────┤
141
- │ Init Memory │ 42.2385 MB │ 0.0283 MB │ 0.001x
142
- ├──────────────┼────────────┼──────────────┼─────────┤
143
- │ Exec Time │ 4.594160 s │ 246.164618 s │ 53.582x
144
- ├──────────────┼────────────┼──────────────┼─────────┤
145
- │ Exec Memory │ 17.3251 MB │ 0.8950 MB │ 0.052x │
146
- ├──────────────┼────────────┼──────────────┼─────────┤
147
- │ Total Time │ 5.065382 s │ 246.170825 s │ 48.599x
148
- ├──────────────┼────────────┼──────────────┼─────────┤
149
- │ Total Memory │ 59.5636 MB │ 0.9233 MB │ 0.016x
150
- └──────────────┴────────────┴──────────────┴─────────┘
136
+ ┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━┓
137
+ ┃ Metric ┃ tiktoken ┃ skimtoken ┃ Ratio ┃
138
+ ┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━┩
139
+ │ Init Time │ 0.815441 s │ 0.138714 s │ 0.170x
140
+ ├──────────────┼────────────┼────────────┼────────┤
141
+ │ Init Memory │ 42.4791 MB │ 0.1613 MB │ 0.004x
142
+ ├──────────────┼────────────┼────────────┼────────┤
143
+ │ Exec Time │ 4.041857 s │ 5.380782 s │ 1.331x
144
+ ├──────────────┼────────────┼────────────┼────────┤
145
+ │ Exec Memory │ 17.3227 MB │ 0.8950 MB │ 0.052x │
146
+ ├──────────────┼────────────┼────────────┼────────┤
147
+ │ Total Time │ 4.857297 s │ 5.519496 s │ 1.136x
148
+ ├──────────────┼────────────┼────────────┼────────┤
149
+ │ Total Memory │ 59.8018 MB │ 1.0563 MB │ 0.018x
150
+ └──────────────┴────────────┴────────────┴────────┘
151
151
  ```
152
152
 
153
153
  ## Available Methods
@@ -279,3 +279,4 @@ We are actively working to improve skimtoken's accuracy and performance:
279
279
  ## License
280
280
 
281
281
  MIT License - see [LICENSE](./LICENSE) for details.
282
+
@@ -12,7 +12,7 @@ A lightweight, fast token count estimation library written in Rust with Python b
12
12
 
13
13
  **This library is currently in early beta and has significant accuracy issues:**
14
14
 
15
- - **Multilingual method**: Takes 48.60x longer than tiktoken due to inefficient implementation
15
+ - **Multilingual method**: Takes 1.13x longer than tiktoken due to inefficient implementation
16
16
  - **Overall accuracy**: 15.11% error rate, which is too high for most use cases
17
17
 
18
18
 
@@ -24,7 +24,7 @@ A lightweight, fast token count estimation library written in Rust with Python b
24
24
 
25
25
  - ✅ **64x less memory** (0.92MB vs 60MB)
26
26
  - ✅ **128x faster startup** (4ms vs 485ms)
27
- - ❌ **48.60x slower execution** (0.93s vs 4.59s) for multilingual method
27
+ - ❌ **1.13x slower execution** (5.51s vs 4.59s) for multilingual method
28
28
  - ❌ Trade-off: ~15.11% error rate vs exact counts
29
29
 
30
30
  ## Installation
@@ -120,21 +120,21 @@ Total Characters: 13,062,391
120
120
  Mean RMSE: 21.3034 tokens
121
121
  Mean Error Rate: 15.11%
122
122
 
123
- ┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━┓
124
- ┃ Metric ┃ tiktoken ┃ skimtoken ┃ Ratio ┃
125
- ┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━┩
126
- │ Init Time │ 0.471222 s │ 0.006207 s │ 0.013x
127
- ├──────────────┼────────────┼──────────────┼─────────┤
128
- │ Init Memory │ 42.2385 MB │ 0.0283 MB │ 0.001x
129
- ├──────────────┼────────────┼──────────────┼─────────┤
130
- │ Exec Time │ 4.594160 s │ 246.164618 s │ 53.582x
131
- ├──────────────┼────────────┼──────────────┼─────────┤
132
- │ Exec Memory │ 17.3251 MB │ 0.8950 MB │ 0.052x │
133
- ├──────────────┼────────────┼──────────────┼─────────┤
134
- │ Total Time │ 5.065382 s │ 246.170825 s │ 48.599x
135
- ├──────────────┼────────────┼──────────────┼─────────┤
136
- │ Total Memory │ 59.5636 MB │ 0.9233 MB │ 0.016x
137
- └──────────────┴────────────┴──────────────┴─────────┘
123
+ ┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━┓
124
+ ┃ Metric ┃ tiktoken ┃ skimtoken ┃ Ratio ┃
125
+ ┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━┩
126
+ │ Init Time │ 0.815441 s │ 0.138714 s │ 0.170x
127
+ ├──────────────┼────────────┼────────────┼────────┤
128
+ │ Init Memory │ 42.4791 MB │ 0.1613 MB │ 0.004x
129
+ ├──────────────┼────────────┼────────────┼────────┤
130
+ │ Exec Time │ 4.041857 s │ 5.380782 s │ 1.331x
131
+ ├──────────────┼────────────┼────────────┼────────┤
132
+ │ Exec Memory │ 17.3227 MB │ 0.8950 MB │ 0.052x │
133
+ ├──────────────┼────────────┼────────────┼────────┤
134
+ │ Total Time │ 4.857297 s │ 5.519496 s │ 1.136x
135
+ ├──────────────┼────────────┼────────────┼────────┤
136
+ │ Total Memory │ 59.8018 MB │ 1.0563 MB │ 0.018x
137
+ └──────────────┴────────────┴────────────┴────────┘
138
138
  ```
139
139
 
140
140
  ## Available Methods
@@ -265,4 +265,4 @@ We are actively working to improve skimtoken's accuracy and performance:
265
265
 
266
266
  ## License
267
267
 
268
- MIT License - see [LICENSE](./LICENSE) for details.
268
+ MIT License - see [LICENSE](./LICENSE) for details.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "skimtoken"
3
- version = "0.2.0"
3
+ version = "0.2.1"
4
4
  description = "Fast token count estimation library"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.9"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes