skimtoken 0.2.0__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. {skimtoken-0.2.0 → skimtoken-0.2.2}/Cargo.lock +1 -1
  2. {skimtoken-0.2.0 → skimtoken-0.2.2}/Cargo.toml +1 -1
  3. {skimtoken-0.2.0 → skimtoken-0.2.2}/PKG-INFO +25 -48
  4. {skimtoken-0.2.0 → skimtoken-0.2.2}/README.md +24 -48
  5. {skimtoken-0.2.0 → skimtoken-0.2.2}/pyproject.toml +1 -1
  6. {skimtoken-0.2.0 → skimtoken-0.2.2}/scripts/update_rust_params.py +47 -19
  7. {skimtoken-0.2.0 → skimtoken-0.2.2}/src/lib.rs +2 -2
  8. {skimtoken-0.2.0 → skimtoken-0.2.2}/src/methods/method_basic.rs +15 -15
  9. skimtoken-0.2.2/src/methods/method_multilingual.rs +965 -0
  10. {skimtoken-0.2.0 → skimtoken-0.2.2}/src/methods/method_multilingual_simple.rs +72 -82
  11. {skimtoken-0.2.0 → skimtoken-0.2.2}/src/methods/method_simple.rs +3 -3
  12. skimtoken-0.2.0/src/methods/method_multilingual.rs +0 -965
  13. {skimtoken-0.2.0 → skimtoken-0.2.2}/.github/workflows/ci.yml +0 -0
  14. {skimtoken-0.2.0 → skimtoken-0.2.2}/.github/workflows/release.yml +0 -0
  15. {skimtoken-0.2.0 → skimtoken-0.2.2}/.gitignore +0 -0
  16. {skimtoken-0.2.0 → skimtoken-0.2.2}/CONTRIBUTING.md +0 -0
  17. {skimtoken-0.2.0 → skimtoken-0.2.2}/LICENSE +0 -0
  18. {skimtoken-0.2.0 → skimtoken-0.2.2}/data/test_dataset.jsonl +0 -0
  19. {skimtoken-0.2.0 → skimtoken-0.2.2}/examples/example.py +0 -0
  20. {skimtoken-0.2.0 → skimtoken-0.2.2}/examples/multilingual_estimate.py +0 -0
  21. {skimtoken-0.2.0 → skimtoken-0.2.2}/params/basic.toml +0 -0
  22. {skimtoken-0.2.0 → skimtoken-0.2.2}/params/multilingual.toml +0 -0
  23. {skimtoken-0.2.0 → skimtoken-0.2.2}/params/multilingual_simple.toml +0 -0
  24. {skimtoken-0.2.0 → skimtoken-0.2.2}/params/simple.toml +0 -0
  25. {skimtoken-0.2.0 → skimtoken-0.2.2}/scripts/benchmark.py +0 -0
  26. {skimtoken-0.2.0 → skimtoken-0.2.2}/scripts/optimize/optimize_basic.py +0 -0
  27. {skimtoken-0.2.0 → skimtoken-0.2.2}/scripts/optimize/optimize_multilingual.py +0 -0
  28. {skimtoken-0.2.0 → skimtoken-0.2.2}/scripts/optimize/optimize_multilingual_simple.py +0 -0
  29. {skimtoken-0.2.0 → skimtoken-0.2.2}/scripts/optimize/optimize_simple.py +0 -0
  30. {skimtoken-0.2.0 → skimtoken-0.2.2}/scripts/optimize/utils.py +0 -0
  31. {skimtoken-0.2.0 → skimtoken-0.2.2}/scripts/optimize_all.py +0 -0
  32. {skimtoken-0.2.0 → skimtoken-0.2.2}/scripts/prepare_cc100_dataset.py +0 -0
  33. {skimtoken-0.2.0 → skimtoken-0.2.2}/scripts/update_token_counts.py +0 -0
  34. {skimtoken-0.2.0 → skimtoken-0.2.2}/skimtoken/__init__.py +0 -0
  35. {skimtoken-0.2.0 → skimtoken-0.2.2}/skimtoken/__init__.pyi +0 -0
  36. {skimtoken-0.2.0 → skimtoken-0.2.2}/skimtoken/basic.py +0 -0
  37. {skimtoken-0.2.0 → skimtoken-0.2.2}/skimtoken/basic.pyi +0 -0
  38. {skimtoken-0.2.0 → skimtoken-0.2.2}/skimtoken/multilingual.py +0 -0
  39. {skimtoken-0.2.0 → skimtoken-0.2.2}/skimtoken/multilingual.pyi +0 -0
  40. {skimtoken-0.2.0 → skimtoken-0.2.2}/skimtoken/multilingual_simple.py +0 -0
  41. {skimtoken-0.2.0 → skimtoken-0.2.2}/skimtoken/multilingual_simple.pyi +0 -0
  42. {skimtoken-0.2.0 → skimtoken-0.2.2}/skimtoken/simple.py +0 -0
  43. {skimtoken-0.2.0 → skimtoken-0.2.2}/skimtoken/simple.pyi +0 -0
  44. {skimtoken-0.2.0 → skimtoken-0.2.2}/src/main.rs +0 -0
  45. {skimtoken-0.2.0 → skimtoken-0.2.2}/src/methods/method.rs +0 -0
  46. {skimtoken-0.2.0 → skimtoken-0.2.2}/tests/test_comprehensive.py +0 -0
  47. {skimtoken-0.2.0 → skimtoken-0.2.2}/tests/test_hypothesis.py +0 -0
  48. {skimtoken-0.2.0 → skimtoken-0.2.2}/tests/test_simple.py +0 -0
  49. {skimtoken-0.2.0 → skimtoken-0.2.2}/uv.lock +0 -0
@@ -392,7 +392,7 @@ dependencies = [
392
392
 
393
393
  [[package]]
394
394
  name = "skimtoken"
395
- version = "0.2.0"
395
+ version = "0.2.2"
396
396
  dependencies = [
397
397
  "atty",
398
398
  "clap",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "skimtoken"
3
- version = "0.2.0"
3
+ version = "0.2.2"
4
4
  edition = "2021"
5
5
  authors = ["masaishi <mwishiha@ucsc.edu>"]
6
6
  license = "MIT"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: skimtoken
3
- Version: 0.2.0
3
+ Version: 0.2.2
4
4
  License-File: LICENSE
5
5
  Summary: Fast token count estimation library
6
6
  Home-Page: https://github.com/masaishi/skimtoken
@@ -21,23 +21,16 @@ A lightweight, fast token count estimation library written in Rust with Python b
21
21
  [![Crates.io](https://img.shields.io/crates/v/skimtoken)](https://crates.io/crates/skimtoken)
22
22
  [![License](https://img.shields.io/github/license/masaishi/skimtoken)](https://github.com/masaishi/skimtoken/blob/main/LICENSE)
23
23
 
24
- ## ⚠️ Current Limitations
25
-
26
- **This library is currently in early beta and has significant accuracy issues:**
27
-
28
- - **Multilingual method**: Takes 48.60x longer than tiktoken due to inefficient implementation
29
- - **Overall accuracy**: 15.11% error rate, which is too high for most use cases
30
-
31
24
 
32
25
  ## Why skimtoken?
33
26
 
34
- **The Problem**: [tiktoken](https://github.com/openai/tiktoken) is great for precise tokenization, but requires ~60MB of memory just to count tokens - problematic for memory-constrained environments.
27
+ **The Problem**: [tiktoken](https://github.com/openai/tiktoken) is great for precise tokenization, but requires ~59.6MB of memory just to count tokens - problematic for memory-constrained environments.
35
28
 
36
29
  **The Solution**: skimtoken estimates token counts using statistical patterns instead of loading entire vocabularies, achieving:
37
30
 
38
- - ✅ **64x less memory** (0.92MB vs 60MB)
39
- - ✅ **128x faster startup** (4ms vs 485ms)
40
- - ❌ **48.60x slower execution** (0.93s vs 4.59s) for multilingual method
31
+ - ✅ **65x less memory** (0.92MB vs 59.6MB)
32
+ - ✅ **421x faster startup** (2.389ms vs 1,005ms)
33
+ - ❌ **1.03x slowwer execute time** (6.689s vs 6.912s) for Multilingual single method
41
34
  - ❌ Trade-off: ~15.11% error rate vs exact counts
42
35
 
43
36
  ## Installation
@@ -100,63 +93,46 @@ print(f"Estimated tokens (multilingual): {token_count}")
100
93
 
101
94
  ### Large-Scale Benchmark (100k samples)
102
95
 
103
- Simple method (Just char length x coefficient):
96
+ Multilingual single method:
104
97
  ```
105
98
  Results:
106
99
  Total Samples: 100,726
107
100
  Total Characters: 13,062,391
108
- Mean RMSE: 38.4863 tokens
109
- Mean Error Rate: 21.63%
101
+ Mean RMSE: 21.3034 tokens
102
+ Mean Error Rate: 15.11%
110
103
 
111
104
  ┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━┓
112
105
  ┃ Metric ┃ tiktoken ┃ skimtoken ┃ Ratio ┃
113
106
  ┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━┩
114
- │ Init Time │ 0.481672 s │ 0.182308 s │ 0.378x
107
+ │ Init Time │ 1.005490 s │ 0.002389 s │ 0.002x
115
108
  ├──────────────┼────────────┼────────────┼────────┤
116
- │ Init Memory │ 42.2386 MB │ 0.0291 MB │ 0.001x │
109
+ │ Init Memory │ 42.2310 MB │ 0.0265 MB │ 0.001x │
117
110
  ├──────────────┼────────────┼────────────┼────────┤
118
- │ Exec Time │ 4.710224 s │ 0.805272 s │ 0.171x
111
+ │ Exec Time │ 6.689203 s │ 6.911931 s │ 1.033x
119
112
  ├──────────────┼────────────┼────────────┼────────┤
120
- │ Exec Memory │ 17.3251 MB │ 0.8849 MB │ 0.051x
113
+ │ Exec Memory │ 17.3251 MB │ 0.8950 MB │ 0.052x
121
114
  ├──────────────┼────────────┼────────────┼────────┤
122
- │ Total Time │ 5.191896 s │ 0.928758 s │ 0.190x
115
+ │ Total Time │ 7.694694 s │ 6.914320 s │ 0.899x
123
116
  ├──────────────┼────────────┼────────────┼────────┤
124
- │ Total Memory │ 59.5637 MB │ 0.9214 MB │ 0.015x │
117
+ │ Total Memory │ 59.5561 MB │ 0.9215 MB │ 0.015x │
125
118
  └──────────────┴────────────┴────────────┴────────┘
126
119
  ```
127
120
 
128
- Multilingual simple method:
129
- ```
130
- Results:
131
- Total Samples: 100,726
132
- Total Characters: 13,062,391
133
- Mean RMSE: 21.3034 tokens
134
- Mean Error Rate: 15.11%
121
+ ### Automated Benchmarks
135
122
 
136
- ┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━┓
137
- ┃ Metric ┃ tiktoken ┃ skimtoken ┃ Ratio ┃
138
- ┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━┩
139
- Init Time │ 0.471222 s │ 0.006207 s │ 0.013x │
140
- ├──────────────┼────────────┼──────────────┼─────────┤
141
- │ Init Memory │ 42.2385 MB │ 0.0283 MB │ 0.001x │
142
- ├──────────────┼────────────┼──────────────┼─────────┤
143
- │ Exec Time │ 4.594160 s │ 246.164618 s │ 53.582x │
144
- ├──────────────┼────────────┼──────────────┼─────────┤
145
- │ Exec Memory │ 17.3251 MB │ 0.8950 MB │ 0.052x │
146
- ├──────────────┼────────────┼──────────────┼─────────┤
147
- │ Total Time │ 5.065382 s │ 246.170825 s │ 48.599x │
148
- ├──────────────┼────────────┼──────────────┼─────────┤
149
- │ Total Memory │ 59.5636 MB │ 0.9233 MB │ 0.016x │
150
- └──────────────┴────────────┴──────────────┴─────────┘
151
- ```
123
+ For up-to-date performance comparisons and detailed accuracy metrics across all methods, visit the [skimtoken_benchmark](https://github.com/masaishi/skimtoken_benchmark) repository. This automated benchmark suite:
124
+
125
+ - Uses the CC-100 multilingual dataset (100k+ samples)
126
+ - Provides language-specific accuracy breakdowns
152
127
 
153
128
  ## Available Methods
154
129
 
155
130
  | Method | Import | Memory | Error | Best For |
156
131
  |--------|---------|--------|-------|----------|
157
- | **Simple** | `from skimtoken.simple import estimate_tokens` | 0.8MB | ~21% | English text, minimum memory |
158
- | **Basic** | `from skimtoken.basic import estimate_tokens` | 0.8MB | ~27% | General use |
159
- | **Multilingual** | `from skimtoken.multilingual import estimate_tokens` | 0.9MB | ~15% | Non-English, mixed languages |
132
+ | **Simple** | `from skimtoken.simple import estimate_tokens` | 1.0MB | ~21.63% | English text, minimum memory |
133
+ | **Basic** | `from skimtoken.basic import estimate_tokens` | 0.9MB | ~27.05% | General use |
134
+ | **Multilingual** | `from skimtoken.multilingual import estimate_tokens` | 0.9MB | ~15.93% | Non-English, mixed languages |
135
+ | **Multilingual Simple** | `from skimtoken.multilingual_simple import estimate_tokens` | 0.9MB | ~15.11% | Fast multilingual estimation |
160
136
 
161
137
  ```python
162
138
  # Example: Choose method based on your needs
@@ -272,10 +248,11 @@ A: Beta = breaking changes possible.
272
248
  We are actively working to improve skimtoken's accuracy and performance:
273
249
 
274
250
  1. **Better estimation algorithms**: Moving beyond simple character multiplication to more sophisticated statistical models
275
- 2. **Performance optimization**: Fixing the 60x slowdown in multilingual method
251
+ 2. **Performance optimization**: Further improving execution speed
276
252
  3. **Improved language support**: Better handling of non-English languages
277
253
  4. **Higher accuracy**: Targeting <10% error rate while maintaining low memory footprint
278
254
 
279
255
  ## License
280
256
 
281
257
  MIT License - see [LICENSE](./LICENSE) for details.
258
+
@@ -8,23 +8,16 @@ A lightweight, fast token count estimation library written in Rust with Python b
8
8
  [![Crates.io](https://img.shields.io/crates/v/skimtoken)](https://crates.io/crates/skimtoken)
9
9
  [![License](https://img.shields.io/github/license/masaishi/skimtoken)](https://github.com/masaishi/skimtoken/blob/main/LICENSE)
10
10
 
11
- ## ⚠️ Current Limitations
12
-
13
- **This library is currently in early beta and has significant accuracy issues:**
14
-
15
- - **Multilingual method**: Takes 48.60x longer than tiktoken due to inefficient implementation
16
- - **Overall accuracy**: 15.11% error rate, which is too high for most use cases
17
-
18
11
 
19
12
  ## Why skimtoken?
20
13
 
21
- **The Problem**: [tiktoken](https://github.com/openai/tiktoken) is great for precise tokenization, but requires ~60MB of memory just to count tokens - problematic for memory-constrained environments.
14
+ **The Problem**: [tiktoken](https://github.com/openai/tiktoken) is great for precise tokenization, but requires ~59.6MB of memory just to count tokens - problematic for memory-constrained environments.
22
15
 
23
16
  **The Solution**: skimtoken estimates token counts using statistical patterns instead of loading entire vocabularies, achieving:
24
17
 
25
- - ✅ **64x less memory** (0.92MB vs 60MB)
26
- - ✅ **128x faster startup** (4ms vs 485ms)
27
- - ❌ **48.60x slower execution** (0.93s vs 4.59s) for multilingual method
18
+ - ✅ **65x less memory** (0.92MB vs 59.6MB)
19
+ - ✅ **421x faster startup** (2.389ms vs 1,005ms)
20
+ - ❌ **1.03x slowwer execute time** (6.689s vs 6.912s) for Multilingual single method
28
21
  - ❌ Trade-off: ~15.11% error rate vs exact counts
29
22
 
30
23
  ## Installation
@@ -87,63 +80,46 @@ print(f"Estimated tokens (multilingual): {token_count}")
87
80
 
88
81
  ### Large-Scale Benchmark (100k samples)
89
82
 
90
- Simple method (Just char length x coefficient):
83
+ Multilingual single method:
91
84
  ```
92
85
  Results:
93
86
  Total Samples: 100,726
94
87
  Total Characters: 13,062,391
95
- Mean RMSE: 38.4863 tokens
96
- Mean Error Rate: 21.63%
88
+ Mean RMSE: 21.3034 tokens
89
+ Mean Error Rate: 15.11%
97
90
 
98
91
  ┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━┓
99
92
  ┃ Metric ┃ tiktoken ┃ skimtoken ┃ Ratio ┃
100
93
  ┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━┩
101
- │ Init Time │ 0.481672 s │ 0.182308 s │ 0.378x
94
+ │ Init Time │ 1.005490 s │ 0.002389 s │ 0.002x
102
95
  ├──────────────┼────────────┼────────────┼────────┤
103
- │ Init Memory │ 42.2386 MB │ 0.0291 MB │ 0.001x │
96
+ │ Init Memory │ 42.2310 MB │ 0.0265 MB │ 0.001x │
104
97
  ├──────────────┼────────────┼────────────┼────────┤
105
- │ Exec Time │ 4.710224 s │ 0.805272 s │ 0.171x
98
+ │ Exec Time │ 6.689203 s │ 6.911931 s │ 1.033x
106
99
  ├──────────────┼────────────┼────────────┼────────┤
107
- │ Exec Memory │ 17.3251 MB │ 0.8849 MB │ 0.051x
100
+ │ Exec Memory │ 17.3251 MB │ 0.8950 MB │ 0.052x
108
101
  ├──────────────┼────────────┼────────────┼────────┤
109
- │ Total Time │ 5.191896 s │ 0.928758 s │ 0.190x
102
+ │ Total Time │ 7.694694 s │ 6.914320 s │ 0.899x
110
103
  ├──────────────┼────────────┼────────────┼────────┤
111
- │ Total Memory │ 59.5637 MB │ 0.9214 MB │ 0.015x │
104
+ │ Total Memory │ 59.5561 MB │ 0.9215 MB │ 0.015x │
112
105
  └──────────────┴────────────┴────────────┴────────┘
113
106
  ```
114
107
 
115
- Multilingual simple method:
116
- ```
117
- Results:
118
- Total Samples: 100,726
119
- Total Characters: 13,062,391
120
- Mean RMSE: 21.3034 tokens
121
- Mean Error Rate: 15.11%
108
+ ### Automated Benchmarks
122
109
 
123
- ┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━┓
124
- ┃ Metric ┃ tiktoken ┃ skimtoken ┃ Ratio ┃
125
- ┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━┩
126
- Init Time │ 0.471222 s │ 0.006207 s │ 0.013x │
127
- ├──────────────┼────────────┼──────────────┼─────────┤
128
- │ Init Memory │ 42.2385 MB │ 0.0283 MB │ 0.001x │
129
- ├──────────────┼────────────┼──────────────┼─────────┤
130
- │ Exec Time │ 4.594160 s │ 246.164618 s │ 53.582x │
131
- ├──────────────┼────────────┼──────────────┼─────────┤
132
- │ Exec Memory │ 17.3251 MB │ 0.8950 MB │ 0.052x │
133
- ├──────────────┼────────────┼──────────────┼─────────┤
134
- │ Total Time │ 5.065382 s │ 246.170825 s │ 48.599x │
135
- ├──────────────┼────────────┼──────────────┼─────────┤
136
- │ Total Memory │ 59.5636 MB │ 0.9233 MB │ 0.016x │
137
- └──────────────┴────────────┴──────────────┴─────────┘
138
- ```
110
+ For up-to-date performance comparisons and detailed accuracy metrics across all methods, visit the [skimtoken_benchmark](https://github.com/masaishi/skimtoken_benchmark) repository. This automated benchmark suite:
111
+
112
+ - Uses the CC-100 multilingual dataset (100k+ samples)
113
+ - Provides language-specific accuracy breakdowns
139
114
 
140
115
  ## Available Methods
141
116
 
142
117
  | Method | Import | Memory | Error | Best For |
143
118
  |--------|---------|--------|-------|----------|
144
- | **Simple** | `from skimtoken.simple import estimate_tokens` | 0.8MB | ~21% | English text, minimum memory |
145
- | **Basic** | `from skimtoken.basic import estimate_tokens` | 0.8MB | ~27% | General use |
146
- | **Multilingual** | `from skimtoken.multilingual import estimate_tokens` | 0.9MB | ~15% | Non-English, mixed languages |
119
+ | **Simple** | `from skimtoken.simple import estimate_tokens` | 1.0MB | ~21.63% | English text, minimum memory |
120
+ | **Basic** | `from skimtoken.basic import estimate_tokens` | 0.9MB | ~27.05% | General use |
121
+ | **Multilingual** | `from skimtoken.multilingual import estimate_tokens` | 0.9MB | ~15.93% | Non-English, mixed languages |
122
+ | **Multilingual Simple** | `from skimtoken.multilingual_simple import estimate_tokens` | 0.9MB | ~15.11% | Fast multilingual estimation |
147
123
 
148
124
  ```python
149
125
  # Example: Choose method based on your needs
@@ -259,10 +235,10 @@ A: Beta = breaking changes possible.
259
235
  We are actively working to improve skimtoken's accuracy and performance:
260
236
 
261
237
  1. **Better estimation algorithms**: Moving beyond simple character multiplication to more sophisticated statistical models
262
- 2. **Performance optimization**: Fixing the 60x slowdown in multilingual method
238
+ 2. **Performance optimization**: Further improving execution speed
263
239
  3. **Improved language support**: Better handling of non-English languages
264
240
  4. **Higher accuracy**: Targeting <10% error rate while maintaining low memory footprint
265
241
 
266
242
  ## License
267
243
 
268
- MIT License - see [LICENSE](./LICENSE) for details.
244
+ MIT License - see [LICENSE](./LICENSE) for details.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "skimtoken"
3
- version = "0.2.0"
3
+ version = "0.2.2"
4
4
  description = "Fast token count estimation library"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.9"
@@ -4,10 +4,11 @@ Script to load parameters from TOML files and update the Rust parameter files.
4
4
  Handles all parameter types: simple, basic, multilingual, and multilingual_simple.
5
5
  """
6
6
 
7
- import toml
8
7
  from pathlib import Path
9
8
  from typing import Any, Callable, TypedDict
10
9
 
10
+ import toml
11
+
11
12
 
12
13
  class ParamConfig(TypedDict):
13
14
  name: str
@@ -23,6 +24,29 @@ def load_params_from_toml(toml_path: str) -> dict[str, Any]:
23
24
  return data
24
25
 
25
26
 
27
+ def format_f32(value: float) -> str:
28
+ """Format a float value for f32 with underscores for readability."""
29
+ str_val = f"{value:.7g}" # Use 7 significant digits for f32 precision
30
+
31
+ if "e" in str_val.lower():
32
+ return str_val
33
+
34
+ if "." in str_val:
35
+ integer_part, decimal_part = str_val.split(".")
36
+
37
+ if len(decimal_part) > 3:
38
+ formatted_decimal = ""
39
+ for i, digit in enumerate(decimal_part):
40
+ if i > 0 and i % 3 == 0:
41
+ formatted_decimal += "_"
42
+ formatted_decimal += digit
43
+ return f"{integer_part}.{formatted_decimal}"
44
+ else:
45
+ return str_val
46
+ else:
47
+ return str_val
48
+
49
+
26
50
  def generate_simple_params_rust(params_data: dict[str, Any]) -> str:
27
51
  """Generate Rust code for simple parameters."""
28
52
  rust_code: list[str] = []
@@ -30,7 +54,7 @@ def generate_simple_params_rust(params_data: dict[str, Any]) -> str:
30
54
  rust_code.append("impl Default for SimpleParameters {")
31
55
  rust_code.append(" fn default() -> Self {")
32
56
  rust_code.append(" Self {")
33
- rust_code.append(f" coefficient: {params_data['coefficient']},")
57
+ rust_code.append(f" coefficient: {format_f32(params_data['coefficient'])},")
34
58
  rust_code.append(" }")
35
59
  rust_code.append(" }")
36
60
  rust_code.append("}")
@@ -45,11 +69,13 @@ def generate_basic_params_rust(params_data: dict[str, Any]) -> str:
45
69
  rust_code.append("impl Default for BasicParameters {")
46
70
  rust_code.append(" fn default() -> Self {")
47
71
  rust_code.append(" Self {")
48
- rust_code.append(f" char_coef: {params_data['char_coef']},")
49
- rust_code.append(f" word_coef: {params_data['word_coef']},")
50
- rust_code.append(f" avg_word_length_coef: {params_data['avg_word_length_coef']},")
51
- rust_code.append(f" space_coef: {params_data['space_coef']},")
52
- rust_code.append(f" intercept: {params_data['intercept']},")
72
+ rust_code.append(f" char_coef: {format_f32(params_data['char_coef'])},")
73
+ rust_code.append(f" word_coef: {format_f32(params_data['word_coef'])},")
74
+ rust_code.append(
75
+ f" avg_word_length_coef: {format_f32(params_data['avg_word_length_coef'])},"
76
+ )
77
+ rust_code.append(f" space_coef: {format_f32(params_data['space_coef'])},")
78
+ rust_code.append(f" intercept: {format_f32(params_data['intercept'])},")
53
79
  rust_code.append(" }")
54
80
  rust_code.append(" }")
55
81
  rust_code.append("}")
@@ -66,11 +92,13 @@ def generate_multilingual_params_rust(params_data: dict[str, Any]) -> str:
66
92
  rust_code.append("impl Default for MultilingualParameters {")
67
93
  rust_code.append(" fn default() -> Self {")
68
94
  rust_code.append(" Self {")
69
- rust_code.append(f" char_coef: {default_params['char_coef']},")
70
- rust_code.append(f" word_coef: {default_params['word_coef']},")
71
- rust_code.append(f" avg_word_length_coef: {default_params['avg_word_length_coef']},")
72
- rust_code.append(f" space_coef: {default_params['space_coef']},")
73
- rust_code.append(f" intercept: {default_params['intercept']},")
95
+ rust_code.append(f" char_coef: {format_f32(default_params['char_coef'])},")
96
+ rust_code.append(f" word_coef: {format_f32(default_params['word_coef'])},")
97
+ rust_code.append(
98
+ f" avg_word_length_coef: {format_f32(default_params['avg_word_length_coef'])},"
99
+ )
100
+ rust_code.append(f" space_coef: {format_f32(default_params['space_coef'])},")
101
+ rust_code.append(f" intercept: {format_f32(default_params['intercept'])},")
74
102
  rust_code.append(" }")
75
103
  rust_code.append(" }")
76
104
  rust_code.append("}")
@@ -89,13 +117,13 @@ def generate_multilingual_params_rust(params_data: dict[str, Any]) -> str:
89
117
  rust_code.append(" language_params.insert(")
90
118
  rust_code.append(f' "{lang_key}".to_string(),')
91
119
  rust_code.append(" MultilingualParameters {")
92
- rust_code.append(f" char_coef: {lang_params['char_coef']},")
93
- rust_code.append(f" word_coef: {lang_params['word_coef']},")
120
+ rust_code.append(f" char_coef: {format_f32(lang_params['char_coef'])},")
121
+ rust_code.append(f" word_coef: {format_f32(lang_params['word_coef'])},")
94
122
  rust_code.append(
95
- f" avg_word_length_coef: {lang_params['avg_word_length_coef']},"
123
+ f" avg_word_length_coef: {format_f32(lang_params['avg_word_length_coef'])},"
96
124
  )
97
- rust_code.append(f" space_coef: {lang_params['space_coef']},")
98
- rust_code.append(f" intercept: {lang_params['intercept']},")
125
+ rust_code.append(f" space_coef: {format_f32(lang_params['space_coef'])},")
126
+ rust_code.append(f" intercept: {format_f32(lang_params['intercept'])},")
99
127
  rust_code.append(" },")
100
128
  rust_code.append(" );")
101
129
  rust_code.append("")
@@ -119,7 +147,7 @@ def generate_multilingual_simple_params_rust(params_data: dict[str, Any]) -> str
119
147
  rust_code.append("impl Default for MultilingualSimpleParameters {")
120
148
  rust_code.append(" fn default() -> Self {")
121
149
  rust_code.append(" Self {")
122
- rust_code.append(f" coefficient: {default_params['coefficient']},")
150
+ rust_code.append(f" coefficient: {format_f32(default_params['coefficient'])},")
123
151
  rust_code.append(" }")
124
152
  rust_code.append(" }")
125
153
  rust_code.append("}")
@@ -137,7 +165,7 @@ def generate_multilingual_simple_params_rust(params_data: dict[str, Any]) -> str
137
165
  rust_code.append(" language_params.insert(")
138
166
  rust_code.append(f' "{lang_key}".to_string(),')
139
167
  rust_code.append(" MultilingualSimpleParameters {")
140
- rust_code.append(f" coefficient: {lang_params['coefficient']},")
168
+ rust_code.append(f" coefficient: {format_f32(lang_params['coefficient'])},")
141
169
  rust_code.append(" },")
142
170
  rust_code.append(" );")
143
171
  rust_code.append("")
@@ -133,7 +133,7 @@ fn _skimtoken_core(m: &Bound<'_, PyModule>) -> PyResult<()> {
133
133
 
134
134
  #[pyfn(m)]
135
135
  #[pyo3(name = "count_basic")]
136
- fn count_basic_py(text: &Bound<'_, PyAny>) -> PyResult<(usize, usize, f64, usize)> {
136
+ fn count_basic_py(text: &Bound<'_, PyAny>) -> PyResult<(usize, usize, f32, usize)> {
137
137
  let text_str = if let Ok(s) = text.extract::<String>() {
138
138
  s
139
139
  } else {
@@ -153,7 +153,7 @@ fn _skimtoken_core(m: &Bound<'_, PyModule>) -> PyResult<()> {
153
153
  #[pyo3(name = "count_multilingual")]
154
154
  fn count_multilingual_py(
155
155
  text: &Bound<'_, PyAny>,
156
- ) -> PyResult<(usize, usize, f64, usize, String)> {
156
+ ) -> PyResult<(usize, usize, f32, usize, String)> {
157
157
  let text_str = if let Ok(s) = text.extract::<String>() {
158
158
  s
159
159
  } else {
@@ -5,27 +5,27 @@ use serde::{Deserialize, Serialize};
5
5
  pub struct BasicFeatures {
6
6
  pub char_count: usize,
7
7
  pub word_count: usize,
8
- pub avg_word_length: f64,
8
+ pub avg_word_length: f32,
9
9
  pub space_count: usize,
10
10
  }
11
11
 
12
12
  #[derive(Debug, Clone, Serialize, Deserialize)]
13
13
  pub struct BasicParameters {
14
- pub char_coef: f64,
15
- pub word_coef: f64,
16
- pub avg_word_length_coef: f64,
17
- pub space_coef: f64,
18
- pub intercept: f64,
14
+ pub char_coef: f32,
15
+ pub word_coef: f32,
16
+ pub avg_word_length_coef: f32,
17
+ pub space_coef: f32,
18
+ pub intercept: f32,
19
19
  }
20
20
 
21
21
  impl Default for BasicParameters {
22
22
  fn default() -> Self {
23
23
  Self {
24
- char_coef: 0.3217745347518016,
25
- word_coef: 0.07022881669049061,
26
- avg_word_length_coef: 0.5090982427870748,
27
- space_coef: -0.15831091236345404,
28
- intercept: 1.591021053665763,
24
+ char_coef: 0.321_774_5,
25
+ word_coef: 0.070_228_82,
26
+ avg_word_length_coef: 0.509_098_2,
27
+ space_coef: -0.158_310_9,
28
+ intercept: 1.591_021,
29
29
  }
30
30
  }
31
31
  }
@@ -60,7 +60,7 @@ impl EstimationMethod for BasicMethod {
60
60
 
61
61
  let avg_word_length = if word_count > 0 {
62
62
  let total_word_chars: usize = words.iter().map(|w| w.chars().count()).sum();
63
- total_word_chars as f64 / word_count as f64
63
+ total_word_chars as f32 / word_count as f32
64
64
  } else {
65
65
  0.0
66
66
  };
@@ -75,10 +75,10 @@ impl EstimationMethod for BasicMethod {
75
75
 
76
76
  fn estimate(&self, text: &str) -> usize {
77
77
  let features = self.count(text);
78
- let estimate = self.parameters.char_coef * features.char_count as f64
79
- + self.parameters.word_coef * features.word_count as f64
78
+ let estimate = self.parameters.char_coef * features.char_count as f32
79
+ + self.parameters.word_coef * features.word_count as f32
80
80
  + self.parameters.avg_word_length_coef * features.avg_word_length
81
- + self.parameters.space_coef * features.space_count as f64
81
+ + self.parameters.space_coef * features.space_count as f32
82
82
  + self.parameters.intercept;
83
83
 
84
84
  estimate.round().max(0.0) as usize