skimtoken 0.2.0__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {skimtoken-0.2.0 → skimtoken-0.2.2}/Cargo.lock +1 -1
- {skimtoken-0.2.0 → skimtoken-0.2.2}/Cargo.toml +1 -1
- {skimtoken-0.2.0 → skimtoken-0.2.2}/PKG-INFO +25 -48
- {skimtoken-0.2.0 → skimtoken-0.2.2}/README.md +24 -48
- {skimtoken-0.2.0 → skimtoken-0.2.2}/pyproject.toml +1 -1
- {skimtoken-0.2.0 → skimtoken-0.2.2}/scripts/update_rust_params.py +47 -19
- {skimtoken-0.2.0 → skimtoken-0.2.2}/src/lib.rs +2 -2
- {skimtoken-0.2.0 → skimtoken-0.2.2}/src/methods/method_basic.rs +15 -15
- skimtoken-0.2.2/src/methods/method_multilingual.rs +965 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/src/methods/method_multilingual_simple.rs +72 -82
- {skimtoken-0.2.0 → skimtoken-0.2.2}/src/methods/method_simple.rs +3 -3
- skimtoken-0.2.0/src/methods/method_multilingual.rs +0 -965
- {skimtoken-0.2.0 → skimtoken-0.2.2}/.github/workflows/ci.yml +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/.github/workflows/release.yml +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/.gitignore +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/CONTRIBUTING.md +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/LICENSE +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/data/test_dataset.jsonl +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/examples/example.py +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/examples/multilingual_estimate.py +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/params/basic.toml +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/params/multilingual.toml +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/params/multilingual_simple.toml +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/params/simple.toml +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/scripts/benchmark.py +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/scripts/optimize/optimize_basic.py +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/scripts/optimize/optimize_multilingual.py +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/scripts/optimize/optimize_multilingual_simple.py +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/scripts/optimize/optimize_simple.py +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/scripts/optimize/utils.py +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/scripts/optimize_all.py +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/scripts/prepare_cc100_dataset.py +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/scripts/update_token_counts.py +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/skimtoken/__init__.py +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/skimtoken/__init__.pyi +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/skimtoken/basic.py +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/skimtoken/basic.pyi +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/skimtoken/multilingual.py +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/skimtoken/multilingual.pyi +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/skimtoken/multilingual_simple.py +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/skimtoken/multilingual_simple.pyi +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/skimtoken/simple.py +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/skimtoken/simple.pyi +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/src/main.rs +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/src/methods/method.rs +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/tests/test_comprehensive.py +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/tests/test_hypothesis.py +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/tests/test_simple.py +0 -0
- {skimtoken-0.2.0 → skimtoken-0.2.2}/uv.lock +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: skimtoken
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.2
|
4
4
|
License-File: LICENSE
|
5
5
|
Summary: Fast token count estimation library
|
6
6
|
Home-Page: https://github.com/masaishi/skimtoken
|
@@ -21,23 +21,16 @@ A lightweight, fast token count estimation library written in Rust with Python b
|
|
21
21
|
[](https://crates.io/crates/skimtoken)
|
22
22
|
[](https://github.com/masaishi/skimtoken/blob/main/LICENSE)
|
23
23
|
|
24
|
-
## ⚠️ Current Limitations
|
25
|
-
|
26
|
-
**This library is currently in early beta and has significant accuracy issues:**
|
27
|
-
|
28
|
-
- **Multilingual method**: Takes 48.60x longer than tiktoken due to inefficient implementation
|
29
|
-
- **Overall accuracy**: 15.11% error rate, which is too high for most use cases
|
30
|
-
|
31
24
|
|
32
25
|
## Why skimtoken?
|
33
26
|
|
34
|
-
**The Problem**: [tiktoken](https://github.com/openai/tiktoken) is great for precise tokenization, but requires ~
|
27
|
+
**The Problem**: [tiktoken](https://github.com/openai/tiktoken) is great for precise tokenization, but requires ~59.6MB of memory just to count tokens - problematic for memory-constrained environments.
|
35
28
|
|
36
29
|
**The Solution**: skimtoken estimates token counts using statistical patterns instead of loading entire vocabularies, achieving:
|
37
30
|
|
38
|
-
- ✅ **
|
39
|
-
- ✅ **
|
40
|
-
- ❌ **
|
31
|
+
- ✅ **65x less memory** (0.92MB vs 59.6MB)
|
32
|
+
- ✅ **421x faster startup** (2.389ms vs 1,005ms)
|
33
|
+
- ❌ **1.03x slowwer execute time** (6.689s vs 6.912s) for Multilingual single method
|
41
34
|
- ❌ Trade-off: ~15.11% error rate vs exact counts
|
42
35
|
|
43
36
|
## Installation
|
@@ -100,63 +93,46 @@ print(f"Estimated tokens (multilingual): {token_count}")
|
|
100
93
|
|
101
94
|
### Large-Scale Benchmark (100k samples)
|
102
95
|
|
103
|
-
|
96
|
+
Multilingual single method:
|
104
97
|
```
|
105
98
|
Results:
|
106
99
|
Total Samples: 100,726
|
107
100
|
Total Characters: 13,062,391
|
108
|
-
Mean RMSE:
|
109
|
-
Mean Error Rate:
|
101
|
+
Mean RMSE: 21.3034 tokens
|
102
|
+
Mean Error Rate: 15.11%
|
110
103
|
|
111
104
|
┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━┓
|
112
105
|
┃ Metric ┃ tiktoken ┃ skimtoken ┃ Ratio ┃
|
113
106
|
┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━┩
|
114
|
-
│ Init Time │
|
107
|
+
│ Init Time │ 1.005490 s │ 0.002389 s │ 0.002x │
|
115
108
|
├──────────────┼────────────┼────────────┼────────┤
|
116
|
-
│ Init Memory │ 42.
|
109
|
+
│ Init Memory │ 42.2310 MB │ 0.0265 MB │ 0.001x │
|
117
110
|
├──────────────┼────────────┼────────────┼────────┤
|
118
|
-
│ Exec Time │
|
111
|
+
│ Exec Time │ 6.689203 s │ 6.911931 s │ 1.033x │
|
119
112
|
├──────────────┼────────────┼────────────┼────────┤
|
120
|
-
│ Exec Memory │ 17.3251 MB │ 0.
|
113
|
+
│ Exec Memory │ 17.3251 MB │ 0.8950 MB │ 0.052x │
|
121
114
|
├──────────────┼────────────┼────────────┼────────┤
|
122
|
-
│ Total Time │
|
115
|
+
│ Total Time │ 7.694694 s │ 6.914320 s │ 0.899x │
|
123
116
|
├──────────────┼────────────┼────────────┼────────┤
|
124
|
-
│ Total Memory │ 59.
|
117
|
+
│ Total Memory │ 59.5561 MB │ 0.9215 MB │ 0.015x │
|
125
118
|
└──────────────┴────────────┴────────────┴────────┘
|
126
119
|
```
|
127
120
|
|
128
|
-
|
129
|
-
```
|
130
|
-
Results:
|
131
|
-
Total Samples: 100,726
|
132
|
-
Total Characters: 13,062,391
|
133
|
-
Mean RMSE: 21.3034 tokens
|
134
|
-
Mean Error Rate: 15.11%
|
121
|
+
### Automated Benchmarks
|
135
122
|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
├──────────────┼────────────┼──────────────┼─────────┤
|
141
|
-
│ Init Memory │ 42.2385 MB │ 0.0283 MB │ 0.001x │
|
142
|
-
├──────────────┼────────────┼──────────────┼─────────┤
|
143
|
-
│ Exec Time │ 4.594160 s │ 246.164618 s │ 53.582x │
|
144
|
-
├──────────────┼────────────┼──────────────┼─────────┤
|
145
|
-
│ Exec Memory │ 17.3251 MB │ 0.8950 MB │ 0.052x │
|
146
|
-
├──────────────┼────────────┼──────────────┼─────────┤
|
147
|
-
│ Total Time │ 5.065382 s │ 246.170825 s │ 48.599x │
|
148
|
-
├──────────────┼────────────┼──────────────┼─────────┤
|
149
|
-
│ Total Memory │ 59.5636 MB │ 0.9233 MB │ 0.016x │
|
150
|
-
└──────────────┴────────────┴──────────────┴─────────┘
|
151
|
-
```
|
123
|
+
For up-to-date performance comparisons and detailed accuracy metrics across all methods, visit the [skimtoken_benchmark](https://github.com/masaishi/skimtoken_benchmark) repository. This automated benchmark suite:
|
124
|
+
|
125
|
+
- Uses the CC-100 multilingual dataset (100k+ samples)
|
126
|
+
- Provides language-specific accuracy breakdowns
|
152
127
|
|
153
128
|
## Available Methods
|
154
129
|
|
155
130
|
| Method | Import | Memory | Error | Best For |
|
156
131
|
|--------|---------|--------|-------|----------|
|
157
|
-
| **Simple** | `from skimtoken.simple import estimate_tokens` |
|
158
|
-
| **Basic** | `from skimtoken.basic import estimate_tokens` | 0.
|
159
|
-
| **Multilingual** | `from skimtoken.multilingual import estimate_tokens` | 0.9MB | ~15% | Non-English, mixed languages |
|
132
|
+
| **Simple** | `from skimtoken.simple import estimate_tokens` | 1.0MB | ~21.63% | English text, minimum memory |
|
133
|
+
| **Basic** | `from skimtoken.basic import estimate_tokens` | 0.9MB | ~27.05% | General use |
|
134
|
+
| **Multilingual** | `from skimtoken.multilingual import estimate_tokens` | 0.9MB | ~15.93% | Non-English, mixed languages |
|
135
|
+
| **Multilingual Simple** | `from skimtoken.multilingual_simple import estimate_tokens` | 0.9MB | ~15.11% | Fast multilingual estimation |
|
160
136
|
|
161
137
|
```python
|
162
138
|
# Example: Choose method based on your needs
|
@@ -272,10 +248,11 @@ A: Beta = breaking changes possible.
|
|
272
248
|
We are actively working to improve skimtoken's accuracy and performance:
|
273
249
|
|
274
250
|
1. **Better estimation algorithms**: Moving beyond simple character multiplication to more sophisticated statistical models
|
275
|
-
2. **Performance optimization**:
|
251
|
+
2. **Performance optimization**: Further improving execution speed
|
276
252
|
3. **Improved language support**: Better handling of non-English languages
|
277
253
|
4. **Higher accuracy**: Targeting <10% error rate while maintaining low memory footprint
|
278
254
|
|
279
255
|
## License
|
280
256
|
|
281
257
|
MIT License - see [LICENSE](./LICENSE) for details.
|
258
|
+
|
@@ -8,23 +8,16 @@ A lightweight, fast token count estimation library written in Rust with Python b
|
|
8
8
|
[](https://crates.io/crates/skimtoken)
|
9
9
|
[](https://github.com/masaishi/skimtoken/blob/main/LICENSE)
|
10
10
|
|
11
|
-
## ⚠️ Current Limitations
|
12
|
-
|
13
|
-
**This library is currently in early beta and has significant accuracy issues:**
|
14
|
-
|
15
|
-
- **Multilingual method**: Takes 48.60x longer than tiktoken due to inefficient implementation
|
16
|
-
- **Overall accuracy**: 15.11% error rate, which is too high for most use cases
|
17
|
-
|
18
11
|
|
19
12
|
## Why skimtoken?
|
20
13
|
|
21
|
-
**The Problem**: [tiktoken](https://github.com/openai/tiktoken) is great for precise tokenization, but requires ~
|
14
|
+
**The Problem**: [tiktoken](https://github.com/openai/tiktoken) is great for precise tokenization, but requires ~59.6MB of memory just to count tokens - problematic for memory-constrained environments.
|
22
15
|
|
23
16
|
**The Solution**: skimtoken estimates token counts using statistical patterns instead of loading entire vocabularies, achieving:
|
24
17
|
|
25
|
-
- ✅ **
|
26
|
-
- ✅ **
|
27
|
-
- ❌ **
|
18
|
+
- ✅ **65x less memory** (0.92MB vs 59.6MB)
|
19
|
+
- ✅ **421x faster startup** (2.389ms vs 1,005ms)
|
20
|
+
- ❌ **1.03x slowwer execute time** (6.689s vs 6.912s) for Multilingual single method
|
28
21
|
- ❌ Trade-off: ~15.11% error rate vs exact counts
|
29
22
|
|
30
23
|
## Installation
|
@@ -87,63 +80,46 @@ print(f"Estimated tokens (multilingual): {token_count}")
|
|
87
80
|
|
88
81
|
### Large-Scale Benchmark (100k samples)
|
89
82
|
|
90
|
-
|
83
|
+
Multilingual single method:
|
91
84
|
```
|
92
85
|
Results:
|
93
86
|
Total Samples: 100,726
|
94
87
|
Total Characters: 13,062,391
|
95
|
-
Mean RMSE:
|
96
|
-
Mean Error Rate:
|
88
|
+
Mean RMSE: 21.3034 tokens
|
89
|
+
Mean Error Rate: 15.11%
|
97
90
|
|
98
91
|
┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━┓
|
99
92
|
┃ Metric ┃ tiktoken ┃ skimtoken ┃ Ratio ┃
|
100
93
|
┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━┩
|
101
|
-
│ Init Time │
|
94
|
+
│ Init Time │ 1.005490 s │ 0.002389 s │ 0.002x │
|
102
95
|
├──────────────┼────────────┼────────────┼────────┤
|
103
|
-
│ Init Memory │ 42.
|
96
|
+
│ Init Memory │ 42.2310 MB │ 0.0265 MB │ 0.001x │
|
104
97
|
├──────────────┼────────────┼────────────┼────────┤
|
105
|
-
│ Exec Time │
|
98
|
+
│ Exec Time │ 6.689203 s │ 6.911931 s │ 1.033x │
|
106
99
|
├──────────────┼────────────┼────────────┼────────┤
|
107
|
-
│ Exec Memory │ 17.3251 MB │ 0.
|
100
|
+
│ Exec Memory │ 17.3251 MB │ 0.8950 MB │ 0.052x │
|
108
101
|
├──────────────┼────────────┼────────────┼────────┤
|
109
|
-
│ Total Time │
|
102
|
+
│ Total Time │ 7.694694 s │ 6.914320 s │ 0.899x │
|
110
103
|
├──────────────┼────────────┼────────────┼────────┤
|
111
|
-
│ Total Memory │ 59.
|
104
|
+
│ Total Memory │ 59.5561 MB │ 0.9215 MB │ 0.015x │
|
112
105
|
└──────────────┴────────────┴────────────┴────────┘
|
113
106
|
```
|
114
107
|
|
115
|
-
|
116
|
-
```
|
117
|
-
Results:
|
118
|
-
Total Samples: 100,726
|
119
|
-
Total Characters: 13,062,391
|
120
|
-
Mean RMSE: 21.3034 tokens
|
121
|
-
Mean Error Rate: 15.11%
|
108
|
+
### Automated Benchmarks
|
122
109
|
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
├──────────────┼────────────┼──────────────┼─────────┤
|
128
|
-
│ Init Memory │ 42.2385 MB │ 0.0283 MB │ 0.001x │
|
129
|
-
├──────────────┼────────────┼──────────────┼─────────┤
|
130
|
-
│ Exec Time │ 4.594160 s │ 246.164618 s │ 53.582x │
|
131
|
-
├──────────────┼────────────┼──────────────┼─────────┤
|
132
|
-
│ Exec Memory │ 17.3251 MB │ 0.8950 MB │ 0.052x │
|
133
|
-
├──────────────┼────────────┼──────────────┼─────────┤
|
134
|
-
│ Total Time │ 5.065382 s │ 246.170825 s │ 48.599x │
|
135
|
-
├──────────────┼────────────┼──────────────┼─────────┤
|
136
|
-
│ Total Memory │ 59.5636 MB │ 0.9233 MB │ 0.016x │
|
137
|
-
└──────────────┴────────────┴──────────────┴─────────┘
|
138
|
-
```
|
110
|
+
For up-to-date performance comparisons and detailed accuracy metrics across all methods, visit the [skimtoken_benchmark](https://github.com/masaishi/skimtoken_benchmark) repository. This automated benchmark suite:
|
111
|
+
|
112
|
+
- Uses the CC-100 multilingual dataset (100k+ samples)
|
113
|
+
- Provides language-specific accuracy breakdowns
|
139
114
|
|
140
115
|
## Available Methods
|
141
116
|
|
142
117
|
| Method | Import | Memory | Error | Best For |
|
143
118
|
|--------|---------|--------|-------|----------|
|
144
|
-
| **Simple** | `from skimtoken.simple import estimate_tokens` |
|
145
|
-
| **Basic** | `from skimtoken.basic import estimate_tokens` | 0.
|
146
|
-
| **Multilingual** | `from skimtoken.multilingual import estimate_tokens` | 0.9MB | ~15% | Non-English, mixed languages |
|
119
|
+
| **Simple** | `from skimtoken.simple import estimate_tokens` | 1.0MB | ~21.63% | English text, minimum memory |
|
120
|
+
| **Basic** | `from skimtoken.basic import estimate_tokens` | 0.9MB | ~27.05% | General use |
|
121
|
+
| **Multilingual** | `from skimtoken.multilingual import estimate_tokens` | 0.9MB | ~15.93% | Non-English, mixed languages |
|
122
|
+
| **Multilingual Simple** | `from skimtoken.multilingual_simple import estimate_tokens` | 0.9MB | ~15.11% | Fast multilingual estimation |
|
147
123
|
|
148
124
|
```python
|
149
125
|
# Example: Choose method based on your needs
|
@@ -259,10 +235,10 @@ A: Beta = breaking changes possible.
|
|
259
235
|
We are actively working to improve skimtoken's accuracy and performance:
|
260
236
|
|
261
237
|
1. **Better estimation algorithms**: Moving beyond simple character multiplication to more sophisticated statistical models
|
262
|
-
2. **Performance optimization**:
|
238
|
+
2. **Performance optimization**: Further improving execution speed
|
263
239
|
3. **Improved language support**: Better handling of non-English languages
|
264
240
|
4. **Higher accuracy**: Targeting <10% error rate while maintaining low memory footprint
|
265
241
|
|
266
242
|
## License
|
267
243
|
|
268
|
-
MIT License - see [LICENSE](./LICENSE) for details.
|
244
|
+
MIT License - see [LICENSE](./LICENSE) for details.
|
@@ -4,10 +4,11 @@ Script to load parameters from TOML files and update the Rust parameter files.
|
|
4
4
|
Handles all parameter types: simple, basic, multilingual, and multilingual_simple.
|
5
5
|
"""
|
6
6
|
|
7
|
-
import toml
|
8
7
|
from pathlib import Path
|
9
8
|
from typing import Any, Callable, TypedDict
|
10
9
|
|
10
|
+
import toml
|
11
|
+
|
11
12
|
|
12
13
|
class ParamConfig(TypedDict):
|
13
14
|
name: str
|
@@ -23,6 +24,29 @@ def load_params_from_toml(toml_path: str) -> dict[str, Any]:
|
|
23
24
|
return data
|
24
25
|
|
25
26
|
|
27
|
+
def format_f32(value: float) -> str:
|
28
|
+
"""Format a float value for f32 with underscores for readability."""
|
29
|
+
str_val = f"{value:.7g}" # Use 7 significant digits for f32 precision
|
30
|
+
|
31
|
+
if "e" in str_val.lower():
|
32
|
+
return str_val
|
33
|
+
|
34
|
+
if "." in str_val:
|
35
|
+
integer_part, decimal_part = str_val.split(".")
|
36
|
+
|
37
|
+
if len(decimal_part) > 3:
|
38
|
+
formatted_decimal = ""
|
39
|
+
for i, digit in enumerate(decimal_part):
|
40
|
+
if i > 0 and i % 3 == 0:
|
41
|
+
formatted_decimal += "_"
|
42
|
+
formatted_decimal += digit
|
43
|
+
return f"{integer_part}.{formatted_decimal}"
|
44
|
+
else:
|
45
|
+
return str_val
|
46
|
+
else:
|
47
|
+
return str_val
|
48
|
+
|
49
|
+
|
26
50
|
def generate_simple_params_rust(params_data: dict[str, Any]) -> str:
|
27
51
|
"""Generate Rust code for simple parameters."""
|
28
52
|
rust_code: list[str] = []
|
@@ -30,7 +54,7 @@ def generate_simple_params_rust(params_data: dict[str, Any]) -> str:
|
|
30
54
|
rust_code.append("impl Default for SimpleParameters {")
|
31
55
|
rust_code.append(" fn default() -> Self {")
|
32
56
|
rust_code.append(" Self {")
|
33
|
-
rust_code.append(f" coefficient: {params_data['coefficient']},")
|
57
|
+
rust_code.append(f" coefficient: {format_f32(params_data['coefficient'])},")
|
34
58
|
rust_code.append(" }")
|
35
59
|
rust_code.append(" }")
|
36
60
|
rust_code.append("}")
|
@@ -45,11 +69,13 @@ def generate_basic_params_rust(params_data: dict[str, Any]) -> str:
|
|
45
69
|
rust_code.append("impl Default for BasicParameters {")
|
46
70
|
rust_code.append(" fn default() -> Self {")
|
47
71
|
rust_code.append(" Self {")
|
48
|
-
rust_code.append(f" char_coef: {params_data['char_coef']},")
|
49
|
-
rust_code.append(f" word_coef: {params_data['word_coef']},")
|
50
|
-
rust_code.append(
|
51
|
-
|
52
|
-
|
72
|
+
rust_code.append(f" char_coef: {format_f32(params_data['char_coef'])},")
|
73
|
+
rust_code.append(f" word_coef: {format_f32(params_data['word_coef'])},")
|
74
|
+
rust_code.append(
|
75
|
+
f" avg_word_length_coef: {format_f32(params_data['avg_word_length_coef'])},"
|
76
|
+
)
|
77
|
+
rust_code.append(f" space_coef: {format_f32(params_data['space_coef'])},")
|
78
|
+
rust_code.append(f" intercept: {format_f32(params_data['intercept'])},")
|
53
79
|
rust_code.append(" }")
|
54
80
|
rust_code.append(" }")
|
55
81
|
rust_code.append("}")
|
@@ -66,11 +92,13 @@ def generate_multilingual_params_rust(params_data: dict[str, Any]) -> str:
|
|
66
92
|
rust_code.append("impl Default for MultilingualParameters {")
|
67
93
|
rust_code.append(" fn default() -> Self {")
|
68
94
|
rust_code.append(" Self {")
|
69
|
-
rust_code.append(f" char_coef: {default_params['char_coef']},")
|
70
|
-
rust_code.append(f" word_coef: {default_params['word_coef']},")
|
71
|
-
rust_code.append(
|
72
|
-
|
73
|
-
|
95
|
+
rust_code.append(f" char_coef: {format_f32(default_params['char_coef'])},")
|
96
|
+
rust_code.append(f" word_coef: {format_f32(default_params['word_coef'])},")
|
97
|
+
rust_code.append(
|
98
|
+
f" avg_word_length_coef: {format_f32(default_params['avg_word_length_coef'])},"
|
99
|
+
)
|
100
|
+
rust_code.append(f" space_coef: {format_f32(default_params['space_coef'])},")
|
101
|
+
rust_code.append(f" intercept: {format_f32(default_params['intercept'])},")
|
74
102
|
rust_code.append(" }")
|
75
103
|
rust_code.append(" }")
|
76
104
|
rust_code.append("}")
|
@@ -89,13 +117,13 @@ def generate_multilingual_params_rust(params_data: dict[str, Any]) -> str:
|
|
89
117
|
rust_code.append(" language_params.insert(")
|
90
118
|
rust_code.append(f' "{lang_key}".to_string(),')
|
91
119
|
rust_code.append(" MultilingualParameters {")
|
92
|
-
rust_code.append(f" char_coef: {lang_params['char_coef']},")
|
93
|
-
rust_code.append(f" word_coef: {lang_params['word_coef']},")
|
120
|
+
rust_code.append(f" char_coef: {format_f32(lang_params['char_coef'])},")
|
121
|
+
rust_code.append(f" word_coef: {format_f32(lang_params['word_coef'])},")
|
94
122
|
rust_code.append(
|
95
|
-
f" avg_word_length_coef: {lang_params['avg_word_length_coef']},"
|
123
|
+
f" avg_word_length_coef: {format_f32(lang_params['avg_word_length_coef'])},"
|
96
124
|
)
|
97
|
-
rust_code.append(f" space_coef: {lang_params['space_coef']},")
|
98
|
-
rust_code.append(f" intercept: {lang_params['intercept']},")
|
125
|
+
rust_code.append(f" space_coef: {format_f32(lang_params['space_coef'])},")
|
126
|
+
rust_code.append(f" intercept: {format_f32(lang_params['intercept'])},")
|
99
127
|
rust_code.append(" },")
|
100
128
|
rust_code.append(" );")
|
101
129
|
rust_code.append("")
|
@@ -119,7 +147,7 @@ def generate_multilingual_simple_params_rust(params_data: dict[str, Any]) -> str
|
|
119
147
|
rust_code.append("impl Default for MultilingualSimpleParameters {")
|
120
148
|
rust_code.append(" fn default() -> Self {")
|
121
149
|
rust_code.append(" Self {")
|
122
|
-
rust_code.append(f" coefficient: {default_params['coefficient']},")
|
150
|
+
rust_code.append(f" coefficient: {format_f32(default_params['coefficient'])},")
|
123
151
|
rust_code.append(" }")
|
124
152
|
rust_code.append(" }")
|
125
153
|
rust_code.append("}")
|
@@ -137,7 +165,7 @@ def generate_multilingual_simple_params_rust(params_data: dict[str, Any]) -> str
|
|
137
165
|
rust_code.append(" language_params.insert(")
|
138
166
|
rust_code.append(f' "{lang_key}".to_string(),')
|
139
167
|
rust_code.append(" MultilingualSimpleParameters {")
|
140
|
-
rust_code.append(f" coefficient: {lang_params['coefficient']},")
|
168
|
+
rust_code.append(f" coefficient: {format_f32(lang_params['coefficient'])},")
|
141
169
|
rust_code.append(" },")
|
142
170
|
rust_code.append(" );")
|
143
171
|
rust_code.append("")
|
@@ -133,7 +133,7 @@ fn _skimtoken_core(m: &Bound<'_, PyModule>) -> PyResult<()> {
|
|
133
133
|
|
134
134
|
#[pyfn(m)]
|
135
135
|
#[pyo3(name = "count_basic")]
|
136
|
-
fn count_basic_py(text: &Bound<'_, PyAny>) -> PyResult<(usize, usize,
|
136
|
+
fn count_basic_py(text: &Bound<'_, PyAny>) -> PyResult<(usize, usize, f32, usize)> {
|
137
137
|
let text_str = if let Ok(s) = text.extract::<String>() {
|
138
138
|
s
|
139
139
|
} else {
|
@@ -153,7 +153,7 @@ fn _skimtoken_core(m: &Bound<'_, PyModule>) -> PyResult<()> {
|
|
153
153
|
#[pyo3(name = "count_multilingual")]
|
154
154
|
fn count_multilingual_py(
|
155
155
|
text: &Bound<'_, PyAny>,
|
156
|
-
) -> PyResult<(usize, usize,
|
156
|
+
) -> PyResult<(usize, usize, f32, usize, String)> {
|
157
157
|
let text_str = if let Ok(s) = text.extract::<String>() {
|
158
158
|
s
|
159
159
|
} else {
|
@@ -5,27 +5,27 @@ use serde::{Deserialize, Serialize};
|
|
5
5
|
pub struct BasicFeatures {
|
6
6
|
pub char_count: usize,
|
7
7
|
pub word_count: usize,
|
8
|
-
pub avg_word_length:
|
8
|
+
pub avg_word_length: f32,
|
9
9
|
pub space_count: usize,
|
10
10
|
}
|
11
11
|
|
12
12
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
13
13
|
pub struct BasicParameters {
|
14
|
-
pub char_coef:
|
15
|
-
pub word_coef:
|
16
|
-
pub avg_word_length_coef:
|
17
|
-
pub space_coef:
|
18
|
-
pub intercept:
|
14
|
+
pub char_coef: f32,
|
15
|
+
pub word_coef: f32,
|
16
|
+
pub avg_word_length_coef: f32,
|
17
|
+
pub space_coef: f32,
|
18
|
+
pub intercept: f32,
|
19
19
|
}
|
20
20
|
|
21
21
|
impl Default for BasicParameters {
|
22
22
|
fn default() -> Self {
|
23
23
|
Self {
|
24
|
-
char_coef: 0.
|
25
|
-
word_coef: 0.
|
26
|
-
avg_word_length_coef: 0.
|
27
|
-
space_coef: -0.
|
28
|
-
intercept: 1.
|
24
|
+
char_coef: 0.321_774_5,
|
25
|
+
word_coef: 0.070_228_82,
|
26
|
+
avg_word_length_coef: 0.509_098_2,
|
27
|
+
space_coef: -0.158_310_9,
|
28
|
+
intercept: 1.591_021,
|
29
29
|
}
|
30
30
|
}
|
31
31
|
}
|
@@ -60,7 +60,7 @@ impl EstimationMethod for BasicMethod {
|
|
60
60
|
|
61
61
|
let avg_word_length = if word_count > 0 {
|
62
62
|
let total_word_chars: usize = words.iter().map(|w| w.chars().count()).sum();
|
63
|
-
total_word_chars as
|
63
|
+
total_word_chars as f32 / word_count as f32
|
64
64
|
} else {
|
65
65
|
0.0
|
66
66
|
};
|
@@ -75,10 +75,10 @@ impl EstimationMethod for BasicMethod {
|
|
75
75
|
|
76
76
|
fn estimate(&self, text: &str) -> usize {
|
77
77
|
let features = self.count(text);
|
78
|
-
let estimate = self.parameters.char_coef * features.char_count as
|
79
|
-
+ self.parameters.word_coef * features.word_count as
|
78
|
+
let estimate = self.parameters.char_coef * features.char_count as f32
|
79
|
+
+ self.parameters.word_coef * features.word_count as f32
|
80
80
|
+ self.parameters.avg_word_length_coef * features.avg_word_length
|
81
|
-
+ self.parameters.space_coef * features.space_count as
|
81
|
+
+ self.parameters.space_coef * features.space_count as f32
|
82
82
|
+ self.parameters.intercept;
|
83
83
|
|
84
84
|
estimate.round().max(0.0) as usize
|