skimtoken 0.2.1__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {skimtoken-0.2.1 → skimtoken-0.2.2}/Cargo.lock +1 -1
- {skimtoken-0.2.1 → skimtoken-0.2.2}/Cargo.toml +1 -1
- {skimtoken-0.2.1 → skimtoken-0.2.2}/PKG-INFO +24 -48
- {skimtoken-0.2.1 → skimtoken-0.2.2}/README.md +23 -47
- {skimtoken-0.2.1 → skimtoken-0.2.2}/pyproject.toml +1 -1
- {skimtoken-0.2.1 → skimtoken-0.2.2}/scripts/update_rust_params.py +47 -19
- {skimtoken-0.2.1 → skimtoken-0.2.2}/src/lib.rs +2 -2
- {skimtoken-0.2.1 → skimtoken-0.2.2}/src/methods/method_basic.rs +15 -15
- skimtoken-0.2.2/src/methods/method_multilingual.rs +965 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/src/methods/method_multilingual_simple.rs +72 -82
- {skimtoken-0.2.1 → skimtoken-0.2.2}/src/methods/method_simple.rs +3 -3
- skimtoken-0.2.1/src/methods/method_multilingual.rs +0 -965
- {skimtoken-0.2.1 → skimtoken-0.2.2}/.github/workflows/ci.yml +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/.github/workflows/release.yml +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/.gitignore +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/CONTRIBUTING.md +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/LICENSE +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/data/test_dataset.jsonl +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/examples/example.py +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/examples/multilingual_estimate.py +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/params/basic.toml +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/params/multilingual.toml +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/params/multilingual_simple.toml +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/params/simple.toml +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/scripts/benchmark.py +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/scripts/optimize/optimize_basic.py +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/scripts/optimize/optimize_multilingual.py +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/scripts/optimize/optimize_multilingual_simple.py +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/scripts/optimize/optimize_simple.py +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/scripts/optimize/utils.py +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/scripts/optimize_all.py +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/scripts/prepare_cc100_dataset.py +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/scripts/update_token_counts.py +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/skimtoken/__init__.py +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/skimtoken/__init__.pyi +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/skimtoken/basic.py +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/skimtoken/basic.pyi +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/skimtoken/multilingual.py +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/skimtoken/multilingual.pyi +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/skimtoken/multilingual_simple.py +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/skimtoken/multilingual_simple.pyi +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/skimtoken/simple.py +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/skimtoken/simple.pyi +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/src/main.rs +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/src/methods/method.rs +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/tests/test_comprehensive.py +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/tests/test_hypothesis.py +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/tests/test_simple.py +0 -0
- {skimtoken-0.2.1 → skimtoken-0.2.2}/uv.lock +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: skimtoken
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.2
|
4
4
|
License-File: LICENSE
|
5
5
|
Summary: Fast token count estimation library
|
6
6
|
Home-Page: https://github.com/masaishi/skimtoken
|
@@ -21,23 +21,16 @@ A lightweight, fast token count estimation library written in Rust with Python b
|
|
21
21
|
[](https://crates.io/crates/skimtoken)
|
22
22
|
[](https://github.com/masaishi/skimtoken/blob/main/LICENSE)
|
23
23
|
|
24
|
-
## ⚠️ Current Limitations
|
25
|
-
|
26
|
-
**This library is currently in early beta and has significant accuracy issues:**
|
27
|
-
|
28
|
-
- **Multilingual method**: Takes 1.13x longer than tiktoken due to inefficient implementation
|
29
|
-
- **Overall accuracy**: 15.11% error rate, which is too high for most use cases
|
30
|
-
|
31
24
|
|
32
25
|
## Why skimtoken?
|
33
26
|
|
34
|
-
**The Problem**: [tiktoken](https://github.com/openai/tiktoken) is great for precise tokenization, but requires ~
|
27
|
+
**The Problem**: [tiktoken](https://github.com/openai/tiktoken) is great for precise tokenization, but requires ~59.6MB of memory just to count tokens - problematic for memory-constrained environments.
|
35
28
|
|
36
29
|
**The Solution**: skimtoken estimates token counts using statistical patterns instead of loading entire vocabularies, achieving:
|
37
30
|
|
38
|
-
- ✅ **
|
39
|
-
- ✅ **
|
40
|
-
- ❌ **1.
|
31
|
+
- ✅ **65x less memory** (0.92MB vs 59.6MB)
|
32
|
+
- ✅ **421x faster startup** (2.389ms vs 1,005ms)
|
33
|
+
- ❌ **1.03x slowwer execute time** (6.689s vs 6.912s) for Multilingual single method
|
41
34
|
- ❌ Trade-off: ~15.11% error rate vs exact counts
|
42
35
|
|
43
36
|
## Installation
|
@@ -100,32 +93,7 @@ print(f"Estimated tokens (multilingual): {token_count}")
|
|
100
93
|
|
101
94
|
### Large-Scale Benchmark (100k samples)
|
102
95
|
|
103
|
-
|
104
|
-
```
|
105
|
-
Results:
|
106
|
-
Total Samples: 100,726
|
107
|
-
Total Characters: 13,062,391
|
108
|
-
Mean RMSE: 38.4863 tokens
|
109
|
-
Mean Error Rate: 21.63%
|
110
|
-
|
111
|
-
┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━┓
|
112
|
-
┃ Metric ┃ tiktoken ┃ skimtoken ┃ Ratio ┃
|
113
|
-
┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━┩
|
114
|
-
│ Init Time │ 0.481672 s │ 0.182308 s │ 0.378x │
|
115
|
-
├──────────────┼────────────┼────────────┼────────┤
|
116
|
-
│ Init Memory │ 42.2386 MB │ 0.0291 MB │ 0.001x │
|
117
|
-
├──────────────┼────────────┼────────────┼────────┤
|
118
|
-
│ Exec Time │ 4.710224 s │ 0.805272 s │ 0.171x │
|
119
|
-
├──────────────┼────────────┼────────────┼────────┤
|
120
|
-
│ Exec Memory │ 17.3251 MB │ 0.8849 MB │ 0.051x │
|
121
|
-
├──────────────┼────────────┼────────────┼────────┤
|
122
|
-
│ Total Time │ 5.191896 s │ 0.928758 s │ 0.190x │
|
123
|
-
├──────────────┼────────────┼────────────┼────────┤
|
124
|
-
│ Total Memory │ 59.5637 MB │ 0.9214 MB │ 0.015x │
|
125
|
-
└──────────────┴────────────┴────────────┴────────┘
|
126
|
-
```
|
127
|
-
|
128
|
-
Multilingual simple method:
|
96
|
+
Multilingual single method:
|
129
97
|
```
|
130
98
|
Results:
|
131
99
|
Total Samples: 100,726
|
@@ -136,27 +104,35 @@ Mean Error Rate: 15.11%
|
|
136
104
|
┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━┓
|
137
105
|
┃ Metric ┃ tiktoken ┃ skimtoken ┃ Ratio ┃
|
138
106
|
┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━┩
|
139
|
-
│ Init Time │
|
107
|
+
│ Init Time │ 1.005490 s │ 0.002389 s │ 0.002x │
|
140
108
|
├──────────────┼────────────┼────────────┼────────┤
|
141
|
-
│ Init Memory │ 42.
|
109
|
+
│ Init Memory │ 42.2310 MB │ 0.0265 MB │ 0.001x │
|
142
110
|
├──────────────┼────────────┼────────────┼────────┤
|
143
|
-
│ Exec Time │
|
111
|
+
│ Exec Time │ 6.689203 s │ 6.911931 s │ 1.033x │
|
144
112
|
├──────────────┼────────────┼────────────┼────────┤
|
145
|
-
│ Exec Memory │ 17.
|
113
|
+
│ Exec Memory │ 17.3251 MB │ 0.8950 MB │ 0.052x │
|
146
114
|
├──────────────┼────────────┼────────────┼────────┤
|
147
|
-
│ Total Time │
|
115
|
+
│ Total Time │ 7.694694 s │ 6.914320 s │ 0.899x │
|
148
116
|
├──────────────┼────────────┼────────────┼────────┤
|
149
|
-
│ Total Memory │ 59.
|
117
|
+
│ Total Memory │ 59.5561 MB │ 0.9215 MB │ 0.015x │
|
150
118
|
└──────────────┴────────────┴────────────┴────────┘
|
151
119
|
```
|
152
120
|
|
121
|
+
### Automated Benchmarks
|
122
|
+
|
123
|
+
For up-to-date performance comparisons and detailed accuracy metrics across all methods, visit the [skimtoken_benchmark](https://github.com/masaishi/skimtoken_benchmark) repository. This automated benchmark suite:
|
124
|
+
|
125
|
+
- Uses the CC-100 multilingual dataset (100k+ samples)
|
126
|
+
- Provides language-specific accuracy breakdowns
|
127
|
+
|
153
128
|
## Available Methods
|
154
129
|
|
155
130
|
| Method | Import | Memory | Error | Best For |
|
156
131
|
|--------|---------|--------|-------|----------|
|
157
|
-
| **Simple** | `from skimtoken.simple import estimate_tokens` |
|
158
|
-
| **Basic** | `from skimtoken.basic import estimate_tokens` | 0.
|
159
|
-
| **Multilingual** | `from skimtoken.multilingual import estimate_tokens` | 0.9MB | ~15% | Non-English, mixed languages |
|
132
|
+
| **Simple** | `from skimtoken.simple import estimate_tokens` | 1.0MB | ~21.63% | English text, minimum memory |
|
133
|
+
| **Basic** | `from skimtoken.basic import estimate_tokens` | 0.9MB | ~27.05% | General use |
|
134
|
+
| **Multilingual** | `from skimtoken.multilingual import estimate_tokens` | 0.9MB | ~15.93% | Non-English, mixed languages |
|
135
|
+
| **Multilingual Simple** | `from skimtoken.multilingual_simple import estimate_tokens` | 0.9MB | ~15.11% | Fast multilingual estimation |
|
160
136
|
|
161
137
|
```python
|
162
138
|
# Example: Choose method based on your needs
|
@@ -272,7 +248,7 @@ A: Beta = breaking changes possible.
|
|
272
248
|
We are actively working to improve skimtoken's accuracy and performance:
|
273
249
|
|
274
250
|
1. **Better estimation algorithms**: Moving beyond simple character multiplication to more sophisticated statistical models
|
275
|
-
2. **Performance optimization**:
|
251
|
+
2. **Performance optimization**: Further improving execution speed
|
276
252
|
3. **Improved language support**: Better handling of non-English languages
|
277
253
|
4. **Higher accuracy**: Targeting <10% error rate while maintaining low memory footprint
|
278
254
|
|
@@ -8,23 +8,16 @@ A lightweight, fast token count estimation library written in Rust with Python b
|
|
8
8
|
[](https://crates.io/crates/skimtoken)
|
9
9
|
[](https://github.com/masaishi/skimtoken/blob/main/LICENSE)
|
10
10
|
|
11
|
-
## ⚠️ Current Limitations
|
12
|
-
|
13
|
-
**This library is currently in early beta and has significant accuracy issues:**
|
14
|
-
|
15
|
-
- **Multilingual method**: Takes 1.13x longer than tiktoken due to inefficient implementation
|
16
|
-
- **Overall accuracy**: 15.11% error rate, which is too high for most use cases
|
17
|
-
|
18
11
|
|
19
12
|
## Why skimtoken?
|
20
13
|
|
21
|
-
**The Problem**: [tiktoken](https://github.com/openai/tiktoken) is great for precise tokenization, but requires ~
|
14
|
+
**The Problem**: [tiktoken](https://github.com/openai/tiktoken) is great for precise tokenization, but requires ~59.6MB of memory just to count tokens - problematic for memory-constrained environments.
|
22
15
|
|
23
16
|
**The Solution**: skimtoken estimates token counts using statistical patterns instead of loading entire vocabularies, achieving:
|
24
17
|
|
25
|
-
- ✅ **
|
26
|
-
- ✅ **
|
27
|
-
- ❌ **1.
|
18
|
+
- ✅ **65x less memory** (0.92MB vs 59.6MB)
|
19
|
+
- ✅ **421x faster startup** (2.389ms vs 1,005ms)
|
20
|
+
- ❌ **1.03x slowwer execute time** (6.689s vs 6.912s) for Multilingual single method
|
28
21
|
- ❌ Trade-off: ~15.11% error rate vs exact counts
|
29
22
|
|
30
23
|
## Installation
|
@@ -87,32 +80,7 @@ print(f"Estimated tokens (multilingual): {token_count}")
|
|
87
80
|
|
88
81
|
### Large-Scale Benchmark (100k samples)
|
89
82
|
|
90
|
-
|
91
|
-
```
|
92
|
-
Results:
|
93
|
-
Total Samples: 100,726
|
94
|
-
Total Characters: 13,062,391
|
95
|
-
Mean RMSE: 38.4863 tokens
|
96
|
-
Mean Error Rate: 21.63%
|
97
|
-
|
98
|
-
┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━┓
|
99
|
-
┃ Metric ┃ tiktoken ┃ skimtoken ┃ Ratio ┃
|
100
|
-
┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━┩
|
101
|
-
│ Init Time │ 0.481672 s │ 0.182308 s │ 0.378x │
|
102
|
-
├──────────────┼────────────┼────────────┼────────┤
|
103
|
-
│ Init Memory │ 42.2386 MB │ 0.0291 MB │ 0.001x │
|
104
|
-
├──────────────┼────────────┼────────────┼────────┤
|
105
|
-
│ Exec Time │ 4.710224 s │ 0.805272 s │ 0.171x │
|
106
|
-
├──────────────┼────────────┼────────────┼────────┤
|
107
|
-
│ Exec Memory │ 17.3251 MB │ 0.8849 MB │ 0.051x │
|
108
|
-
├──────────────┼────────────┼────────────┼────────┤
|
109
|
-
│ Total Time │ 5.191896 s │ 0.928758 s │ 0.190x │
|
110
|
-
├──────────────┼────────────┼────────────┼────────┤
|
111
|
-
│ Total Memory │ 59.5637 MB │ 0.9214 MB │ 0.015x │
|
112
|
-
└──────────────┴────────────┴────────────┴────────┘
|
113
|
-
```
|
114
|
-
|
115
|
-
Multilingual simple method:
|
83
|
+
Multilingual single method:
|
116
84
|
```
|
117
85
|
Results:
|
118
86
|
Total Samples: 100,726
|
@@ -123,27 +91,35 @@ Mean Error Rate: 15.11%
|
|
123
91
|
┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━┓
|
124
92
|
┃ Metric ┃ tiktoken ┃ skimtoken ┃ Ratio ┃
|
125
93
|
┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━┩
|
126
|
-
│ Init Time │
|
94
|
+
│ Init Time │ 1.005490 s │ 0.002389 s │ 0.002x │
|
127
95
|
├──────────────┼────────────┼────────────┼────────┤
|
128
|
-
│ Init Memory │ 42.
|
96
|
+
│ Init Memory │ 42.2310 MB │ 0.0265 MB │ 0.001x │
|
129
97
|
├──────────────┼────────────┼────────────┼────────┤
|
130
|
-
│ Exec Time │
|
98
|
+
│ Exec Time │ 6.689203 s │ 6.911931 s │ 1.033x │
|
131
99
|
├──────────────┼────────────┼────────────┼────────┤
|
132
|
-
│ Exec Memory │ 17.
|
100
|
+
│ Exec Memory │ 17.3251 MB │ 0.8950 MB │ 0.052x │
|
133
101
|
├──────────────┼────────────┼────────────┼────────┤
|
134
|
-
│ Total Time │
|
102
|
+
│ Total Time │ 7.694694 s │ 6.914320 s │ 0.899x │
|
135
103
|
├──────────────┼────────────┼────────────┼────────┤
|
136
|
-
│ Total Memory │ 59.
|
104
|
+
│ Total Memory │ 59.5561 MB │ 0.9215 MB │ 0.015x │
|
137
105
|
└──────────────┴────────────┴────────────┴────────┘
|
138
106
|
```
|
139
107
|
|
108
|
+
### Automated Benchmarks
|
109
|
+
|
110
|
+
For up-to-date performance comparisons and detailed accuracy metrics across all methods, visit the [skimtoken_benchmark](https://github.com/masaishi/skimtoken_benchmark) repository. This automated benchmark suite:
|
111
|
+
|
112
|
+
- Uses the CC-100 multilingual dataset (100k+ samples)
|
113
|
+
- Provides language-specific accuracy breakdowns
|
114
|
+
|
140
115
|
## Available Methods
|
141
116
|
|
142
117
|
| Method | Import | Memory | Error | Best For |
|
143
118
|
|--------|---------|--------|-------|----------|
|
144
|
-
| **Simple** | `from skimtoken.simple import estimate_tokens` |
|
145
|
-
| **Basic** | `from skimtoken.basic import estimate_tokens` | 0.
|
146
|
-
| **Multilingual** | `from skimtoken.multilingual import estimate_tokens` | 0.9MB | ~15% | Non-English, mixed languages |
|
119
|
+
| **Simple** | `from skimtoken.simple import estimate_tokens` | 1.0MB | ~21.63% | English text, minimum memory |
|
120
|
+
| **Basic** | `from skimtoken.basic import estimate_tokens` | 0.9MB | ~27.05% | General use |
|
121
|
+
| **Multilingual** | `from skimtoken.multilingual import estimate_tokens` | 0.9MB | ~15.93% | Non-English, mixed languages |
|
122
|
+
| **Multilingual Simple** | `from skimtoken.multilingual_simple import estimate_tokens` | 0.9MB | ~15.11% | Fast multilingual estimation |
|
147
123
|
|
148
124
|
```python
|
149
125
|
# Example: Choose method based on your needs
|
@@ -259,7 +235,7 @@ A: Beta = breaking changes possible.
|
|
259
235
|
We are actively working to improve skimtoken's accuracy and performance:
|
260
236
|
|
261
237
|
1. **Better estimation algorithms**: Moving beyond simple character multiplication to more sophisticated statistical models
|
262
|
-
2. **Performance optimization**:
|
238
|
+
2. **Performance optimization**: Further improving execution speed
|
263
239
|
3. **Improved language support**: Better handling of non-English languages
|
264
240
|
4. **Higher accuracy**: Targeting <10% error rate while maintaining low memory footprint
|
265
241
|
|
@@ -4,10 +4,11 @@ Script to load parameters from TOML files and update the Rust parameter files.
|
|
4
4
|
Handles all parameter types: simple, basic, multilingual, and multilingual_simple.
|
5
5
|
"""
|
6
6
|
|
7
|
-
import toml
|
8
7
|
from pathlib import Path
|
9
8
|
from typing import Any, Callable, TypedDict
|
10
9
|
|
10
|
+
import toml
|
11
|
+
|
11
12
|
|
12
13
|
class ParamConfig(TypedDict):
|
13
14
|
name: str
|
@@ -23,6 +24,29 @@ def load_params_from_toml(toml_path: str) -> dict[str, Any]:
|
|
23
24
|
return data
|
24
25
|
|
25
26
|
|
27
|
+
def format_f32(value: float) -> str:
|
28
|
+
"""Format a float value for f32 with underscores for readability."""
|
29
|
+
str_val = f"{value:.7g}" # Use 7 significant digits for f32 precision
|
30
|
+
|
31
|
+
if "e" in str_val.lower():
|
32
|
+
return str_val
|
33
|
+
|
34
|
+
if "." in str_val:
|
35
|
+
integer_part, decimal_part = str_val.split(".")
|
36
|
+
|
37
|
+
if len(decimal_part) > 3:
|
38
|
+
formatted_decimal = ""
|
39
|
+
for i, digit in enumerate(decimal_part):
|
40
|
+
if i > 0 and i % 3 == 0:
|
41
|
+
formatted_decimal += "_"
|
42
|
+
formatted_decimal += digit
|
43
|
+
return f"{integer_part}.{formatted_decimal}"
|
44
|
+
else:
|
45
|
+
return str_val
|
46
|
+
else:
|
47
|
+
return str_val
|
48
|
+
|
49
|
+
|
26
50
|
def generate_simple_params_rust(params_data: dict[str, Any]) -> str:
|
27
51
|
"""Generate Rust code for simple parameters."""
|
28
52
|
rust_code: list[str] = []
|
@@ -30,7 +54,7 @@ def generate_simple_params_rust(params_data: dict[str, Any]) -> str:
|
|
30
54
|
rust_code.append("impl Default for SimpleParameters {")
|
31
55
|
rust_code.append(" fn default() -> Self {")
|
32
56
|
rust_code.append(" Self {")
|
33
|
-
rust_code.append(f" coefficient: {params_data['coefficient']},")
|
57
|
+
rust_code.append(f" coefficient: {format_f32(params_data['coefficient'])},")
|
34
58
|
rust_code.append(" }")
|
35
59
|
rust_code.append(" }")
|
36
60
|
rust_code.append("}")
|
@@ -45,11 +69,13 @@ def generate_basic_params_rust(params_data: dict[str, Any]) -> str:
|
|
45
69
|
rust_code.append("impl Default for BasicParameters {")
|
46
70
|
rust_code.append(" fn default() -> Self {")
|
47
71
|
rust_code.append(" Self {")
|
48
|
-
rust_code.append(f" char_coef: {params_data['char_coef']},")
|
49
|
-
rust_code.append(f" word_coef: {params_data['word_coef']},")
|
50
|
-
rust_code.append(
|
51
|
-
|
52
|
-
|
72
|
+
rust_code.append(f" char_coef: {format_f32(params_data['char_coef'])},")
|
73
|
+
rust_code.append(f" word_coef: {format_f32(params_data['word_coef'])},")
|
74
|
+
rust_code.append(
|
75
|
+
f" avg_word_length_coef: {format_f32(params_data['avg_word_length_coef'])},"
|
76
|
+
)
|
77
|
+
rust_code.append(f" space_coef: {format_f32(params_data['space_coef'])},")
|
78
|
+
rust_code.append(f" intercept: {format_f32(params_data['intercept'])},")
|
53
79
|
rust_code.append(" }")
|
54
80
|
rust_code.append(" }")
|
55
81
|
rust_code.append("}")
|
@@ -66,11 +92,13 @@ def generate_multilingual_params_rust(params_data: dict[str, Any]) -> str:
|
|
66
92
|
rust_code.append("impl Default for MultilingualParameters {")
|
67
93
|
rust_code.append(" fn default() -> Self {")
|
68
94
|
rust_code.append(" Self {")
|
69
|
-
rust_code.append(f" char_coef: {default_params['char_coef']},")
|
70
|
-
rust_code.append(f" word_coef: {default_params['word_coef']},")
|
71
|
-
rust_code.append(
|
72
|
-
|
73
|
-
|
95
|
+
rust_code.append(f" char_coef: {format_f32(default_params['char_coef'])},")
|
96
|
+
rust_code.append(f" word_coef: {format_f32(default_params['word_coef'])},")
|
97
|
+
rust_code.append(
|
98
|
+
f" avg_word_length_coef: {format_f32(default_params['avg_word_length_coef'])},"
|
99
|
+
)
|
100
|
+
rust_code.append(f" space_coef: {format_f32(default_params['space_coef'])},")
|
101
|
+
rust_code.append(f" intercept: {format_f32(default_params['intercept'])},")
|
74
102
|
rust_code.append(" }")
|
75
103
|
rust_code.append(" }")
|
76
104
|
rust_code.append("}")
|
@@ -89,13 +117,13 @@ def generate_multilingual_params_rust(params_data: dict[str, Any]) -> str:
|
|
89
117
|
rust_code.append(" language_params.insert(")
|
90
118
|
rust_code.append(f' "{lang_key}".to_string(),')
|
91
119
|
rust_code.append(" MultilingualParameters {")
|
92
|
-
rust_code.append(f" char_coef: {lang_params['char_coef']},")
|
93
|
-
rust_code.append(f" word_coef: {lang_params['word_coef']},")
|
120
|
+
rust_code.append(f" char_coef: {format_f32(lang_params['char_coef'])},")
|
121
|
+
rust_code.append(f" word_coef: {format_f32(lang_params['word_coef'])},")
|
94
122
|
rust_code.append(
|
95
|
-
f" avg_word_length_coef: {lang_params['avg_word_length_coef']},"
|
123
|
+
f" avg_word_length_coef: {format_f32(lang_params['avg_word_length_coef'])},"
|
96
124
|
)
|
97
|
-
rust_code.append(f" space_coef: {lang_params['space_coef']},")
|
98
|
-
rust_code.append(f" intercept: {lang_params['intercept']},")
|
125
|
+
rust_code.append(f" space_coef: {format_f32(lang_params['space_coef'])},")
|
126
|
+
rust_code.append(f" intercept: {format_f32(lang_params['intercept'])},")
|
99
127
|
rust_code.append(" },")
|
100
128
|
rust_code.append(" );")
|
101
129
|
rust_code.append("")
|
@@ -119,7 +147,7 @@ def generate_multilingual_simple_params_rust(params_data: dict[str, Any]) -> str
|
|
119
147
|
rust_code.append("impl Default for MultilingualSimpleParameters {")
|
120
148
|
rust_code.append(" fn default() -> Self {")
|
121
149
|
rust_code.append(" Self {")
|
122
|
-
rust_code.append(f" coefficient: {default_params['coefficient']},")
|
150
|
+
rust_code.append(f" coefficient: {format_f32(default_params['coefficient'])},")
|
123
151
|
rust_code.append(" }")
|
124
152
|
rust_code.append(" }")
|
125
153
|
rust_code.append("}")
|
@@ -137,7 +165,7 @@ def generate_multilingual_simple_params_rust(params_data: dict[str, Any]) -> str
|
|
137
165
|
rust_code.append(" language_params.insert(")
|
138
166
|
rust_code.append(f' "{lang_key}".to_string(),')
|
139
167
|
rust_code.append(" MultilingualSimpleParameters {")
|
140
|
-
rust_code.append(f" coefficient: {lang_params['coefficient']},")
|
168
|
+
rust_code.append(f" coefficient: {format_f32(lang_params['coefficient'])},")
|
141
169
|
rust_code.append(" },")
|
142
170
|
rust_code.append(" );")
|
143
171
|
rust_code.append("")
|
@@ -133,7 +133,7 @@ fn _skimtoken_core(m: &Bound<'_, PyModule>) -> PyResult<()> {
|
|
133
133
|
|
134
134
|
#[pyfn(m)]
|
135
135
|
#[pyo3(name = "count_basic")]
|
136
|
-
fn count_basic_py(text: &Bound<'_, PyAny>) -> PyResult<(usize, usize,
|
136
|
+
fn count_basic_py(text: &Bound<'_, PyAny>) -> PyResult<(usize, usize, f32, usize)> {
|
137
137
|
let text_str = if let Ok(s) = text.extract::<String>() {
|
138
138
|
s
|
139
139
|
} else {
|
@@ -153,7 +153,7 @@ fn _skimtoken_core(m: &Bound<'_, PyModule>) -> PyResult<()> {
|
|
153
153
|
#[pyo3(name = "count_multilingual")]
|
154
154
|
fn count_multilingual_py(
|
155
155
|
text: &Bound<'_, PyAny>,
|
156
|
-
) -> PyResult<(usize, usize,
|
156
|
+
) -> PyResult<(usize, usize, f32, usize, String)> {
|
157
157
|
let text_str = if let Ok(s) = text.extract::<String>() {
|
158
158
|
s
|
159
159
|
} else {
|
@@ -5,27 +5,27 @@ use serde::{Deserialize, Serialize};
|
|
5
5
|
pub struct BasicFeatures {
|
6
6
|
pub char_count: usize,
|
7
7
|
pub word_count: usize,
|
8
|
-
pub avg_word_length:
|
8
|
+
pub avg_word_length: f32,
|
9
9
|
pub space_count: usize,
|
10
10
|
}
|
11
11
|
|
12
12
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
13
13
|
pub struct BasicParameters {
|
14
|
-
pub char_coef:
|
15
|
-
pub word_coef:
|
16
|
-
pub avg_word_length_coef:
|
17
|
-
pub space_coef:
|
18
|
-
pub intercept:
|
14
|
+
pub char_coef: f32,
|
15
|
+
pub word_coef: f32,
|
16
|
+
pub avg_word_length_coef: f32,
|
17
|
+
pub space_coef: f32,
|
18
|
+
pub intercept: f32,
|
19
19
|
}
|
20
20
|
|
21
21
|
impl Default for BasicParameters {
|
22
22
|
fn default() -> Self {
|
23
23
|
Self {
|
24
|
-
char_coef: 0.
|
25
|
-
word_coef: 0.
|
26
|
-
avg_word_length_coef: 0.
|
27
|
-
space_coef: -0.
|
28
|
-
intercept: 1.
|
24
|
+
char_coef: 0.321_774_5,
|
25
|
+
word_coef: 0.070_228_82,
|
26
|
+
avg_word_length_coef: 0.509_098_2,
|
27
|
+
space_coef: -0.158_310_9,
|
28
|
+
intercept: 1.591_021,
|
29
29
|
}
|
30
30
|
}
|
31
31
|
}
|
@@ -60,7 +60,7 @@ impl EstimationMethod for BasicMethod {
|
|
60
60
|
|
61
61
|
let avg_word_length = if word_count > 0 {
|
62
62
|
let total_word_chars: usize = words.iter().map(|w| w.chars().count()).sum();
|
63
|
-
total_word_chars as
|
63
|
+
total_word_chars as f32 / word_count as f32
|
64
64
|
} else {
|
65
65
|
0.0
|
66
66
|
};
|
@@ -75,10 +75,10 @@ impl EstimationMethod for BasicMethod {
|
|
75
75
|
|
76
76
|
fn estimate(&self, text: &str) -> usize {
|
77
77
|
let features = self.count(text);
|
78
|
-
let estimate = self.parameters.char_coef * features.char_count as
|
79
|
-
+ self.parameters.word_coef * features.word_count as
|
78
|
+
let estimate = self.parameters.char_coef * features.char_count as f32
|
79
|
+
+ self.parameters.word_coef * features.word_count as f32
|
80
80
|
+ self.parameters.avg_word_length_coef * features.avg_word_length
|
81
|
-
+ self.parameters.space_coef * features.space_count as
|
81
|
+
+ self.parameters.space_coef * features.space_count as f32
|
82
82
|
+ self.parameters.intercept;
|
83
83
|
|
84
84
|
estimate.round().max(0.0) as usize
|