PyPI - skimtoken - Versions diffs - 0.2.0__tar.gz → 0.2.2__tar.gz - Mend

skimtoken 0.2.0tar.gz → 0.2.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

{skimtoken-0.2.0 → skimtoken-0.2.2}/Cargo.lock RENAMED Viewed

@@ -392,7 +392,7 @@ dependencies = [
 [[package]]
 name = "skimtoken"
-version = "0.2.0"
+version = "0.2.2"
 dependencies = [
  "atty",
  "clap",

{skimtoken-0.2.0 → skimtoken-0.2.2}/Cargo.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "skimtoken"
-version = "0.2.0"
+version = "0.2.2"
 edition = "2021"
 authors = ["masaishi <mwishiha@ucsc.edu>"]
 license = "MIT"

{skimtoken-0.2.0 → skimtoken-0.2.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: skimtoken
-Version: 0.2.0
+Version: 0.2.2
 License-File: LICENSE
 Summary: Fast token count estimation library
 Home-Page: https://github.com/masaishi/skimtoken
@@ -21,23 +21,16 @@ A lightweight, fast token count estimation library written in Rust with Python b
 [![Crates.io](https://img.shields.io/crates/v/skimtoken)](https://crates.io/crates/skimtoken)
 [![License](https://img.shields.io/github/license/masaishi/skimtoken)](https://github.com/masaishi/skimtoken/blob/main/LICENSE)
-## ⚠️ Current Limitations
-**This library is currently in early beta and has significant accuracy issues:**
-- **Multilingual method**: Takes 48.60x longer than tiktoken due to inefficient implementation
-- **Overall accuracy**: 15.11% error rate, which is too high for most use cases
 ## Why skimtoken?
-**The Problem**: [tiktoken](https://github.com/openai/tiktoken) is great for precise tokenization, but requires ~60MB of memory just to count tokens - problematic for memory-constrained environments.
+**The Problem**: [tiktoken](https://github.com/openai/tiktoken) is great for precise tokenization, but requires ~59.6MB of memory just to count tokens - problematic for memory-constrained environments.
 **The Solution**: skimtoken estimates token counts using statistical patterns instead of loading entire vocabularies, achieving:
-- ✅ **64x less memory** (0.92MB vs 60MB)
-- ✅ **128x faster startup** (4ms vs 485ms)
-- ❌ **48.60x slower execution** (0.93s vs 4.59s) for multilingual method
+- ✅ **65x less memory** (0.92MB vs 59.6MB)
+- ✅ **421x faster startup** (2.389ms vs 1,005ms)
+- ❌ **1.03x slowwer execute time** (6.689s vs 6.912s) for Multilingual single method
 - ❌ Trade-off: ~15.11% error rate vs exact counts
 ## Installation
@@ -100,63 +93,46 @@ print(f"Estimated tokens (multilingual): {token_count}")
 ### Large-Scale Benchmark (100k samples)
-Simple method (Just char length x coefficient):
+Multilingual single method:
 ```
 Results:
 Total Samples: 100,726
 Total Characters: 13,062,391
-Mean RMSE: 38.4863 tokens
-Mean Error Rate: 21.63%
+Mean RMSE: 21.3034 tokens
+Mean Error Rate: 15.11%
 ┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━┓
 ┃ Metric       ┃   tiktoken ┃  skimtoken ┃  Ratio ┃
 ┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━┩
-│ Init Time    │ 0.481672 s │ 0.182308 s │ 0.378x │
+│ Init Time    │ 1.005490 s │ 0.002389 s │ 0.002x │
 ├──────────────┼────────────┼────────────┼────────┤
-│ Init Memory  │ 42.2386 MB │  0.0291 MB │ 0.001x │
+│ Init Memory  │ 42.2310 MB │  0.0265 MB │ 0.001x │
 ├──────────────┼────────────┼────────────┼────────┤
-│ Exec Time    │ 4.710224 s │ 0.805272 s │ 0.171x │
+│ Exec Time    │ 6.689203 s │ 6.911931 s │ 1.033x │
 ├──────────────┼────────────┼────────────┼────────┤
-│ Exec Memory  │ 17.3251 MB │  0.8849 MB │ 0.051x │
+│ Exec Memory  │ 17.3251 MB │  0.8950 MB │ 0.052x │
 ├──────────────┼────────────┼────────────┼────────┤
-│ Total Time   │ 5.191896 s │ 0.928758 s │ 0.190x │
+│ Total Time   │ 7.694694 s │ 6.914320 s │ 0.899x │
 ├──────────────┼────────────┼────────────┼────────┤
-│ Total Memory │ 59.5637 MB │  0.9214 MB │ 0.015x │
+│ Total Memory │ 59.5561 MB │  0.9215 MB │ 0.015x │
 └──────────────┴────────────┴────────────┴────────┘
 ```
-Multilingual simple method:
-```
-Results:
-Total Samples: 100,726
-Total Characters: 13,062,391
-Mean RMSE: 21.3034 tokens
-Mean Error Rate: 15.11%
+### Automated Benchmarks
-┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━┓
-┃ Metric       ┃   tiktoken ┃    skimtoken ┃   Ratio ┃
-┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━┩
-│ Init Time    │ 0.471222 s │   0.006207 s │  0.013x │
-├──────────────┼────────────┼──────────────┼─────────┤
-│ Init Memory  │ 42.2385 MB │    0.0283 MB │  0.001x │
-├──────────────┼────────────┼──────────────┼─────────┤
-│ Exec Time    │ 4.594160 s │ 246.164618 s │ 53.582x │
-├──────────────┼────────────┼──────────────┼─────────┤
-│ Exec Memory  │ 17.3251 MB │    0.8950 MB │  0.052x │
-├──────────────┼────────────┼──────────────┼─────────┤
-│ Total Time   │ 5.065382 s │ 246.170825 s │ 48.599x │
-├──────────────┼────────────┼──────────────┼─────────┤
-│ Total Memory │ 59.5636 MB │    0.9233 MB │  0.016x │
-└──────────────┴────────────┴──────────────┴─────────┘
-```
+For up-to-date performance comparisons and detailed accuracy metrics across all methods, visit the [skimtoken_benchmark](https://github.com/masaishi/skimtoken_benchmark) repository. This automated benchmark suite:
+- Uses the CC-100 multilingual dataset (100k+ samples)
+- Provides language-specific accuracy breakdowns
 ## Available Methods
 | Method | Import | Memory | Error | Best For |
 |--------|---------|--------|-------|----------|
-| **Simple** | `from skimtoken.simple import estimate_tokens` | 0.8MB | ~21% | English text, minimum memory |
-| **Basic** | `from skimtoken.basic import estimate_tokens` | 0.8MB | ~27% | General use |
-| **Multilingual** | `from skimtoken.multilingual import estimate_tokens` | 0.9MB | ~15% | Non-English, mixed languages |
+| **Simple** | `from skimtoken.simple import estimate_tokens` | 1.0MB | ~21.63% | English text, minimum memory |
+| **Basic** | `from skimtoken.basic import estimate_tokens` | 0.9MB | ~27.05% | General use |
+| **Multilingual** | `from skimtoken.multilingual import estimate_tokens` | 0.9MB | ~15.93% | Non-English, mixed languages |
+| **Multilingual Simple** | `from skimtoken.multilingual_simple import estimate_tokens` | 0.9MB | ~15.11% | Fast multilingual estimation |
 ```python
 # Example: Choose method based on your needs
@@ -272,10 +248,11 @@ A: Beta = breaking changes possible.
 We are actively working to improve skimtoken's accuracy and performance:
 1. **Better estimation algorithms**: Moving beyond simple character multiplication to more sophisticated statistical models
-2. **Performance optimization**: Fixing the 60x slowdown in multilingual method
+2. **Performance optimization**: Further improving execution speed
 3. **Improved language support**: Better handling of non-English languages
 4. **Higher accuracy**: Targeting <10% error rate while maintaining low memory footprint
 ## License
 MIT License - see [LICENSE](./LICENSE) for details.

{skimtoken-0.2.0 → skimtoken-0.2.2}/README.md RENAMED Viewed

@@ -8,23 +8,16 @@ A lightweight, fast token count estimation library written in Rust with Python b
 [![Crates.io](https://img.shields.io/crates/v/skimtoken)](https://crates.io/crates/skimtoken)
 [![License](https://img.shields.io/github/license/masaishi/skimtoken)](https://github.com/masaishi/skimtoken/blob/main/LICENSE)
-## ⚠️ Current Limitations
-**This library is currently in early beta and has significant accuracy issues:**
-- **Multilingual method**: Takes 48.60x longer than tiktoken due to inefficient implementation
-- **Overall accuracy**: 15.11% error rate, which is too high for most use cases
 ## Why skimtoken?
-**The Problem**: [tiktoken](https://github.com/openai/tiktoken) is great for precise tokenization, but requires ~60MB of memory just to count tokens - problematic for memory-constrained environments.
+**The Problem**: [tiktoken](https://github.com/openai/tiktoken) is great for precise tokenization, but requires ~59.6MB of memory just to count tokens - problematic for memory-constrained environments.
 **The Solution**: skimtoken estimates token counts using statistical patterns instead of loading entire vocabularies, achieving:
-- ✅ **64x less memory** (0.92MB vs 60MB)
-- ✅ **128x faster startup** (4ms vs 485ms)
-- ❌ **48.60x slower execution** (0.93s vs 4.59s) for multilingual method
+- ✅ **65x less memory** (0.92MB vs 59.6MB)
+- ✅ **421x faster startup** (2.389ms vs 1,005ms)
+- ❌ **1.03x slowwer execute time** (6.689s vs 6.912s) for Multilingual single method
 - ❌ Trade-off: ~15.11% error rate vs exact counts
 ## Installation
@@ -87,63 +80,46 @@ print(f"Estimated tokens (multilingual): {token_count}")
 ### Large-Scale Benchmark (100k samples)
-Simple method (Just char length x coefficient):
+Multilingual single method:
 ```
 Results:
 Total Samples: 100,726
 Total Characters: 13,062,391
-Mean RMSE: 38.4863 tokens
-Mean Error Rate: 21.63%
+Mean RMSE: 21.3034 tokens
+Mean Error Rate: 15.11%
 ┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━┓
 ┃ Metric       ┃   tiktoken ┃  skimtoken ┃  Ratio ┃
 ┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━┩
-│ Init Time    │ 0.481672 s │ 0.182308 s │ 0.378x │
+│ Init Time    │ 1.005490 s │ 0.002389 s │ 0.002x │
 ├──────────────┼────────────┼────────────┼────────┤
-│ Init Memory  │ 42.2386 MB │  0.0291 MB │ 0.001x │
+│ Init Memory  │ 42.2310 MB │  0.0265 MB │ 0.001x │
 ├──────────────┼────────────┼────────────┼────────┤
-│ Exec Time    │ 4.710224 s │ 0.805272 s │ 0.171x │
+│ Exec Time    │ 6.689203 s │ 6.911931 s │ 1.033x │
 ├──────────────┼────────────┼────────────┼────────┤
-│ Exec Memory  │ 17.3251 MB │  0.8849 MB │ 0.051x │
+│ Exec Memory  │ 17.3251 MB │  0.8950 MB │ 0.052x │
 ├──────────────┼────────────┼────────────┼────────┤
-│ Total Time   │ 5.191896 s │ 0.928758 s │ 0.190x │
+│ Total Time   │ 7.694694 s │ 6.914320 s │ 0.899x │
 ├──────────────┼────────────┼────────────┼────────┤
-│ Total Memory │ 59.5637 MB │  0.9214 MB │ 0.015x │
+│ Total Memory │ 59.5561 MB │  0.9215 MB │ 0.015x │
 └──────────────┴────────────┴────────────┴────────┘
 ```
-Multilingual simple method:
-```
-Results:
-Total Samples: 100,726
-Total Characters: 13,062,391
-Mean RMSE: 21.3034 tokens
-Mean Error Rate: 15.11%
+### Automated Benchmarks
-┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━┓
-┃ Metric       ┃   tiktoken ┃    skimtoken ┃   Ratio ┃
-┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━┩
-│ Init Time    │ 0.471222 s │   0.006207 s │  0.013x │
-├──────────────┼────────────┼──────────────┼─────────┤
-│ Init Memory  │ 42.2385 MB │    0.0283 MB │  0.001x │
-├──────────────┼────────────┼──────────────┼─────────┤
-│ Exec Time    │ 4.594160 s │ 246.164618 s │ 53.582x │
-├──────────────┼────────────┼──────────────┼─────────┤
-│ Exec Memory  │ 17.3251 MB │    0.8950 MB │  0.052x │
-├──────────────┼────────────┼──────────────┼─────────┤
-│ Total Time   │ 5.065382 s │ 246.170825 s │ 48.599x │
-├──────────────┼────────────┼──────────────┼─────────┤
-│ Total Memory │ 59.5636 MB │    0.9233 MB │  0.016x │
-└──────────────┴────────────┴──────────────┴─────────┘
-```
+For up-to-date performance comparisons and detailed accuracy metrics across all methods, visit the [skimtoken_benchmark](https://github.com/masaishi/skimtoken_benchmark) repository. This automated benchmark suite:
+- Uses the CC-100 multilingual dataset (100k+ samples)
+- Provides language-specific accuracy breakdowns
 ## Available Methods
 | Method | Import | Memory | Error | Best For |
 |--------|---------|--------|-------|----------|
-| **Simple** | `from skimtoken.simple import estimate_tokens` | 0.8MB | ~21% | English text, minimum memory |
-| **Basic** | `from skimtoken.basic import estimate_tokens` | 0.8MB | ~27% | General use |
-| **Multilingual** | `from skimtoken.multilingual import estimate_tokens` | 0.9MB | ~15% | Non-English, mixed languages |
+| **Simple** | `from skimtoken.simple import estimate_tokens` | 1.0MB | ~21.63% | English text, minimum memory |
+| **Basic** | `from skimtoken.basic import estimate_tokens` | 0.9MB | ~27.05% | General use |
+| **Multilingual** | `from skimtoken.multilingual import estimate_tokens` | 0.9MB | ~15.93% | Non-English, mixed languages |
+| **Multilingual Simple** | `from skimtoken.multilingual_simple import estimate_tokens` | 0.9MB | ~15.11% | Fast multilingual estimation |
 ```python
 # Example: Choose method based on your needs
@@ -259,10 +235,10 @@ A: Beta = breaking changes possible.
 We are actively working to improve skimtoken's accuracy and performance:
 1. **Better estimation algorithms**: Moving beyond simple character multiplication to more sophisticated statistical models
-2. **Performance optimization**: Fixing the 60x slowdown in multilingual method
+2. **Performance optimization**: Further improving execution speed
 3. **Improved language support**: Better handling of non-English languages
 4. **Higher accuracy**: Targeting <10% error rate while maintaining low memory footprint
 ## License
-MIT License - see [LICENSE](./LICENSE) for details.
+MIT License - see [LICENSE](./LICENSE) for details.

{skimtoken-0.2.0 → skimtoken-0.2.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "skimtoken"
-version = "0.2.0"
+version = "0.2.2"
 description = "Fast token count estimation library"
 readme = "README.md"
 requires-python = ">=3.9"

{skimtoken-0.2.0 → skimtoken-0.2.2}/scripts/update_rust_params.py RENAMED Viewed

@@ -4,10 +4,11 @@ Script to load parameters from TOML files and update the Rust parameter files.
 Handles all parameter types: simple, basic, multilingual, and multilingual_simple.
 """
-import toml
 from pathlib import Path
 from typing import Any, Callable, TypedDict
+import toml
 class ParamConfig(TypedDict):
     name: str
@@ -23,6 +24,29 @@ def load_params_from_toml(toml_path: str) -> dict[str, Any]:
     return data
+def format_f32(value: float) -> str:
+    """Format a float value for f32 with underscores for readability."""
+    str_val = f"{value:.7g}"  # Use 7 significant digits for f32 precision
+    if "e" in str_val.lower():
+        return str_val
+    if "." in str_val:
+        integer_part, decimal_part = str_val.split(".")
+        if len(decimal_part) > 3:
+            formatted_decimal = ""
+            for i, digit in enumerate(decimal_part):
+                if i > 0 and i % 3 == 0:
+                    formatted_decimal += "_"
+                formatted_decimal += digit
+            return f"{integer_part}.{formatted_decimal}"
+        else:
+            return str_val
+    else:
+        return str_val
 def generate_simple_params_rust(params_data: dict[str, Any]) -> str:
     """Generate Rust code for simple parameters."""
     rust_code: list[str] = []
@@ -30,7 +54,7 @@ def generate_simple_params_rust(params_data: dict[str, Any]) -> str:
     rust_code.append("impl Default for SimpleParameters {")
     rust_code.append("    fn default() -> Self {")
     rust_code.append("        Self {")
-    rust_code.append(f"            coefficient: {params_data['coefficient']},")
+    rust_code.append(f"            coefficient: {format_f32(params_data['coefficient'])},")
     rust_code.append("        }")
     rust_code.append("    }")
     rust_code.append("}")
@@ -45,11 +69,13 @@ def generate_basic_params_rust(params_data: dict[str, Any]) -> str:
     rust_code.append("impl Default for BasicParameters {")
     rust_code.append("    fn default() -> Self {")
     rust_code.append("        Self {")
-    rust_code.append(f"            char_coef: {params_data['char_coef']},")
-    rust_code.append(f"            word_coef: {params_data['word_coef']},")
-    rust_code.append(f"            avg_word_length_coef: {params_data['avg_word_length_coef']},")
-    rust_code.append(f"            space_coef: {params_data['space_coef']},")
-    rust_code.append(f"            intercept: {params_data['intercept']},")
+    rust_code.append(f"            char_coef: {format_f32(params_data['char_coef'])},")
+    rust_code.append(f"            word_coef: {format_f32(params_data['word_coef'])},")
+    rust_code.append(
+        f"            avg_word_length_coef: {format_f32(params_data['avg_word_length_coef'])},"
+    )
+    rust_code.append(f"            space_coef: {format_f32(params_data['space_coef'])},")
+    rust_code.append(f"            intercept: {format_f32(params_data['intercept'])},")
     rust_code.append("        }")
     rust_code.append("    }")
     rust_code.append("}")
@@ -66,11 +92,13 @@ def generate_multilingual_params_rust(params_data: dict[str, Any]) -> str:
     rust_code.append("impl Default for MultilingualParameters {")
     rust_code.append("    fn default() -> Self {")
     rust_code.append("        Self {")
-    rust_code.append(f"            char_coef: {default_params['char_coef']},")
-    rust_code.append(f"            word_coef: {default_params['word_coef']},")
-    rust_code.append(f"            avg_word_length_coef: {default_params['avg_word_length_coef']},")
-    rust_code.append(f"            space_coef: {default_params['space_coef']},")
-    rust_code.append(f"            intercept: {default_params['intercept']},")
+    rust_code.append(f"            char_coef: {format_f32(default_params['char_coef'])},")
+    rust_code.append(f"            word_coef: {format_f32(default_params['word_coef'])},")
+    rust_code.append(
+        f"            avg_word_length_coef: {format_f32(default_params['avg_word_length_coef'])},"
+    )
+    rust_code.append(f"            space_coef: {format_f32(default_params['space_coef'])},")
+    rust_code.append(f"            intercept: {format_f32(default_params['intercept'])},")
     rust_code.append("        }")
     rust_code.append("    }")
     rust_code.append("}")
@@ -89,13 +117,13 @@ def generate_multilingual_params_rust(params_data: dict[str, Any]) -> str:
         rust_code.append("        language_params.insert(")
         rust_code.append(f'            "{lang_key}".to_string(),')
         rust_code.append("            MultilingualParameters {")
-        rust_code.append(f"                char_coef: {lang_params['char_coef']},")
-        rust_code.append(f"                word_coef: {lang_params['word_coef']},")
+        rust_code.append(f"                char_coef: {format_f32(lang_params['char_coef'])},")
+        rust_code.append(f"                word_coef: {format_f32(lang_params['word_coef'])},")
         rust_code.append(
-            f"                avg_word_length_coef: {lang_params['avg_word_length_coef']},"
+            f"                avg_word_length_coef: {format_f32(lang_params['avg_word_length_coef'])},"
         )
-        rust_code.append(f"                space_coef: {lang_params['space_coef']},")
-        rust_code.append(f"                intercept: {lang_params['intercept']},")
+        rust_code.append(f"                space_coef: {format_f32(lang_params['space_coef'])},")
+        rust_code.append(f"                intercept: {format_f32(lang_params['intercept'])},")
         rust_code.append("            },")
         rust_code.append("        );")
         rust_code.append("")
@@ -119,7 +147,7 @@ def generate_multilingual_simple_params_rust(params_data: dict[str, Any]) -> str
     rust_code.append("impl Default for MultilingualSimpleParameters {")
     rust_code.append("    fn default() -> Self {")
     rust_code.append("        Self {")
-    rust_code.append(f"            coefficient: {default_params['coefficient']},")
+    rust_code.append(f"            coefficient: {format_f32(default_params['coefficient'])},")
     rust_code.append("        }")
     rust_code.append("    }")
     rust_code.append("}")
@@ -137,7 +165,7 @@ def generate_multilingual_simple_params_rust(params_data: dict[str, Any]) -> str
         rust_code.append("        language_params.insert(")
         rust_code.append(f'            "{lang_key}".to_string(),')
         rust_code.append("            MultilingualSimpleParameters {")
-        rust_code.append(f"                coefficient: {lang_params['coefficient']},")
+        rust_code.append(f"                coefficient: {format_f32(lang_params['coefficient'])},")
         rust_code.append("            },")
         rust_code.append("        );")
         rust_code.append("")

{skimtoken-0.2.0 → skimtoken-0.2.2}/src/lib.rs RENAMED Viewed

@@ -133,7 +133,7 @@ fn _skimtoken_core(m: &Bound<'_, PyModule>) -> PyResult<()> {
     #[pyfn(m)]
     #[pyo3(name = "count_basic")]
-    fn count_basic_py(text: &Bound<'_, PyAny>) -> PyResult<(usize, usize, f64, usize)> {
+    fn count_basic_py(text: &Bound<'_, PyAny>) -> PyResult<(usize, usize, f32, usize)> {
         let text_str = if let Ok(s) = text.extract::<String>() {
             s
         } else {
@@ -153,7 +153,7 @@ fn _skimtoken_core(m: &Bound<'_, PyModule>) -> PyResult<()> {
     #[pyo3(name = "count_multilingual")]
     fn count_multilingual_py(
         text: &Bound<'_, PyAny>,
-    ) -> PyResult<(usize, usize, f64, usize, String)> {
+    ) -> PyResult<(usize, usize, f32, usize, String)> {
         let text_str = if let Ok(s) = text.extract::<String>() {
             s
         } else {

{skimtoken-0.2.0 → skimtoken-0.2.2}/src/methods/method_basic.rs RENAMED Viewed

@@ -5,27 +5,27 @@ use serde::{Deserialize, Serialize};
 pub struct BasicFeatures {
     pub char_count: usize,
     pub word_count: usize,
-    pub avg_word_length: f64,
+    pub avg_word_length: f32,
     pub space_count: usize,
 }
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct BasicParameters {
-    pub char_coef: f64,
-    pub word_coef: f64,
-    pub avg_word_length_coef: f64,
-    pub space_coef: f64,
-    pub intercept: f64,
+    pub char_coef: f32,
+    pub word_coef: f32,
+    pub avg_word_length_coef: f32,
+    pub space_coef: f32,
+    pub intercept: f32,
 }
 impl Default for BasicParameters {
     fn default() -> Self {
         Self {
-            char_coef: 0.3217745347518016,
-            word_coef: 0.07022881669049061,
-            avg_word_length_coef: 0.5090982427870748,
-            space_coef: -0.15831091236345404,
-            intercept: 1.591021053665763,
+            char_coef: 0.321_774_5,
+            word_coef: 0.070_228_82,
+            avg_word_length_coef: 0.509_098_2,
+            space_coef: -0.158_310_9,
+            intercept: 1.591_021,
         }
     }
 }
@@ -60,7 +60,7 @@ impl EstimationMethod for BasicMethod {
         let avg_word_length = if word_count > 0 {
             let total_word_chars: usize = words.iter().map(|w| w.chars().count()).sum();
-            total_word_chars as f64 / word_count as f64
+            total_word_chars as f32 / word_count as f32
         } else {
             0.0
         };
@@ -75,10 +75,10 @@ impl EstimationMethod for BasicMethod {
     fn estimate(&self, text: &str) -> usize {
         let features = self.count(text);
-        let estimate = self.parameters.char_coef * features.char_count as f64
-            + self.parameters.word_coef * features.word_count as f64
+        let estimate = self.parameters.char_coef * features.char_count as f32
+            + self.parameters.word_coef * features.word_count as f32
             + self.parameters.avg_word_length_coef * features.avg_word_length
-            + self.parameters.space_coef * features.space_count as f64
+            + self.parameters.space_coef * features.space_count as f32
             + self.parameters.intercept;
         estimate.round().max(0.0) as usize

skimtoken 0.2.0__tar.gz → 0.2.2__tar.gz

skimtoken 0.2.0tar.gz → 0.2.2tar.gz