PyPI - skimtoken - Versions diffs - 0.2.1__tar.gz → 0.2.2__tar.gz - Mend

skimtoken 0.2.1tar.gz → 0.2.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

{skimtoken-0.2.1 → skimtoken-0.2.2}/Cargo.lock RENAMED Viewed

@@ -392,7 +392,7 @@ dependencies = [
 [[package]]
 name = "skimtoken"
-version = "0.2.1"
+version = "0.2.2"
 dependencies = [
  "atty",
  "clap",

{skimtoken-0.2.1 → skimtoken-0.2.2}/Cargo.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "skimtoken"
-version = "0.2.1"
+version = "0.2.2"
 edition = "2021"
 authors = ["masaishi <mwishiha@ucsc.edu>"]
 license = "MIT"

{skimtoken-0.2.1 → skimtoken-0.2.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: skimtoken
-Version: 0.2.1
+Version: 0.2.2
 License-File: LICENSE
 Summary: Fast token count estimation library
 Home-Page: https://github.com/masaishi/skimtoken
@@ -21,23 +21,16 @@ A lightweight, fast token count estimation library written in Rust with Python b
 [![Crates.io](https://img.shields.io/crates/v/skimtoken)](https://crates.io/crates/skimtoken)
 [![License](https://img.shields.io/github/license/masaishi/skimtoken)](https://github.com/masaishi/skimtoken/blob/main/LICENSE)
-## ⚠️ Current Limitations
-**This library is currently in early beta and has significant accuracy issues:**
-- **Multilingual method**: Takes 1.13x longer than tiktoken due to inefficient implementation
-- **Overall accuracy**: 15.11% error rate, which is too high for most use cases
 ## Why skimtoken?
-**The Problem**: [tiktoken](https://github.com/openai/tiktoken) is great for precise tokenization, but requires ~60MB of memory just to count tokens - problematic for memory-constrained environments.
+**The Problem**: [tiktoken](https://github.com/openai/tiktoken) is great for precise tokenization, but requires ~59.6MB of memory just to count tokens - problematic for memory-constrained environments.
 **The Solution**: skimtoken estimates token counts using statistical patterns instead of loading entire vocabularies, achieving:
-- ✅ **64x less memory** (0.92MB vs 60MB)
-- ✅ **128x faster startup** (4ms vs 485ms)
-- ❌ **1.13x slower execution** (5.51s vs 4.59s) for multilingual method
+- ✅ **65x less memory** (0.92MB vs 59.6MB)
+- ✅ **421x faster startup** (2.389ms vs 1,005ms)
+- ❌ **1.03x slowwer execute time** (6.689s vs 6.912s) for Multilingual single method
 - ❌ Trade-off: ~15.11% error rate vs exact counts
 ## Installation
@@ -100,32 +93,7 @@ print(f"Estimated tokens (multilingual): {token_count}")
 ### Large-Scale Benchmark (100k samples)
-Simple method (Just char length x coefficient):
-```
-Results:
-Total Samples: 100,726
-Total Characters: 13,062,391
-Mean RMSE: 38.4863 tokens
-Mean Error Rate: 21.63%
-┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━┓
-┃ Metric       ┃   tiktoken ┃  skimtoken ┃  Ratio ┃
-┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━┩
-│ Init Time    │ 0.481672 s │ 0.182308 s │ 0.378x │
-├──────────────┼────────────┼────────────┼────────┤
-│ Init Memory  │ 42.2386 MB │  0.0291 MB │ 0.001x │
-├──────────────┼────────────┼────────────┼────────┤
-│ Exec Time    │ 4.710224 s │ 0.805272 s │ 0.171x │
-├──────────────┼────────────┼────────────┼────────┤
-│ Exec Memory  │ 17.3251 MB │  0.8849 MB │ 0.051x │
-├──────────────┼────────────┼────────────┼────────┤
-│ Total Time   │ 5.191896 s │ 0.928758 s │ 0.190x │
-├──────────────┼────────────┼────────────┼────────┤
-│ Total Memory │ 59.5637 MB │  0.9214 MB │ 0.015x │
-└──────────────┴────────────┴────────────┴────────┘
-```
-Multilingual simple method:
+Multilingual single method:
 ```
 Results:
 Total Samples: 100,726
@@ -136,27 +104,35 @@ Mean Error Rate: 15.11%
 ┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━┓
 ┃ Metric       ┃   tiktoken ┃  skimtoken ┃  Ratio ┃
 ┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━┩
-│ Init Time    │ 0.815441 s │ 0.138714 s │ 0.170x │
+│ Init Time    │ 1.005490 s │ 0.002389 s │ 0.002x │
 ├──────────────┼────────────┼────────────┼────────┤
-│ Init Memory  │ 42.4791 MB │  0.1613 MB │ 0.004x │
+│ Init Memory  │ 42.2310 MB │  0.0265 MB │ 0.001x │
 ├──────────────┼────────────┼────────────┼────────┤
-│ Exec Time    │ 4.041857 s │ 5.380782 s │ 1.331x │
+│ Exec Time    │ 6.689203 s │ 6.911931 s │ 1.033x │
 ├──────────────┼────────────┼────────────┼────────┤
-│ Exec Memory  │ 17.3227 MB │  0.8950 MB │ 0.052x │
+│ Exec Memory  │ 17.3251 MB │  0.8950 MB │ 0.052x │
 ├──────────────┼────────────┼────────────┼────────┤
-│ Total Time   │ 4.857297 s │ 5.519496 s │ 1.136x │
+│ Total Time   │ 7.694694 s │ 6.914320 s │ 0.899x │
 ├──────────────┼────────────┼────────────┼────────┤
-│ Total Memory │ 59.8018 MB │  1.0563 MB │ 0.018x │
+│ Total Memory │ 59.5561 MB │  0.9215 MB │ 0.015x │
 └──────────────┴────────────┴────────────┴────────┘
 ```
+### Automated Benchmarks
+For up-to-date performance comparisons and detailed accuracy metrics across all methods, visit the [skimtoken_benchmark](https://github.com/masaishi/skimtoken_benchmark) repository. This automated benchmark suite:
+- Uses the CC-100 multilingual dataset (100k+ samples)
+- Provides language-specific accuracy breakdowns
 ## Available Methods
 | Method | Import | Memory | Error | Best For |
 |--------|---------|--------|-------|----------|
-| **Simple** | `from skimtoken.simple import estimate_tokens` | 0.8MB | ~21% | English text, minimum memory |
-| **Basic** | `from skimtoken.basic import estimate_tokens` | 0.8MB | ~27% | General use |
-| **Multilingual** | `from skimtoken.multilingual import estimate_tokens` | 0.9MB | ~15% | Non-English, mixed languages |
+| **Simple** | `from skimtoken.simple import estimate_tokens` | 1.0MB | ~21.63% | English text, minimum memory |
+| **Basic** | `from skimtoken.basic import estimate_tokens` | 0.9MB | ~27.05% | General use |
+| **Multilingual** | `from skimtoken.multilingual import estimate_tokens` | 0.9MB | ~15.93% | Non-English, mixed languages |
+| **Multilingual Simple** | `from skimtoken.multilingual_simple import estimate_tokens` | 0.9MB | ~15.11% | Fast multilingual estimation |
 ```python
 # Example: Choose method based on your needs
@@ -272,7 +248,7 @@ A: Beta = breaking changes possible.
 We are actively working to improve skimtoken's accuracy and performance:
 1. **Better estimation algorithms**: Moving beyond simple character multiplication to more sophisticated statistical models
-2. **Performance optimization**: Fixing the 60x slowdown in multilingual method
+2. **Performance optimization**: Further improving execution speed
 3. **Improved language support**: Better handling of non-English languages
 4. **Higher accuracy**: Targeting <10% error rate while maintaining low memory footprint

{skimtoken-0.2.1 → skimtoken-0.2.2}/README.md RENAMED Viewed

@@ -8,23 +8,16 @@ A lightweight, fast token count estimation library written in Rust with Python b
 [![Crates.io](https://img.shields.io/crates/v/skimtoken)](https://crates.io/crates/skimtoken)
 [![License](https://img.shields.io/github/license/masaishi/skimtoken)](https://github.com/masaishi/skimtoken/blob/main/LICENSE)
-## ⚠️ Current Limitations
-**This library is currently in early beta and has significant accuracy issues:**
-- **Multilingual method**: Takes 1.13x longer than tiktoken due to inefficient implementation
-- **Overall accuracy**: 15.11% error rate, which is too high for most use cases
 ## Why skimtoken?
-**The Problem**: [tiktoken](https://github.com/openai/tiktoken) is great for precise tokenization, but requires ~60MB of memory just to count tokens - problematic for memory-constrained environments.
+**The Problem**: [tiktoken](https://github.com/openai/tiktoken) is great for precise tokenization, but requires ~59.6MB of memory just to count tokens - problematic for memory-constrained environments.
 **The Solution**: skimtoken estimates token counts using statistical patterns instead of loading entire vocabularies, achieving:
-- ✅ **64x less memory** (0.92MB vs 60MB)
-- ✅ **128x faster startup** (4ms vs 485ms)
-- ❌ **1.13x slower execution** (5.51s vs 4.59s) for multilingual method
+- ✅ **65x less memory** (0.92MB vs 59.6MB)
+- ✅ **421x faster startup** (2.389ms vs 1,005ms)
+- ❌ **1.03x slowwer execute time** (6.689s vs 6.912s) for Multilingual single method
 - ❌ Trade-off: ~15.11% error rate vs exact counts
 ## Installation
@@ -87,32 +80,7 @@ print(f"Estimated tokens (multilingual): {token_count}")
 ### Large-Scale Benchmark (100k samples)
-Simple method (Just char length x coefficient):
-```
-Results:
-Total Samples: 100,726
-Total Characters: 13,062,391
-Mean RMSE: 38.4863 tokens
-Mean Error Rate: 21.63%
-┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━┓
-┃ Metric       ┃   tiktoken ┃  skimtoken ┃  Ratio ┃
-┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━┩
-│ Init Time    │ 0.481672 s │ 0.182308 s │ 0.378x │
-├──────────────┼────────────┼────────────┼────────┤
-│ Init Memory  │ 42.2386 MB │  0.0291 MB │ 0.001x │
-├──────────────┼────────────┼────────────┼────────┤
-│ Exec Time    │ 4.710224 s │ 0.805272 s │ 0.171x │
-├──────────────┼────────────┼────────────┼────────┤
-│ Exec Memory  │ 17.3251 MB │  0.8849 MB │ 0.051x │
-├──────────────┼────────────┼────────────┼────────┤
-│ Total Time   │ 5.191896 s │ 0.928758 s │ 0.190x │
-├──────────────┼────────────┼────────────┼────────┤
-│ Total Memory │ 59.5637 MB │  0.9214 MB │ 0.015x │
-└──────────────┴────────────┴────────────┴────────┘
-```
-Multilingual simple method:
+Multilingual single method:
 ```
 Results:
 Total Samples: 100,726
@@ -123,27 +91,35 @@ Mean Error Rate: 15.11%
 ┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━┓
 ┃ Metric       ┃   tiktoken ┃  skimtoken ┃  Ratio ┃
 ┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━┩
-│ Init Time    │ 0.815441 s │ 0.138714 s │ 0.170x │
+│ Init Time    │ 1.005490 s │ 0.002389 s │ 0.002x │
 ├──────────────┼────────────┼────────────┼────────┤
-│ Init Memory  │ 42.4791 MB │  0.1613 MB │ 0.004x │
+│ Init Memory  │ 42.2310 MB │  0.0265 MB │ 0.001x │
 ├──────────────┼────────────┼────────────┼────────┤
-│ Exec Time    │ 4.041857 s │ 5.380782 s │ 1.331x │
+│ Exec Time    │ 6.689203 s │ 6.911931 s │ 1.033x │
 ├──────────────┼────────────┼────────────┼────────┤
-│ Exec Memory  │ 17.3227 MB │  0.8950 MB │ 0.052x │
+│ Exec Memory  │ 17.3251 MB │  0.8950 MB │ 0.052x │
 ├──────────────┼────────────┼────────────┼────────┤
-│ Total Time   │ 4.857297 s │ 5.519496 s │ 1.136x │
+│ Total Time   │ 7.694694 s │ 6.914320 s │ 0.899x │
 ├──────────────┼────────────┼────────────┼────────┤
-│ Total Memory │ 59.8018 MB │  1.0563 MB │ 0.018x │
+│ Total Memory │ 59.5561 MB │  0.9215 MB │ 0.015x │
 └──────────────┴────────────┴────────────┴────────┘
 ```
+### Automated Benchmarks
+For up-to-date performance comparisons and detailed accuracy metrics across all methods, visit the [skimtoken_benchmark](https://github.com/masaishi/skimtoken_benchmark) repository. This automated benchmark suite:
+- Uses the CC-100 multilingual dataset (100k+ samples)
+- Provides language-specific accuracy breakdowns
 ## Available Methods
 | Method | Import | Memory | Error | Best For |
 |--------|---------|--------|-------|----------|
-| **Simple** | `from skimtoken.simple import estimate_tokens` | 0.8MB | ~21% | English text, minimum memory |
-| **Basic** | `from skimtoken.basic import estimate_tokens` | 0.8MB | ~27% | General use |
-| **Multilingual** | `from skimtoken.multilingual import estimate_tokens` | 0.9MB | ~15% | Non-English, mixed languages |
+| **Simple** | `from skimtoken.simple import estimate_tokens` | 1.0MB | ~21.63% | English text, minimum memory |
+| **Basic** | `from skimtoken.basic import estimate_tokens` | 0.9MB | ~27.05% | General use |
+| **Multilingual** | `from skimtoken.multilingual import estimate_tokens` | 0.9MB | ~15.93% | Non-English, mixed languages |
+| **Multilingual Simple** | `from skimtoken.multilingual_simple import estimate_tokens` | 0.9MB | ~15.11% | Fast multilingual estimation |
 ```python
 # Example: Choose method based on your needs
@@ -259,7 +235,7 @@ A: Beta = breaking changes possible.
 We are actively working to improve skimtoken's accuracy and performance:
 1. **Better estimation algorithms**: Moving beyond simple character multiplication to more sophisticated statistical models
-2. **Performance optimization**: Fixing the 60x slowdown in multilingual method
+2. **Performance optimization**: Further improving execution speed
 3. **Improved language support**: Better handling of non-English languages
 4. **Higher accuracy**: Targeting <10% error rate while maintaining low memory footprint

{skimtoken-0.2.1 → skimtoken-0.2.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "skimtoken"
-version = "0.2.1"
+version = "0.2.2"
 description = "Fast token count estimation library"
 readme = "README.md"
 requires-python = ">=3.9"

{skimtoken-0.2.1 → skimtoken-0.2.2}/scripts/update_rust_params.py RENAMED Viewed

@@ -4,10 +4,11 @@ Script to load parameters from TOML files and update the Rust parameter files.
 Handles all parameter types: simple, basic, multilingual, and multilingual_simple.
 """
-import toml
 from pathlib import Path
 from typing import Any, Callable, TypedDict
+import toml
 class ParamConfig(TypedDict):
     name: str
@@ -23,6 +24,29 @@ def load_params_from_toml(toml_path: str) -> dict[str, Any]:
     return data
+def format_f32(value: float) -> str:
+    """Format a float value for f32 with underscores for readability."""
+    str_val = f"{value:.7g}"  # Use 7 significant digits for f32 precision
+    if "e" in str_val.lower():
+        return str_val
+    if "." in str_val:
+        integer_part, decimal_part = str_val.split(".")
+        if len(decimal_part) > 3:
+            formatted_decimal = ""
+            for i, digit in enumerate(decimal_part):
+                if i > 0 and i % 3 == 0:
+                    formatted_decimal += "_"
+                formatted_decimal += digit
+            return f"{integer_part}.{formatted_decimal}"
+        else:
+            return str_val
+    else:
+        return str_val
 def generate_simple_params_rust(params_data: dict[str, Any]) -> str:
     """Generate Rust code for simple parameters."""
     rust_code: list[str] = []
@@ -30,7 +54,7 @@ def generate_simple_params_rust(params_data: dict[str, Any]) -> str:
     rust_code.append("impl Default for SimpleParameters {")
     rust_code.append("    fn default() -> Self {")
     rust_code.append("        Self {")
-    rust_code.append(f"            coefficient: {params_data['coefficient']},")
+    rust_code.append(f"            coefficient: {format_f32(params_data['coefficient'])},")
     rust_code.append("        }")
     rust_code.append("    }")
     rust_code.append("}")
@@ -45,11 +69,13 @@ def generate_basic_params_rust(params_data: dict[str, Any]) -> str:
     rust_code.append("impl Default for BasicParameters {")
     rust_code.append("    fn default() -> Self {")
     rust_code.append("        Self {")
-    rust_code.append(f"            char_coef: {params_data['char_coef']},")
-    rust_code.append(f"            word_coef: {params_data['word_coef']},")
-    rust_code.append(f"            avg_word_length_coef: {params_data['avg_word_length_coef']},")
-    rust_code.append(f"            space_coef: {params_data['space_coef']},")
-    rust_code.append(f"            intercept: {params_data['intercept']},")
+    rust_code.append(f"            char_coef: {format_f32(params_data['char_coef'])},")
+    rust_code.append(f"            word_coef: {format_f32(params_data['word_coef'])},")
+    rust_code.append(
+        f"            avg_word_length_coef: {format_f32(params_data['avg_word_length_coef'])},"
+    )
+    rust_code.append(f"            space_coef: {format_f32(params_data['space_coef'])},")
+    rust_code.append(f"            intercept: {format_f32(params_data['intercept'])},")
     rust_code.append("        }")
     rust_code.append("    }")
     rust_code.append("}")
@@ -66,11 +92,13 @@ def generate_multilingual_params_rust(params_data: dict[str, Any]) -> str:
     rust_code.append("impl Default for MultilingualParameters {")
     rust_code.append("    fn default() -> Self {")
     rust_code.append("        Self {")
-    rust_code.append(f"            char_coef: {default_params['char_coef']},")
-    rust_code.append(f"            word_coef: {default_params['word_coef']},")
-    rust_code.append(f"            avg_word_length_coef: {default_params['avg_word_length_coef']},")
-    rust_code.append(f"            space_coef: {default_params['space_coef']},")
-    rust_code.append(f"            intercept: {default_params['intercept']},")
+    rust_code.append(f"            char_coef: {format_f32(default_params['char_coef'])},")
+    rust_code.append(f"            word_coef: {format_f32(default_params['word_coef'])},")
+    rust_code.append(
+        f"            avg_word_length_coef: {format_f32(default_params['avg_word_length_coef'])},"
+    )
+    rust_code.append(f"            space_coef: {format_f32(default_params['space_coef'])},")
+    rust_code.append(f"            intercept: {format_f32(default_params['intercept'])},")
     rust_code.append("        }")
     rust_code.append("    }")
     rust_code.append("}")
@@ -89,13 +117,13 @@ def generate_multilingual_params_rust(params_data: dict[str, Any]) -> str:
         rust_code.append("        language_params.insert(")
         rust_code.append(f'            "{lang_key}".to_string(),')
         rust_code.append("            MultilingualParameters {")
-        rust_code.append(f"                char_coef: {lang_params['char_coef']},")
-        rust_code.append(f"                word_coef: {lang_params['word_coef']},")
+        rust_code.append(f"                char_coef: {format_f32(lang_params['char_coef'])},")
+        rust_code.append(f"                word_coef: {format_f32(lang_params['word_coef'])},")
         rust_code.append(
-            f"                avg_word_length_coef: {lang_params['avg_word_length_coef']},"
+            f"                avg_word_length_coef: {format_f32(lang_params['avg_word_length_coef'])},"
         )
-        rust_code.append(f"                space_coef: {lang_params['space_coef']},")
-        rust_code.append(f"                intercept: {lang_params['intercept']},")
+        rust_code.append(f"                space_coef: {format_f32(lang_params['space_coef'])},")
+        rust_code.append(f"                intercept: {format_f32(lang_params['intercept'])},")
         rust_code.append("            },")
         rust_code.append("        );")
         rust_code.append("")
@@ -119,7 +147,7 @@ def generate_multilingual_simple_params_rust(params_data: dict[str, Any]) -> str
     rust_code.append("impl Default for MultilingualSimpleParameters {")
     rust_code.append("    fn default() -> Self {")
     rust_code.append("        Self {")
-    rust_code.append(f"            coefficient: {default_params['coefficient']},")
+    rust_code.append(f"            coefficient: {format_f32(default_params['coefficient'])},")
     rust_code.append("        }")
     rust_code.append("    }")
     rust_code.append("}")
@@ -137,7 +165,7 @@ def generate_multilingual_simple_params_rust(params_data: dict[str, Any]) -> str
         rust_code.append("        language_params.insert(")
         rust_code.append(f'            "{lang_key}".to_string(),')
         rust_code.append("            MultilingualSimpleParameters {")
-        rust_code.append(f"                coefficient: {lang_params['coefficient']},")
+        rust_code.append(f"                coefficient: {format_f32(lang_params['coefficient'])},")
         rust_code.append("            },")
         rust_code.append("        );")
         rust_code.append("")

{skimtoken-0.2.1 → skimtoken-0.2.2}/src/lib.rs RENAMED Viewed

@@ -133,7 +133,7 @@ fn _skimtoken_core(m: &Bound<'_, PyModule>) -> PyResult<()> {
     #[pyfn(m)]
     #[pyo3(name = "count_basic")]
-    fn count_basic_py(text: &Bound<'_, PyAny>) -> PyResult<(usize, usize, f64, usize)> {
+    fn count_basic_py(text: &Bound<'_, PyAny>) -> PyResult<(usize, usize, f32, usize)> {
         let text_str = if let Ok(s) = text.extract::<String>() {
             s
         } else {
@@ -153,7 +153,7 @@ fn _skimtoken_core(m: &Bound<'_, PyModule>) -> PyResult<()> {
     #[pyo3(name = "count_multilingual")]
     fn count_multilingual_py(
         text: &Bound<'_, PyAny>,
-    ) -> PyResult<(usize, usize, f64, usize, String)> {
+    ) -> PyResult<(usize, usize, f32, usize, String)> {
         let text_str = if let Ok(s) = text.extract::<String>() {
             s
         } else {

{skimtoken-0.2.1 → skimtoken-0.2.2}/src/methods/method_basic.rs RENAMED Viewed

@@ -5,27 +5,27 @@ use serde::{Deserialize, Serialize};
 pub struct BasicFeatures {
     pub char_count: usize,
     pub word_count: usize,
-    pub avg_word_length: f64,
+    pub avg_word_length: f32,
     pub space_count: usize,
 }
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct BasicParameters {
-    pub char_coef: f64,
-    pub word_coef: f64,
-    pub avg_word_length_coef: f64,
-    pub space_coef: f64,
-    pub intercept: f64,
+    pub char_coef: f32,
+    pub word_coef: f32,
+    pub avg_word_length_coef: f32,
+    pub space_coef: f32,
+    pub intercept: f32,
 }
 impl Default for BasicParameters {
     fn default() -> Self {
         Self {
-            char_coef: 0.3217745347518016,
-            word_coef: 0.07022881669049061,
-            avg_word_length_coef: 0.5090982427870748,
-            space_coef: -0.15831091236345404,
-            intercept: 1.591021053665763,
+            char_coef: 0.321_774_5,
+            word_coef: 0.070_228_82,
+            avg_word_length_coef: 0.509_098_2,
+            space_coef: -0.158_310_9,
+            intercept: 1.591_021,
         }
     }
 }
@@ -60,7 +60,7 @@ impl EstimationMethod for BasicMethod {
         let avg_word_length = if word_count > 0 {
             let total_word_chars: usize = words.iter().map(|w| w.chars().count()).sum();
-            total_word_chars as f64 / word_count as f64
+            total_word_chars as f32 / word_count as f32
         } else {
             0.0
         };
@@ -75,10 +75,10 @@ impl EstimationMethod for BasicMethod {
     fn estimate(&self, text: &str) -> usize {
         let features = self.count(text);
-        let estimate = self.parameters.char_coef * features.char_count as f64
-            + self.parameters.word_coef * features.word_count as f64
+        let estimate = self.parameters.char_coef * features.char_count as f32
+            + self.parameters.word_coef * features.word_count as f32
             + self.parameters.avg_word_length_coef * features.avg_word_length
-            + self.parameters.space_coef * features.space_count as f64
+            + self.parameters.space_coef * features.space_count as f32
             + self.parameters.intercept;
         estimate.round().max(0.0) as usize

skimtoken 0.2.1__tar.gz → 0.2.2__tar.gz

skimtoken 0.2.1tar.gz → 0.2.2tar.gz