PyPI - ocr-stringdist - Versions diffs - 0.0.6__tar.gz → 0.0.7__tar.gz - Mend

ocr-stringdist 0.0.6tar.gz → 0.0.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/Cargo.lock RENAMED Viewed

@@ -74,7 +74,7 @@ dependencies = [
 [[package]]
 name = "ocr_stringdist"
-version = "0.0.6"
+version = "0.0.7"
 dependencies = [
  "pyo3",
  "rayon",

{ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/Cargo.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "ocr_stringdist"
-version = "0.0.6"
+version = "0.0.7"
 edition = "2021"
 description = "String distances considering OCR errors."
 authors = ["Niklas von Moers <niklasvmoers@protonmail.com>"]

{ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ocr_stringdist
-Version: 0.0.6
+Version: 0.0.7
 Classifier: Programming Language :: Rust
 Classifier: Programming Language :: Python
 Classifier: Operating System :: OS Independent
@@ -60,7 +60,6 @@ distance = osd.weighted_levenshtein_distance(
     "hi", "Ini",
     cost_map=custom_map,
     symmetric=True,
-    max_token_characters=2,
 )
 print(f"Distance with custom map: {distance}")
 ```

{ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/README.md RENAMED Viewed

@@ -45,7 +45,6 @@ distance = osd.weighted_levenshtein_distance(
     "hi", "Ini",
     cost_map=custom_map,
     symmetric=True,
-    max_token_characters=2,
 )
 print(f"Distance with custom map: {distance}")
 ```

{ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/docs/source/api/index.rst RENAMED Viewed

@@ -5,9 +5,10 @@ API Reference
 This page contains the auto-generated API reference documentation.
-.. autofunction:: ocr_stringdist.weighted_levenshtein_distance
-.. autofunction:: ocr_stringdist.batch_weighted_levenshtein_distance
+.. automodule:: ocr_stringdist.levenshtein
+   :members:
+   :undoc-members:
+   :show-inheritance:
 .. automodule:: ocr_stringdist.matching
    :members:

{ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/examples/batch_processing.py RENAMED Viewed

@@ -8,8 +8,6 @@ from typing import Any, Callable
 import ocr_stringdist as osd
-MAX_TOKEN_CHARACTERS = 1
 def benchmark(func: Callable, *args: Any, **kwargs: Any) -> tuple[Any, float]:  # type: ignore
     """Run a function and return the execution time in seconds."""
@@ -32,12 +30,7 @@ def compare_methods() -> None:
     # Standard loop approach
     _, time_loop = benchmark(
-        lambda: [
-            osd.weighted_levenshtein_distance(
-                source, cand, max_token_characters=MAX_TOKEN_CHARACTERS
-            )
-            for cand in candidates
-        ]
+        lambda: [osd.weighted_levenshtein_distance(source, cand) for cand in candidates]
     )
     print(
         f"Loop of single calls: {time_loop:.6f} seconds "
@@ -45,12 +38,7 @@ def compare_methods() -> None:
     )
     # Batch approach
-    _, time_batch = benchmark(
-        osd.batch_weighted_levenshtein_distance,
-        source,
-        candidates,
-        max_token_characters=MAX_TOKEN_CHARACTERS,
-    )
+    _, time_batch = benchmark(osd.batch_weighted_levenshtein_distance, source, candidates)
     print(
         f"Batch function: {time_batch:.6f} seconds "
         f"({1000 * time_batch / len(candidates):.6f}ms each)"

{ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/examples/weighted_levenshtein.py RENAMED Viewed

@@ -24,7 +24,6 @@ ic(
         "이탈리",
         "OI탈리",  # Korean syllables may be confused with multiple Latin letters at once
         {("이", "OI"): 0.5},
-        max_token_characters=2,
     ),
 )

ocr_stringdist-0.0.7/python/ocr_stringdist/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+from .default_ocr_distances import ocr_distance_map
+from .levenshtein import batch_weighted_levenshtein_distance, weighted_levenshtein_distance
+from .matching import find_best_candidate
+__all__ = [
+    "ocr_distance_map",
+    "weighted_levenshtein_distance",
+    "batch_weighted_levenshtein_distance",
+    "find_best_candidate",
+]

ocr_stringdist-0.0.6/python/ocr_stringdist/__init__.py → ocr_stringdist-0.0.7/python/ocr_stringdist/levenshtein.py RENAMED Viewed

@@ -2,14 +2,6 @@ from typing import Optional
 from ._rust_stringdist import *  # noqa: F403
 from .default_ocr_distances import ocr_distance_map
-from .matching import find_best_candidate
-__all__ = [
-    "ocr_distance_map",
-    "weighted_levenshtein_distance",
-    "batch_weighted_levenshtein_distance",
-    "find_best_candidate",
-]
 def weighted_levenshtein_distance(
@@ -20,7 +12,6 @@ def weighted_levenshtein_distance(
     *,
     symmetric: bool = True,
     default_cost: float = 1.0,
-    max_token_characters: int = 1,
 ) -> float:
     """
     Levenshtein distance with custom substitution costs.
@@ -34,26 +25,16 @@ def weighted_levenshtein_distance(
     :param cost_map: Dictionary mapping tuples of strings ("substitution tokens") to their
                      substitution costs.
                      Only one direction needs to be configured unless `symmetric` is False.
-                     Note that you need to set `max_token_characters` if the substitution tokens
-                     have more than one character, for example when substituting "w" for "vv".
+                     Note that the runtime scales in the length of the longest substitution token.
                      Defaults to `ocr_stringdist.ocr_distance_map`.
     :param symmetric: Should the keys of `cost_map` be considered to be symmetric? Defaults to True.
     :param default_cost: The default substitution cost for character pairs not found in `cost_map`.
-    :param max_token_characters: A positive integer, indicating the maximum number of characters a
-                                 substitution token in `cost_map` may have. The default 1 indicates
-                                 that only single characters can be substituted for each other.
-                                 Higher values lead to slower calculations.
     """
     if cost_map is None:
         cost_map = ocr_distance_map
     # _weighted_levenshtein_distance is written in Rust, see src/rust_stringdist.rs.
     return _weighted_levenshtein_distance(  # type: ignore  # noqa: F405
-        s1,
-        s2,
-        cost_map=cost_map,
-        symmetric=symmetric,
-        default_cost=default_cost,
-        max_token_characters=max_token_characters,
+        s1, s2, cost_map=cost_map, symmetric=symmetric, default_cost=default_cost
     )
@@ -65,7 +46,6 @@ def batch_weighted_levenshtein_distance(
     *,
     symmetric: bool = True,
     default_cost: float = 1.0,
-    max_token_characters: int = 1,
 ) -> list[float]:
     """
     Calculate weighted Levenshtein distances between a string and multiple candidates.
@@ -74,25 +54,18 @@ def batch_weighted_levenshtein_distance(
     :param s: The string to compare
     :param candidates: List of candidate strings to compare against
-    :param cost_map: Dictionary mapping tuples of characters to their substitution cost.
+    :param cost_map: Dictionary mapping tuples of strings ("substitution tokens") to their
+                     substitution costs.
                      Only one direction needs to be configured unless `symmetric` is False.
+                     Note that the runtime scales in the length of the longest substitution token.
                      Defaults to `ocr_stringdist.ocr_distance_map`.
     :param symmetric: Should the keys of `cost_map` be considered to be symmetric? Defaults to True.
     :param default_cost: The default substitution cost for character pairs not found in `cost_map`.
-    :param max_token_characters: A positive integer, indicating the maximum number of characters a
-                                 substitution token in `cost_map` may have. The default 1 indicates
-                                 that only single characters can be substituted for each other.
-                                 Higher values lead to slower calculations.
     :return: A list of distances corresponding to each candidate
     """
     if cost_map is None:
         cost_map = ocr_distance_map
     # _batch_weighted_levenshtein_distance is written in Rust, see src/rust_stringdist.rs.
     return _batch_weighted_levenshtein_distance(  # type: ignore  # noqa: F405
-        s,
-        candidates,
-        cost_map=cost_map,
-        symmetric=symmetric,
-        default_cost=default_cost,
-        max_token_characters=max_token_characters,
+        s, candidates, cost_map=cost_map, symmetric=symmetric, default_cost=default_cost
     )

{ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/src/lib.rs RENAMED Viewed

@@ -1,5 +1,7 @@
+mod longest_tokens;
 mod weighted_levenshtein;
+pub use longest_tokens::longest_key_string_length;
 pub use weighted_levenshtein::{custom_levenshtein_distance_with_cost_map, OcrCostMap};
 #[cfg(feature = "python")]

ocr_stringdist-0.0.7/src/longest_tokens.rs ADDED Viewed

@@ -0,0 +1,48 @@
+use std::collections::HashMap;
+/// Calculates the length of the longest string found within the key tuples of a HashMap.
+pub fn longest_key_string_length<V>(map: &HashMap<(String, String), V>) -> usize {
+    map.keys()
+        .flat_map(|(s1, s2)| [s1.len(), s2.len()].into_iter())
+        .max()
+        .unwrap_or(1)
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_longest_key_string_length_basic() {
+        let mut map = HashMap::new();
+        map.insert(("apple".to_string(), "banana".to_string()), 1); // 5, 6
+        map.insert(("kiwi".to_string(), "grapefruit".to_string()), 2); // 4, 10
+        map.insert(("short".to_string(), "tiny".to_string()), 3); // 5, 4
+        assert_eq!(longest_key_string_length(&map), 10); // "grapefruit"
+    }
+    #[test]
+    fn test_longest_key_string_length_first_element() {
+        let mut map = HashMap::new();
+        map.insert(("a_very_long_string".to_string(), "short".to_string()), 1); // 18, 5
+        map.insert(("medium".to_string(), "small".to_string()), 2); // 6, 5
+        assert_eq!(longest_key_string_length(&map), 18);
+    }
+    #[test]
+    fn test_longest_key_string_length_empty_map() {
+        let map: HashMap<(String, String), bool> = HashMap::new();
+        assert_eq!(longest_key_string_length(&map), 1);
+    }
+    #[test]
+    fn test_longest_key_string_length_empty_strings() {
+        let mut map = HashMap::new();
+        map.insert(("".to_string(), "".to_string()), 1);
+        map.insert(("a".to_string(), "".to_string()), 2);
+        assert_eq!(longest_key_string_length(&map), 1);
+    }
+}

{ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/src/rust_stringdist.rs RENAMED Viewed

@@ -1,4 +1,5 @@
 use crate::custom_levenshtein_distance_with_cost_map as _weighted_lev_with_map;
+use crate::longest_key_string_length;
 use crate::OcrCostMap;
 use pyo3::prelude::*;
 use pyo3::types::PyDict;
@@ -6,42 +7,41 @@ use rayon::prelude::*;
 // Calculates the weighted Levenshtein distance with a custom cost map from Python.
 #[pyfunction]
-#[pyo3(signature = (a, b, cost_map, symmetric = true, default_cost = 1.0, max_token_characters = 1))]
+#[pyo3(signature = (a, b, cost_map, symmetric = true, default_cost = 1.0))]
 fn _weighted_levenshtein_distance(
     a: &str,
     b: &str,
     cost_map: &Bound<'_, PyDict>,
     symmetric: bool,
     default_cost: f64,
-    max_token_characters: usize,
 ) -> PyResult<f64> {
+    let ocr_cost_map = OcrCostMap::from_py_dict(cost_map, default_cost, symmetric);
+    let max_token_characters = longest_key_string_length(&ocr_cost_map.costs);
     Ok(_weighted_lev_with_map(
         a,
         b,
-        &OcrCostMap::from_py_dict(cost_map, default_cost, symmetric),
+        &ocr_cost_map,
         max_token_characters,
     ))
 }
 // Calculates the weighted Levenshtein distance between a string and a list of candidates.
 #[pyfunction]
-#[pyo3(signature = (s, candidates, cost_map, symmetric = true, default_cost = 1.0, max_token_characters = 1))]
+#[pyo3(signature = (s, candidates, cost_map, symmetric = true, default_cost = 1.0))]
 fn _batch_weighted_levenshtein_distance(
     s: &str,
     candidates: Vec<String>,
     cost_map: &Bound<'_, PyDict>,
     symmetric: bool,
     default_cost: f64,
-    max_token_characters: usize,
 ) -> PyResult<Vec<f64>> {
-    let custom_cost_map = OcrCostMap::from_py_dict(cost_map, default_cost, symmetric);
+    let ocr_cost_map = OcrCostMap::from_py_dict(cost_map, default_cost, symmetric);
+    let max_token_characters = longest_key_string_length(&ocr_cost_map.costs);
     // Calculate distances for each candidate in parallel
     let distances: Vec<f64> = candidates
         .par_iter()
-        .map(|candidate| {
-            _weighted_lev_with_map(s, candidate, &custom_cost_map, max_token_characters)
-        })
+        .map(|candidate| _weighted_lev_with_map(s, candidate, &ocr_cost_map, max_token_characters))
         .collect();
     Ok(distances)

{ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/src/weighted_levenshtein.rs RENAMED Viewed

@@ -6,7 +6,7 @@ use pyo3::prelude::*;
 #[derive(Clone, Debug)]
 pub struct OcrCostMap {
     /// Maps pairs of strings to their specific substitution cost.
-    costs: HashMap<(String, String), f64>,
+    pub costs: HashMap<(String, String), f64>,
     /// Default cost for substitutions not found in the map.
     default_substitution_cost: f64,
 }

{ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/tests/test_batch_functions.py RENAMED Viewed

@@ -40,14 +40,12 @@ def test_batch_vs_individual(
     """Test that batch results match individual function calls."""
     # Individual results
     individual_results = [
-        weighted_levenshtein_distance(source, candidate, cost_map=cost_map, max_token_characters=2)
+        weighted_levenshtein_distance(source, candidate, cost_map=cost_map)
         for candidate in candidates
     ]
     # Batch results
-    batch_results = batch_weighted_levenshtein_distance(
-        source, candidates, cost_map=cost_map, max_token_characters=2
-    )
+    batch_results = batch_weighted_levenshtein_distance(source, candidates, cost_map=cost_map)
     # Compare results
     for ind, batch in zip(individual_results, batch_results):
@@ -79,9 +77,7 @@ def test_batch_finds_best_match(
 ) -> None:
     """Test that batch processing correctly identifies the best match."""
     # Using OCR cost map
-    distances = batch_weighted_levenshtein_distance(
-        source, candidates, cost_map=OCR_COST_MAP, max_token_characters=2
-    )
+    distances = batch_weighted_levenshtein_distance(source, candidates, cost_map=OCR_COST_MAP)
     print(f"------------------------------------distances: {distances}")
     # Find the index with minimum distance
@@ -101,14 +97,12 @@ def test_batch_finds_best_match(
 )
 def test_custom_cost_map(test_string: str, expected_distance: float) -> None:
     """Test using a custom cost map for specific substitution costs."""
-    result = weighted_levenshtein_distance(
-        "hello", test_string, cost_map=OCR_COST_MAP, max_token_characters=2
-    )
+    result = weighted_levenshtein_distance("hello", test_string, cost_map=OCR_COST_MAP)
     assert result == pytest.approx(expected_distance)
     # Check that batch processing gives the same result
     batch_result = batch_weighted_levenshtein_distance(
-        "hello", [test_string], cost_map=OCR_COST_MAP, max_token_characters=2
+        "hello", [test_string], cost_map=OCR_COST_MAP
     )[0]
     assert batch_result == pytest.approx(expected_distance)
@@ -126,15 +120,11 @@ def test_empty_vs_default_cost_map(
 ) -> None:
     """Test that empty cost maps produce different results than default cost maps."""
     # With empty cost map (all costs are 1.0)
-    default_result = batch_weighted_levenshtein_distance(
-        string1, [string2], cost_map={}, max_token_characters=2
-    )
+    default_result = batch_weighted_levenshtein_distance(string1, [string2], cost_map={})
     assert default_result[0] == pytest.approx(default_map_distance)
     # With custom cost map (OCR-specific costs)
-    custom_result = batch_weighted_levenshtein_distance(
-        string1, [string2], cost_map=OCR_COST_MAP, max_token_characters=2
-    )
+    custom_result = batch_weighted_levenshtein_distance(string1, [string2], cost_map=OCR_COST_MAP)
     assert custom_result[0] == pytest.approx(custom_map_distance)
     # Custom map should give lower distance for OCR errors

{ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/tests/test_ocr_stringdist.py RENAMED Viewed

@@ -35,9 +35,7 @@ from ocr_stringdist import weighted_levenshtein_distance
 def test_weighted_levenshtein_distance(
     s1: str, s2: str, cost_map: dict[tuple[str, str], float], expected: float
 ) -> None:
-    assert weighted_levenshtein_distance(
-        s1, s2, cost_map=cost_map, max_token_characters=3
-    ) == pytest.approx(expected)
+    assert weighted_levenshtein_distance(s1, s2, cost_map=cost_map) == pytest.approx(expected)
 def test_complex_ocr_substitutions() -> None:
@@ -57,12 +55,8 @@ def test_complex_ocr_substitutions() -> None:
     original = "The man ran down the hill at 10 km/h."
     ocr_result = "Tine rnan ram dovvn tine Ini11 at 1O krn/In."
-    distance = weighted_levenshtein_distance(
-        original, ocr_result, cost_map=ocr_cost_map, max_token_characters=3
-    )
-    standard_distance = weighted_levenshtein_distance(
-        original, ocr_result, cost_map={}, max_token_characters=3
-    )
+    distance = weighted_levenshtein_distance(original, ocr_result, cost_map=ocr_cost_map)
+    standard_distance = weighted_levenshtein_distance(original, ocr_result, cost_map={})
     assert standard_distance > distance
@@ -83,7 +77,7 @@ def test_asymmetric_substitution_costs(s1: str, s2: str, expected: float) -> Non
         ("S", "5"): 0.6,
     }
     assert weighted_levenshtein_distance(
-        s1, s2, cost_map=asymmetric_cost_map, symmetric=False, max_token_characters=3
+        s1, s2, cost_map=asymmetric_cost_map, symmetric=False
     ) == pytest.approx(expected)
@@ -107,6 +101,6 @@ def test_nested_substitution_patterns(s1: str, s2: str, expected: float) -> None
         ("abc", "d"): 0.3,
         ("d", "abc"): 0.3,
     }
-    assert weighted_levenshtein_distance(
-        s1, s2, cost_map=nested_cost_map, max_token_characters=3
-    ) == pytest.approx(expected)
+    assert weighted_levenshtein_distance(s1, s2, cost_map=nested_cost_map) == pytest.approx(
+        expected
+    )