PyPI - ocr-stringdist - Versions diffs - 0.0.5__tar.gz → 0.0.7__tar.gz - Mend

ocr-stringdist 0.0.5tar.gz → 0.0.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

{ocr_stringdist-0.0.5 → ocr_stringdist-0.0.7}/Cargo.lock RENAMED Viewed

@@ -14,6 +14,37 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+[[package]]
+name = "crossbeam-deque"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
+dependencies = [
+ "crossbeam-epoch",
+ "crossbeam-utils",
+]
+[[package]]
+name = "crossbeam-epoch"
+version = "0.9.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
+dependencies = [
+ "crossbeam-utils",
+]
+[[package]]
+name = "crossbeam-utils"
+version = "0.8.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
+[[package]]
+name = "either"
+version = "1.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
 [[package]]
 name = "heck"
 version = "0.5.0"
@@ -43,9 +74,10 @@ dependencies = [
 [[package]]
 name = "ocr_stringdist"
-version = "0.0.5"
+version = "0.0.7"
 dependencies = [
  "pyo3",
+ "rayon",
 ]
 [[package]]
@@ -141,6 +173,26 @@ dependencies = [
  "proc-macro2",
 ]
+[[package]]
+name = "rayon"
+version = "1.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
+dependencies = [
+ "either",
+ "rayon-core",
+]
+[[package]]
+name = "rayon-core"
+version = "1.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
+dependencies = [
+ "crossbeam-deque",
+ "crossbeam-utils",
+]
 [[package]]
 name = "syn"
 version = "2.0.100"

{ocr_stringdist-0.0.5 → ocr_stringdist-0.0.7}/Cargo.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "ocr_stringdist"
-version = "0.0.5"
+version = "0.0.7"
 edition = "2021"
 description = "String distances considering OCR errors."
 authors = ["Niklas von Moers <niklasvmoers@protonmail.com>"]
@@ -15,6 +15,7 @@ crate-type = ["cdylib"]
 [dependencies]
 pyo3 = { version = "0.24.0", features = [] }
+rayon = "1.10.0"
 [features]
 python = []

{ocr_stringdist-0.0.5 → ocr_stringdist-0.0.7}/Justfile RENAMED Viewed

@@ -12,3 +12,9 @@ test:
 mypy:
     uv run mypy .
+lint:
+    uv run ruff check . --fix
+doc:
+    uv run make -C docs html

{ocr_stringdist-0.0.5 → ocr_stringdist-0.0.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ocr_stringdist
-Version: 0.0.5
+Version: 0.0.7
 Classifier: Programming Language :: Rust
 Classifier: Programming Language :: Python
 Classifier: Operating System :: OS Independent
@@ -36,7 +36,7 @@ pip install ocr-stringdist
 ## Features
-- **Weighted Levenshtein Distance**: An adaptation of the classic Levenshtein algorithm with custom substitution costs for character pairs that are commonly confused in OCR models.
+- **Weighted Levenshtein Distance**: An adaptation of the classic Levenshtein algorithm with custom substitution costs for character pairs that are commonly confused in OCR models, including efficient batch processing.
 - **Unicode Support**: Arbitrary unicode strings can be compared.
 - **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
 - **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
@@ -60,7 +60,6 @@ distance = osd.weighted_levenshtein_distance(
     "hi", "Ini",
     cost_map=custom_map,
     symmetric=True,
-    default_cost=1.0,
 )
 print(f"Distance with custom map: {distance}")
 ```

{ocr_stringdist-0.0.5 → ocr_stringdist-0.0.7}/README.md RENAMED Viewed

@@ -21,7 +21,7 @@ pip install ocr-stringdist
 ## Features
-- **Weighted Levenshtein Distance**: An adaptation of the classic Levenshtein algorithm with custom substitution costs for character pairs that are commonly confused in OCR models.
+- **Weighted Levenshtein Distance**: An adaptation of the classic Levenshtein algorithm with custom substitution costs for character pairs that are commonly confused in OCR models, including efficient batch processing.
 - **Unicode Support**: Arbitrary unicode strings can be compared.
 - **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
 - **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
@@ -45,7 +45,6 @@ distance = osd.weighted_levenshtein_distance(
     "hi", "Ini",
     cost_map=custom_map,
     symmetric=True,
-    default_cost=1.0,
 )
 print(f"Distance with custom map: {distance}")
 ```

ocr_stringdist-0.0.7/docs/source/api/index.rst ADDED Viewed

@@ -0,0 +1,23 @@
+.. _api_reference:
+API Reference
+=============
+This page contains the auto-generated API reference documentation.
+.. automodule:: ocr_stringdist.levenshtein
+   :members:
+   :undoc-members:
+   :show-inheritance:
+.. automodule:: ocr_stringdist.matching
+   :members:
+   :undoc-members:
+   :show-inheritance:
+.. autodata:: ocr_stringdist.default_ocr_distances.ocr_distance_map
+   :annotation:
+.. literalinclude:: ../../../python/ocr_stringdist/default_ocr_distances.py
+   :language: python
+   :start-after: OCR_DISTANCE_MAP_START
+   :end-before: OCR_DISTANCE_MAP_END

ocr_stringdist-0.0.7/examples/batch_processing.py ADDED Viewed

@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+"""
+Example demonstrating the usage of the batch processing functions from ocr_stringdist.
+"""
+import time
+from typing import Any, Callable
+import ocr_stringdist as osd
+def benchmark(func: Callable, *args: Any, **kwargs: Any) -> tuple[Any, float]:  # type: ignore
+    """Run a function and return the execution time in seconds."""
+    start = time.time()
+    result = func(*args, **kwargs)
+    end = time.time()
+    return result, end - start
+def compare_methods() -> None:
+    """
+    Compare the performance of different methods for calculating Levenshtein distances.
+    """
+    # Example data
+    source = "recognition"
+    candidates = ["recognition", "recogmtion", "recognltlon", "recogrtition", "recognitton"] * 1000
+    print("\nSingle string against multiple candidates:")
+    print("-" * 50)
+    # Standard loop approach
+    _, time_loop = benchmark(
+        lambda: [osd.weighted_levenshtein_distance(source, cand) for cand in candidates]
+    )
+    print(
+        f"Loop of single calls: {time_loop:.6f} seconds "
+        f"({1000 * time_loop / len(candidates):.6f}ms each)"
+    )
+    # Batch approach
+    _, time_batch = benchmark(osd.batch_weighted_levenshtein_distance, source, candidates)
+    print(
+        f"Batch function: {time_batch:.6f} seconds "
+        f"({1000 * time_batch / len(candidates):.6f}ms each)"
+    )
+    print(f"Speedup: {time_loop / time_batch:.2f}x")
+def main() -> None:
+    """Main function."""
+    print("Demonstrating batch processing functions from ocr_stringdist\n")
+    # Run the benchmarks
+    compare_methods()
+if __name__ == "__main__":
+    main()

ocr_stringdist-0.0.5/example.py → ocr_stringdist-0.0.7/examples/weighted_levenshtein.py RENAMED Viewed

@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 from icecream import ic
 from ocr_stringdist import find_best_candidate, weighted_levenshtein_distance

ocr_stringdist-0.0.7/python/ocr_stringdist/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+from .default_ocr_distances import ocr_distance_map
+from .levenshtein import batch_weighted_levenshtein_distance, weighted_levenshtein_distance
+from .matching import find_best_candidate
+__all__ = [
+    "ocr_distance_map",
+    "weighted_levenshtein_distance",
+    "batch_weighted_levenshtein_distance",
+    "find_best_candidate",
+]

{ocr_stringdist-0.0.5 → ocr_stringdist-0.0.7}/python/ocr_stringdist/default_ocr_distances.py RENAMED Viewed

@@ -1,3 +1,5 @@
+# Start marker for literalinclude, see docs/source/api/index.rst.
+# OCR_DISTANCE_MAP_START
 ocr_distance_map: dict[tuple[str, str], float] = {
     ("O", "0"): 0.1,
     ("l", "1"): 0.1,
@@ -31,6 +33,8 @@ ocr_distance_map: dict[tuple[str, str], float] = {
     ("é", "á"): 0.7,
     ("E", "F"): 0.8,
 }
+# OCR_DISTANCE_MAP_END
+# End marker for literalinclude
 """
 Pre-defined distance map between characters, considering common OCR errors.
 The distances are between 0 and 1.

ocr_stringdist-0.0.7/python/ocr_stringdist/levenshtein.py ADDED Viewed

@@ -0,0 +1,71 @@
+from typing import Optional
+from ._rust_stringdist import *  # noqa: F403
+from .default_ocr_distances import ocr_distance_map
+def weighted_levenshtein_distance(
+    s1: str,
+    s2: str,
+    /,
+    cost_map: Optional[dict[tuple[str, str], float]] = None,
+    *,
+    symmetric: bool = True,
+    default_cost: float = 1.0,
+) -> float:
+    """
+    Levenshtein distance with custom substitution costs.
+    Insertion/deletion costs are 1.
+    The default `cost_map` considers common OCR errors, see
+    :py:data:`ocr_stringdist.default_ocr_distances.ocr_distance_map`.
+    :param s1: First string
+    :param s2: Second string
+    :param cost_map: Dictionary mapping tuples of strings ("substitution tokens") to their
+                     substitution costs.
+                     Only one direction needs to be configured unless `symmetric` is False.
+                     Note that the runtime scales in the length of the longest substitution token.
+                     Defaults to `ocr_stringdist.ocr_distance_map`.
+    :param symmetric: Should the keys of `cost_map` be considered to be symmetric? Defaults to True.
+    :param default_cost: The default substitution cost for character pairs not found in `cost_map`.
+    """
+    if cost_map is None:
+        cost_map = ocr_distance_map
+    # _weighted_levenshtein_distance is written in Rust, see src/rust_stringdist.rs.
+    return _weighted_levenshtein_distance(  # type: ignore  # noqa: F405
+        s1, s2, cost_map=cost_map, symmetric=symmetric, default_cost=default_cost
+    )
+def batch_weighted_levenshtein_distance(
+    s: str,
+    candidates: list[str],
+    /,
+    cost_map: Optional[dict[tuple[str, str], float]] = None,
+    *,
+    symmetric: bool = True,
+    default_cost: float = 1.0,
+) -> list[float]:
+    """
+    Calculate weighted Levenshtein distances between a string and multiple candidates.
+    This is more efficient than calling :func:`weighted_levenshtein_distance` multiple times.
+    :param s: The string to compare
+    :param candidates: List of candidate strings to compare against
+    :param cost_map: Dictionary mapping tuples of strings ("substitution tokens") to their
+                     substitution costs.
+                     Only one direction needs to be configured unless `symmetric` is False.
+                     Note that the runtime scales in the length of the longest substitution token.
+                     Defaults to `ocr_stringdist.ocr_distance_map`.
+    :param symmetric: Should the keys of `cost_map` be considered to be symmetric? Defaults to True.
+    :param default_cost: The default substitution cost for character pairs not found in `cost_map`.
+    :return: A list of distances corresponding to each candidate
+    """
+    if cost_map is None:
+        cost_map = ocr_distance_map
+    # _batch_weighted_levenshtein_distance is written in Rust, see src/rust_stringdist.rs.
+    return _batch_weighted_levenshtein_distance(  # type: ignore  # noqa: F405
+        s, candidates, cost_map=cost_map, symmetric=symmetric, default_cost=default_cost
+    )

{ocr_stringdist-0.0.5 → ocr_stringdist-0.0.7}/src/lib.rs RENAMED Viewed

@@ -1,5 +1,7 @@
+mod longest_tokens;
 mod weighted_levenshtein;
+pub use longest_tokens::longest_key_string_length;
 pub use weighted_levenshtein::{custom_levenshtein_distance_with_cost_map, OcrCostMap};
 #[cfg(feature = "python")]

ocr_stringdist-0.0.7/src/longest_tokens.rs ADDED Viewed

@@ -0,0 +1,48 @@
+use std::collections::HashMap;
+/// Calculates the length of the longest string found within the key tuples of a HashMap.
+pub fn longest_key_string_length<V>(map: &HashMap<(String, String), V>) -> usize {
+    map.keys()
+        .flat_map(|(s1, s2)| [s1.len(), s2.len()].into_iter())
+        .max()
+        .unwrap_or(1)
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_longest_key_string_length_basic() {
+        let mut map = HashMap::new();
+        map.insert(("apple".to_string(), "banana".to_string()), 1); // 5, 6
+        map.insert(("kiwi".to_string(), "grapefruit".to_string()), 2); // 4, 10
+        map.insert(("short".to_string(), "tiny".to_string()), 3); // 5, 4
+        assert_eq!(longest_key_string_length(&map), 10); // "grapefruit"
+    }
+    #[test]
+    fn test_longest_key_string_length_first_element() {
+        let mut map = HashMap::new();
+        map.insert(("a_very_long_string".to_string(), "short".to_string()), 1); // 18, 5
+        map.insert(("medium".to_string(), "small".to_string()), 2); // 6, 5
+        assert_eq!(longest_key_string_length(&map), 18);
+    }
+    #[test]
+    fn test_longest_key_string_length_empty_map() {
+        let map: HashMap<(String, String), bool> = HashMap::new();
+        assert_eq!(longest_key_string_length(&map), 1);
+    }
+    #[test]
+    fn test_longest_key_string_length_empty_strings() {
+        let mut map = HashMap::new();
+        map.insert(("".to_string(), "".to_string()), 1);
+        map.insert(("a".to_string(), "".to_string()), 2);
+        assert_eq!(longest_key_string_length(&map), 1);
+    }
+}

ocr_stringdist-0.0.7/src/rust_stringdist.rs ADDED Viewed

@@ -0,0 +1,56 @@
+use crate::custom_levenshtein_distance_with_cost_map as _weighted_lev_with_map;
+use crate::longest_key_string_length;
+use crate::OcrCostMap;
+use pyo3::prelude::*;
+use pyo3::types::PyDict;
+use rayon::prelude::*;
+// Calculates the weighted Levenshtein distance with a custom cost map from Python.
+#[pyfunction]
+#[pyo3(signature = (a, b, cost_map, symmetric = true, default_cost = 1.0))]
+fn _weighted_levenshtein_distance(
+    a: &str,
+    b: &str,
+    cost_map: &Bound<'_, PyDict>,
+    symmetric: bool,
+    default_cost: f64,
+) -> PyResult<f64> {
+    let ocr_cost_map = OcrCostMap::from_py_dict(cost_map, default_cost, symmetric);
+    let max_token_characters = longest_key_string_length(&ocr_cost_map.costs);
+    Ok(_weighted_lev_with_map(
+        a,
+        b,
+        &ocr_cost_map,
+        max_token_characters,
+    ))
+}
+// Calculates the weighted Levenshtein distance between a string and a list of candidates.
+#[pyfunction]
+#[pyo3(signature = (s, candidates, cost_map, symmetric = true, default_cost = 1.0))]
+fn _batch_weighted_levenshtein_distance(
+    s: &str,
+    candidates: Vec<String>,
+    cost_map: &Bound<'_, PyDict>,
+    symmetric: bool,
+    default_cost: f64,
+) -> PyResult<Vec<f64>> {
+    let ocr_cost_map = OcrCostMap::from_py_dict(cost_map, default_cost, symmetric);
+    let max_token_characters = longest_key_string_length(&ocr_cost_map.costs);
+    // Calculate distances for each candidate in parallel
+    let distances: Vec<f64> = candidates
+        .par_iter()
+        .map(|candidate| _weighted_lev_with_map(s, candidate, &ocr_cost_map, max_token_characters))
+        .collect();
+    Ok(distances)
+}
+/// A Python module implemented in Rust.
+#[pymodule]
+pub fn _rust_stringdist(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
+    m.add_function(wrap_pyfunction!(_weighted_levenshtein_distance, m)?)?;
+    m.add_function(wrap_pyfunction!(_batch_weighted_levenshtein_distance, m)?)?;
+    Ok(())
+}

{ocr_stringdist-0.0.5 → ocr_stringdist-0.0.7}/src/weighted_levenshtein.rs RENAMED Viewed

@@ -1,9 +1,12 @@
 use std::collections::HashMap;
+#[cfg(feature = "python")]
+use pyo3::prelude::*;
 #[derive(Clone, Debug)]
 pub struct OcrCostMap {
     /// Maps pairs of strings to their specific substitution cost.
-    costs: HashMap<(String, String), f64>,
+    pub costs: HashMap<(String, String), f64>,
     /// Default cost for substitutions not found in the map.
     default_substitution_cost: f64,
 }
@@ -31,6 +34,28 @@ impl OcrCostMap {
         }
     }
+    #[cfg(feature = "python")]
+    /// Creates an OcrCostMap from a Python dictionary.
+    /// This method is only available when the "python" feature is enabled.
+    pub fn from_py_dict<'a, D>(py_dict: &'a D, default_cost: f64, symmetric: bool) -> Self
+    where
+        D: PyDictMethods<'a>,
+    {
+        let mut substitution_costs: HashMap<(String, String), f64> = HashMap::new();
+        // Convert Python dictionary to Rust HashMap
+        for (key, value) in py_dict.iter() {
+            if let Ok(key_tuple) = key.extract::<(String, String)>() {
+                if let Ok(cost) = value.extract::<f64>() {
+                    substitution_costs.insert((key_tuple.0, key_tuple.1), cost);
+                }
+            }
+        }
+        // Create the OcrCostMap
+        Self::new(substitution_costs, default_cost, symmetric)
+    }
     /// Gets the substitution cost between two strings.
     /// Checks the custom map first, then falls back to the
     /// default substitution cost configured within this map instance.
@@ -55,7 +80,12 @@ impl OcrCostMap {
 /// Calculates custom Levenshtein distance between two strings using a provided cost map.
 /// This implementation considers string-to-string substitutions rather than just characters.
-pub fn custom_levenshtein_distance_with_cost_map(s1: &str, s2: &str, cost_map: &OcrCostMap) -> f64 {
+pub fn custom_levenshtein_distance_with_cost_map(
+    s1: &str,
+    s2: &str,
+    cost_map: &OcrCostMap,
+    max_token_characters: usize,
+) -> f64 {
     if s1 == s2 {
         return 0.0;
     }
@@ -86,7 +116,7 @@ pub fn custom_levenshtein_distance_with_cost_map(s1: &str, s2: &str, cost_map: &
     }
     // Limit on substring lengths to check
-    let max_substr_len = 5.min(len1.max(len2));
+    let max_substr_len = max_token_characters.min(len1.max(len2));
     // Fill the dp matrix
     for i in 1..=len1 {
@@ -176,7 +206,7 @@ mod test {
         );
         assert_approx_eq(
-            custom_levenshtein_distance_with_cost_map("abc", "bbc", &cost_map),
+            custom_levenshtein_distance_with_cost_map("abc", "bbc", &cost_map, 3),
             0.1,
             1e-9,
         );
@@ -192,14 +222,14 @@ mod test {
         // Test that "hi" with "Ini" has a low cost due to the special substitution
         assert_approx_eq(
-            custom_levenshtein_distance_with_cost_map("hi", "Ini", &cost_map),
+            custom_levenshtein_distance_with_cost_map("hi", "Ini", &cost_map, 2),
             0.2, // Only the h->In substitution cost
             1e-9,
         );
         // Test another example
         assert_approx_eq(
-            custom_levenshtein_distance_with_cost_map("hello", "Inello", &cost_map),
+            custom_levenshtein_distance_with_cost_map("hello", "Inello", &cost_map, 2),
             0.2, // Only the h->In substitution cost
             1e-9,
         );
@@ -214,7 +244,7 @@ mod test {
         // Test multiple substitutions in the same string
         assert_approx_eq(
-            custom_levenshtein_distance_with_cost_map("hello", "Ine11o", &cost_map),
+            custom_levenshtein_distance_with_cost_map("hello", "Ine11o", &cost_map, 2),
             0.8, // 0.2 for h->In and 0.3+0.3 for l->1 twice
             1e-9,
         );
@@ -229,14 +259,14 @@ mod test {
         // Test the rn->m substitution
         assert_approx_eq(
-            custom_levenshtein_distance_with_cost_map("corner", "comer", &cost_map),
+            custom_levenshtein_distance_with_cost_map("corner", "comer", &cost_map, 2),
             0.1,
             1e-9,
         );
         // Test the cl->d substitution
         assert_approx_eq(
-            custom_levenshtein_distance_with_cost_map("class", "dass", &cost_map),
+            custom_levenshtein_distance_with_cost_map("class", "dass", &cost_map, 2),
             0.2,
             1e-9,
         );
@@ -253,14 +283,14 @@ mod test {
         // Test 0->O substitution (lower cost)
         assert_approx_eq(
-            custom_levenshtein_distance_with_cost_map("R0AD", "ROAD", &cost_map),
+            custom_levenshtein_distance_with_cost_map("R0AD", "ROAD", &cost_map, 1),
             0.1,
             1e-9,
         );
         // Test O->0 substitution (higher cost)
         assert_approx_eq(
-            custom_levenshtein_distance_with_cost_map("rOad", "r0ad", &cost_map),
+            custom_levenshtein_distance_with_cost_map("rOad", "r0ad", &cost_map, 1),
             0.5,
             1e-9,
         );
@@ -274,14 +304,14 @@ mod test {
         // Test substitution at start of word
         assert_approx_eq(
-            custom_levenshtein_distance_with_cost_map("rnat", "mat", &cost_map),
+            custom_levenshtein_distance_with_cost_map("rnat", "mat", &cost_map, 2),
             0.1,
             1e-9,
         );
         // Test substitution at end of word
         assert_approx_eq(
-            custom_levenshtein_distance_with_cost_map("burn", "bum", &cost_map),
+            custom_levenshtein_distance_with_cost_map("burn", "bum", &cost_map, 2),
             0.1,
             1e-9,
         );
@@ -294,13 +324,13 @@ mod test {
         // Test that "h" -> "In" costs 2.0 (1 deletion + 1 substitution) since there's no custom mapping
         assert_approx_eq(
-            custom_levenshtein_distance_with_cost_map("h", "In", &cost_map),
+            custom_levenshtein_distance_with_cost_map("h", "In", &cost_map, 1),
             2.0,
             1e-9,
         );
         assert_approx_eq(
-            custom_levenshtein_distance_with_cost_map("kitten", "sitting", &cost_map),
+            custom_levenshtein_distance_with_cost_map("kitten", "sitting", &cost_map, 1),
             3.0,
             1e-9,
         );
@@ -314,7 +344,7 @@ mod test {
         // - Insert 'e' (1)
         // Total: 4 operations
         assert_approx_eq(
-            custom_levenshtein_distance_with_cost_map("café", "coffee", &cost_map),
+            custom_levenshtein_distance_with_cost_map("café", "coffee", &cost_map, 1),
             4.0, // 4 edits required
             1e-9,
         );

ocr_stringdist-0.0.7/tests/test_batch_functions.py ADDED Viewed

@@ -0,0 +1,131 @@
+"""
+Unit tests for the batch processing functions.
+"""
+import pytest
+from ocr_stringdist import batch_weighted_levenshtein_distance, weighted_levenshtein_distance
+# Define a custom cost map with some OCR confusions for testing
+OCR_COST_MAP = {
+    ("l", "1"): 0.2,  # l to 1 is a common OCR error
+    ("O", "0"): 0.1,  # O to 0 is a common OCR error
+    ("o", "0"): 0.1,
+    ("m", "rn"): 0.3,  # m to rn is a common OCR error
+}
+@pytest.mark.parametrize(
+    ["source", "candidates", "cost_map"],
+    [
+        (
+            "recognition",
+            ["recognition", "recogmtion", "recognltlon", "recogrtition", "recognitton"],
+            None,
+        ),
+        (
+            "hello",
+            ["hello", "he11o", "hell0"],
+            OCR_COST_MAP,
+        ),
+        (
+            "algorithm",
+            ["algorithm", "algorlthm", "a1gorithm"],
+            OCR_COST_MAP,
+        ),
+    ],
+)
+def test_batch_vs_individual(
+    source: str, candidates: list[str], cost_map: dict[tuple[str, str], float]
+) -> None:
+    """Test that batch results match individual function calls."""
+    # Individual results
+    individual_results = [
+        weighted_levenshtein_distance(source, candidate, cost_map=cost_map)
+        for candidate in candidates
+    ]
+    # Batch results
+    batch_results = batch_weighted_levenshtein_distance(source, candidates, cost_map=cost_map)
+    # Compare results
+    for ind, batch in zip(individual_results, batch_results):
+        assert ind == pytest.approx(batch)
+@pytest.mark.parametrize(
+    ["source", "candidates", "expected_indices"],
+    [
+        (
+            "hello",
+            ["hello", "he11o", "hell0", "hallo", "help"],
+            [0],  # exact match should be the best
+        ),
+        (
+            "algorithm",
+            ["a1gorithm", "algorithm", "algorlthm", "alg0rithm"],
+            [1],  # exact match should be the best
+        ),
+        (
+            "recognition",
+            ["wreck", "cognition", "recogmition", "wreckognition"],
+            [2],  # "recogmtion" should be closest to "recognition"
+        ),
+    ],
+)
+def test_batch_finds_best_match(
+    source: str, candidates: list[str], expected_indices: list[int]
+) -> None:
+    """Test that batch processing correctly identifies the best match."""
+    # Using OCR cost map
+    distances = batch_weighted_levenshtein_distance(source, candidates, cost_map=OCR_COST_MAP)
+    print(f"------------------------------------distances: {distances}")
+    # Find the index with minimum distance
+    min_index = distances.index(min(distances))
+    # Check if the minimum index is in the expected indices
+    assert min_index in expected_indices
+@pytest.mark.parametrize(
+    ["test_string", "expected_distance"],
+    [
+        ("hello", 0.0),  # exact match
+        ("he11o", 0.4),  # two l->1 substitutions at cost 0.2 each
+        ("hell0", 0.1),  # one O->0 substitution at cost 0.1
+    ],
+)
+def test_custom_cost_map(test_string: str, expected_distance: float) -> None:
+    """Test using a custom cost map for specific substitution costs."""
+    result = weighted_levenshtein_distance("hello", test_string, cost_map=OCR_COST_MAP)
+    assert result == pytest.approx(expected_distance)
+    # Check that batch processing gives the same result
+    batch_result = batch_weighted_levenshtein_distance(
+        "hello", [test_string], cost_map=OCR_COST_MAP
+    )[0]
+    assert batch_result == pytest.approx(expected_distance)
+@pytest.mark.parametrize(
+    ["string1", "string2", "default_map_distance", "custom_map_distance"],
+    [
+        ("hello", "he11o", 2.0, 0.4),  # l->1 costs 0.2 each instead of 1.0 each
+        ("hello", "hell0", 1.0, 0.1),  # o->0 costs 0.1 instead of 1.0
+        ("come", "corne", 2.0, 0.3),  # rn->m costs 0.3 instead of 2.0
+    ],
+)
+def test_empty_vs_default_cost_map(
+    string1: str, string2: str, default_map_distance: float, custom_map_distance: float
+) -> None:
+    """Test that empty cost maps produce different results than default cost maps."""
+    # With empty cost map (all costs are 1.0)
+    default_result = batch_weighted_levenshtein_distance(string1, [string2], cost_map={})
+    assert default_result[0] == pytest.approx(default_map_distance)
+    # With custom cost map (OCR-specific costs)
+    custom_result = batch_weighted_levenshtein_distance(string1, [string2], cost_map=OCR_COST_MAP)
+    assert custom_result[0] == pytest.approx(custom_map_distance)
+    # Custom map should give lower distance for OCR errors
+    assert custom_result[0] < default_result[0]

ocr_stringdist-0.0.5/docs/source/api/index.rst DELETED Viewed

@@ -1,18 +0,0 @@
-.. _api_reference:
-API Reference
-=============
-This page contains the auto-generated API reference documentation.
-.. autofunction:: ocr_stringdist.__init__.weighted_levenshtein_distance
-.. automodule:: ocr_stringdist.matching
-   :members:
-   :undoc-members:
-   :show-inheritance:
-.. automodule:: ocr_stringdist.default_ocr_distances
-   :members:
-   :undoc-members:
-   :show-inheritance:

ocr_stringdist-0.0.5/python/ocr_stringdist/__init__.py DELETED Viewed

@@ -1,42 +0,0 @@
-from typing import Optional
-from ._rust_stringdist import *  # noqa: F403
-from .default_ocr_distances import ocr_distance_map
-from .matching import find_best_candidate
-__all__ = [
-    "ocr_distance_map",
-    "weighted_levenshtein_distance",  # noqa: F405
-    "find_best_candidate",
-]
-def weighted_levenshtein_distance(
-    s1: str,
-    s2: str,
-    /,
-    cost_map: Optional[dict[tuple[str, str], float]] = None,
-    *,
-    symmetric: bool = True,
-    default_cost: float = 1.0,
-) -> float:
-    """
-    Levenshtein distance with custom substitution costs.
-    Insertion/deletion costs are 1.
-    The default `cost_map` considers common OCR errors, see `ocr_stringdist.ocr_distance_map`.
-    :param s1: First string
-    :param s2: Second string
-    :param cost_map: Dictionary mapping tuples of characters to their substitution cost.
-                     Only one direction needs to be configured unless `symmetric` is False.
-                     Defaults to `ocr_stringdist.ocr_distance_map`.
-    :param symmetric: Should the keys of `cost_map` be considered to be symmetric? Defaults to True.
-    :param default_cost: The default substitution cost for character pairs not found in `cost_map`.
-    """
-    if cost_map is None:
-        cost_map = ocr_distance_map
-    # _weighted_levenshtein_distance is written in Rust, see src/rust_stringdist.rs.
-    return _weighted_levenshtein_distance(  # type: ignore  # noqa: F405
-        s1, s2, cost_map=cost_map, symmetric=symmetric, default_cost=default_cost
-    )

ocr_stringdist-0.0.5/src/rust_stringdist.rs DELETED Viewed

@@ -1,39 +0,0 @@
-use crate::custom_levenshtein_distance_with_cost_map as _weighted_lev_with_map;
-use crate::OcrCostMap;
-use pyo3::prelude::*;
-use pyo3::types::PyDict;
-use std::collections::HashMap;
-// Calculates the weighted Levenshtein distance with a custom cost map from Python.
-#[pyfunction]
-#[pyo3(signature = (a, b, cost_map, symmetric = true, default_cost = 1.0))]
-fn _weighted_levenshtein_distance(
-    a: &str,
-    b: &str,
-    cost_map: &Bound<'_, PyDict>,
-    symmetric: bool,
-    default_cost: Option<f64>,
-) -> PyResult<f64> {
-    let default_cost_value = default_cost.unwrap_or(1.0);
-    let mut substitution_costs: HashMap<(String, String), f64> = HashMap::new();
-    // Convert Python dictionary to Rust HashMap
-    for (key, value) in cost_map.iter() {
-        if let Ok(key_tuple) = key.extract::<(String, String)>() {
-            if let Ok(cost) = value.extract::<f64>() {
-                substitution_costs.insert((key_tuple.0, key_tuple.1), cost);
-            }
-        }
-    }
-    // Create a custom cost map and calculate the distance
-    let custom_cost_map = OcrCostMap::new(substitution_costs, default_cost_value, symmetric);
-    Ok(_weighted_lev_with_map(a, b, &custom_cost_map))
-}
-/// A Python module implemented in Rust.
-#[pymodule]
-pub fn _rust_stringdist(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
-    m.add_function(wrap_pyfunction!(_weighted_levenshtein_distance, m)?)?;
-    Ok(())
-}