PyPI - ocr-stringdist - Versions diffs - 0.1.0__cp310-cp310-win_amd64.whl → 0.2.1__cp310-cp310-win_amd64.whl - Mend

ocr-stringdist 0.1.0__cp310-cp310-win_amd64.whl → 0.2.1__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

ocr_stringdist/__init__.py +8 -1
ocr_stringdist/_rust_stringdist.cp310-win_amd64.pyd +0 -0
ocr_stringdist/levenshtein.py +148 -24
ocr_stringdist-0.2.1.dist-info/METADATA +81 -0
ocr_stringdist-0.2.1.dist-info/RECORD +10 -0
{ocr_stringdist-0.1.0.dist-info → ocr_stringdist-0.2.1.dist-info}/WHEEL +1 -1
ocr_stringdist-0.1.0.dist-info/METADATA +0 -85
ocr_stringdist-0.1.0.dist-info/RECORD +0 -10
{ocr_stringdist-0.1.0.dist-info → ocr_stringdist-0.2.1.dist-info}/licenses/LICENSE +0 -0

ocr_stringdist/__init__.py CHANGED Viewed

@@ -1,10 +1,17 @@
 from .default_ocr_distances import ocr_distance_map
-from .levenshtein import batch_weighted_levenshtein_distance, weighted_levenshtein_distance
+from .levenshtein import (
+    WeightedLevenshtein,
+    batch_weighted_levenshtein_distance,
+    explain_weighted_levenshtein,
+    weighted_levenshtein_distance,
+)
 from .matching import find_best_candidate
 __all__ = [
     "ocr_distance_map",
+    "WeightedLevenshtein",
     "weighted_levenshtein_distance",
     "batch_weighted_levenshtein_distance",
+    "explain_weighted_levenshtein",
     "find_best_candidate",
 ]

ocr_stringdist/_rust_stringdist.cp310-win_amd64.pyd CHANGED Viewed

Binary file

ocr_stringdist/levenshtein.py CHANGED Viewed

@@ -1,8 +1,95 @@
-from typing import Optional
+from __future__ import annotations
-from ._rust_stringdist import *  # noqa: F403
+from dataclasses import dataclass
+from typing import Literal, Optional
+from ._rust_stringdist import (
+    _batch_weighted_levenshtein_distance,
+    _explain_weighted_levenshtein_distance,
+    _weighted_levenshtein_distance,
+)
 from .default_ocr_distances import ocr_distance_map
+OperationType = Literal["substitute", "insert", "delete"]
+@dataclass(frozen=True)
+class EditOperation:
+    """
+    Represents a single edit operation (substitution, insertion, or deletion).
+    """
+    op_type: OperationType
+    source_token: Optional[str]
+    target_token: Optional[str]
+    cost: float
+class WeightedLevenshtein:
+    """
+    Calculates Levenshtein distance with custom, configurable costs.
+    This class is initialized with cost dictionaries and settings that define
+    how the distance is measured. Once created, its methods can be used to
+    efficiently compute distances and explain the edit operations.
+    :param substitution_costs: Maps (char, char) tuples to their substitution cost.
+                               Defaults to costs based on common OCR errors.
+    :param insertion_costs: Maps a character to its insertion cost.
+    :param deletion_costs: Maps a character to its deletion cost.
+    :param symmetric_substitution: If True, substitution costs are bidirectional.
+    :param default_substitution_cost: Default cost for substitutions not in the map.
+    :param default_insertion_cost: Default cost for insertions not in the map.
+    :param default_deletion_cost: Default cost for deletions not in the map.
+    """
+    substitution_costs: dict[tuple[str, str], float]
+    insertion_costs: dict[str, float]
+    deletion_costs: dict[str, float]
+    symmetric_substitution: bool
+    default_substitution_cost: float
+    default_insertion_cost: float
+    default_deletion_cost: float
+    def __init__(
+        self,
+        substitution_costs: Optional[dict[tuple[str, str], float]] = None,
+        insertion_costs: Optional[dict[str, float]] = None,
+        deletion_costs: Optional[dict[str, float]] = None,
+        *,
+        symmetric_substitution: bool = True,
+        default_substitution_cost: float = 1.0,
+        default_insertion_cost: float = 1.0,
+        default_deletion_cost: float = 1.0,
+    ) -> None:
+        self.substitution_costs = (
+            ocr_distance_map if substitution_costs is None else substitution_costs
+        )
+        self.insertion_costs = {} if insertion_costs is None else insertion_costs
+        self.deletion_costs = {} if deletion_costs is None else deletion_costs
+        self.symmetric_substitution = symmetric_substitution
+        self.default_substitution_cost = default_substitution_cost
+        self.default_insertion_cost = default_insertion_cost
+        self.default_deletion_cost = default_deletion_cost
+    @classmethod
+    def unweighted(cls) -> WeightedLevenshtein:
+        """Creates an instance with all operations having equal cost of 1.0."""
+        return cls(substitution_costs={}, insertion_costs={}, deletion_costs={})
+    def distance(self, s1: str, s2: str) -> float:
+        """Calculates the weighted Levenshtein distance between two strings."""
+        return _weighted_levenshtein_distance(s1, s2, **self.__dict__)  # type: ignore[no-any-return]
+    def explain(self, s1: str, s2: str) -> list[EditOperation]:
+        """Returns the list of edit operations to transform s1 into s2."""
+        raw_path = _explain_weighted_levenshtein_distance(s1, s2, **self.__dict__)
+        return [EditOperation(*op) for op in raw_path]
+    def batch_distance(self, s: str, candidates: list[str]) -> list[float]:
+        """Calculates distances between a string and a list of candidates."""
+        return _batch_weighted_levenshtein_distance(s, candidates, **self.__dict__)  # type: ignore[no-any-return]
 def weighted_levenshtein_distance(
     s1: str,
@@ -20,6 +107,8 @@ def weighted_levenshtein_distance(
     """
     Levenshtein distance with custom substitution, insertion and deletion costs.
+    See also :meth:`WeightedLevenshtein.distance`.
     The default `substitution_costs` considers common OCR errors, see
     :py:data:`ocr_stringdist.default_ocr_distances.ocr_distance_map`.
@@ -41,16 +130,7 @@ def weighted_levenshtein_distance(
     :param default_deletion_cost: The default deletion cost for characters not found in
                                   `deletion_costs`.
     """
-    if substitution_costs is None:
-        substitution_costs = ocr_distance_map
-    if insertion_costs is None:
-        insertion_costs = {}
-    if deletion_costs is None:
-        deletion_costs = {}
-    # _weighted_levenshtein_distance is written in Rust, see src/rust_stringdist.rs.
-    return _weighted_levenshtein_distance(  # type: ignore  # noqa: F405
-        s1,
-        s2,
+    return WeightedLevenshtein(
         substitution_costs=substitution_costs,
         insertion_costs=insertion_costs,
         deletion_costs=deletion_costs,
@@ -58,7 +138,7 @@ def weighted_levenshtein_distance(
         default_substitution_cost=default_substitution_cost,
         default_insertion_cost=default_insertion_cost,
         default_deletion_cost=default_deletion_cost,
-    )
+    ).distance(s1, s2)
 def batch_weighted_levenshtein_distance(
@@ -77,6 +157,8 @@ def batch_weighted_levenshtein_distance(
     """
     Calculate weighted Levenshtein distances between a string and multiple candidates.
+    See also :meth:`WeightedLevenshtein.batch_distance`.
     This is more efficient than calling :func:`weighted_levenshtein_distance` multiple times.
     :param s: The string to compare (interpreted as the string read via OCR)
@@ -98,16 +180,58 @@ def batch_weighted_levenshtein_distance(
                                   `deletion_costs`.
     :return: A list of distances corresponding to each candidate
     """
-    if substitution_costs is None:
-        substitution_costs = ocr_distance_map
-    if insertion_costs is None:
-        insertion_costs = {}
-    if deletion_costs is None:
-        deletion_costs = {}
-    # _batch_weighted_levenshtein_distance is written in Rust, see src/rust_stringdist.rs.
-    return _batch_weighted_levenshtein_distance(  # type: ignore  # noqa: F405
-        s,
-        candidates,
+    return WeightedLevenshtein(
+        substitution_costs=substitution_costs,
+        insertion_costs=insertion_costs,
+        deletion_costs=deletion_costs,
+        symmetric_substitution=symmetric_substitution,
+        default_substitution_cost=default_substitution_cost,
+        default_insertion_cost=default_insertion_cost,
+        default_deletion_cost=default_deletion_cost,
+    ).batch_distance(s, candidates)
+def explain_weighted_levenshtein(
+    s1: str,
+    s2: str,
+    /,
+    substitution_costs: Optional[dict[tuple[str, str], float]] = None,
+    insertion_costs: Optional[dict[str, float]] = None,
+    deletion_costs: Optional[dict[str, float]] = None,
+    *,
+    symmetric_substitution: bool = True,
+    default_substitution_cost: float = 1.0,
+    default_insertion_cost: float = 1.0,
+    default_deletion_cost: float = 1.0,
+) -> list[EditOperation]:
+    """
+    Computes the path of operations associated with the custom Levenshtein distance.
+    See also :meth:`WeightedLevenshtein.explain`.
+    The default `substitution_costs` considers common OCR errors, see
+    :py:data:`ocr_stringdist.default_ocr_distances.ocr_distance_map`.
+    :param s1: First string (interpreted as the string read via OCR)
+    :param s2: Second string
+    :param substitution_costs: Dictionary mapping tuples of strings ("substitution tokens") to their
+                     substitution costs. Only one direction needs to be configured unless
+                     `symmetric_substitution` is False.
+                     Note that the runtime scales in the length of the longest substitution token.
+                     Defaults to `ocr_stringdist.ocr_distance_map`.
+    :param insertion_costs: Dictionary mapping strings to their insertion costs.
+    :param deletion_costs: Dictionary mapping strings to their deletion costs.
+    :param symmetric_substitution: Should the keys of `substitution_costs` be considered to be
+                                   symmetric? Defaults to True.
+    :param default_substitution_cost: The default substitution cost for character pairs not found
+                                      in `substitution_costs`.
+    :param default_insertion_cost: The default insertion cost for characters not found in
+                                   `insertion_costs`.
+    :param default_deletion_cost: The default deletion cost for characters not found in
+                                  `deletion_costs`.
+    :return: List of :class:`EditOperation` instances.
+    """
+    return WeightedLevenshtein(
         substitution_costs=substitution_costs,
         insertion_costs=insertion_costs,
         deletion_costs=deletion_costs,
@@ -115,4 +239,4 @@ def batch_weighted_levenshtein_distance(
         default_substitution_cost=default_substitution_cost,
         default_insertion_cost=default_insertion_cost,
         default_deletion_cost=default_deletion_cost,
-    )
+    ).explain(s1, s2)

ocr_stringdist-0.2.1.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,81 @@
+Metadata-Version: 2.4
+Name: ocr_stringdist
+Version: 0.2.1
+Classifier: Programming Language :: Rust
+Classifier: Programming Language :: Python
+Classifier: Operating System :: OS Independent
+License-File: LICENSE
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
+Project-URL: repository, https://github.com/NiklasvonM/ocr-stringdist
+# OCR-StringDist
+A Python library for fast string distance calculations that account for common OCR (optical character recognition) errors.
+Documentation: https://niklasvonm.github.io/ocr-stringdist/
+[![PyPI](https://img.shields.io/badge/PyPI-Package-blue)](https://pypi.org/project/ocr-stringdist/)
+[![License](https://img.shields.io/badge/License-MIT-green)](LICENSE)
+## Overview
+Standard string distances (like Levenshtein) treat all character substitutions equally. This is suboptimal for text read from images via OCR, where errors like `O` vs `0` are far more common than, say, `O` vs `X`.
+OCR-StringDist uses a **weighted Levenshtein distance**, assigning lower costs to common OCR errors.
+**Example:** Matching against the correct word `CODE`:
+* **Standard Levenshtein:**
+    * $d(\text{"CODE"}, \text{"C0DE"}) = 1$ (O → 0)
+    * $d(\text{"CODE"}, \text{"CXDE"}) = 1$ (O → X)
+    * Result: Both appear equally likely/distant.
+* **OCR-StringDist (Weighted):**
+    * $d(\text{"CODE"}, \text{"C0DE"}) \approx 0.1$ (common error, low cost)
+    * $d(\text{"CODE"}, \text{"CXDE"}) = 1.0$ (unlikely error, high cost)
+    * Result: Correctly identifies `C0DE` as a much closer match.
+This makes it ideal for matching potentially incorrect OCR output against known values (e.g., product codes, database entries).
+> **Note:** This project is in early development. APIs may change in future releases.
+## Installation
+```bash
+pip install ocr-stringdist
+```
+## Features
+- **Weighted Levenshtein Distance**: Calculates Levenshtein distance with customizable costs for substitutions, insertions, and deletions. Includes an efficient batch version (`batch_weighted_levenshtein_distance`) for comparing one string against many candidates.
+- **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
+- **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
+- **Unicode Support**: Works with arbitrary Unicode strings.
+- **Best Match Finder**: Includes a utility function `find_best_candidate` to efficiently find the best match from a list based on _any_ distance function.
+## Usage
+### Weighted Levenshtein Distance
+```python
+import ocr_stringdist as osd
+# Using default OCR distance map
+distance = osd.weighted_levenshtein_distance("OCR5", "OCRS")
+print(f"Distance between 'OCR5' and 'OCRS': {distance}")  # Will be less than 1.0
+# Custom cost map
+substitution_costs = {("In", "h"): 0.5}
+distance = osd.weighted_levenshtein_distance(
+    "hi", "Ini",
+    substitution_costs=substitution_costs,
+    symmetric_substitution=True,
+)
+print(f"Distance with custom map: {distance}")
+```
+## Acknowledgements
+This project is inspired by [jellyfish](https://github.com/jamesturk/jellyfish), providing the base implementations of the algorithms used here.

ocr_stringdist-0.2.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,10 @@
+ocr_stringdist-0.2.1.dist-info/METADATA,sha256=I9_z87cgY0Ncejl5laQLI8ad1w_ZLHK1uf9S0iSMP24,3389
+ocr_stringdist-0.2.1.dist-info/WHEEL,sha256=Iz7QqxpWQRXToFIDkGspPPKDuV_klwuhW8ziiU5jhR8,96
+ocr_stringdist-0.2.1.dist-info/licenses/LICENSE,sha256=3cNRiJag5vI0KMMDNf0oiaY4vg43rLxRszbMJs1GBoU,1092
+ocr_stringdist/__init__.py,sha256=37hKeJm1qxv_DptaciEPby-7h2yojwr0djherLI88Hk,484
+ocr_stringdist/_rust_stringdist.cp310-win_amd64.pyd,sha256=-Q3hQ1TV79b4PcZR8g7u2KkN2Mn2zm4tjlpuDGkpgnA,411136
+ocr_stringdist/default_ocr_distances.py,sha256=vlhzQCCcE-D1xor5RvMW0oaMuL_HP_5Y7SO4ESkdb4w,1075
+ocr_stringdist/levenshtein.py,sha256=OzmkqSr2sxzTTo-cEf0qAfZAUIMdgj8dhucyL2txDnk,11485
+ocr_stringdist/matching.py,sha256=KEzYBBEHZhfLA9eD3MxDaehKiD9lUb0RQq74u5qWpVw,3376
+ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+ocr_stringdist-0.2.1.dist-info/RECORD,,

{ocr_stringdist-0.1.0.dist-info → ocr_stringdist-0.2.1.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: maturin (1.8.3)
+Generator: maturin (1.9.4)
 Root-Is-Purelib: false
 Tag: cp310-cp310-win_amd64

ocr_stringdist-0.1.0.dist-info/METADATA DELETED Viewed

@@ -1,85 +0,0 @@
-Metadata-Version: 2.4
-Name: ocr_stringdist
-Version: 0.1.0
-Classifier: Programming Language :: Rust
-Classifier: Programming Language :: Python
-Classifier: Operating System :: OS Independent
-License-File: LICENSE
-Summary: String distances considering OCR errors.
-Author: Niklas von Moers <niklasvmoers@protonmail.com>
-Author-email: Niklas von Moers <niklasvmoers@protonmail.com>
-License: MIT
-Requires-Python: >=3.9
-Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
-Project-URL: repository, https://github.com/NiklasvonM/ocr-stringdist
-# OCR-StringDist
-A Python library for string distance calculations that account for common OCR (optical character recognition) errors.
-Documentation: https://niklasvonm.github.io/ocr-stringdist/
-[![PyPI](https://img.shields.io/badge/PyPI-Package-blue)](https://pypi.org/project/ocr-stringdist/)
-[![License](https://img.shields.io/badge/License-MIT-green)](LICENSE)
-## Overview
-OCR-StringDist provides specialized string distance algorithms that accommodate for optical character recognition (OCR) errors. Unlike traditional string comparison algorithms, OCR-StringDist considers common OCR confusions (like "0" vs "O", "6" vs "G", etc.) when calculating distances between strings.
-> **Note:** This project is in early development. APIs may change in future releases.
-## Installation
-```bash
-pip install ocr-stringdist
-```
-## Features
-- **Weighted Levenshtein Distance**: An adaptation of the classic Levenshtein algorithm with custom substitution costs for character pairs that are commonly confused in OCR models, including efficient batch processing.
-- **Unicode Support**: Arbitrary unicode strings can be compared.
-- **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
-- **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
-- **Best Match Finder**: Utility function `find_best_candidate` to efficiently find the best matching string from a collection of candidates using any specified distance function (including the library's OCR-aware ones).
-## Usage
-### Weighted Levenshtein Distance
-```python
-import ocr_stringdist as osd
-# Using default OCR distance map
-distance = osd.weighted_levenshtein_distance("OCR5", "OCRS")
-print(f"Distance between 'OCR5' and 'OCRS': {distance}")  # Will be less than 1.0
-# Custom cost map
-custom_map = {("In", "h"): 0.5}
-distance = osd.weighted_levenshtein_distance(
-    "hi", "Ini",
-    cost_map=custom_map,
-    symmetric=True,
-)
-print(f"Distance with custom map: {distance}")
-```
-### Finding the Best Candidate
-```python
-import ocr_stringdist as osd
-s = "apple"
-candidates = ["apply", "apples", "orange", "appIe"]  # 'appIe' has an OCR-like error
-def ocr_aware_distance(s1: str, s2: str) -> float:
-    return osd.weighted_levenshtein_distance(s1, s2, cost_map={("l", "I"): 0.1})
-best_candidate, best_dist = osd.find_best_candidate(s, candidates, ocr_aware_distance)
-print(f"Best candidate for '{s}' is '{best_candidate}' with distance {best_dist}")
-# Output: Best candidate for 'apple' is 'appIe' with distance 0.1
-```
-## Acknowledgements
-This project is inspired by [jellyfish](https://github.com/jamesturk/jellyfish), providing the base implementations of the algorithms used here.

ocr_stringdist-0.1.0.dist-info/RECORD DELETED Viewed

@@ -1,10 +0,0 @@
-ocr_stringdist-0.1.0.dist-info/METADATA,sha256=O6qY72O8qG2qHJlqdvSkK-_ka_dt6xhQHI5Z3qOIz5Q,3457
-ocr_stringdist-0.1.0.dist-info/WHEEL,sha256=77DqkvxB4HqZitBRK_M49NRS207JKb0MotMEjnxEWQ8,96
-ocr_stringdist-0.1.0.dist-info/licenses/LICENSE,sha256=3cNRiJag5vI0KMMDNf0oiaY4vg43rLxRszbMJs1GBoU,1092
-ocr_stringdist/default_ocr_distances.py,sha256=vlhzQCCcE-D1xor5RvMW0oaMuL_HP_5Y7SO4ESkdb4w,1075
-ocr_stringdist/levenshtein.py,sha256=IQLNcTMVdRqo6hWEYErA5qhlzvQNjwFuXPV24yQoeP0,5745
-ocr_stringdist/matching.py,sha256=KEzYBBEHZhfLA9eD3MxDaehKiD9lUb0RQq74u5qWpVw,3376
-ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-ocr_stringdist/__init__.py,sha256=MkIgLBJKXQRGfRoEdbrKBxwlRJKV85w-_jBdYDeH__0,342
-ocr_stringdist/_rust_stringdist.cp310-win_amd64.pyd,sha256=6AIx2_jaQg62xdJ_t1GazPy1AJRwuH28jJrBWWJ6suQ,398336
-ocr_stringdist-0.1.0.dist-info/RECORD,,

{ocr_stringdist-0.1.0.dist-info → ocr_stringdist-0.2.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes