PyPI - ocr-stringdist - Versions diffs - 0.3.0__cp311-cp311-musllinux_1_1_i686.whl → 1.0.1__cp311-cp311-musllinux_1_1_i686.whl - Mend

ocr-stringdist 0.3.0__cp311-cp311-musllinux_1_1_i686.whl → 1.0.1__cp311-cp311-musllinux_1_1_i686.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

ocr_stringdist/__init__.py +5 -9
ocr_stringdist/_rust_stringdist.cpython-311-i386-linux-musl.so +0 -0
ocr_stringdist/edit_operation.py +5 -2
ocr_stringdist/learner.py +111 -115
ocr_stringdist/levenshtein.py +93 -158
ocr_stringdist/protocols.py +9 -0
{ocr_stringdist-0.3.0.dist-info → ocr_stringdist-1.0.1.dist-info}/METADATA +33 -49
ocr_stringdist-1.0.1.dist-info/RECORD +14 -0
ocr_stringdist-0.3.0.dist-info/RECORD +0 -13
{ocr_stringdist-0.3.0.dist-info → ocr_stringdist-1.0.1.dist-info}/WHEEL +0 -0
{ocr_stringdist-0.3.0.dist-info → ocr_stringdist-1.0.1.dist-info}/licenses/LICENSE +0 -0

ocr_stringdist/__init__.py CHANGED Viewed

@@ -1,17 +1,13 @@
 from .default_ocr_distances import ocr_distance_map
-from .levenshtein import (
-    WeightedLevenshtein,
-    batch_weighted_levenshtein_distance,
-    explain_weighted_levenshtein,
-    weighted_levenshtein_distance,
-)
+from .edit_operation import EditOperation
+from .learner import CostLearner
+from .levenshtein import WeightedLevenshtein
 from .matching import find_best_candidate
 __all__ = [
     "ocr_distance_map",
+    "EditOperation",
+    "CostLearner",
     "WeightedLevenshtein",
-    "weighted_levenshtein_distance",
-    "batch_weighted_levenshtein_distance",
-    "explain_weighted_levenshtein",
     "find_best_candidate",
 ]

ocr_stringdist/_rust_stringdist.cpython-311-i386-linux-musl.so CHANGED Viewed

Binary file

ocr_stringdist/edit_operation.py CHANGED Viewed

@@ -1,5 +1,5 @@
-from dataclasses import dataclass
-from typing import Literal, Optional
+from dataclasses import asdict, dataclass
+from typing import Any, Literal, Optional
 OperationType = Literal["substitute", "insert", "delete", "match"]
@@ -14,3 +14,6 @@ class EditOperation:
     source_token: Optional[str]
     target_token: Optional[str]
     cost: float
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)

ocr_stringdist/learner.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import itertools
 import math
 from collections import defaultdict
 from collections.abc import Iterable
@@ -7,12 +8,12 @@ from typing import TYPE_CHECKING, Callable, Optional
 if TYPE_CHECKING:
     from .edit_operation import EditOperation
     from .levenshtein import WeightedLevenshtein
+    from .protocols import Aligner
 CostFunction = Callable[[float], float]
 def negative_log_likelihood(probability: float) -> float:
-    """Standard cost function based on information theory. Common errors get low cost."""
     if probability <= 0.0:
         raise ValueError("Probability must be positive to compute negative log likelihood.")
     return -math.log(probability)
@@ -26,6 +27,7 @@ class TallyCounts:
     insertions: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
     deletions: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
     source_chars: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
+    target_chars: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
     vocab: set[str] = field(default_factory=set)
@@ -36,7 +38,7 @@ class _Costs:
     deletions: dict[str, float]
-class Learner:
+class CostLearner:
     """
     Configures and executes the process of learning Levenshtein costs from data.
@@ -45,17 +47,16 @@ class Learner:
     Example::
-        from ocr_stringdist.learner import Learner
+        from ocr_stringdist import CostLearner
         data = [
             ("Hell0", "Hello"),
         ]
-        learner = Learner().with_smoothing(1.0)
+        learner = CostLearner().with_smoothing(1.0)
         wl = learner.fit(data) # Substitution 0 -> o learned with cost < 1.0
     """
     # Configuration parameters
-    _cost_function: CostFunction
     _smoothing_k: float
     # These attributes are set during fitting
@@ -63,71 +64,37 @@ class Learner:
     vocab_size: Optional[int] = None
     def __init__(self) -> None:
-        self._cost_function = negative_log_likelihood
         self._smoothing_k = 1.0
-    def with_cost_function(self, cost_function: CostFunction) -> "Learner":
-        """
-        Sets a custom function to convert probabilities to costs.
-        :param cost_function: A callable that takes a float (probability)
-                              and returns a float (cost).
-                              Is negative log likelihood unless overridden.
-        :return: The Learner instance for method chaining.
-        """
-        self._cost_function = cost_function
-        return self
-    def with_smoothing(self, k: float) -> "Learner":
+    def with_smoothing(self, k: float) -> "CostLearner":
         r"""
         Sets the smoothing parameter `k`.
-        This parameter controls how strongly the model defaults to uniform probabilities.
+        This parameter controls how strongly the model defaults to a uniform
+        probability distribution by adding a "pseudo-count" of `k` to every
+        possible event.
-        :param k: The smoothing factor, which must be a positive number.
-        :return: The Learner instance for method chaining.
+        :param k: The smoothing factor, which must be a non-negative number.
+        :return: The CostLearner instance for method chaining.
+        :raises ValueError: If k < 0.
         Notes
         -----
-        **Conceptual Framework**
-        Additive smoothing works by adding a "pseudo-count" `k` to every possible
-        event before calculating probabilities. This effectively pretends that
-        every possible substitution, insertion, and deletion has already been seen
-        `k` times.
-        - **k = 1.0 (Default):** This is standard **Laplace smoothing**. It is a
-          robust choice for most situations and corresponds to adding one
-          pseudo-count for every possible event.
-        - **0 < k < 1.0:** A smaller `k` is suitable for large and representative
-          datasets, indicating higher confidence in the observed frequencies.
-        - **k > 1.0:** A larger `k` is useful for small or noisy datasets. It
-          regularizes the model by pulling the learned probabilities closer
-          to a uniform distribution.
-        **Bayesian Interpretation**
-        From a Bayesian perspective, `k` serves as the concentration parameter,
-        :math:`\alpha`, of a **symmetric Dirichlet prior distribution**.
-        This distribution acts as the conjugate prior for the
-        multinomial distribution of the observed error counts.
-        The smoothed probability of an event `i` is the posterior expectation of
-        the Dirichlet-multinomial model:
-        .. math::
-            P(\text{event}_i) = \frac{c_i + k}{N + k \cdot V}
-        Where:
-            - :math:`c_i` is the observed count of event :math:`i`.
-            - :math:`N` is the total number of observations in the given context
-              (e.g., the total count of a specific source character).
-            - :math:`V` is the vocabulary size (the number of possible unique events).
-            - :math:`k` is the smoothing parameter, representing the strength of the prior.
+        This parameter allows for a continuous transition between two modes:
+        - **k > 0 (recommended):** This enables additive smoothing, with `k = 1.0`
+          being Laplace smoothing. It regularizes the model by assuming no event is impossible.
+          The final costs are a measure of "relative surprisal," normalized by the vocabulary size
+        - **k = 0:** This corresponds to a normalized Maximum Likelihood Estimation.
+          Probabilities are derived from the raw observed frequencies. The final costs are
+          normalized using the same logic as the `k > 0` case, making `k=0` the continuous limit
+          of the smoothed model. In this mode, costs can only be calculated for events observed in
+          the training data. Unseen events will receive the default cost, regardless of
+          the value of `calculate_for_unseen` in :meth:`fit`.
         """
-        if k <= 0:
-            raise ValueError("Smoothing parameter k must be positive.")
+        if k < 0:
+            raise ValueError("Smoothing parameter k must be non-negative.")
         self._smoothing_k = k
         return self
@@ -138,6 +105,7 @@ class Learner:
             if op.source_token is not None:
                 counts.vocab.add(op.source_token)
             if op.target_token is not None:
+                counts.target_chars[op.target_token] += 1
                 counts.vocab.add(op.target_token)
             if op.op_type == "substitute":
@@ -160,85 +128,112 @@ class Learner:
                 counts.source_chars[op.source_token] += 1
         return counts
-    def _calculate_single_scaled_cost(
-        self,
-        observed_count: int,
-        context_total: int,
-        vocab_size: int,
-    ) -> Optional[float]:
-        """Calculates a single scaled cost for an edit operation."""
-        denominator = context_total + self._smoothing_k * vocab_size
-        if denominator <= 0:
-            return None
-        # Calculate the cost of an unseen event in this context, used for scaling
-        prob_unseen = self._smoothing_k / denominator
-        scaling_factor = self._cost_function(prob_unseen)
-        if scaling_factor <= 0:
-            return None
-        # Calculate the cost for the actually observed event
-        prob_observed = (observed_count + self._smoothing_k) / denominator
-        cost_observed = self._cost_function(prob_observed)
-        return cost_observed / scaling_factor
-    def _calculate_costs(self, counts: TallyCounts, vocab_size: int) -> _Costs:
+    def _calculate_costs(
+        self, counts: TallyCounts, vocab: set[str], calculate_for_unseen: bool = False
+    ) -> _Costs:
         """
-        Calculates and scales costs for observed operations using a context-dependent
-        scaling factor to ensure the effective default cost is 1.0.
+        Calculates the costs for edit operations based on tallied counts.
         """
-        # Substitutions
         sub_costs: dict[tuple[str, str], float] = {}
-        for (source, target), count in counts.substitutions.items():
-            source_char_count = counts.source_chars[source]
-            cost = self._calculate_single_scaled_cost(count, source_char_count, vocab_size)
-            if cost is not None:
-                sub_costs[(source, target)] = cost
-        # Insertions
         ins_costs: dict[str, float] = {}
-        total_chars = sum(counts.source_chars.values())
-        for target, count in counts.insertions.items():
-            cost = self._calculate_single_scaled_cost(count, total_chars, vocab_size)
-            if cost is not None:
-                ins_costs[target] = cost
+        del_costs: dict[str, float] = {}
+        k = self._smoothing_k
+        if k == 0:
+            calculate_for_unseen = False
+        # Error space size V for all conditional probabilities.
+        # The space of possible outcomes for a given source character (from OCR)
+        # includes all vocab characters (for matches/substitutions) plus the empty
+        # character (for deletions). This gives V = len(vocab) + 1.
+        # Symmetrically, the space of outcomes for a given target character (from GT)
+        # includes all vocab characters plus the empty character (for insertions/misses).
+        V = len(vocab) + 1
+        # Normalization ceiling Z' = -log(1/V).
+        normalization_ceiling = math.log(V) if V > 1 else 1.0
+        # Substitutions
+        sub_iterator = (
+            itertools.product(vocab, vocab) if calculate_for_unseen else counts.substitutions.keys()
+        )
+        for source, target in sub_iterator:
+            count = counts.substitutions[(source, target)]
+            total_count = counts.source_chars[source]
+            prob = (count + k) / (total_count + k * V)
+            base_cost = negative_log_likelihood(prob)
+            sub_costs[(source, target)] = base_cost / normalization_ceiling
         # Deletions
-        del_costs: dict[str, float] = {}
-        for source, count in counts.deletions.items():
-            source_char_count = counts.source_chars[source]
-            cost = self._calculate_single_scaled_cost(count, source_char_count, vocab_size)
-            if cost is not None:
-                del_costs[source] = cost
+        del_iterator = vocab if calculate_for_unseen else counts.deletions.keys()
+        for source in del_iterator:
+            count = counts.deletions[source]
+            total_count = counts.source_chars[source]
+            prob = (count + k) / (total_count + k * V)
+            base_cost = negative_log_likelihood(prob)
+            del_costs[source] = base_cost / normalization_ceiling
+        # Insertions
+        ins_iterator = vocab if calculate_for_unseen else counts.insertions.keys()
+        for target in ins_iterator:
+            count = counts.insertions[target]
+            total_target_count = counts.target_chars[target]
+            prob = (count + k) / (total_target_count + k * V)
+            base_cost = negative_log_likelihood(prob)
+            ins_costs[target] = base_cost / normalization_ceiling
         return _Costs(substitutions=sub_costs, insertions=ins_costs, deletions=del_costs)
-    def _calculate_operations(self, pairs: Iterable[tuple[str, str]]) -> list["EditOperation"]:
-        """Calculate edit operations for all string pairs using unweighted Levenshtein."""
-        from .levenshtein import WeightedLevenshtein
+    def _calculate_operations(
+        self, pairs: Iterable[tuple[str, str]], aligner: "Aligner"
+    ) -> list["EditOperation"]:
+        """Calculate edit operations for all string pairs using the provided aligner."""
-        unweighted_lev = WeightedLevenshtein.unweighted()
         all_ops = [
             op
             for ocr_str, truth_str in pairs
-            for op in unweighted_lev.explain(ocr_str, truth_str, filter_matches=False)
+            for op in aligner.explain(ocr_str, truth_str, filter_matches=False)
         ]
         return all_ops
-    def fit(self, pairs: Iterable[tuple[str, str]]) -> "WeightedLevenshtein":
+    def fit(
+        self,
+        pairs: Iterable[tuple[str, str]],
+        *,
+        initial_model: "Aligner | None" = None,
+        calculate_for_unseen: bool = False,
+    ) -> "WeightedLevenshtein":
         """
         Fits the costs of a WeightedLevenshtein instance to the provided data.
-        Note that learning multi-character tokens is not yet supported.
+        Note that learning multi-character tokens is only supported if an initial alignment model
+        is provided that can handle those multi-character tokens.
+        This method analyzes pairs of strings to learn the costs of edit operations
+        based on their observed frequencies. The underlying model calculates costs
+        based on the principle of relative information cost.
+        For a detailed explanation of the methodology, please see the
+        :doc:`Cost Learning Model <cost_learning_model>` documentation page.
         :param pairs: An iterable of (ocr_string, ground_truth_string) tuples.
+        :param initial_model: Optional initial model used to align OCR outputs and ground truth
+                              strings. By default, an unweighted Levenshtein distance is used.
+        :param calculate_for_unseen: If True (and k > 0), pre-calculates costs for all
+                                     possible edit operations based on the vocabulary.
+                                     If False (default), only calculates costs for operations
+                                     observed in the data.
         :return: A `WeightedLevenshtein` instance with the learned costs.
         """
         from .levenshtein import WeightedLevenshtein
-        all_ops = self._calculate_operations(pairs)
+        if not pairs:
+            return WeightedLevenshtein.unweighted()
+        if initial_model is None:
+            initial_model = WeightedLevenshtein.unweighted()
+        all_ops = self._calculate_operations(pairs, aligner=initial_model)
         self.counts = self._tally_operations(all_ops)
         vocab = self.counts.vocab
         self.vocab_size = len(vocab)
@@ -246,12 +241,13 @@ class Learner:
         if not self.vocab_size:
             return WeightedLevenshtein.unweighted()
-        costs = self._calculate_costs(self.counts, self.vocab_size)
+        costs = self._calculate_costs(self.counts, vocab, calculate_for_unseen=calculate_for_unseen)
         return WeightedLevenshtein(
             substitution_costs=costs.substitutions,
             insertion_costs=costs.insertions,
             deletion_costs=costs.deletions,
+            symmetric_substitution=False,
             default_substitution_cost=1.0,
             default_insertion_cost=1.0,
             default_deletion_cost=1.0,

ocr_stringdist/levenshtein.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
 from collections.abc import Iterable
-from typing import Optional
+from typing import Any, Optional
 from ._rust_stringdist import (
     _batch_weighted_levenshtein_distance,
@@ -24,10 +24,13 @@ class WeightedLevenshtein:
                                Defaults to costs based on common OCR errors.
     :param insertion_costs: Maps a string to its insertion cost.
     :param deletion_costs: Maps a string to its deletion cost.
-    :param symmetric_substitution: If True, substitution costs are bidirectional.
+    :param symmetric_substitution: If True, a cost defined for, e.g., ('0', 'O') will automatically
+                                   apply to ('O', '0'). If False, both must be defined explicitly.
     :param default_substitution_cost: Default cost for single-char substitutions not in the map.
     :param default_insertion_cost: Default cost for single-char insertions not in the map.
     :param default_deletion_cost: Default cost for single-char deletions not in the map.
+    :raises TypeError, ValueError: If the provided arguments are invalid.
     """
     substitution_costs: dict[tuple[str, str], float]
@@ -49,9 +52,37 @@ class WeightedLevenshtein:
         default_insertion_cost: float = 1.0,
         default_deletion_cost: float = 1.0,
     ) -> None:
-        self.substitution_costs = (
-            ocr_distance_map if substitution_costs is None else substitution_costs
-        )
+        # Validate default costs
+        for cost_name, cost_val in [
+            ("default_substitution_cost", default_substitution_cost),
+            ("default_insertion_cost", default_insertion_cost),
+            ("default_deletion_cost", default_deletion_cost),
+        ]:
+            if not isinstance(cost_val, (int, float)):
+                raise TypeError(f"{cost_name} must be a number, but got: {type(cost_val).__name__}")
+            if cost_val < 0:
+                raise ValueError(f"{cost_name} must be non-negative, got value: {cost_val}")
+        # Validate substitution_costs dictionary
+        sub_costs = ocr_distance_map if substitution_costs is None else substitution_costs
+        for key, cost in sub_costs.items():
+            if not (
+                isinstance(key, tuple)
+                and len(key) == 2
+                and isinstance(key[0], str)
+                and isinstance(key[1], str)
+            ):
+                raise TypeError(
+                    f"substitution_costs keys must be tuples of two strings, but found: {key}"
+                )
+            if not isinstance(cost, (int, float)):
+                raise TypeError(
+                    f"Cost for substitution {key} must be a number, but got: {type(cost).__name__}"
+                )
+            if cost < 0:
+                raise ValueError(f"Cost for substitution {key} cannot be negative, but got: {cost}")
+        self.substitution_costs = sub_costs
         self.insertion_costs = {} if insertion_costs is None else insertion_costs
         self.deletion_costs = {} if deletion_costs is None else deletion_costs
         self.symmetric_substitution = symmetric_substitution
@@ -92,7 +123,8 @@ class WeightedLevenshtein:
         """
         Creates an instance by learning costs from a dataset of (OCR, ground truth) string pairs.
-        For more advanced learning configuration, see the `ocr_stringdist.learner.Learner` class.
+        For more advanced learning configuration, see the
+        :class:`ocr_stringdist.learner.CostLearner` class.
         :param pairs: An iterable of (ocr_string, ground_truth_string) tuples. Correct pairs
                       are not intended to be filtered; they are needed to learn well-aligned costs.
@@ -111,9 +143,9 @@ class WeightedLevenshtein:
             print(wl.substitution_costs) # learned cost for substituting '8' with 'B'
             print(wl.deletion_costs) # learned cost for deleting '.'
         """
-        from .learner import Learner
+        from .learner import CostLearner
-        return Learner().fit(pairs)
+        return CostLearner().fit(pairs)
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, WeightedLevenshtein):
@@ -128,154 +160,57 @@ class WeightedLevenshtein:
             and self.default_deletion_cost == other.default_deletion_cost
         )
+    def to_dict(self) -> dict[str, Any]:
+        """
+        Serializes the instance's configuration to a dictionary.
-def weighted_levenshtein_distance(
-    s1: str,
-    s2: str,
-    /,
-    substitution_costs: Optional[dict[tuple[str, str], float]] = None,
-    insertion_costs: Optional[dict[str, float]] = None,
-    deletion_costs: Optional[dict[str, float]] = None,
-    *,
-    symmetric_substitution: bool = True,
-    default_substitution_cost: float = 1.0,
-    default_insertion_cost: float = 1.0,
-    default_deletion_cost: float = 1.0,
-) -> float:
-    """
-    Levenshtein distance with custom substitution, insertion and deletion costs.
-    See also :meth:`WeightedLevenshtein.distance`.
-    The default `substitution_costs` considers common OCR errors, see
-    :py:data:`ocr_stringdist.default_ocr_distances.ocr_distance_map`.
-    :param s1: First string (interpreted as the string read via OCR)
-    :param s2: Second string
-    :param substitution_costs: Dictionary mapping tuples of strings ("substitution tokens") to their
-                     substitution costs. Only one direction needs to be configured unless
-                     `symmetric_substitution` is False.
-                     Note that the runtime scales in the length of the longest substitution token.
-                     Defaults to `ocr_stringdist.ocr_distance_map`.
-    :param insertion_costs: Dictionary mapping strings to their insertion costs.
-    :param deletion_costs: Dictionary mapping strings to their deletion costs.
-    :param symmetric_substitution: Should the keys of `substitution_costs` be considered to be
-                                   symmetric? Defaults to True.
-    :param default_substitution_cost: The default substitution cost for character pairs not found
-                                      in `substitution_costs`.
-    :param default_insertion_cost: The default insertion cost for characters not found in
-                                   `insertion_costs`.
-    :param default_deletion_cost: The default deletion cost for characters not found in
-                                  `deletion_costs`.
-    """
-    return WeightedLevenshtein(
-        substitution_costs=substitution_costs,
-        insertion_costs=insertion_costs,
-        deletion_costs=deletion_costs,
-        symmetric_substitution=symmetric_substitution,
-        default_substitution_cost=default_substitution_cost,
-        default_insertion_cost=default_insertion_cost,
-        default_deletion_cost=default_deletion_cost,
-    ).distance(s1, s2)
-def batch_weighted_levenshtein_distance(
-    s: str,
-    candidates: list[str],
-    /,
-    substitution_costs: Optional[dict[tuple[str, str], float]] = None,
-    insertion_costs: Optional[dict[str, float]] = None,
-    deletion_costs: Optional[dict[str, float]] = None,
-    *,
-    symmetric_substitution: bool = True,
-    default_substitution_cost: float = 1.0,
-    default_insertion_cost: float = 1.0,
-    default_deletion_cost: float = 1.0,
-) -> list[float]:
-    """
-    Calculate weighted Levenshtein distances between a string and multiple candidates.
-    See also :meth:`WeightedLevenshtein.batch_distance`.
-    This is more efficient than calling :func:`weighted_levenshtein_distance` multiple times.
-    :param s: The string to compare (interpreted as the string read via OCR)
-    :param candidates: List of candidate strings to compare against
-    :param substitution_costs: Dictionary mapping tuples of strings ("substitution tokens") to their
-                     substitution costs. Only one direction needs to be configured unless
-                     `symmetric_substitution` is False.
-                     Note that the runtime scales in the length of the longest substitution token.
-                     Defaults to `ocr_stringdist.ocr_distance_map`.
-    :param insertion_costs: Dictionary mapping strings to their insertion costs.
-    :param deletion_costs: Dictionary mapping strings to their deletion costs.
-    :param symmetric_substitution: Should the keys of `substitution_costs` be considered to be
-                                   symmetric? Defaults to True.
-    :param default_substitution_cost: The default substitution cost for character pairs not found
-                                      in `substitution_costs`.
-    :param default_insertion_cost: The default insertion cost for characters not found in
-                                   `insertion_costs`.
-    :param default_deletion_cost: The default deletion cost for characters not found in
-                                  `deletion_costs`.
-    :return: A list of distances corresponding to each candidate
-    """
-    return WeightedLevenshtein(
-        substitution_costs=substitution_costs,
-        insertion_costs=insertion_costs,
-        deletion_costs=deletion_costs,
-        symmetric_substitution=symmetric_substitution,
-        default_substitution_cost=default_substitution_cost,
-        default_insertion_cost=default_insertion_cost,
-        default_deletion_cost=default_deletion_cost,
-    ).batch_distance(s, candidates)
-def explain_weighted_levenshtein(
-    s1: str,
-    s2: str,
-    /,
-    substitution_costs: Optional[dict[tuple[str, str], float]] = None,
-    insertion_costs: Optional[dict[str, float]] = None,
-    deletion_costs: Optional[dict[str, float]] = None,
-    *,
-    symmetric_substitution: bool = True,
-    default_substitution_cost: float = 1.0,
-    default_insertion_cost: float = 1.0,
-    default_deletion_cost: float = 1.0,
-    filter_matches: bool = True,
-) -> list[EditOperation]:
-    """
-    Computes the path of operations associated with the custom Levenshtein distance.
-    See also :meth:`WeightedLevenshtein.explain`.
-    The default `substitution_costs` considers common OCR errors, see
-    :py:data:`ocr_stringdist.default_ocr_distances.ocr_distance_map`.
-    :param s1: First string (interpreted as the string read via OCR)
-    :param s2: Second string
-    :param substitution_costs: Dictionary mapping tuples of strings ("substitution tokens") to their
-                     substitution costs. Only one direction needs to be configured unless
-                     `symmetric_substitution` is False.
-                     Note that the runtime scales in the length of the longest substitution token.
-                     Defaults to `ocr_stringdist.ocr_distance_map`.
-    :param insertion_costs: Dictionary mapping strings to their insertion costs.
-    :param deletion_costs: Dictionary mapping strings to their deletion costs.
-    :param symmetric_substitution: Should the keys of `substitution_costs` be considered to be
-                                   symmetric? Defaults to True.
-    :param default_substitution_cost: The default substitution cost for character pairs not found
-                                      in `substitution_costs`.
-    :param default_insertion_cost: The default insertion cost for characters not found in
-                                   `insertion_costs`.
-    :param default_deletion_cost: The default deletion cost for characters not found in
-                                  `deletion_costs`.
-    :return: List of :class:`EditOperation` instances.
-    """
-    return WeightedLevenshtein(
-        substitution_costs=substitution_costs,
-        insertion_costs=insertion_costs,
-        deletion_costs=deletion_costs,
-        symmetric_substitution=symmetric_substitution,
-        default_substitution_cost=default_substitution_cost,
-        default_insertion_cost=default_insertion_cost,
-        default_deletion_cost=default_deletion_cost,
-    ).explain(s1, s2, filter_matches=filter_matches)
+        The result can be written to, say, JSON.
+        For the counterpart, see :meth:`WeightedLevenshtein.from_dict`.
+        """
+        # Convert tuple keys to a list of lists/objects for broader compatibility (e.g., JSON)
+        sub_costs_serializable = [
+            {"from": k[0], "to": k[1], "cost": v} for k, v in self.substitution_costs.items()
+        ]
+        return {
+            "substitution_costs": sub_costs_serializable,
+            "insertion_costs": self.insertion_costs,
+            "deletion_costs": self.deletion_costs,
+            "symmetric_substitution": self.symmetric_substitution,
+            "default_substitution_cost": self.default_substitution_cost,
+            "default_insertion_cost": self.default_insertion_cost,
+            "default_deletion_cost": self.default_deletion_cost,
+        }
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> WeightedLevenshtein:
+        """
+        Deserialize from a dictionary.
+        For the counterpart, see :meth:`WeightedLevenshtein.to_dict`.
+        :param data: A dictionary with (not necessarily all of) the following keys:
+                     - "substitution_costs": {"from": str, "to": str, "cost": float}
+                     - "substitution_costs": dict[str, float]
+                     - "deletion_costs": dict[str, float]
+                     - "symmetric_substitution": bool
+                     - "default_substitution_cost": float
+                     - "default_insertion_cost": float
+                     - "default_deletion_cost": float
+        """
+        # Convert the list of substitution costs back to the required dict format
+        sub_costs: dict[tuple[str, str], float] = {
+            (item["from"], item["to"]): item["cost"] for item in data.get("substitution_costs", {})
+        }
+        return cls(
+            substitution_costs=sub_costs,
+            insertion_costs=data.get("insertion_costs"),
+            deletion_costs=data.get("deletion_costs"),
+            symmetric_substitution=data.get("symmetric_substitution", True),
+            default_substitution_cost=data.get("default_substitution_cost", 1.0),
+            default_insertion_cost=data.get("default_insertion_cost", 1.0),
+            default_deletion_cost=data.get("default_deletion_cost", 1.0),
+        )

ocr_stringdist/protocols.py ADDED Viewed

@@ -0,0 +1,9 @@
+from typing import TYPE_CHECKING, Protocol, runtime_checkable
+if TYPE_CHECKING:
+    from .edit_operation import EditOperation
+@runtime_checkable
+class Aligner(Protocol):
+    def explain(self, s1: str, s2: str, filter_matches: bool) -> list["EditOperation"]: ...

{ocr_stringdist-0.3.0.dist-info → ocr_stringdist-1.0.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ocr-stringdist
-Version: 0.3.0
+Version: 1.0.1
 Classifier: Programming Language :: Rust
 Classifier: Programming Language :: Python :: Implementation :: CPython
 Classifier: License :: OSI Approved :: MIT License
@@ -13,7 +13,7 @@ Project-URL: documentation, https://niklasvonm.github.io/ocr-stringdist/
 # OCR-StringDist
-A Python library for fast string distance calculations that account for common OCR (optical character recognition) errors.
+A Python library to learn, model, explain and correct OCR errors using a fast string distance engine.
 Documentation: https://niklasvonm.github.io/ocr-stringdist/
@@ -24,7 +24,7 @@ Documentation: https://niklasvonm.github.io/ocr-stringdist/
 Standard string distances (like Levenshtein) treat all character substitutions equally. This is suboptimal for text read from images via OCR, where errors like `O` vs `0` are far more common than, say, `O` vs `X`.
-OCR-StringDist uses a **weighted Levenshtein distance**, assigning lower costs to common OCR errors.
+OCR-StringDist provides a learnable **weighted Levenshtein distance**, implementing part of the **Noisy Channel model**.
 **Example:** Matching against the correct word `CODE`:
@@ -33,12 +33,12 @@ OCR-StringDist uses a **weighted Levenshtein distance**, assigning lower costs t
     * $d(\text{"CODE"}, \text{"CXDE"}) = 1$ (O → X)
     * Result: Both appear equally likely/distant.
-* **OCR-StringDist (Weighted):**
+* **OCR-StringDist (Channel Model):**
     * $d(\text{"CODE"}, \text{"C0DE"}) \approx 0.1$ (common error, low cost)
     * $d(\text{"CODE"}, \text{"CXDE"}) = 1.0$ (unlikely error, high cost)
     * Result: Correctly identifies `C0DE` as a much closer match.
-This makes it ideal for matching potentially incorrect OCR output against known values (e.g., product codes, database entries).
+This makes it ideal for matching potentially incorrect OCR output against known values (e.g., product codes). By combining this *channel model* with a *source model* (e.g., product code frequencies), you can build a complete and robust OCR correction system.
 ## Installation
@@ -48,63 +48,47 @@ pip install ocr-stringdist
 ## Features
-- **High Performance**: The core logic is implemented in Rust with speed in mind.
-- **Weighted Levenshtein Distance**: Calculates Levenshtein distance with customizable costs for substitutions, insertions, and deletions. Includes an efficient batch version (`batch_weighted_levenshtein_distance`) for comparing one string against many candidates.
-- **Explainable Edit Path**: Returns the optimal sequence of edit operations (substitutions, insertions, and deletions) used to transform one string into another.
+- **Learnable Costs**: Automatically learn substitution, insertion, and deletion costs from a dataset of (OCR string, ground truth string) pairs.
+- **Weighted Levenshtein Distance**: Models OCR error patterns by assigning custom costs to specific edit operations.
+- **High Performance**: Core logic in Rust and a batch_distance function for efficiently comparing one string against thousands of candidates.
 - **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
+- **Explainable Edit Path**: Returns the optimal sequence of edit operations (substitutions, insertions, and deletions) used to transform one string into another.
 - **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
-- **Learnable Costs**: Easily learn costs from a dataset of (OCR string, ground truth string)-pairs.
-- **Unicode Support**: Works with arbitrary Unicode strings.
-- **Best Match Finder**: Includes a utility function `find_best_candidate` to efficiently find the best match from a list based on _any_ distance function.
+- **Full Unicode Support**: Works with arbitrary Unicode strings.
-## Usage
+## Core Workflow
-### Basic usage
+The typical workflow involves
+- learning costs from your data and then
+- using the resulting model to find the best match from a list of candidates.
 ```python
 from ocr_stringdist import WeightedLevenshtein
-# Default substitution costs are ocr_stringdist.ocr_distance_map.
-wl = WeightedLevenshtein()
+# 1. LEARN costs from your own data
+training_data = [
+    ("128", "123"),
+    ("567", "567"),
+]
+wl = WeightedLevenshtein.learn_from(training_data)
-print(wl.distance("CXDE", "CODE")) # == 1
-print(wl.distance("C0DE", "CODE")) # < 1
-```
+# The engine has now learned that '8' -> '3' is a low-cost substitution
+print(f"Learned cost for ('8', '3'): {wl.substitution_costs[('8', '3')]:.2f}")
-### Explain the Edit Path
-```python
-edit_path = wl.explain("C0DE", "CODE")
-print(edit_path)
-# [EditOperation(op_type='substitute', source_token='0', target_token='O', cost=0.1)]
-```
+# 2. MATCH new OCR output against a list of candidates
+ocr_output = "Product Code 128"
+candidates = [
+    "Product Code 123",
+    "Product Code 523",  # '5' -> '1' is an unlikely error
+]
-### Fast Batch Calculations
+distances = wl.batch_distance(ocr_output, candidates)
-Quickly compare a string to a list of candidates.
+# Find the best match
+min_distance = min(distances)
+best_match = candidates[distances.index(min_distance)]
-```python
-distances: list[float] = wl.batch_distance("CODE", ["CXDE", "C0DE"])
-# [1.0, 0.1]
+print(f"Best match for '{ocr_output}': '{best_match}' (Cost: {min_distance:.2f})")
 ```
-### Multi-character Substitutions
-```python
-# Custom costs with multi-character substitution
-wl = WeightedLevenshtein(substitution_costs={("In", "h"): 0.5})
-print(wl.distance("hi", "Ini")) # 0.5
-```
-### Learn Costs
-```python
-wl = WeightedLevenshtein.learn_from([("Hallo", "Hello")])
-print(wl.substitution_costs[("a", "e")]) # < 1
-```
-## Acknowledgements
-This project is inspired by [jellyfish](https://github.com/jamesturk/jellyfish), providing the base implementations of the algorithms used here.

ocr_stringdist-1.0.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,14 @@
+ocr_stringdist-1.0.1.dist-info/METADATA,sha256=86GHCkU8E3oyRoN14UPF_PJM55eD7_1NIVppBRr62Ro,3963
+ocr_stringdist-1.0.1.dist-info/WHEEL,sha256=nTH9UaXhMe2Z7vYjzTFaj4VIXUvQaRiC6yVlzzX1nis,105
+ocr_stringdist-1.0.1.dist-info/licenses/LICENSE,sha256=5BPRcjlnbl2t4TidSgpfGrtC_birSf8JlZfA-qmVoQE,1072
+ocr_stringdist.libs/libgcc_s-27e5a392.so.1,sha256=x5sO63liVwXxrjGGP371wB0RyQe1KEnIynYm82T0G0M,449745
+ocr_stringdist/__init__.py,sha256=tU-70gwq3CAJ9nZdDSM-C59ShuDE4ClNYEPAkbntYeQ,347
+ocr_stringdist/_rust_stringdist.cpython-311-i386-linux-musl.so,sha256=CB04CRFCUo9wspvcM7jq1vosSy62R7wR_f8FmZUmIIk,780697
+ocr_stringdist/default_ocr_distances.py,sha256=oSu-TpHjPA4jxKpLAfmap8z0ZsC99jsOjnRVHW7Hj_Y,1033
+ocr_stringdist/edit_operation.py,sha256=JjCZulSwZpXLnLL3xcuz2j8EUzNQM9_hNxxVdCHMkvQ,478
+ocr_stringdist/learner.py,sha256=3qWvqHrAWm4seuwmBmFN4InRL20u8HnPATHjCTnU3I0,10491
+ocr_stringdist/levenshtein.py,sha256=m4WAh5iaQefDIQXzqcgC4XcCXntb7zNKhyWAx8A1yOs,9852
+ocr_stringdist/matching.py,sha256=28Xt-x_V_iVsohD3F64MfZ0mys4_qOZXTIAcmSOE0dA,3270
+ocr_stringdist/protocols.py,sha256=IyvGzzktPgmPRZyDRE0UKCYo4C0tdewU8IgwFbxZLls,265
+ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+ocr_stringdist-1.0.1.dist-info/RECORD,,

ocr_stringdist-0.3.0.dist-info/RECORD DELETED Viewed

@@ -1,13 +0,0 @@
-ocr_stringdist-0.3.0.dist-info/METADATA,sha256=YZzEt3aeo26-vS_1HCdXtbV1w7YC9Rnfw2dU8uUfy-s,4188
-ocr_stringdist-0.3.0.dist-info/WHEEL,sha256=nTH9UaXhMe2Z7vYjzTFaj4VIXUvQaRiC6yVlzzX1nis,105
-ocr_stringdist-0.3.0.dist-info/licenses/LICENSE,sha256=5BPRcjlnbl2t4TidSgpfGrtC_birSf8JlZfA-qmVoQE,1072
-ocr_stringdist.libs/libgcc_s-27e5a392.so.1,sha256=x5sO63liVwXxrjGGP371wB0RyQe1KEnIynYm82T0G0M,449745
-ocr_stringdist/__init__.py,sha256=ApxqraLRcWAkzXhGJXSf3EqGEVFbxghrYrfJ9dmQjQU,467
-ocr_stringdist/_rust_stringdist.cpython-311-i386-linux-musl.so,sha256=ry0S-F4wBOS6olO7CbZMffBIYSmSF4RnxFBbdUb47U4,780697
-ocr_stringdist/default_ocr_distances.py,sha256=oSu-TpHjPA4jxKpLAfmap8z0ZsC99jsOjnRVHW7Hj_Y,1033
-ocr_stringdist/edit_operation.py,sha256=EgEc-2_nOwLUZDOWtogYqKLXIQJxOd9sIAbcGkn-TMY,395
-ocr_stringdist/learner.py,sha256=9gZMqnF5Fm3gSKOnAfSI40JhGtMKGg0gZNil1-LVP9Q,10191
-ocr_stringdist/levenshtein.py,sha256=5A1k8Ana10tCbRbQXYIbMHIjeU9mhUK_DSFZZ2Znx2o,13360
-ocr_stringdist/matching.py,sha256=28Xt-x_V_iVsohD3F64MfZ0mys4_qOZXTIAcmSOE0dA,3270
-ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-ocr_stringdist-0.3.0.dist-info/RECORD,,

{ocr_stringdist-0.3.0.dist-info → ocr_stringdist-1.0.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{ocr_stringdist-0.3.0.dist-info → ocr_stringdist-1.0.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes