ocr-stringdist 0.3.0__cp310-cp310-win_amd64.whl → 1.0.0__cp310-cp310-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocr_stringdist/__init__.py +3 -9
- ocr_stringdist/_rust_stringdist.cp310-win_amd64.pyd +0 -0
- ocr_stringdist/learner.py +111 -115
- ocr_stringdist/levenshtein.py +92 -158
- ocr_stringdist/protocols.py +9 -0
- ocr_stringdist-1.0.0.dist-info/METADATA +94 -0
- ocr_stringdist-1.0.0.dist-info/RECORD +13 -0
- ocr_stringdist-0.3.0.dist-info/METADATA +0 -110
- ocr_stringdist-0.3.0.dist-info/RECORD +0 -12
- {ocr_stringdist-0.3.0.dist-info → ocr_stringdist-1.0.0.dist-info}/WHEEL +0 -0
- {ocr_stringdist-0.3.0.dist-info → ocr_stringdist-1.0.0.dist-info}/licenses/LICENSE +0 -0
    
        ocr_stringdist/__init__.py
    CHANGED
    
    | @@ -1,17 +1,11 @@ | |
| 1 1 | 
             
            from .default_ocr_distances import ocr_distance_map
         | 
| 2 | 
            -
            from . | 
| 3 | 
            -
             | 
| 4 | 
            -
                batch_weighted_levenshtein_distance,
         | 
| 5 | 
            -
                explain_weighted_levenshtein,
         | 
| 6 | 
            -
                weighted_levenshtein_distance,
         | 
| 7 | 
            -
            )
         | 
| 2 | 
            +
            from .learner import CostLearner
         | 
| 3 | 
            +
            from .levenshtein import WeightedLevenshtein
         | 
| 8 4 | 
             
            from .matching import find_best_candidate
         | 
| 9 5 |  | 
| 10 6 | 
             
            __all__ = [
         | 
| 11 7 | 
             
                "ocr_distance_map",
         | 
| 8 | 
            +
                "CostLearner",
         | 
| 12 9 | 
             
                "WeightedLevenshtein",
         | 
| 13 | 
            -
                "weighted_levenshtein_distance",
         | 
| 14 | 
            -
                "batch_weighted_levenshtein_distance",
         | 
| 15 | 
            -
                "explain_weighted_levenshtein",
         | 
| 16 10 | 
             
                "find_best_candidate",
         | 
| 17 11 | 
             
            ]
         | 
| Binary file | 
    
        ocr_stringdist/learner.py
    CHANGED
    
    | @@ -1,3 +1,4 @@ | |
| 1 | 
            +
            import itertools
         | 
| 1 2 | 
             
            import math
         | 
| 2 3 | 
             
            from collections import defaultdict
         | 
| 3 4 | 
             
            from collections.abc import Iterable
         | 
| @@ -7,12 +8,12 @@ from typing import TYPE_CHECKING, Callable, Optional | |
| 7 8 | 
             
            if TYPE_CHECKING:
         | 
| 8 9 | 
             
                from .edit_operation import EditOperation
         | 
| 9 10 | 
             
                from .levenshtein import WeightedLevenshtein
         | 
| 11 | 
            +
                from .protocols import Aligner
         | 
| 10 12 |  | 
| 11 13 | 
             
            CostFunction = Callable[[float], float]
         | 
| 12 14 |  | 
| 13 15 |  | 
| 14 16 | 
             
            def negative_log_likelihood(probability: float) -> float:
         | 
| 15 | 
            -
                """Standard cost function based on information theory. Common errors get low cost."""
         | 
| 16 17 | 
             
                if probability <= 0.0:
         | 
| 17 18 | 
             
                    raise ValueError("Probability must be positive to compute negative log likelihood.")
         | 
| 18 19 | 
             
                return -math.log(probability)
         | 
| @@ -26,6 +27,7 @@ class TallyCounts: | |
| 26 27 | 
             
                insertions: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
         | 
| 27 28 | 
             
                deletions: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
         | 
| 28 29 | 
             
                source_chars: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
         | 
| 30 | 
            +
                target_chars: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
         | 
| 29 31 | 
             
                vocab: set[str] = field(default_factory=set)
         | 
| 30 32 |  | 
| 31 33 |  | 
| @@ -36,7 +38,7 @@ class _Costs: | |
| 36 38 | 
             
                deletions: dict[str, float]
         | 
| 37 39 |  | 
| 38 40 |  | 
| 39 | 
            -
            class  | 
| 41 | 
            +
            class CostLearner:
         | 
| 40 42 | 
             
                """
         | 
| 41 43 | 
             
                Configures and executes the process of learning Levenshtein costs from data.
         | 
| 42 44 |  | 
| @@ -45,17 +47,16 @@ class Learner: | |
| 45 47 |  | 
| 46 48 | 
             
                Example::
         | 
| 47 49 |  | 
| 48 | 
            -
                    from ocr_stringdist | 
| 50 | 
            +
                    from ocr_stringdist import CostLearner
         | 
| 49 51 |  | 
| 50 52 | 
             
                    data = [
         | 
| 51 53 | 
             
                        ("Hell0", "Hello"),
         | 
| 52 54 | 
             
                    ]
         | 
| 53 | 
            -
                    learner =  | 
| 55 | 
            +
                    learner = CostLearner().with_smoothing(1.0)
         | 
| 54 56 | 
             
                    wl = learner.fit(data) # Substitution 0 -> o learned with cost < 1.0
         | 
| 55 57 | 
             
                """
         | 
| 56 58 |  | 
| 57 59 | 
             
                # Configuration parameters
         | 
| 58 | 
            -
                _cost_function: CostFunction
         | 
| 59 60 | 
             
                _smoothing_k: float
         | 
| 60 61 |  | 
| 61 62 | 
             
                # These attributes are set during fitting
         | 
| @@ -63,71 +64,37 @@ class Learner: | |
| 63 64 | 
             
                vocab_size: Optional[int] = None
         | 
| 64 65 |  | 
| 65 66 | 
             
                def __init__(self) -> None:
         | 
| 66 | 
            -
                    self._cost_function = negative_log_likelihood
         | 
| 67 67 | 
             
                    self._smoothing_k = 1.0
         | 
| 68 68 |  | 
| 69 | 
            -
                def  | 
| 70 | 
            -
                    """
         | 
| 71 | 
            -
                    Sets a custom function to convert probabilities to costs.
         | 
| 72 | 
            -
             | 
| 73 | 
            -
                    :param cost_function: A callable that takes a float (probability)
         | 
| 74 | 
            -
                                          and returns a float (cost).
         | 
| 75 | 
            -
                                          Is negative log likelihood unless overridden.
         | 
| 76 | 
            -
                    :return: The Learner instance for method chaining.
         | 
| 77 | 
            -
                    """
         | 
| 78 | 
            -
                    self._cost_function = cost_function
         | 
| 79 | 
            -
                    return self
         | 
| 80 | 
            -
             | 
| 81 | 
            -
                def with_smoothing(self, k: float) -> "Learner":
         | 
| 69 | 
            +
                def with_smoothing(self, k: float) -> "CostLearner":
         | 
| 82 70 | 
             
                    r"""
         | 
| 83 71 | 
             
                    Sets the smoothing parameter `k`.
         | 
| 84 72 |  | 
| 85 | 
            -
                    This parameter controls how strongly the model defaults to uniform | 
| 73 | 
            +
                    This parameter controls how strongly the model defaults to a uniform
         | 
| 74 | 
            +
                    probability distribution by adding a "pseudo-count" of `k` to every
         | 
| 75 | 
            +
                    possible event.
         | 
| 86 76 |  | 
| 87 | 
            -
                    :param k: The smoothing factor, which must be a  | 
| 88 | 
            -
                    :return: The  | 
| 77 | 
            +
                    :param k: The smoothing factor, which must be a non-negative number.
         | 
| 78 | 
            +
                    :return: The CostLearner instance for method chaining.
         | 
| 79 | 
            +
                    :raises ValueError: If k < 0.
         | 
| 89 80 |  | 
| 90 81 | 
             
                    Notes
         | 
| 91 82 | 
             
                    -----
         | 
| 92 | 
            -
                     | 
| 93 | 
            -
             | 
| 94 | 
            -
                     | 
| 95 | 
            -
             | 
| 96 | 
            -
             | 
| 97 | 
            -
             | 
| 98 | 
            -
             | 
| 99 | 
            -
             | 
| 100 | 
            -
                       | 
| 101 | 
            -
                       | 
| 102 | 
            -
             | 
| 103 | 
            -
                       | 
| 104 | 
            -
                    - **k > 1.0:** A larger `k` is useful for small or noisy datasets. It
         | 
| 105 | 
            -
                      regularizes the model by pulling the learned probabilities closer
         | 
| 106 | 
            -
                      to a uniform distribution.
         | 
| 107 | 
            -
             | 
| 108 | 
            -
                    **Bayesian Interpretation**
         | 
| 109 | 
            -
             | 
| 110 | 
            -
                    From a Bayesian perspective, `k` serves as the concentration parameter,
         | 
| 111 | 
            -
                    :math:`\alpha`, of a **symmetric Dirichlet prior distribution**.
         | 
| 112 | 
            -
                    This distribution acts as the conjugate prior for the
         | 
| 113 | 
            -
                    multinomial distribution of the observed error counts.
         | 
| 114 | 
            -
             | 
| 115 | 
            -
                    The smoothed probability of an event `i` is the posterior expectation of
         | 
| 116 | 
            -
                    the Dirichlet-multinomial model:
         | 
| 117 | 
            -
             | 
| 118 | 
            -
                    .. math::
         | 
| 119 | 
            -
             | 
| 120 | 
            -
                        P(\text{event}_i) = \frac{c_i + k}{N + k \cdot V}
         | 
| 121 | 
            -
             | 
| 122 | 
            -
                    Where:
         | 
| 123 | 
            -
                        - :math:`c_i` is the observed count of event :math:`i`.
         | 
| 124 | 
            -
                        - :math:`N` is the total number of observations in the given context
         | 
| 125 | 
            -
                          (e.g., the total count of a specific source character).
         | 
| 126 | 
            -
                        - :math:`V` is the vocabulary size (the number of possible unique events).
         | 
| 127 | 
            -
                        - :math:`k` is the smoothing parameter, representing the strength of the prior.
         | 
| 83 | 
            +
                    This parameter allows for a continuous transition between two modes:
         | 
| 84 | 
            +
             | 
| 85 | 
            +
                    - **k > 0 (recommended):** This enables additive smoothing, with `k = 1.0`
         | 
| 86 | 
            +
                      being Laplace smoothing. It regularizes the model by assuming no event is impossible.
         | 
| 87 | 
            +
                      The final costs are a measure of "relative surprisal," normalized by the vocabulary size
         | 
| 88 | 
            +
             | 
| 89 | 
            +
                    - **k = 0:** This corresponds to a normalized Maximum Likelihood Estimation.
         | 
| 90 | 
            +
                      Probabilities are derived from the raw observed frequencies. The final costs are
         | 
| 91 | 
            +
                      normalized using the same logic as the `k > 0` case, making `k=0` the continuous limit
         | 
| 92 | 
            +
                      of the smoothed model. In this mode, costs can only be calculated for events observed in
         | 
| 93 | 
            +
                      the training data. Unseen events will receive the default cost, regardless of
         | 
| 94 | 
            +
                      the value of `calculate_for_unseen` in :meth:`fit`.
         | 
| 128 95 | 
             
                    """
         | 
| 129 | 
            -
                    if k  | 
| 130 | 
            -
                        raise ValueError("Smoothing parameter k must be  | 
| 96 | 
            +
                    if k < 0:
         | 
| 97 | 
            +
                        raise ValueError("Smoothing parameter k must be non-negative.")
         | 
| 131 98 | 
             
                    self._smoothing_k = k
         | 
| 132 99 | 
             
                    return self
         | 
| 133 100 |  | 
| @@ -138,6 +105,7 @@ class Learner: | |
| 138 105 | 
             
                        if op.source_token is not None:
         | 
| 139 106 | 
             
                            counts.vocab.add(op.source_token)
         | 
| 140 107 | 
             
                        if op.target_token is not None:
         | 
| 108 | 
            +
                            counts.target_chars[op.target_token] += 1
         | 
| 141 109 | 
             
                            counts.vocab.add(op.target_token)
         | 
| 142 110 |  | 
| 143 111 | 
             
                        if op.op_type == "substitute":
         | 
| @@ -160,85 +128,112 @@ class Learner: | |
| 160 128 | 
             
                            counts.source_chars[op.source_token] += 1
         | 
| 161 129 | 
             
                    return counts
         | 
| 162 130 |  | 
| 163 | 
            -
                def  | 
| 164 | 
            -
                    self,
         | 
| 165 | 
            -
             | 
| 166 | 
            -
                    context_total: int,
         | 
| 167 | 
            -
                    vocab_size: int,
         | 
| 168 | 
            -
                ) -> Optional[float]:
         | 
| 169 | 
            -
                    """Calculates a single scaled cost for an edit operation."""
         | 
| 170 | 
            -
                    denominator = context_total + self._smoothing_k * vocab_size
         | 
| 171 | 
            -
                    if denominator <= 0:
         | 
| 172 | 
            -
                        return None
         | 
| 173 | 
            -
             | 
| 174 | 
            -
                    # Calculate the cost of an unseen event in this context, used for scaling
         | 
| 175 | 
            -
                    prob_unseen = self._smoothing_k / denominator
         | 
| 176 | 
            -
                    scaling_factor = self._cost_function(prob_unseen)
         | 
| 177 | 
            -
                    if scaling_factor <= 0:
         | 
| 178 | 
            -
                        return None
         | 
| 179 | 
            -
             | 
| 180 | 
            -
                    # Calculate the cost for the actually observed event
         | 
| 181 | 
            -
                    prob_observed = (observed_count + self._smoothing_k) / denominator
         | 
| 182 | 
            -
                    cost_observed = self._cost_function(prob_observed)
         | 
| 183 | 
            -
             | 
| 184 | 
            -
                    return cost_observed / scaling_factor
         | 
| 185 | 
            -
             | 
| 186 | 
            -
                def _calculate_costs(self, counts: TallyCounts, vocab_size: int) -> _Costs:
         | 
| 131 | 
            +
                def _calculate_costs(
         | 
| 132 | 
            +
                    self, counts: TallyCounts, vocab: set[str], calculate_for_unseen: bool = False
         | 
| 133 | 
            +
                ) -> _Costs:
         | 
| 187 134 | 
             
                    """
         | 
| 188 | 
            -
                    Calculates  | 
| 189 | 
            -
                    scaling factor to ensure the effective default cost is 1.0.
         | 
| 135 | 
            +
                    Calculates the costs for edit operations based on tallied counts.
         | 
| 190 136 | 
             
                    """
         | 
| 191 | 
            -
             | 
| 192 | 
            -
                    # Substitutions
         | 
| 193 137 | 
             
                    sub_costs: dict[tuple[str, str], float] = {}
         | 
| 194 | 
            -
                    for (source, target), count in counts.substitutions.items():
         | 
| 195 | 
            -
                        source_char_count = counts.source_chars[source]
         | 
| 196 | 
            -
                        cost = self._calculate_single_scaled_cost(count, source_char_count, vocab_size)
         | 
| 197 | 
            -
                        if cost is not None:
         | 
| 198 | 
            -
                            sub_costs[(source, target)] = cost
         | 
| 199 | 
            -
             | 
| 200 | 
            -
                    # Insertions
         | 
| 201 138 | 
             
                    ins_costs: dict[str, float] = {}
         | 
| 202 | 
            -
                     | 
| 203 | 
            -
                     | 
| 204 | 
            -
             | 
| 205 | 
            -
             | 
| 206 | 
            -
             | 
| 139 | 
            +
                    del_costs: dict[str, float] = {}
         | 
| 140 | 
            +
                    k = self._smoothing_k
         | 
| 141 | 
            +
             | 
| 142 | 
            +
                    if k == 0:
         | 
| 143 | 
            +
                        calculate_for_unseen = False
         | 
| 144 | 
            +
             | 
| 145 | 
            +
                    # Error space size V for all conditional probabilities.
         | 
| 146 | 
            +
                    # The space of possible outcomes for a given source character (from OCR)
         | 
| 147 | 
            +
                    # includes all vocab characters (for matches/substitutions) plus the empty
         | 
| 148 | 
            +
                    # character (for deletions). This gives V = len(vocab) + 1.
         | 
| 149 | 
            +
                    # Symmetrically, the space of outcomes for a given target character (from GT)
         | 
| 150 | 
            +
                    # includes all vocab characters plus the empty character (for insertions/misses).
         | 
| 151 | 
            +
                    V = len(vocab) + 1
         | 
| 152 | 
            +
             | 
| 153 | 
            +
                    # Normalization ceiling Z' = -log(1/V).
         | 
| 154 | 
            +
                    normalization_ceiling = math.log(V) if V > 1 else 1.0
         | 
| 155 | 
            +
             | 
| 156 | 
            +
                    # Substitutions
         | 
| 157 | 
            +
                    sub_iterator = (
         | 
| 158 | 
            +
                        itertools.product(vocab, vocab) if calculate_for_unseen else counts.substitutions.keys()
         | 
| 159 | 
            +
                    )
         | 
| 160 | 
            +
                    for source, target in sub_iterator:
         | 
| 161 | 
            +
                        count = counts.substitutions[(source, target)]
         | 
| 162 | 
            +
                        total_count = counts.source_chars[source]
         | 
| 163 | 
            +
                        prob = (count + k) / (total_count + k * V)
         | 
| 164 | 
            +
                        base_cost = negative_log_likelihood(prob)
         | 
| 165 | 
            +
                        sub_costs[(source, target)] = base_cost / normalization_ceiling
         | 
| 207 166 |  | 
| 208 167 | 
             
                    # Deletions
         | 
| 209 | 
            -
                     | 
| 210 | 
            -
                    for source | 
| 211 | 
            -
                         | 
| 212 | 
            -
                         | 
| 213 | 
            -
                         | 
| 214 | 
            -
             | 
| 168 | 
            +
                    del_iterator = vocab if calculate_for_unseen else counts.deletions.keys()
         | 
| 169 | 
            +
                    for source in del_iterator:
         | 
| 170 | 
            +
                        count = counts.deletions[source]
         | 
| 171 | 
            +
                        total_count = counts.source_chars[source]
         | 
| 172 | 
            +
                        prob = (count + k) / (total_count + k * V)
         | 
| 173 | 
            +
                        base_cost = negative_log_likelihood(prob)
         | 
| 174 | 
            +
                        del_costs[source] = base_cost / normalization_ceiling
         | 
| 175 | 
            +
             | 
| 176 | 
            +
                    # Insertions
         | 
| 177 | 
            +
                    ins_iterator = vocab if calculate_for_unseen else counts.insertions.keys()
         | 
| 178 | 
            +
                    for target in ins_iterator:
         | 
| 179 | 
            +
                        count = counts.insertions[target]
         | 
| 180 | 
            +
                        total_target_count = counts.target_chars[target]
         | 
| 181 | 
            +
                        prob = (count + k) / (total_target_count + k * V)
         | 
| 182 | 
            +
                        base_cost = negative_log_likelihood(prob)
         | 
| 183 | 
            +
                        ins_costs[target] = base_cost / normalization_ceiling
         | 
| 215 184 |  | 
| 216 185 | 
             
                    return _Costs(substitutions=sub_costs, insertions=ins_costs, deletions=del_costs)
         | 
| 217 186 |  | 
| 218 | 
            -
                def _calculate_operations( | 
| 219 | 
            -
                     | 
| 220 | 
            -
             | 
| 187 | 
            +
                def _calculate_operations(
         | 
| 188 | 
            +
                    self, pairs: Iterable[tuple[str, str]], aligner: "Aligner"
         | 
| 189 | 
            +
                ) -> list["EditOperation"]:
         | 
| 190 | 
            +
                    """Calculate edit operations for all string pairs using the provided aligner."""
         | 
| 221 191 |  | 
| 222 | 
            -
                    unweighted_lev = WeightedLevenshtein.unweighted()
         | 
| 223 192 | 
             
                    all_ops = [
         | 
| 224 193 | 
             
                        op
         | 
| 225 194 | 
             
                        for ocr_str, truth_str in pairs
         | 
| 226 | 
            -
                        for op in  | 
| 195 | 
            +
                        for op in aligner.explain(ocr_str, truth_str, filter_matches=False)
         | 
| 227 196 | 
             
                    ]
         | 
| 228 197 | 
             
                    return all_ops
         | 
| 229 198 |  | 
| 230 | 
            -
                def fit( | 
| 199 | 
            +
                def fit(
         | 
| 200 | 
            +
                    self,
         | 
| 201 | 
            +
                    pairs: Iterable[tuple[str, str]],
         | 
| 202 | 
            +
                    *,
         | 
| 203 | 
            +
                    initial_model: "Aligner | None" = None,
         | 
| 204 | 
            +
                    calculate_for_unseen: bool = False,
         | 
| 205 | 
            +
                ) -> "WeightedLevenshtein":
         | 
| 231 206 | 
             
                    """
         | 
| 232 207 | 
             
                    Fits the costs of a WeightedLevenshtein instance to the provided data.
         | 
| 233 208 |  | 
| 234 | 
            -
                    Note that learning multi-character tokens is  | 
| 209 | 
            +
                    Note that learning multi-character tokens is only supported if an initial alignment model
         | 
| 210 | 
            +
                    is provided that can handle those multi-character tokens.
         | 
| 211 | 
            +
             | 
| 212 | 
            +
                    This method analyzes pairs of strings to learn the costs of edit operations
         | 
| 213 | 
            +
                    based on their observed frequencies. The underlying model calculates costs
         | 
| 214 | 
            +
                    based on the principle of relative information cost.
         | 
| 215 | 
            +
             | 
| 216 | 
            +
                    For a detailed explanation of the methodology, please see the
         | 
| 217 | 
            +
                    :doc:`Cost Learning Model <cost_learning_model>` documentation page.
         | 
| 235 218 |  | 
| 236 219 | 
             
                    :param pairs: An iterable of (ocr_string, ground_truth_string) tuples.
         | 
| 220 | 
            +
                    :param initial_model: Optional initial model used to align OCR outputs and ground truth
         | 
| 221 | 
            +
                                          strings. By default, an unweighted Levenshtein distance is used.
         | 
| 222 | 
            +
                    :param calculate_for_unseen: If True (and k > 0), pre-calculates costs for all
         | 
| 223 | 
            +
                                                 possible edit operations based on the vocabulary.
         | 
| 224 | 
            +
                                                 If False (default), only calculates costs for operations
         | 
| 225 | 
            +
                                                 observed in the data.
         | 
| 237 226 | 
             
                    :return: A `WeightedLevenshtein` instance with the learned costs.
         | 
| 238 227 | 
             
                    """
         | 
| 239 228 | 
             
                    from .levenshtein import WeightedLevenshtein
         | 
| 240 229 |  | 
| 241 | 
            -
                     | 
| 230 | 
            +
                    if not pairs:
         | 
| 231 | 
            +
                        return WeightedLevenshtein.unweighted()
         | 
| 232 | 
            +
             | 
| 233 | 
            +
                    if initial_model is None:
         | 
| 234 | 
            +
                        initial_model = WeightedLevenshtein.unweighted()
         | 
| 235 | 
            +
             | 
| 236 | 
            +
                    all_ops = self._calculate_operations(pairs, aligner=initial_model)
         | 
| 242 237 | 
             
                    self.counts = self._tally_operations(all_ops)
         | 
| 243 238 | 
             
                    vocab = self.counts.vocab
         | 
| 244 239 | 
             
                    self.vocab_size = len(vocab)
         | 
| @@ -246,12 +241,13 @@ class Learner: | |
| 246 241 | 
             
                    if not self.vocab_size:
         | 
| 247 242 | 
             
                        return WeightedLevenshtein.unweighted()
         | 
| 248 243 |  | 
| 249 | 
            -
                    costs = self._calculate_costs(self.counts,  | 
| 244 | 
            +
                    costs = self._calculate_costs(self.counts, vocab, calculate_for_unseen=calculate_for_unseen)
         | 
| 250 245 |  | 
| 251 246 | 
             
                    return WeightedLevenshtein(
         | 
| 252 247 | 
             
                        substitution_costs=costs.substitutions,
         | 
| 253 248 | 
             
                        insertion_costs=costs.insertions,
         | 
| 254 249 | 
             
                        deletion_costs=costs.deletions,
         | 
| 250 | 
            +
                        symmetric_substitution=False,
         | 
| 255 251 | 
             
                        default_substitution_cost=1.0,
         | 
| 256 252 | 
             
                        default_insertion_cost=1.0,
         | 
| 257 253 | 
             
                        default_deletion_cost=1.0,
         | 
    
        ocr_stringdist/levenshtein.py
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            from __future__ import annotations
         | 
| 2 2 |  | 
| 3 3 | 
             
            from collections.abc import Iterable
         | 
| 4 | 
            -
            from typing import Optional
         | 
| 4 | 
            +
            from typing import Any, Optional
         | 
| 5 5 |  | 
| 6 6 | 
             
            from ._rust_stringdist import (
         | 
| 7 7 | 
             
                _batch_weighted_levenshtein_distance,
         | 
| @@ -24,10 +24,13 @@ class WeightedLevenshtein: | |
| 24 24 | 
             
                                           Defaults to costs based on common OCR errors.
         | 
| 25 25 | 
             
                :param insertion_costs: Maps a string to its insertion cost.
         | 
| 26 26 | 
             
                :param deletion_costs: Maps a string to its deletion cost.
         | 
| 27 | 
            -
                :param symmetric_substitution: If True,  | 
| 27 | 
            +
                :param symmetric_substitution: If True, a cost defined for, e.g., ('0', 'O') will automatically
         | 
| 28 | 
            +
                                               apply to ('O', '0'). If False, both must be defined explicitly.
         | 
| 28 29 | 
             
                :param default_substitution_cost: Default cost for single-char substitutions not in the map.
         | 
| 29 30 | 
             
                :param default_insertion_cost: Default cost for single-char insertions not in the map.
         | 
| 30 31 | 
             
                :param default_deletion_cost: Default cost for single-char deletions not in the map.
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                :raises TypeError, ValueError: If the provided arguments are invalid.
         | 
| 31 34 | 
             
                """
         | 
| 32 35 |  | 
| 33 36 | 
             
                substitution_costs: dict[tuple[str, str], float]
         | 
| @@ -49,9 +52,37 @@ class WeightedLevenshtein: | |
| 49 52 | 
             
                    default_insertion_cost: float = 1.0,
         | 
| 50 53 | 
             
                    default_deletion_cost: float = 1.0,
         | 
| 51 54 | 
             
                ) -> None:
         | 
| 52 | 
            -
                     | 
| 53 | 
            -
             | 
| 54 | 
            -
             | 
| 55 | 
            +
                    # Validate default costs
         | 
| 56 | 
            +
                    for cost_name, cost_val in [
         | 
| 57 | 
            +
                        ("default_substitution_cost", default_substitution_cost),
         | 
| 58 | 
            +
                        ("default_insertion_cost", default_insertion_cost),
         | 
| 59 | 
            +
                        ("default_deletion_cost", default_deletion_cost),
         | 
| 60 | 
            +
                    ]:
         | 
| 61 | 
            +
                        if not isinstance(cost_val, (int, float)):
         | 
| 62 | 
            +
                            raise TypeError(f"{cost_name} must be a number, but got: {type(cost_val).__name__}")
         | 
| 63 | 
            +
                        if cost_val < 0:
         | 
| 64 | 
            +
                            raise ValueError(f"{cost_name} must be non-negative, got value: {cost_val}")
         | 
| 65 | 
            +
             | 
| 66 | 
            +
                    # Validate substitution_costs dictionary
         | 
| 67 | 
            +
                    sub_costs = ocr_distance_map if substitution_costs is None else substitution_costs
         | 
| 68 | 
            +
                    for key, cost in sub_costs.items():
         | 
| 69 | 
            +
                        if not (
         | 
| 70 | 
            +
                            isinstance(key, tuple)
         | 
| 71 | 
            +
                            and len(key) == 2
         | 
| 72 | 
            +
                            and isinstance(key[0], str)
         | 
| 73 | 
            +
                            and isinstance(key[1], str)
         | 
| 74 | 
            +
                        ):
         | 
| 75 | 
            +
                            raise TypeError(
         | 
| 76 | 
            +
                                f"substitution_costs keys must be tuples of two strings, but found: {key}"
         | 
| 77 | 
            +
                            )
         | 
| 78 | 
            +
                        if not isinstance(cost, (int, float)):
         | 
| 79 | 
            +
                            raise TypeError(
         | 
| 80 | 
            +
                                f"Cost for substitution {key} must be a number, but got: {type(cost).__name__}"
         | 
| 81 | 
            +
                            )
         | 
| 82 | 
            +
                        if cost < 0:
         | 
| 83 | 
            +
                            raise ValueError(f"Cost for substitution {key} cannot be negative, but got: {cost}")
         | 
| 84 | 
            +
             | 
| 85 | 
            +
                    self.substitution_costs = sub_costs
         | 
| 55 86 | 
             
                    self.insertion_costs = {} if insertion_costs is None else insertion_costs
         | 
| 56 87 | 
             
                    self.deletion_costs = {} if deletion_costs is None else deletion_costs
         | 
| 57 88 | 
             
                    self.symmetric_substitution = symmetric_substitution
         | 
| @@ -92,7 +123,8 @@ class WeightedLevenshtein: | |
| 92 123 | 
             
                    """
         | 
| 93 124 | 
             
                    Creates an instance by learning costs from a dataset of (OCR, ground truth) string pairs.
         | 
| 94 125 |  | 
| 95 | 
            -
                    For more advanced learning configuration, see the | 
| 126 | 
            +
                    For more advanced learning configuration, see the
         | 
| 127 | 
            +
                    :class:`ocr_stringdist.learner.CostLearner` class.
         | 
| 96 128 |  | 
| 97 129 | 
             
                    :param pairs: An iterable of (ocr_string, ground_truth_string) tuples. Correct pairs
         | 
| 98 130 | 
             
                                  are not intended to be filtered; they are needed to learn well-aligned costs.
         | 
| @@ -111,9 +143,9 @@ class WeightedLevenshtein: | |
| 111 143 | 
             
                        print(wl.substitution_costs) # learned cost for substituting '8' with 'B'
         | 
| 112 144 | 
             
                        print(wl.deletion_costs) # learned cost for deleting '.'
         | 
| 113 145 | 
             
                    """
         | 
| 114 | 
            -
                    from .learner import  | 
| 146 | 
            +
                    from .learner import CostLearner
         | 
| 115 147 |  | 
| 116 | 
            -
                    return  | 
| 148 | 
            +
                    return CostLearner().fit(pairs)
         | 
| 117 149 |  | 
| 118 150 | 
             
                def __eq__(self, other: object) -> bool:
         | 
| 119 151 | 
             
                    if not isinstance(other, WeightedLevenshtein):
         | 
| @@ -128,154 +160,56 @@ class WeightedLevenshtein: | |
| 128 160 | 
             
                        and self.default_deletion_cost == other.default_deletion_cost
         | 
| 129 161 | 
             
                    )
         | 
| 130 162 |  | 
| 163 | 
            +
                def to_dict(self) -> dict[str, Any]:
         | 
| 164 | 
            +
                    """
         | 
| 165 | 
            +
                    Serializes the instance's configuration to a dictionary.
         | 
| 131 166 |  | 
| 132 | 
            -
             | 
| 133 | 
            -
             | 
| 134 | 
            -
             | 
| 135 | 
            -
             | 
| 136 | 
            -
             | 
| 137 | 
            -
             | 
| 138 | 
            -
             | 
| 139 | 
            -
             | 
| 140 | 
            -
             | 
| 141 | 
            -
             | 
| 142 | 
            -
             | 
| 143 | 
            -
             | 
| 144 | 
            -
             | 
| 145 | 
            -
             | 
| 146 | 
            -
             | 
| 147 | 
            -
             | 
| 148 | 
            -
             | 
| 149 | 
            -
             | 
| 150 | 
            -
             | 
| 151 | 
            -
                 | 
| 152 | 
            -
             | 
| 153 | 
            -
             | 
| 154 | 
            -
             | 
| 155 | 
            -
             | 
| 156 | 
            -
             | 
| 157 | 
            -
             | 
| 158 | 
            -
             | 
| 159 | 
            -
                                  | 
| 160 | 
            -
             | 
| 161 | 
            -
             | 
| 162 | 
            -
             | 
| 163 | 
            -
             | 
| 164 | 
            -
             | 
| 165 | 
            -
             | 
| 166 | 
            -
             | 
| 167 | 
            -
             | 
| 168 | 
            -
             | 
| 169 | 
            -
             | 
| 170 | 
            -
             | 
| 171 | 
            -
             | 
| 172 | 
            -
                     | 
| 173 | 
            -
             | 
| 174 | 
            -
             | 
| 175 | 
            -
             | 
| 176 | 
            -
             | 
| 177 | 
            -
             | 
| 178 | 
            -
             | 
| 179 | 
            -
             | 
| 180 | 
            -
             | 
| 181 | 
            -
             | 
| 182 | 
            -
            def batch_weighted_levenshtein_distance(
         | 
| 183 | 
            -
                s: str,
         | 
| 184 | 
            -
                candidates: list[str],
         | 
| 185 | 
            -
                /,
         | 
| 186 | 
            -
                substitution_costs: Optional[dict[tuple[str, str], float]] = None,
         | 
| 187 | 
            -
                insertion_costs: Optional[dict[str, float]] = None,
         | 
| 188 | 
            -
                deletion_costs: Optional[dict[str, float]] = None,
         | 
| 189 | 
            -
                *,
         | 
| 190 | 
            -
                symmetric_substitution: bool = True,
         | 
| 191 | 
            -
                default_substitution_cost: float = 1.0,
         | 
| 192 | 
            -
                default_insertion_cost: float = 1.0,
         | 
| 193 | 
            -
                default_deletion_cost: float = 1.0,
         | 
| 194 | 
            -
            ) -> list[float]:
         | 
| 195 | 
            -
                """
         | 
| 196 | 
            -
                Calculate weighted Levenshtein distances between a string and multiple candidates.
         | 
| 197 | 
            -
             | 
| 198 | 
            -
                See also :meth:`WeightedLevenshtein.batch_distance`.
         | 
| 199 | 
            -
             | 
| 200 | 
            -
                This is more efficient than calling :func:`weighted_levenshtein_distance` multiple times.
         | 
| 201 | 
            -
             | 
| 202 | 
            -
                :param s: The string to compare (interpreted as the string read via OCR)
         | 
| 203 | 
            -
                :param candidates: List of candidate strings to compare against
         | 
| 204 | 
            -
                :param substitution_costs: Dictionary mapping tuples of strings ("substitution tokens") to their
         | 
| 205 | 
            -
                                 substitution costs. Only one direction needs to be configured unless
         | 
| 206 | 
            -
                                 `symmetric_substitution` is False.
         | 
| 207 | 
            -
                                 Note that the runtime scales in the length of the longest substitution token.
         | 
| 208 | 
            -
                                 Defaults to `ocr_stringdist.ocr_distance_map`.
         | 
| 209 | 
            -
                :param insertion_costs: Dictionary mapping strings to their insertion costs.
         | 
| 210 | 
            -
                :param deletion_costs: Dictionary mapping strings to their deletion costs.
         | 
| 211 | 
            -
                :param symmetric_substitution: Should the keys of `substitution_costs` be considered to be
         | 
| 212 | 
            -
                                               symmetric? Defaults to True.
         | 
| 213 | 
            -
                :param default_substitution_cost: The default substitution cost for character pairs not found
         | 
| 214 | 
            -
                                                  in `substitution_costs`.
         | 
| 215 | 
            -
                :param default_insertion_cost: The default insertion cost for characters not found in
         | 
| 216 | 
            -
                                               `insertion_costs`.
         | 
| 217 | 
            -
                :param default_deletion_cost: The default deletion cost for characters not found in
         | 
| 218 | 
            -
                                              `deletion_costs`.
         | 
| 219 | 
            -
                :return: A list of distances corresponding to each candidate
         | 
| 220 | 
            -
                """
         | 
| 221 | 
            -
                return WeightedLevenshtein(
         | 
| 222 | 
            -
                    substitution_costs=substitution_costs,
         | 
| 223 | 
            -
                    insertion_costs=insertion_costs,
         | 
| 224 | 
            -
                    deletion_costs=deletion_costs,
         | 
| 225 | 
            -
                    symmetric_substitution=symmetric_substitution,
         | 
| 226 | 
            -
                    default_substitution_cost=default_substitution_cost,
         | 
| 227 | 
            -
                    default_insertion_cost=default_insertion_cost,
         | 
| 228 | 
            -
                    default_deletion_cost=default_deletion_cost,
         | 
| 229 | 
            -
                ).batch_distance(s, candidates)
         | 
| 230 | 
            -
             | 
| 231 | 
            -
             | 
| 232 | 
            -
            def explain_weighted_levenshtein(
         | 
| 233 | 
            -
                s1: str,
         | 
| 234 | 
            -
                s2: str,
         | 
| 235 | 
            -
                /,
         | 
| 236 | 
            -
                substitution_costs: Optional[dict[tuple[str, str], float]] = None,
         | 
| 237 | 
            -
                insertion_costs: Optional[dict[str, float]] = None,
         | 
| 238 | 
            -
                deletion_costs: Optional[dict[str, float]] = None,
         | 
| 239 | 
            -
                *,
         | 
| 240 | 
            -
                symmetric_substitution: bool = True,
         | 
| 241 | 
            -
                default_substitution_cost: float = 1.0,
         | 
| 242 | 
            -
                default_insertion_cost: float = 1.0,
         | 
| 243 | 
            -
                default_deletion_cost: float = 1.0,
         | 
| 244 | 
            -
                filter_matches: bool = True,
         | 
| 245 | 
            -
            ) -> list[EditOperation]:
         | 
| 246 | 
            -
                """
         | 
| 247 | 
            -
                Computes the path of operations associated with the custom Levenshtein distance.
         | 
| 248 | 
            -
             | 
| 249 | 
            -
                See also :meth:`WeightedLevenshtein.explain`.
         | 
| 250 | 
            -
             | 
| 251 | 
            -
                The default `substitution_costs` considers common OCR errors, see
         | 
| 252 | 
            -
                :py:data:`ocr_stringdist.default_ocr_distances.ocr_distance_map`.
         | 
| 253 | 
            -
             | 
| 254 | 
            -
                :param s1: First string (interpreted as the string read via OCR)
         | 
| 255 | 
            -
                :param s2: Second string
         | 
| 256 | 
            -
                :param substitution_costs: Dictionary mapping tuples of strings ("substitution tokens") to their
         | 
| 257 | 
            -
                                 substitution costs. Only one direction needs to be configured unless
         | 
| 258 | 
            -
                                 `symmetric_substitution` is False.
         | 
| 259 | 
            -
                                 Note that the runtime scales in the length of the longest substitution token.
         | 
| 260 | 
            -
                                 Defaults to `ocr_stringdist.ocr_distance_map`.
         | 
| 261 | 
            -
                :param insertion_costs: Dictionary mapping strings to their insertion costs.
         | 
| 262 | 
            -
                :param deletion_costs: Dictionary mapping strings to their deletion costs.
         | 
| 263 | 
            -
                :param symmetric_substitution: Should the keys of `substitution_costs` be considered to be
         | 
| 264 | 
            -
                                               symmetric? Defaults to True.
         | 
| 265 | 
            -
                :param default_substitution_cost: The default substitution cost for character pairs not found
         | 
| 266 | 
            -
                                                  in `substitution_costs`.
         | 
| 267 | 
            -
                :param default_insertion_cost: The default insertion cost for characters not found in
         | 
| 268 | 
            -
                                               `insertion_costs`.
         | 
| 269 | 
            -
                :param default_deletion_cost: The default deletion cost for characters not found in
         | 
| 270 | 
            -
                                              `deletion_costs`.
         | 
| 271 | 
            -
                :return: List of :class:`EditOperation` instances.
         | 
| 272 | 
            -
                """
         | 
| 273 | 
            -
                return WeightedLevenshtein(
         | 
| 274 | 
            -
                    substitution_costs=substitution_costs,
         | 
| 275 | 
            -
                    insertion_costs=insertion_costs,
         | 
| 276 | 
            -
                    deletion_costs=deletion_costs,
         | 
| 277 | 
            -
                    symmetric_substitution=symmetric_substitution,
         | 
| 278 | 
            -
                    default_substitution_cost=default_substitution_cost,
         | 
| 279 | 
            -
                    default_insertion_cost=default_insertion_cost,
         | 
| 280 | 
            -
                    default_deletion_cost=default_deletion_cost,
         | 
| 281 | 
            -
                ).explain(s1, s2, filter_matches=filter_matches)
         | 
| 167 | 
            +
                    The result can be written to, say, JSON.
         | 
| 168 | 
            +
             | 
| 169 | 
            +
                    For the counterpart, see :meth:`WeightedLevenshtein.from_dict`.
         | 
| 170 | 
            +
                    """
         | 
| 171 | 
            +
                    # Convert tuple keys to a list of lists/objects for broader compatibility (e.g., JSON)
         | 
| 172 | 
            +
                    sub_costs_serializable = [
         | 
| 173 | 
            +
                        {"from": k[0], "to": k[1], "cost": v} for k, v in self.substitution_costs.items()
         | 
| 174 | 
            +
                    ]
         | 
| 175 | 
            +
             | 
| 176 | 
            +
                    return {
         | 
| 177 | 
            +
                        "substitution_costs": sub_costs_serializable,
         | 
| 178 | 
            +
                        "insertion_costs": self.insertion_costs,
         | 
| 179 | 
            +
                        "deletion_costs": self.deletion_costs,
         | 
| 180 | 
            +
                        "symmetric_substitution": self.symmetric_substitution,
         | 
| 181 | 
            +
                        "default_substitution_cost": self.default_substitution_cost,
         | 
| 182 | 
            +
                        "default_insertion_cost": self.default_insertion_cost,
         | 
| 183 | 
            +
                        "default_deletion_cost": self.default_deletion_cost,
         | 
| 184 | 
            +
                    }
         | 
| 185 | 
            +
             | 
| 186 | 
            +
                @classmethod
         | 
| 187 | 
            +
                def from_dict(cls, data: dict[str, Any]) -> WeightedLevenshtein:
         | 
| 188 | 
            +
                    """
         | 
| 189 | 
            +
                    Deserialize from a dictionary.
         | 
| 190 | 
            +
             | 
| 191 | 
            +
                    For the counterpart, see :meth:`WeightedLevenshtein.to_dict`.
         | 
| 192 | 
            +
             | 
| 193 | 
            +
                    :param data: A dictionary with (not necessarily all of) the following keys:
         | 
| 194 | 
            +
                                 - "substitution_costs": {"from": str, "to": str, "cost": float}
         | 
| 195 | 
            +
                                 - "substitution_costs": dict[str, float]
         | 
| 196 | 
            +
                                 - "deletion_costs": dict[str, float]
         | 
| 197 | 
            +
                                 - "symmetric_substitution": bool
         | 
| 198 | 
            +
                                 - "default_substitution_cost": float
         | 
| 199 | 
            +
                                 - "default_insertion_cost": float
         | 
| 200 | 
            +
                                 - "default_deletion_cost": float
         | 
| 201 | 
            +
                    """
         | 
| 202 | 
            +
                    # Convert the list of substitution costs back to the required dict format
         | 
| 203 | 
            +
                    sub_costs: dict[tuple[str, str], float] = {
         | 
| 204 | 
            +
                        (item["from"], item["to"]): item["cost"] for item in data.get("substitution_costs", {})
         | 
| 205 | 
            +
                    }
         | 
| 206 | 
            +
             | 
| 207 | 
            +
                    return cls(
         | 
| 208 | 
            +
                        substitution_costs=sub_costs,
         | 
| 209 | 
            +
                        insertion_costs=data.get("substitution_costs"),
         | 
| 210 | 
            +
                        deletion_costs=data.get("deletion_costs"),
         | 
| 211 | 
            +
                        symmetric_substitution=data.get("symmetric_substitution", True),
         | 
| 212 | 
            +
                        default_substitution_cost=data.get("default_substitution_cost", 1.0),
         | 
| 213 | 
            +
                        default_insertion_cost=data.get("default_insertion_cost", 1.0),
         | 
| 214 | 
            +
                        default_deletion_cost=data.get("default_deletion_cost", 1.0),
         | 
| 215 | 
            +
                    )
         | 
| @@ -0,0 +1,9 @@ | |
| 1 | 
            +
            from typing import TYPE_CHECKING, Protocol, runtime_checkable
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            if TYPE_CHECKING:
         | 
| 4 | 
            +
                from .edit_operation import EditOperation
         | 
| 5 | 
            +
             | 
| 6 | 
            +
             | 
| 7 | 
            +
            @runtime_checkable
         | 
| 8 | 
            +
            class Aligner(Protocol):
         | 
| 9 | 
            +
                def explain(self, s1: str, s2: str, filter_matches: bool) -> list["EditOperation"]: ...
         | 
| @@ -0,0 +1,94 @@ | |
| 1 | 
            +
            Metadata-Version: 2.4
         | 
| 2 | 
            +
            Name: ocr-stringdist
         | 
| 3 | 
            +
            Version: 1.0.0
         | 
| 4 | 
            +
            Classifier: Programming Language :: Rust
         | 
| 5 | 
            +
            Classifier: Programming Language :: Python :: Implementation :: CPython
         | 
| 6 | 
            +
            Classifier: License :: OSI Approved :: MIT License
         | 
| 7 | 
            +
            Classifier: Operating System :: OS Independent
         | 
| 8 | 
            +
            License-File: LICENSE
         | 
| 9 | 
            +
            Requires-Python: >=3.9
         | 
| 10 | 
            +
            Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
         | 
| 11 | 
            +
            Project-URL: repository, https://github.com/NiklasvonM/ocr-stringdist
         | 
| 12 | 
            +
            Project-URL: documentation, https://niklasvonm.github.io/ocr-stringdist/
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            # OCR-StringDist
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            A Python library to learn, model, explain and correct OCR errors using a fast string distance engine.
         | 
| 17 | 
            +
             | 
| 18 | 
            +
            Documentation: https://niklasvonm.github.io/ocr-stringdist/
         | 
| 19 | 
            +
             | 
| 20 | 
            +
            [](https://badge.fury.io/py/ocr-stringdist)
         | 
| 21 | 
            +
            [](LICENSE)
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            ## Overview
         | 
| 24 | 
            +
             | 
| 25 | 
            +
            Standard string distances (like Levenshtein) treat all character substitutions equally. This is suboptimal for text read from images via OCR, where errors like `O` vs `0` are far more common than, say, `O` vs `X`.
         | 
| 26 | 
            +
             | 
| 27 | 
            +
            OCR-StringDist provides a learnable **weighted Levenshtein distance**, implementing part of the **Noisy Channel model**.
         | 
| 28 | 
            +
             | 
| 29 | 
            +
            **Example:** Matching against the correct word `CODE`:
         | 
| 30 | 
            +
             | 
| 31 | 
            +
            * **Standard Levenshtein:**
         | 
| 32 | 
            +
                * $d(\text{"CODE"}, \text{"C0DE"}) = 1$ (O → 0)
         | 
| 33 | 
            +
                * $d(\text{"CODE"}, \text{"CXDE"}) = 1$ (O → X)
         | 
| 34 | 
            +
                * Result: Both appear equally likely/distant.
         | 
| 35 | 
            +
             | 
| 36 | 
            +
            * **OCR-StringDist (Channel Model):**
         | 
| 37 | 
            +
                * $d(\text{"CODE"}, \text{"C0DE"}) \approx 0.1$ (common error, low cost)
         | 
| 38 | 
            +
                * $d(\text{"CODE"}, \text{"CXDE"}) = 1.0$ (unlikely error, high cost)
         | 
| 39 | 
            +
                * Result: Correctly identifies `C0DE` as a much closer match.
         | 
| 40 | 
            +
             | 
| 41 | 
            +
            This makes it ideal for matching potentially incorrect OCR output against known values (e.g., product codes). By combining this *channel model* with a *source model* (e.g., product code frequencies), you can build a complete and robust OCR correction system.
         | 
| 42 | 
            +
             | 
| 43 | 
            +
            ## Installation
         | 
| 44 | 
            +
             | 
| 45 | 
            +
            ```bash
         | 
| 46 | 
            +
            pip install ocr-stringdist
         | 
| 47 | 
            +
            ```
         | 
| 48 | 
            +
             | 
| 49 | 
            +
            ## Features
         | 
| 50 | 
            +
             | 
| 51 | 
            +
            - **Learnable Costs**: Automatically learn substitution, insertion, and deletion costs from a dataset of (OCR string, ground truth string) pairs.
         | 
| 52 | 
            +
            - **Weighted Levenshtein Distance**: Models OCR error patterns by assigning custom costs to specific edit operations.
         | 
| 53 | 
            +
            - **High Performance**: Core logic in Rust and a batch_distance function for efficiently comparing one string against thousands of candidates.
         | 
| 54 | 
            +
            - **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
         | 
| 55 | 
            +
            - **Explainable Edit Path**: Returns the optimal sequence of edit operations (substitutions, insertions, and deletions) used to transform one string into another.
         | 
| 56 | 
            +
            - **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
         | 
| 57 | 
            +
            - **Full Unicode Support**: Works with arbitrary Unicode strings.
         | 
| 58 | 
            +
             | 
| 59 | 
            +
            ## Core Workflow
         | 
| 60 | 
            +
             | 
| 61 | 
            +
            The typical workflow involves
         | 
| 62 | 
            +
            - learning costs from your data and then
         | 
| 63 | 
            +
            - using the resulting model to find the best match from a list of candidates.
         | 
| 64 | 
            +
             | 
| 65 | 
            +
            ```python
         | 
| 66 | 
            +
            from ocr_stringdist import WeightedLevenshtein
         | 
| 67 | 
            +
             | 
| 68 | 
            +
            # 1. LEARN costs from your own data
         | 
| 69 | 
            +
            training_data = [
         | 
| 70 | 
            +
                ("128", "123"),
         | 
| 71 | 
            +
                ("567", "567"),
         | 
| 72 | 
            +
            ]
         | 
| 73 | 
            +
            wl = WeightedLevenshtein.learn_from(training_data)
         | 
| 74 | 
            +
             | 
| 75 | 
            +
            # The engine has now learned that '8' -> '3' is a low-cost substitution
         | 
| 76 | 
            +
            print(f"Learned cost for ('8', '3'): {wl.substitution_costs[('8', '3')]:.2f}")
         | 
| 77 | 
            +
             | 
| 78 | 
            +
             | 
| 79 | 
            +
            # 2. MATCH new OCR output against a list of candidates
         | 
| 80 | 
            +
            ocr_output = "Product Code 128"
         | 
| 81 | 
            +
            candidates = [
         | 
| 82 | 
            +
                "Product Code 123",
         | 
| 83 | 
            +
                "Product Code 523",  # '5' -> '1' is an unlikely error
         | 
| 84 | 
            +
            ]
         | 
| 85 | 
            +
             | 
| 86 | 
            +
            distances = wl.batch_distance(ocr_output, candidates)
         | 
| 87 | 
            +
             | 
| 88 | 
            +
            # Find the best match
         | 
| 89 | 
            +
            min_distance = min(distances)
         | 
| 90 | 
            +
            best_match = candidates[distances.index(min_distance)]
         | 
| 91 | 
            +
             | 
| 92 | 
            +
            print(f"Best match for '{ocr_output}': '{best_match}' (Cost: {min_distance:.2f})")
         | 
| 93 | 
            +
            ```
         | 
| 94 | 
            +
             | 
| @@ -0,0 +1,13 @@ | |
| 1 | 
            +
            ocr_stringdist-1.0.0.dist-info/METADATA,sha256=UQpqp6A67F89fMleGdVdNkIePwjHqPVze1QIOdu3etA,4043
         | 
| 2 | 
            +
            ocr_stringdist-1.0.0.dist-info/WHEEL,sha256=Iz7QqxpWQRXToFIDkGspPPKDuV_klwuhW8ziiU5jhR8,96
         | 
| 3 | 
            +
            ocr_stringdist-1.0.0.dist-info/licenses/LICENSE,sha256=3cNRiJag5vI0KMMDNf0oiaY4vg43rLxRszbMJs1GBoU,1092
         | 
| 4 | 
            +
            ocr_stringdist/__init__.py,sha256=HsCF7QPJmFfJiB5kSnIBDSGsc1lbCddS6u3Ea0fCF0M,295
         | 
| 5 | 
            +
            ocr_stringdist/_rust_stringdist.cp310-win_amd64.pyd,sha256=Or7PkuzqCKzy0itVPmU-3txXBVmn0JNuRWu8V5W42sc,413696
         | 
| 6 | 
            +
            ocr_stringdist/default_ocr_distances.py,sha256=vlhzQCCcE-D1xor5RvMW0oaMuL_HP_5Y7SO4ESkdb4w,1075
         | 
| 7 | 
            +
            ocr_stringdist/edit_operation.py,sha256=8yzz4BUBhqowMwUVWYpkXGc_0GwwGahCu2e55qeLAv4,411
         | 
| 8 | 
            +
            ocr_stringdist/learner.py,sha256=blzjifV0S-fjwzYP7iPQZm2iUuZKYqx9o787eB32tAk,10745
         | 
| 9 | 
            +
            ocr_stringdist/levenshtein.py,sha256=e1RQ4bFW-8yV24ajAh7tfqj0sK2W_p0j-vLH15FgPPU,10069
         | 
| 10 | 
            +
            ocr_stringdist/matching.py,sha256=hM-_M0jpzaC84ekjkWw8qSZUljIfEY86kT8tWL7bq0s,3353
         | 
| 11 | 
            +
            ocr_stringdist/protocols.py,sha256=Cat28DHGa53-b81J34RSiH6O7Hob8w-y8FcYlpeGLTM,274
         | 
| 12 | 
            +
            ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
         | 
| 13 | 
            +
            ocr_stringdist-1.0.0.dist-info/RECORD,,
         | 
| @@ -1,110 +0,0 @@ | |
| 1 | 
            -
            Metadata-Version: 2.4
         | 
| 2 | 
            -
            Name: ocr-stringdist
         | 
| 3 | 
            -
            Version: 0.3.0
         | 
| 4 | 
            -
            Classifier: Programming Language :: Rust
         | 
| 5 | 
            -
            Classifier: Programming Language :: Python :: Implementation :: CPython
         | 
| 6 | 
            -
            Classifier: License :: OSI Approved :: MIT License
         | 
| 7 | 
            -
            Classifier: Operating System :: OS Independent
         | 
| 8 | 
            -
            License-File: LICENSE
         | 
| 9 | 
            -
            Requires-Python: >=3.9
         | 
| 10 | 
            -
            Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
         | 
| 11 | 
            -
            Project-URL: repository, https://github.com/NiklasvonM/ocr-stringdist
         | 
| 12 | 
            -
            Project-URL: documentation, https://niklasvonm.github.io/ocr-stringdist/
         | 
| 13 | 
            -
             | 
| 14 | 
            -
            # OCR-StringDist
         | 
| 15 | 
            -
             | 
| 16 | 
            -
            A Python library for fast string distance calculations that account for common OCR (optical character recognition) errors.
         | 
| 17 | 
            -
             | 
| 18 | 
            -
            Documentation: https://niklasvonm.github.io/ocr-stringdist/
         | 
| 19 | 
            -
             | 
| 20 | 
            -
            [](https://badge.fury.io/py/ocr-stringdist)
         | 
| 21 | 
            -
            [](LICENSE)
         | 
| 22 | 
            -
             | 
| 23 | 
            -
            ## Overview
         | 
| 24 | 
            -
             | 
| 25 | 
            -
            Standard string distances (like Levenshtein) treat all character substitutions equally. This is suboptimal for text read from images via OCR, where errors like `O` vs `0` are far more common than, say, `O` vs `X`.
         | 
| 26 | 
            -
             | 
| 27 | 
            -
            OCR-StringDist uses a **weighted Levenshtein distance**, assigning lower costs to common OCR errors.
         | 
| 28 | 
            -
             | 
| 29 | 
            -
            **Example:** Matching against the correct word `CODE`:
         | 
| 30 | 
            -
             | 
| 31 | 
            -
            * **Standard Levenshtein:**
         | 
| 32 | 
            -
                * $d(\text{"CODE"}, \text{"C0DE"}) = 1$ (O → 0)
         | 
| 33 | 
            -
                * $d(\text{"CODE"}, \text{"CXDE"}) = 1$ (O → X)
         | 
| 34 | 
            -
                * Result: Both appear equally likely/distant.
         | 
| 35 | 
            -
             | 
| 36 | 
            -
            * **OCR-StringDist (Weighted):**
         | 
| 37 | 
            -
                * $d(\text{"CODE"}, \text{"C0DE"}) \approx 0.1$ (common error, low cost)
         | 
| 38 | 
            -
                * $d(\text{"CODE"}, \text{"CXDE"}) = 1.0$ (unlikely error, high cost)
         | 
| 39 | 
            -
                * Result: Correctly identifies `C0DE` as a much closer match.
         | 
| 40 | 
            -
             | 
| 41 | 
            -
            This makes it ideal for matching potentially incorrect OCR output against known values (e.g., product codes, database entries).
         | 
| 42 | 
            -
             | 
| 43 | 
            -
            ## Installation
         | 
| 44 | 
            -
             | 
| 45 | 
            -
            ```bash
         | 
| 46 | 
            -
            pip install ocr-stringdist
         | 
| 47 | 
            -
            ```
         | 
| 48 | 
            -
             | 
| 49 | 
            -
            ## Features
         | 
| 50 | 
            -
             | 
| 51 | 
            -
            - **High Performance**: The core logic is implemented in Rust with speed in mind.
         | 
| 52 | 
            -
            - **Weighted Levenshtein Distance**: Calculates Levenshtein distance with customizable costs for substitutions, insertions, and deletions. Includes an efficient batch version (`batch_weighted_levenshtein_distance`) for comparing one string against many candidates.
         | 
| 53 | 
            -
            - **Explainable Edit Path**: Returns the optimal sequence of edit operations (substitutions, insertions, and deletions) used to transform one string into another.
         | 
| 54 | 
            -
            - **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
         | 
| 55 | 
            -
            - **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
         | 
| 56 | 
            -
            - **Learnable Costs**: Easily learn costs from a dataset of (OCR string, ground truth string)-pairs.
         | 
| 57 | 
            -
            - **Unicode Support**: Works with arbitrary Unicode strings.
         | 
| 58 | 
            -
            - **Best Match Finder**: Includes a utility function `find_best_candidate` to efficiently find the best match from a list based on _any_ distance function.
         | 
| 59 | 
            -
             | 
| 60 | 
            -
            ## Usage
         | 
| 61 | 
            -
             | 
| 62 | 
            -
            ### Basic usage
         | 
| 63 | 
            -
             | 
| 64 | 
            -
            ```python
         | 
| 65 | 
            -
            from ocr_stringdist import WeightedLevenshtein
         | 
| 66 | 
            -
             | 
| 67 | 
            -
            # Default substitution costs are ocr_stringdist.ocr_distance_map.
         | 
| 68 | 
            -
            wl = WeightedLevenshtein()
         | 
| 69 | 
            -
             | 
| 70 | 
            -
            print(wl.distance("CXDE", "CODE")) # == 1
         | 
| 71 | 
            -
            print(wl.distance("C0DE", "CODE")) # < 1
         | 
| 72 | 
            -
            ```
         | 
| 73 | 
            -
             | 
| 74 | 
            -
            ### Explain the Edit Path
         | 
| 75 | 
            -
             | 
| 76 | 
            -
            ```python
         | 
| 77 | 
            -
            edit_path = wl.explain("C0DE", "CODE")
         | 
| 78 | 
            -
            print(edit_path)
         | 
| 79 | 
            -
            # [EditOperation(op_type='substitute', source_token='0', target_token='O', cost=0.1)]
         | 
| 80 | 
            -
            ```
         | 
| 81 | 
            -
             | 
| 82 | 
            -
            ### Fast Batch Calculations
         | 
| 83 | 
            -
             | 
| 84 | 
            -
            Quickly compare a string to a list of candidates.
         | 
| 85 | 
            -
             | 
| 86 | 
            -
            ```python
         | 
| 87 | 
            -
            distances: list[float] = wl.batch_distance("CODE", ["CXDE", "C0DE"])
         | 
| 88 | 
            -
            # [1.0, 0.1]
         | 
| 89 | 
            -
            ```
         | 
| 90 | 
            -
             | 
| 91 | 
            -
            ### Multi-character Substitutions
         | 
| 92 | 
            -
             | 
| 93 | 
            -
            ```python
         | 
| 94 | 
            -
            # Custom costs with multi-character substitution
         | 
| 95 | 
            -
            wl = WeightedLevenshtein(substitution_costs={("In", "h"): 0.5})
         | 
| 96 | 
            -
             | 
| 97 | 
            -
            print(wl.distance("hi", "Ini")) # 0.5
         | 
| 98 | 
            -
            ```
         | 
| 99 | 
            -
             | 
| 100 | 
            -
            ### Learn Costs
         | 
| 101 | 
            -
             | 
| 102 | 
            -
            ```python
         | 
| 103 | 
            -
            wl = WeightedLevenshtein.learn_from([("Hallo", "Hello")])
         | 
| 104 | 
            -
            print(wl.substitution_costs[("a", "e")]) # < 1
         | 
| 105 | 
            -
            ```
         | 
| 106 | 
            -
             | 
| 107 | 
            -
            ## Acknowledgements
         | 
| 108 | 
            -
             | 
| 109 | 
            -
            This project is inspired by [jellyfish](https://github.com/jamesturk/jellyfish), providing the base implementations of the algorithms used here.
         | 
| 110 | 
            -
             | 
| @@ -1,12 +0,0 @@ | |
| 1 | 
            -
            ocr_stringdist-0.3.0.dist-info/METADATA,sha256=SxPzYW8GrsDPOKdXuF2ueUOFUI1426djGtUAleg0V4U,4284
         | 
| 2 | 
            -
            ocr_stringdist-0.3.0.dist-info/WHEEL,sha256=Iz7QqxpWQRXToFIDkGspPPKDuV_klwuhW8ziiU5jhR8,96
         | 
| 3 | 
            -
            ocr_stringdist-0.3.0.dist-info/licenses/LICENSE,sha256=3cNRiJag5vI0KMMDNf0oiaY4vg43rLxRszbMJs1GBoU,1092
         | 
| 4 | 
            -
            ocr_stringdist/__init__.py,sha256=37hKeJm1qxv_DptaciEPby-7h2yojwr0djherLI88Hk,484
         | 
| 5 | 
            -
            ocr_stringdist/_rust_stringdist.cp310-win_amd64.pyd,sha256=HYD6rledYevCdwxcbaOusEOM7G0onm7ijS3Jjbx7N6Q,413696
         | 
| 6 | 
            -
            ocr_stringdist/default_ocr_distances.py,sha256=vlhzQCCcE-D1xor5RvMW0oaMuL_HP_5Y7SO4ESkdb4w,1075
         | 
| 7 | 
            -
            ocr_stringdist/edit_operation.py,sha256=8yzz4BUBhqowMwUVWYpkXGc_0GwwGahCu2e55qeLAv4,411
         | 
| 8 | 
            -
            ocr_stringdist/learner.py,sha256=WNIrwaRpPUqKnq6Yd1rZ_rdxq9LmnezLEK1bEcsRqrU,10449
         | 
| 9 | 
            -
            ocr_stringdist/levenshtein.py,sha256=WZF8Sw6xQ6ZCMGW29pUN3jDwtW2bfVG9OYi7dRGIQhs,13641
         | 
| 10 | 
            -
            ocr_stringdist/matching.py,sha256=hM-_M0jpzaC84ekjkWw8qSZUljIfEY86kT8tWL7bq0s,3353
         | 
| 11 | 
            -
            ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
         | 
| 12 | 
            -
            ocr_stringdist-0.3.0.dist-info/RECORD,,
         | 
| 
            File without changes
         | 
| 
            File without changes
         |