ocr-stringdist 0.2.2__cp312-cp312-musllinux_1_1_aarch64.whl → 1.0.0__cp312-cp312-musllinux_1_1_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,17 +1,11 @@
1
1
  from .default_ocr_distances import ocr_distance_map
2
- from .levenshtein import (
3
- WeightedLevenshtein,
4
- batch_weighted_levenshtein_distance,
5
- explain_weighted_levenshtein,
6
- weighted_levenshtein_distance,
7
- )
2
+ from .learner import CostLearner
3
+ from .levenshtein import WeightedLevenshtein
8
4
  from .matching import find_best_candidate
9
5
 
10
6
  __all__ = [
11
7
  "ocr_distance_map",
8
+ "CostLearner",
12
9
  "WeightedLevenshtein",
13
- "weighted_levenshtein_distance",
14
- "batch_weighted_levenshtein_distance",
15
- "explain_weighted_levenshtein",
16
10
  "find_best_candidate",
17
11
  ]
@@ -0,0 +1,16 @@
1
+ from dataclasses import dataclass
2
+ from typing import Literal, Optional
3
+
4
+ OperationType = Literal["substitute", "insert", "delete", "match"]
5
+
6
+
7
+ @dataclass(frozen=True)
8
+ class EditOperation:
9
+ """
10
+ Represents a single edit operation (substitution, insertion, deletion or match).
11
+ """
12
+
13
+ op_type: OperationType
14
+ source_token: Optional[str]
15
+ target_token: Optional[str]
16
+ cost: float
@@ -0,0 +1,254 @@
1
+ import itertools
2
+ import math
3
+ from collections import defaultdict
4
+ from collections.abc import Iterable
5
+ from dataclasses import dataclass, field
6
+ from typing import TYPE_CHECKING, Callable, Optional
7
+
8
+ if TYPE_CHECKING:
9
+ from .edit_operation import EditOperation
10
+ from .levenshtein import WeightedLevenshtein
11
+ from .protocols import Aligner
12
+
13
+ CostFunction = Callable[[float], float]
14
+
15
+
16
+ def negative_log_likelihood(probability: float) -> float:
17
+ if probability <= 0.0:
18
+ raise ValueError("Probability must be positive to compute negative log likelihood.")
19
+ return -math.log(probability)
20
+
21
+
22
+ @dataclass
23
+ class TallyCounts:
24
+ substitutions: defaultdict[tuple[str, str], int] = field(
25
+ default_factory=lambda: defaultdict(int)
26
+ )
27
+ insertions: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
28
+ deletions: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
29
+ source_chars: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
30
+ target_chars: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
31
+ vocab: set[str] = field(default_factory=set)
32
+
33
+
34
+ @dataclass
35
+ class _Costs:
36
+ substitutions: dict[tuple[str, str], float]
37
+ insertions: dict[str, float]
38
+ deletions: dict[str, float]
39
+
40
+
41
+ class CostLearner:
42
+ """
43
+ Configures and executes the process of learning Levenshtein costs from data.
44
+
45
+ This class uses a builder pattern, allowing chaining configuration methods
46
+ before running the final calculation with .fit().
47
+
48
+ Example::
49
+
50
+ from ocr_stringdist import CostLearner
51
+
52
+ data = [
53
+ ("Hell0", "Hello"),
54
+ ]
55
+ learner = CostLearner().with_smoothing(1.0)
56
+ wl = learner.fit(data) # Substitution 0 -> o learned with cost < 1.0
57
+ """
58
+
59
+ # Configuration parameters
60
+ _smoothing_k: float
61
+
62
+ # These attributes are set during fitting
63
+ counts: Optional[TallyCounts] = None
64
+ vocab_size: Optional[int] = None
65
+
66
+ def __init__(self) -> None:
67
+ self._smoothing_k = 1.0
68
+
69
+ def with_smoothing(self, k: float) -> "CostLearner":
70
+ r"""
71
+ Sets the smoothing parameter `k`.
72
+
73
+ This parameter controls how strongly the model defaults to a uniform
74
+ probability distribution by adding a "pseudo-count" of `k` to every
75
+ possible event.
76
+
77
+ :param k: The smoothing factor, which must be a non-negative number.
78
+ :return: The CostLearner instance for method chaining.
79
+ :raises ValueError: If k < 0.
80
+
81
+ Notes
82
+ -----
83
+ This parameter allows for a continuous transition between two modes:
84
+
85
+ - **k > 0 (recommended):** This enables additive smoothing, with `k = 1.0`
86
+ being Laplace smoothing. It regularizes the model by assuming no event is impossible.
87
+ The final costs are a measure of "relative surprisal," normalized by the vocabulary size
88
+
89
+ - **k = 0:** This corresponds to a normalized Maximum Likelihood Estimation.
90
+ Probabilities are derived from the raw observed frequencies. The final costs are
91
+ normalized using the same logic as the `k > 0` case, making `k=0` the continuous limit
92
+ of the smoothed model. In this mode, costs can only be calculated for events observed in
93
+ the training data. Unseen events will receive the default cost, regardless of
94
+ the value of `calculate_for_unseen` in :meth:`fit`.
95
+ """
96
+ if k < 0:
97
+ raise ValueError("Smoothing parameter k must be non-negative.")
98
+ self._smoothing_k = k
99
+ return self
100
+
101
+ def _tally_operations(self, operations: Iterable["EditOperation"]) -> TallyCounts:
102
+ """Tally all edit operations."""
103
+ counts = TallyCounts()
104
+ for op in operations:
105
+ if op.source_token is not None:
106
+ counts.vocab.add(op.source_token)
107
+ if op.target_token is not None:
108
+ counts.target_chars[op.target_token] += 1
109
+ counts.vocab.add(op.target_token)
110
+
111
+ if op.op_type == "substitute":
112
+ if op.source_token is None or op.target_token is None:
113
+ raise ValueError("Tokens cannot be None for 'substitute'")
114
+ counts.substitutions[(op.source_token, op.target_token)] += 1
115
+ counts.source_chars[op.source_token] += 1
116
+ elif op.op_type == "delete":
117
+ if op.source_token is None:
118
+ raise ValueError("Source token cannot be None for 'delete'")
119
+ counts.deletions[op.source_token] += 1
120
+ counts.source_chars[op.source_token] += 1
121
+ elif op.op_type == "insert":
122
+ if op.target_token is None:
123
+ raise ValueError("Target token cannot be None for 'insert'")
124
+ counts.insertions[op.target_token] += 1
125
+ elif op.op_type == "match":
126
+ if op.source_token is None:
127
+ raise ValueError("Source token cannot be None for 'match'")
128
+ counts.source_chars[op.source_token] += 1
129
+ return counts
130
+
131
+ def _calculate_costs(
132
+ self, counts: TallyCounts, vocab: set[str], calculate_for_unseen: bool = False
133
+ ) -> _Costs:
134
+ """
135
+ Calculates the costs for edit operations based on tallied counts.
136
+ """
137
+ sub_costs: dict[tuple[str, str], float] = {}
138
+ ins_costs: dict[str, float] = {}
139
+ del_costs: dict[str, float] = {}
140
+ k = self._smoothing_k
141
+
142
+ if k == 0:
143
+ calculate_for_unseen = False
144
+
145
+ # Error space size V for all conditional probabilities.
146
+ # The space of possible outcomes for a given source character (from OCR)
147
+ # includes all vocab characters (for matches/substitutions) plus the empty
148
+ # character (for deletions). This gives V = len(vocab) + 1.
149
+ # Symmetrically, the space of outcomes for a given target character (from GT)
150
+ # includes all vocab characters plus the empty character (for insertions/misses).
151
+ V = len(vocab) + 1
152
+
153
+ # Normalization ceiling Z' = -log(1/V).
154
+ normalization_ceiling = math.log(V) if V > 1 else 1.0
155
+
156
+ # Substitutions
157
+ sub_iterator = (
158
+ itertools.product(vocab, vocab) if calculate_for_unseen else counts.substitutions.keys()
159
+ )
160
+ for source, target in sub_iterator:
161
+ count = counts.substitutions[(source, target)]
162
+ total_count = counts.source_chars[source]
163
+ prob = (count + k) / (total_count + k * V)
164
+ base_cost = negative_log_likelihood(prob)
165
+ sub_costs[(source, target)] = base_cost / normalization_ceiling
166
+
167
+ # Deletions
168
+ del_iterator = vocab if calculate_for_unseen else counts.deletions.keys()
169
+ for source in del_iterator:
170
+ count = counts.deletions[source]
171
+ total_count = counts.source_chars[source]
172
+ prob = (count + k) / (total_count + k * V)
173
+ base_cost = negative_log_likelihood(prob)
174
+ del_costs[source] = base_cost / normalization_ceiling
175
+
176
+ # Insertions
177
+ ins_iterator = vocab if calculate_for_unseen else counts.insertions.keys()
178
+ for target in ins_iterator:
179
+ count = counts.insertions[target]
180
+ total_target_count = counts.target_chars[target]
181
+ prob = (count + k) / (total_target_count + k * V)
182
+ base_cost = negative_log_likelihood(prob)
183
+ ins_costs[target] = base_cost / normalization_ceiling
184
+
185
+ return _Costs(substitutions=sub_costs, insertions=ins_costs, deletions=del_costs)
186
+
187
+ def _calculate_operations(
188
+ self, pairs: Iterable[tuple[str, str]], aligner: "Aligner"
189
+ ) -> list["EditOperation"]:
190
+ """Calculate edit operations for all string pairs using the provided aligner."""
191
+
192
+ all_ops = [
193
+ op
194
+ for ocr_str, truth_str in pairs
195
+ for op in aligner.explain(ocr_str, truth_str, filter_matches=False)
196
+ ]
197
+ return all_ops
198
+
199
+ def fit(
200
+ self,
201
+ pairs: Iterable[tuple[str, str]],
202
+ *,
203
+ initial_model: "Aligner | None" = None,
204
+ calculate_for_unseen: bool = False,
205
+ ) -> "WeightedLevenshtein":
206
+ """
207
+ Fits the costs of a WeightedLevenshtein instance to the provided data.
208
+
209
+ Note that learning multi-character tokens is only supported if an initial alignment model
210
+ is provided that can handle those multi-character tokens.
211
+
212
+ This method analyzes pairs of strings to learn the costs of edit operations
213
+ based on their observed frequencies. The underlying model calculates costs
214
+ based on the principle of relative information cost.
215
+
216
+ For a detailed explanation of the methodology, please see the
217
+ :doc:`Cost Learning Model <cost_learning_model>` documentation page.
218
+
219
+ :param pairs: An iterable of (ocr_string, ground_truth_string) tuples.
220
+ :param initial_model: Optional initial model used to align OCR outputs and ground truth
221
+ strings. By default, an unweighted Levenshtein distance is used.
222
+ :param calculate_for_unseen: If True (and k > 0), pre-calculates costs for all
223
+ possible edit operations based on the vocabulary.
224
+ If False (default), only calculates costs for operations
225
+ observed in the data.
226
+ :return: A `WeightedLevenshtein` instance with the learned costs.
227
+ """
228
+ from .levenshtein import WeightedLevenshtein
229
+
230
+ if not pairs:
231
+ return WeightedLevenshtein.unweighted()
232
+
233
+ if initial_model is None:
234
+ initial_model = WeightedLevenshtein.unweighted()
235
+
236
+ all_ops = self._calculate_operations(pairs, aligner=initial_model)
237
+ self.counts = self._tally_operations(all_ops)
238
+ vocab = self.counts.vocab
239
+ self.vocab_size = len(vocab)
240
+
241
+ if not self.vocab_size:
242
+ return WeightedLevenshtein.unweighted()
243
+
244
+ costs = self._calculate_costs(self.counts, vocab, calculate_for_unseen=calculate_for_unseen)
245
+
246
+ return WeightedLevenshtein(
247
+ substitution_costs=costs.substitutions,
248
+ insertion_costs=costs.insertions,
249
+ deletion_costs=costs.deletions,
250
+ symmetric_substitution=False,
251
+ default_substitution_cost=1.0,
252
+ default_insertion_cost=1.0,
253
+ default_deletion_cost=1.0,
254
+ )
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
- from dataclasses import dataclass
4
- from typing import Literal, Optional
3
+ from collections.abc import Iterable
4
+ from typing import Any, Optional
5
5
 
6
6
  from ._rust_stringdist import (
7
7
  _batch_weighted_levenshtein_distance,
@@ -9,20 +9,7 @@ from ._rust_stringdist import (
9
9
  _weighted_levenshtein_distance,
10
10
  )
11
11
  from .default_ocr_distances import ocr_distance_map
12
-
13
- OperationType = Literal["substitute", "insert", "delete"]
14
-
15
-
16
- @dataclass(frozen=True)
17
- class EditOperation:
18
- """
19
- Represents a single edit operation (substitution, insertion, or deletion).
20
- """
21
-
22
- op_type: OperationType
23
- source_token: Optional[str]
24
- target_token: Optional[str]
25
- cost: float
12
+ from .edit_operation import EditOperation
26
13
 
27
14
 
28
15
  class WeightedLevenshtein:
@@ -33,14 +20,17 @@ class WeightedLevenshtein:
33
20
  how the distance is measured. Once created, its methods can be used to
34
21
  efficiently compute distances and explain the edit operations.
35
22
 
36
- :param substitution_costs: Maps (char, char) tuples to their substitution cost.
23
+ :param substitution_costs: Maps (str, str) tuples to their substitution cost.
37
24
  Defaults to costs based on common OCR errors.
38
- :param insertion_costs: Maps a character to its insertion cost.
39
- :param deletion_costs: Maps a character to its deletion cost.
40
- :param symmetric_substitution: If True, substitution costs are bidirectional.
41
- :param default_substitution_cost: Default cost for substitutions not in the map.
42
- :param default_insertion_cost: Default cost for insertions not in the map.
43
- :param default_deletion_cost: Default cost for deletions not in the map.
25
+ :param insertion_costs: Maps a string to its insertion cost.
26
+ :param deletion_costs: Maps a string to its deletion cost.
27
+ :param symmetric_substitution: If True, a cost defined for, e.g., ('0', 'O') will automatically
28
+ apply to ('O', '0'). If False, both must be defined explicitly.
29
+ :param default_substitution_cost: Default cost for single-char substitutions not in the map.
30
+ :param default_insertion_cost: Default cost for single-char insertions not in the map.
31
+ :param default_deletion_cost: Default cost for single-char deletions not in the map.
32
+
33
+ :raises TypeError, ValueError: If the provided arguments are invalid.
44
34
  """
45
35
 
46
36
  substitution_costs: dict[tuple[str, str], float]
@@ -62,9 +52,37 @@ class WeightedLevenshtein:
62
52
  default_insertion_cost: float = 1.0,
63
53
  default_deletion_cost: float = 1.0,
64
54
  ) -> None:
65
- self.substitution_costs = (
66
- ocr_distance_map if substitution_costs is None else substitution_costs
67
- )
55
+ # Validate default costs
56
+ for cost_name, cost_val in [
57
+ ("default_substitution_cost", default_substitution_cost),
58
+ ("default_insertion_cost", default_insertion_cost),
59
+ ("default_deletion_cost", default_deletion_cost),
60
+ ]:
61
+ if not isinstance(cost_val, (int, float)):
62
+ raise TypeError(f"{cost_name} must be a number, but got: {type(cost_val).__name__}")
63
+ if cost_val < 0:
64
+ raise ValueError(f"{cost_name} must be non-negative, got value: {cost_val}")
65
+
66
+ # Validate substitution_costs dictionary
67
+ sub_costs = ocr_distance_map if substitution_costs is None else substitution_costs
68
+ for key, cost in sub_costs.items():
69
+ if not (
70
+ isinstance(key, tuple)
71
+ and len(key) == 2
72
+ and isinstance(key[0], str)
73
+ and isinstance(key[1], str)
74
+ ):
75
+ raise TypeError(
76
+ f"substitution_costs keys must be tuples of two strings, but found: {key}"
77
+ )
78
+ if not isinstance(cost, (int, float)):
79
+ raise TypeError(
80
+ f"Cost for substitution {key} must be a number, but got: {type(cost).__name__}"
81
+ )
82
+ if cost < 0:
83
+ raise ValueError(f"Cost for substitution {key} cannot be negative, but got: {cost}")
84
+
85
+ self.substitution_costs = sub_costs
68
86
  self.insertion_costs = {} if insertion_costs is None else insertion_costs
69
87
  self.deletion_costs = {} if deletion_costs is None else deletion_costs
70
88
  self.symmetric_substitution = symmetric_substitution
@@ -81,162 +99,117 @@ class WeightedLevenshtein:
81
99
  """Calculates the weighted Levenshtein distance between two strings."""
82
100
  return _weighted_levenshtein_distance(s1, s2, **self.__dict__) # type: ignore[no-any-return]
83
101
 
84
- def explain(self, s1: str, s2: str) -> list[EditOperation]:
85
- """Returns the list of edit operations to transform s1 into s2."""
102
+ def explain(self, s1: str, s2: str, filter_matches: bool = True) -> list[EditOperation]:
103
+ """
104
+ Returns the list of edit operations to transform s1 into s2.
105
+
106
+ :param s1: First string (interpreted as the string read via OCR)
107
+ :param s2: Second string (interpreted as the target string)
108
+ :param filter_matches: If True, 'match' operations are excluded from the result.
109
+ :return: List of :class:`EditOperation` instances.
110
+ """
86
111
  raw_path = _explain_weighted_levenshtein_distance(s1, s2, **self.__dict__)
87
- return [EditOperation(*op) for op in raw_path]
112
+ parsed_path = [EditOperation(*op) for op in raw_path]
113
+ if filter_matches:
114
+ return list(filter(lambda op: op.op_type != "match", parsed_path))
115
+ return parsed_path
88
116
 
89
117
  def batch_distance(self, s: str, candidates: list[str]) -> list[float]:
90
118
  """Calculates distances between a string and a list of candidates."""
91
119
  return _batch_weighted_levenshtein_distance(s, candidates, **self.__dict__) # type: ignore[no-any-return]
92
120
 
121
+ @classmethod
122
+ def learn_from(cls, pairs: Iterable[tuple[str, str]]) -> WeightedLevenshtein:
123
+ """
124
+ Creates an instance by learning costs from a dataset of (OCR, ground truth) string pairs.
125
+
126
+ For more advanced learning configuration, see the
127
+ :class:`ocr_stringdist.learner.CostLearner` class.
128
+
129
+ :param pairs: An iterable of (ocr_string, ground_truth_string) tuples. Correct pairs
130
+ are not intended to be filtered; they are needed to learn well-aligned costs.
131
+ :return: A new `WeightedLevenshtein` instance with the learned costs.
132
+
133
+ Example::
134
+
135
+ from ocr_stringdist import WeightedLevenshtein
136
+
137
+ training_data = [
138
+ ("8N234", "BN234"), # read '8' instead of 'B'
139
+ ("BJK18", "BJK18"), # correct
140
+ ("ABC0.", "ABC0"), # extra '.'
141
+ ]
142
+ wl = WeightedLevenshtein.learn_from(training_data)
143
+ print(wl.substitution_costs) # learned cost for substituting '8' with 'B'
144
+ print(wl.deletion_costs) # learned cost for deleting '.'
145
+ """
146
+ from .learner import CostLearner
147
+
148
+ return CostLearner().fit(pairs)
149
+
150
+ def __eq__(self, other: object) -> bool:
151
+ if not isinstance(other, WeightedLevenshtein):
152
+ return NotImplemented
153
+ return (
154
+ self.substitution_costs == other.substitution_costs
155
+ and self.insertion_costs == other.insertion_costs
156
+ and self.deletion_costs == other.deletion_costs
157
+ and self.symmetric_substitution == other.symmetric_substitution
158
+ and self.default_substitution_cost == other.default_substitution_cost
159
+ and self.default_insertion_cost == other.default_insertion_cost
160
+ and self.default_deletion_cost == other.default_deletion_cost
161
+ )
93
162
 
94
- def weighted_levenshtein_distance(
95
- s1: str,
96
- s2: str,
97
- /,
98
- substitution_costs: Optional[dict[tuple[str, str], float]] = None,
99
- insertion_costs: Optional[dict[str, float]] = None,
100
- deletion_costs: Optional[dict[str, float]] = None,
101
- *,
102
- symmetric_substitution: bool = True,
103
- default_substitution_cost: float = 1.0,
104
- default_insertion_cost: float = 1.0,
105
- default_deletion_cost: float = 1.0,
106
- ) -> float:
107
- """
108
- Levenshtein distance with custom substitution, insertion and deletion costs.
109
-
110
- See also :meth:`WeightedLevenshtein.distance`.
111
-
112
- The default `substitution_costs` considers common OCR errors, see
113
- :py:data:`ocr_stringdist.default_ocr_distances.ocr_distance_map`.
114
-
115
- :param s1: First string (interpreted as the string read via OCR)
116
- :param s2: Second string
117
- :param substitution_costs: Dictionary mapping tuples of strings ("substitution tokens") to their
118
- substitution costs. Only one direction needs to be configured unless
119
- `symmetric_substitution` is False.
120
- Note that the runtime scales in the length of the longest substitution token.
121
- Defaults to `ocr_stringdist.ocr_distance_map`.
122
- :param insertion_costs: Dictionary mapping strings to their insertion costs.
123
- :param deletion_costs: Dictionary mapping strings to their deletion costs.
124
- :param symmetric_substitution: Should the keys of `substitution_costs` be considered to be
125
- symmetric? Defaults to True.
126
- :param default_substitution_cost: The default substitution cost for character pairs not found
127
- in `substitution_costs`.
128
- :param default_insertion_cost: The default insertion cost for characters not found in
129
- `insertion_costs`.
130
- :param default_deletion_cost: The default deletion cost for characters not found in
131
- `deletion_costs`.
132
- """
133
- return WeightedLevenshtein(
134
- substitution_costs=substitution_costs,
135
- insertion_costs=insertion_costs,
136
- deletion_costs=deletion_costs,
137
- symmetric_substitution=symmetric_substitution,
138
- default_substitution_cost=default_substitution_cost,
139
- default_insertion_cost=default_insertion_cost,
140
- default_deletion_cost=default_deletion_cost,
141
- ).distance(s1, s2)
142
-
143
-
144
- def batch_weighted_levenshtein_distance(
145
- s: str,
146
- candidates: list[str],
147
- /,
148
- substitution_costs: Optional[dict[tuple[str, str], float]] = None,
149
- insertion_costs: Optional[dict[str, float]] = None,
150
- deletion_costs: Optional[dict[str, float]] = None,
151
- *,
152
- symmetric_substitution: bool = True,
153
- default_substitution_cost: float = 1.0,
154
- default_insertion_cost: float = 1.0,
155
- default_deletion_cost: float = 1.0,
156
- ) -> list[float]:
157
- """
158
- Calculate weighted Levenshtein distances between a string and multiple candidates.
159
-
160
- See also :meth:`WeightedLevenshtein.batch_distance`.
161
-
162
- This is more efficient than calling :func:`weighted_levenshtein_distance` multiple times.
163
-
164
- :param s: The string to compare (interpreted as the string read via OCR)
165
- :param candidates: List of candidate strings to compare against
166
- :param substitution_costs: Dictionary mapping tuples of strings ("substitution tokens") to their
167
- substitution costs. Only one direction needs to be configured unless
168
- `symmetric_substitution` is False.
169
- Note that the runtime scales in the length of the longest substitution token.
170
- Defaults to `ocr_stringdist.ocr_distance_map`.
171
- :param insertion_costs: Dictionary mapping strings to their insertion costs.
172
- :param deletion_costs: Dictionary mapping strings to their deletion costs.
173
- :param symmetric_substitution: Should the keys of `substitution_costs` be considered to be
174
- symmetric? Defaults to True.
175
- :param default_substitution_cost: The default substitution cost for character pairs not found
176
- in `substitution_costs`.
177
- :param default_insertion_cost: The default insertion cost for characters not found in
178
- `insertion_costs`.
179
- :param default_deletion_cost: The default deletion cost for characters not found in
180
- `deletion_costs`.
181
- :return: A list of distances corresponding to each candidate
182
- """
183
- return WeightedLevenshtein(
184
- substitution_costs=substitution_costs,
185
- insertion_costs=insertion_costs,
186
- deletion_costs=deletion_costs,
187
- symmetric_substitution=symmetric_substitution,
188
- default_substitution_cost=default_substitution_cost,
189
- default_insertion_cost=default_insertion_cost,
190
- default_deletion_cost=default_deletion_cost,
191
- ).batch_distance(s, candidates)
192
-
193
-
194
- def explain_weighted_levenshtein(
195
- s1: str,
196
- s2: str,
197
- /,
198
- substitution_costs: Optional[dict[tuple[str, str], float]] = None,
199
- insertion_costs: Optional[dict[str, float]] = None,
200
- deletion_costs: Optional[dict[str, float]] = None,
201
- *,
202
- symmetric_substitution: bool = True,
203
- default_substitution_cost: float = 1.0,
204
- default_insertion_cost: float = 1.0,
205
- default_deletion_cost: float = 1.0,
206
- ) -> list[EditOperation]:
207
- """
208
- Computes the path of operations associated with the custom Levenshtein distance.
209
-
210
- See also :meth:`WeightedLevenshtein.explain`.
211
-
212
- The default `substitution_costs` considers common OCR errors, see
213
- :py:data:`ocr_stringdist.default_ocr_distances.ocr_distance_map`.
214
-
215
- :param s1: First string (interpreted as the string read via OCR)
216
- :param s2: Second string
217
- :param substitution_costs: Dictionary mapping tuples of strings ("substitution tokens") to their
218
- substitution costs. Only one direction needs to be configured unless
219
- `symmetric_substitution` is False.
220
- Note that the runtime scales in the length of the longest substitution token.
221
- Defaults to `ocr_stringdist.ocr_distance_map`.
222
- :param insertion_costs: Dictionary mapping strings to their insertion costs.
223
- :param deletion_costs: Dictionary mapping strings to their deletion costs.
224
- :param symmetric_substitution: Should the keys of `substitution_costs` be considered to be
225
- symmetric? Defaults to True.
226
- :param default_substitution_cost: The default substitution cost for character pairs not found
227
- in `substitution_costs`.
228
- :param default_insertion_cost: The default insertion cost for characters not found in
229
- `insertion_costs`.
230
- :param default_deletion_cost: The default deletion cost for characters not found in
231
- `deletion_costs`.
232
- :return: List of :class:`EditOperation` instances.
233
- """
234
- return WeightedLevenshtein(
235
- substitution_costs=substitution_costs,
236
- insertion_costs=insertion_costs,
237
- deletion_costs=deletion_costs,
238
- symmetric_substitution=symmetric_substitution,
239
- default_substitution_cost=default_substitution_cost,
240
- default_insertion_cost=default_insertion_cost,
241
- default_deletion_cost=default_deletion_cost,
242
- ).explain(s1, s2)
163
+ def to_dict(self) -> dict[str, Any]:
164
+ """
165
+ Serializes the instance's configuration to a dictionary.
166
+
167
+ The result can be written to, say, JSON.
168
+
169
+ For the counterpart, see :meth:`WeightedLevenshtein.from_dict`.
170
+ """
171
+ # Convert tuple keys to a list of lists/objects for broader compatibility (e.g., JSON)
172
+ sub_costs_serializable = [
173
+ {"from": k[0], "to": k[1], "cost": v} for k, v in self.substitution_costs.items()
174
+ ]
175
+
176
+ return {
177
+ "substitution_costs": sub_costs_serializable,
178
+ "insertion_costs": self.insertion_costs,
179
+ "deletion_costs": self.deletion_costs,
180
+ "symmetric_substitution": self.symmetric_substitution,
181
+ "default_substitution_cost": self.default_substitution_cost,
182
+ "default_insertion_cost": self.default_insertion_cost,
183
+ "default_deletion_cost": self.default_deletion_cost,
184
+ }
185
+
186
+ @classmethod
187
+ def from_dict(cls, data: dict[str, Any]) -> WeightedLevenshtein:
188
+ """
189
+ Deserialize from a dictionary.
190
+
191
+ For the counterpart, see :meth:`WeightedLevenshtein.to_dict`.
192
+
193
+ :param data: A dictionary with (not necessarily all of) the following keys:
194
+ - "substitution_costs": {"from": str, "to": str, "cost": float}
195
+ - "substitution_costs": dict[str, float]
196
+ - "deletion_costs": dict[str, float]
197
+ - "symmetric_substitution": bool
198
+ - "default_substitution_cost": float
199
+ - "default_insertion_cost": float
200
+ - "default_deletion_cost": float
201
+ """
202
+ # Convert the list of substitution costs back to the required dict format
203
+ sub_costs: dict[tuple[str, str], float] = {
204
+ (item["from"], item["to"]): item["cost"] for item in data.get("substitution_costs", {})
205
+ }
206
+
207
+ return cls(
208
+ substitution_costs=sub_costs,
209
+ insertion_costs=data.get("substitution_costs"),
210
+ deletion_costs=data.get("deletion_costs"),
211
+ symmetric_substitution=data.get("symmetric_substitution", True),
212
+ default_substitution_cost=data.get("default_substitution_cost", 1.0),
213
+ default_insertion_cost=data.get("default_insertion_cost", 1.0),
214
+ default_deletion_cost=data.get("default_deletion_cost", 1.0),
215
+ )
@@ -39,13 +39,13 @@ def find_best_candidate(
39
39
  calculated distance/score.
40
40
  :rtype: tuple[str, float]
41
41
 
42
- :Example:
42
+ Example::
43
43
 
44
- >>> from ocr_stringdist import weighted_levenshtein_distance as distance
45
- >>> s = "apple"
46
- >>> candidates = ["apply", "apples", "orange", "appIe"]
47
- >>> find_best_match(s, candidates, lambda s1, s2: distance(s1, s2, {("l", "I"): 0.1}))
48
- ('appIe', 0.1)
44
+ from ocr_stringdist import find_best_candidate, WeightedLevenshtein
45
+
46
+ wl = WeightedLevenshtein({("l", "I"): 0.1})
47
+ find_best_candidate("apple", ["apply", "apples", "orange", "appIe"], wl.distance)
48
+ # ('appIe', 0.1)
49
49
  """
50
50
  if not candidates:
51
51
  raise ValueError("The 'candidates' iterable cannot be empty.")
@@ -0,0 +1,9 @@
1
+ from typing import TYPE_CHECKING, Protocol, runtime_checkable
2
+
3
+ if TYPE_CHECKING:
4
+ from .edit_operation import EditOperation
5
+
6
+
7
+ @runtime_checkable
8
+ class Aligner(Protocol):
9
+ def explain(self, s1: str, s2: str, filter_matches: bool) -> list["EditOperation"]: ...
@@ -0,0 +1,94 @@
1
+ Metadata-Version: 2.4
2
+ Name: ocr-stringdist
3
+ Version: 1.0.0
4
+ Classifier: Programming Language :: Rust
5
+ Classifier: Programming Language :: Python :: Implementation :: CPython
6
+ Classifier: License :: OSI Approved :: MIT License
7
+ Classifier: Operating System :: OS Independent
8
+ License-File: LICENSE
9
+ Requires-Python: >=3.9
10
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
11
+ Project-URL: repository, https://github.com/NiklasvonM/ocr-stringdist
12
+ Project-URL: documentation, https://niklasvonm.github.io/ocr-stringdist/
13
+
14
+ # OCR-StringDist
15
+
16
+ A Python library to learn, model, explain and correct OCR errors using a fast string distance engine.
17
+
18
+ Documentation: https://niklasvonm.github.io/ocr-stringdist/
19
+
20
+ [![PyPI badge](https://badge.fury.io/py/ocr-stringdist.svg)](https://badge.fury.io/py/ocr-stringdist)
21
+ [![License](https://img.shields.io/badge/License-MIT-green)](LICENSE)
22
+
23
+ ## Overview
24
+
25
+ Standard string distances (like Levenshtein) treat all character substitutions equally. This is suboptimal for text read from images via OCR, where errors like `O` vs `0` are far more common than, say, `O` vs `X`.
26
+
27
+ OCR-StringDist provides a learnable **weighted Levenshtein distance**, implementing part of the **Noisy Channel model**.
28
+
29
+ **Example:** Matching against the correct word `CODE`:
30
+
31
+ * **Standard Levenshtein:**
32
+ * $d(\text{"CODE"}, \text{"C0DE"}) = 1$ (O → 0)
33
+ * $d(\text{"CODE"}, \text{"CXDE"}) = 1$ (O → X)
34
+ * Result: Both appear equally likely/distant.
35
+
36
+ * **OCR-StringDist (Channel Model):**
37
+ * $d(\text{"CODE"}, \text{"C0DE"}) \approx 0.1$ (common error, low cost)
38
+ * $d(\text{"CODE"}, \text{"CXDE"}) = 1.0$ (unlikely error, high cost)
39
+ * Result: Correctly identifies `C0DE` as a much closer match.
40
+
41
+ This makes it ideal for matching potentially incorrect OCR output against known values (e.g., product codes). By combining this *channel model* with a *source model* (e.g., product code frequencies), you can build a complete and robust OCR correction system.
42
+
43
+ ## Installation
44
+
45
+ ```bash
46
+ pip install ocr-stringdist
47
+ ```
48
+
49
+ ## Features
50
+
51
+ - **Learnable Costs**: Automatically learn substitution, insertion, and deletion costs from a dataset of (OCR string, ground truth string) pairs.
52
+ - **Weighted Levenshtein Distance**: Models OCR error patterns by assigning custom costs to specific edit operations.
53
+ - **High Performance**: Core logic in Rust and a batch_distance function for efficiently comparing one string against thousands of candidates.
54
+ - **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
55
+ - **Explainable Edit Path**: Returns the optimal sequence of edit operations (substitutions, insertions, and deletions) used to transform one string into another.
56
+ - **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
57
+ - **Full Unicode Support**: Works with arbitrary Unicode strings.
58
+
59
+ ## Core Workflow
60
+
61
+ The typical workflow involves
62
+ - learning costs from your data and then
63
+ - using the resulting model to find the best match from a list of candidates.
64
+
65
+ ```python
66
+ from ocr_stringdist import WeightedLevenshtein
67
+
68
+ # 1. LEARN costs from your own data
69
+ training_data = [
70
+ ("128", "123"),
71
+ ("567", "567"),
72
+ ]
73
+ wl = WeightedLevenshtein.learn_from(training_data)
74
+
75
+ # The engine has now learned that '8' -> '3' is a low-cost substitution
76
+ print(f"Learned cost for ('8', '3'): {wl.substitution_costs[('8', '3')]:.2f}")
77
+
78
+
79
+ # 2. MATCH new OCR output against a list of candidates
80
+ ocr_output = "Product Code 128"
81
+ candidates = [
82
+ "Product Code 123",
83
+ "Product Code 523", # '5' -> '1' is an unlikely error
84
+ ]
85
+
86
+ distances = wl.batch_distance(ocr_output, candidates)
87
+
88
+ # Find the best match
89
+ min_distance = min(distances)
90
+ best_match = candidates[distances.index(min_distance)]
91
+
92
+ print(f"Best match for '{ocr_output}': '{best_match}' (Cost: {min_distance:.2f})")
93
+ ```
94
+
@@ -0,0 +1,14 @@
1
+ ocr_stringdist-1.0.0.dist-info/METADATA,sha256=sFZnhhX8kHoYFbMua4zHCq2tELQPXQw3vWGNRoStR-4,3963
2
+ ocr_stringdist-1.0.0.dist-info/WHEEL,sha256=2uhN7WPHLbqdXxY46NnX0Cg94h0mpGw_AYJ-hfDzYxc,108
3
+ ocr_stringdist-1.0.0.dist-info/licenses/LICENSE,sha256=5BPRcjlnbl2t4TidSgpfGrtC_birSf8JlZfA-qmVoQE,1072
4
+ ocr_stringdist.libs/libgcc_s-39080030.so.1,sha256=fIO6GHOh8Ft9CR0Geu7wSUb9Xnl122iTtrxQQ9TAkTQ,789673
5
+ ocr_stringdist/__init__.py,sha256=mL-19TkQQElK5B6iVFCV7vjKVal-6JcsBOFKwiCPQnA,284
6
+ ocr_stringdist/_rust_stringdist.cpython-312-aarch64-linux-musl.so,sha256=9cln8ZSTbHnCzIB6vGDZpuCxqEYx0Dy_FUBiit8T5Uw,920209
7
+ ocr_stringdist/default_ocr_distances.py,sha256=oSu-TpHjPA4jxKpLAfmap8z0ZsC99jsOjnRVHW7Hj_Y,1033
8
+ ocr_stringdist/edit_operation.py,sha256=EgEc-2_nOwLUZDOWtogYqKLXIQJxOd9sIAbcGkn-TMY,395
9
+ ocr_stringdist/learner.py,sha256=3qWvqHrAWm4seuwmBmFN4InRL20u8HnPATHjCTnU3I0,10491
10
+ ocr_stringdist/levenshtein.py,sha256=t05FicwL5WTTsRSzDa92v79D2LpDiEUOYG_6te8oT28,9854
11
+ ocr_stringdist/matching.py,sha256=28Xt-x_V_iVsohD3F64MfZ0mys4_qOZXTIAcmSOE0dA,3270
12
+ ocr_stringdist/protocols.py,sha256=IyvGzzktPgmPRZyDRE0UKCYo4C0tdewU8IgwFbxZLls,265
13
+ ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ ocr_stringdist-1.0.0.dist-info/RECORD,,
@@ -1,102 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: ocr-stringdist
3
- Version: 0.2.2
4
- Classifier: Programming Language :: Rust
5
- Classifier: Programming Language :: Python
6
- Classifier: Operating System :: OS Independent
7
- License-File: LICENSE
8
- Requires-Python: >=3.9
9
- Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
10
- Project-URL: repository, https://github.com/NiklasvonM/ocr-stringdist
11
- Project-URL: documentation, https://niklasvonm.github.io/ocr-stringdist/
12
-
13
- # OCR-StringDist
14
-
15
- A Python library for fast string distance calculations that account for common OCR (optical character recognition) errors.
16
-
17
- Documentation: https://niklasvonm.github.io/ocr-stringdist/
18
-
19
- [![PyPI](https://img.shields.io/badge/PyPI-Package-blue)](https://pypi.org/project/ocr-stringdist/)
20
- [![License](https://img.shields.io/badge/License-MIT-green)](LICENSE)
21
-
22
- ## Overview
23
-
24
- Standard string distances (like Levenshtein) treat all character substitutions equally. This is suboptimal for text read from images via OCR, where errors like `O` vs `0` are far more common than, say, `O` vs `X`.
25
-
26
- OCR-StringDist uses a **weighted Levenshtein distance**, assigning lower costs to common OCR errors.
27
-
28
- **Example:** Matching against the correct word `CODE`:
29
-
30
- * **Standard Levenshtein:**
31
- * $d(\text{"CODE"}, \text{"C0DE"}) = 1$ (O → 0)
32
- * $d(\text{"CODE"}, \text{"CXDE"}) = 1$ (O → X)
33
- * Result: Both appear equally likely/distant.
34
-
35
- * **OCR-StringDist (Weighted):**
36
- * $d(\text{"CODE"}, \text{"C0DE"}) \approx 0.1$ (common error, low cost)
37
- * $d(\text{"CODE"}, \text{"CXDE"}) = 1.0$ (unlikely error, high cost)
38
- * Result: Correctly identifies `C0DE` as a much closer match.
39
-
40
- This makes it ideal for matching potentially incorrect OCR output against known values (e.g., product codes, database entries).
41
-
42
- ## Installation
43
-
44
- ```bash
45
- pip install ocr-stringdist
46
- ```
47
-
48
- ## Features
49
-
50
- - **High Performance**: The core logic is implemented in Rust with speed in mind.
51
- - **Weighted Levenshtein Distance**: Calculates Levenshtein distance with customizable costs for substitutions, insertions, and deletions. Includes an efficient batch version (`batch_weighted_levenshtein_distance`) for comparing one string against many candidates.
52
- - **Explainable Edit Path**: Returns the optimal sequence of edit operations (substitutions, insertions, and deletions) used to transform one string into another.
53
- - **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
54
- - **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
55
- - **Unicode Support**: Works with arbitrary Unicode strings.
56
- - **Best Match Finder**: Includes a utility function `find_best_candidate` to efficiently find the best match from a list based on _any_ distance function.
57
-
58
- ## Usage
59
-
60
- ### Basic usage
61
-
62
- ```python
63
- from ocr_stringdist import WeightedLevenshtein
64
-
65
- # Default substitution costs are ocr_stringdist.ocr_distance_map.
66
- wl = WeightedLevenshtein()
67
-
68
- print(wl.distance("CXDE", "CODE")) # == 1
69
- print(wl.distance("C0DE", "CODE")) # < 1
70
- ```
71
-
72
- ### Explain the Edit Path
73
-
74
- ```python
75
- edit_path = wl.explain("C0DE", "CODE")
76
- print(edit_path)
77
- # EditOperation(op_type='substitute', source_token='0', target_token='O', cost=0.1)]
78
- ```
79
-
80
- ### Fast Batch Calculations
81
-
82
- Quickly compare a string to a list of candidates.
83
-
84
- ```python
85
- distances: list[float] = wl.batch_distance("CODE", ["CXDE", "C0DE"])
86
- # [1.0, 0.1]
87
- ```
88
-
89
- ### Multi-character Substitutions
90
-
91
- ```python
92
- # Custom costs with multi-character substitution
93
- wl = WeightedLevenshtein(substitution_costs={("In", "h"): 0.5})
94
-
95
- print(wl.distance("hi", "Ini")) # 0.5
96
- ```
97
-
98
-
99
- ## Acknowledgements
100
-
101
- This project is inspired by [jellyfish](https://github.com/jamesturk/jellyfish), providing the base implementations of the algorithms used here.
102
-
@@ -1,11 +0,0 @@
1
- ocr_stringdist-0.2.2.dist-info/METADATA,sha256=2KjG6DHqpsannN0lPK4EwkYBbY3adZrl1oTCq-elnL8,3868
2
- ocr_stringdist-0.2.2.dist-info/WHEEL,sha256=2uhN7WPHLbqdXxY46NnX0Cg94h0mpGw_AYJ-hfDzYxc,108
3
- ocr_stringdist-0.2.2.dist-info/licenses/LICENSE,sha256=5BPRcjlnbl2t4TidSgpfGrtC_birSf8JlZfA-qmVoQE,1072
4
- ocr_stringdist.libs/libgcc_s-39080030.so.1,sha256=fIO6GHOh8Ft9CR0Geu7wSUb9Xnl122iTtrxQQ9TAkTQ,789673
5
- ocr_stringdist/__init__.py,sha256=ApxqraLRcWAkzXhGJXSf3EqGEVFbxghrYrfJ9dmQjQU,467
6
- ocr_stringdist/_rust_stringdist.cpython-312-aarch64-linux-musl.so,sha256=4nCAnjLsi1JIUhyHFAmFPjftQDYZ_vWQ-tsUaCB44Ow,920193
7
- ocr_stringdist/default_ocr_distances.py,sha256=oSu-TpHjPA4jxKpLAfmap8z0ZsC99jsOjnRVHW7Hj_Y,1033
8
- ocr_stringdist/levenshtein.py,sha256=Jypg31BQyULipJ_Yh3dcBQDKNnbvEIlmf28tDr_gySw,11243
9
- ocr_stringdist/matching.py,sha256=rr8R63Ttu2hTf5Mni7_P8aGBbjWs6t2QPV3wxKXspAs,3293
10
- ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- ocr_stringdist-0.2.2.dist-info/RECORD,,