ocr-stringdist 0.3.0__cp313-cp313-musllinux_1_1_i686.whl → 1.0.0__cp313-cp313-musllinux_1_1_i686.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,17 +1,11 @@
1
1
  from .default_ocr_distances import ocr_distance_map
2
- from .levenshtein import (
3
- WeightedLevenshtein,
4
- batch_weighted_levenshtein_distance,
5
- explain_weighted_levenshtein,
6
- weighted_levenshtein_distance,
7
- )
2
+ from .learner import CostLearner
3
+ from .levenshtein import WeightedLevenshtein
8
4
  from .matching import find_best_candidate
9
5
 
10
6
  __all__ = [
11
7
  "ocr_distance_map",
8
+ "CostLearner",
12
9
  "WeightedLevenshtein",
13
- "weighted_levenshtein_distance",
14
- "batch_weighted_levenshtein_distance",
15
- "explain_weighted_levenshtein",
16
10
  "find_best_candidate",
17
11
  ]
ocr_stringdist/learner.py CHANGED
@@ -1,3 +1,4 @@
1
+ import itertools
1
2
  import math
2
3
  from collections import defaultdict
3
4
  from collections.abc import Iterable
@@ -7,12 +8,12 @@ from typing import TYPE_CHECKING, Callable, Optional
7
8
  if TYPE_CHECKING:
8
9
  from .edit_operation import EditOperation
9
10
  from .levenshtein import WeightedLevenshtein
11
+ from .protocols import Aligner
10
12
 
11
13
  CostFunction = Callable[[float], float]
12
14
 
13
15
 
14
16
  def negative_log_likelihood(probability: float) -> float:
15
- """Standard cost function based on information theory. Common errors get low cost."""
16
17
  if probability <= 0.0:
17
18
  raise ValueError("Probability must be positive to compute negative log likelihood.")
18
19
  return -math.log(probability)
@@ -26,6 +27,7 @@ class TallyCounts:
26
27
  insertions: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
27
28
  deletions: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
28
29
  source_chars: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
30
+ target_chars: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
29
31
  vocab: set[str] = field(default_factory=set)
30
32
 
31
33
 
@@ -36,7 +38,7 @@ class _Costs:
36
38
  deletions: dict[str, float]
37
39
 
38
40
 
39
- class Learner:
41
+ class CostLearner:
40
42
  """
41
43
  Configures and executes the process of learning Levenshtein costs from data.
42
44
 
@@ -45,17 +47,16 @@ class Learner:
45
47
 
46
48
  Example::
47
49
 
48
- from ocr_stringdist.learner import Learner
50
+ from ocr_stringdist import CostLearner
49
51
 
50
52
  data = [
51
53
  ("Hell0", "Hello"),
52
54
  ]
53
- learner = Learner().with_smoothing(1.0)
55
+ learner = CostLearner().with_smoothing(1.0)
54
56
  wl = learner.fit(data) # Substitution 0 -> o learned with cost < 1.0
55
57
  """
56
58
 
57
59
  # Configuration parameters
58
- _cost_function: CostFunction
59
60
  _smoothing_k: float
60
61
 
61
62
  # These attributes are set during fitting
@@ -63,71 +64,37 @@ class Learner:
63
64
  vocab_size: Optional[int] = None
64
65
 
65
66
  def __init__(self) -> None:
66
- self._cost_function = negative_log_likelihood
67
67
  self._smoothing_k = 1.0
68
68
 
69
- def with_cost_function(self, cost_function: CostFunction) -> "Learner":
70
- """
71
- Sets a custom function to convert probabilities to costs.
72
-
73
- :param cost_function: A callable that takes a float (probability)
74
- and returns a float (cost).
75
- Is negative log likelihood unless overridden.
76
- :return: The Learner instance for method chaining.
77
- """
78
- self._cost_function = cost_function
79
- return self
80
-
81
- def with_smoothing(self, k: float) -> "Learner":
69
+ def with_smoothing(self, k: float) -> "CostLearner":
82
70
  r"""
83
71
  Sets the smoothing parameter `k`.
84
72
 
85
- This parameter controls how strongly the model defaults to uniform probabilities.
73
+ This parameter controls how strongly the model defaults to a uniform
74
+ probability distribution by adding a "pseudo-count" of `k` to every
75
+ possible event.
86
76
 
87
- :param k: The smoothing factor, which must be a positive number.
88
- :return: The Learner instance for method chaining.
77
+ :param k: The smoothing factor, which must be a non-negative number.
78
+ :return: The CostLearner instance for method chaining.
79
+ :raises ValueError: If k < 0.
89
80
 
90
81
  Notes
91
82
  -----
92
- **Conceptual Framework**
93
-
94
- Additive smoothing works by adding a "pseudo-count" `k` to every possible
95
- event before calculating probabilities. This effectively pretends that
96
- every possible substitution, insertion, and deletion has already been seen
97
- `k` times.
98
-
99
- - **k = 1.0 (Default):** This is standard **Laplace smoothing**. It is a
100
- robust choice for most situations and corresponds to adding one
101
- pseudo-count for every possible event.
102
- - **0 < k < 1.0:** A smaller `k` is suitable for large and representative
103
- datasets, indicating higher confidence in the observed frequencies.
104
- - **k > 1.0:** A larger `k` is useful for small or noisy datasets. It
105
- regularizes the model by pulling the learned probabilities closer
106
- to a uniform distribution.
107
-
108
- **Bayesian Interpretation**
109
-
110
- From a Bayesian perspective, `k` serves as the concentration parameter,
111
- :math:`\alpha`, of a **symmetric Dirichlet prior distribution**.
112
- This distribution acts as the conjugate prior for the
113
- multinomial distribution of the observed error counts.
114
-
115
- The smoothed probability of an event `i` is the posterior expectation of
116
- the Dirichlet-multinomial model:
117
-
118
- .. math::
119
-
120
- P(\text{event}_i) = \frac{c_i + k}{N + k \cdot V}
121
-
122
- Where:
123
- - :math:`c_i` is the observed count of event :math:`i`.
124
- - :math:`N` is the total number of observations in the given context
125
- (e.g., the total count of a specific source character).
126
- - :math:`V` is the vocabulary size (the number of possible unique events).
127
- - :math:`k` is the smoothing parameter, representing the strength of the prior.
83
+ This parameter allows for a continuous transition between two modes:
84
+
85
+ - **k > 0 (recommended):** This enables additive smoothing, with `k = 1.0`
86
+ being Laplace smoothing. It regularizes the model by assuming no event is impossible.
87
+ The final costs are a measure of "relative surprisal," normalized by the vocabulary size
88
+
89
+ - **k = 0:** This corresponds to a normalized Maximum Likelihood Estimation.
90
+ Probabilities are derived from the raw observed frequencies. The final costs are
91
+ normalized using the same logic as the `k > 0` case, making `k=0` the continuous limit
92
+ of the smoothed model. In this mode, costs can only be calculated for events observed in
93
+ the training data. Unseen events will receive the default cost, regardless of
94
+ the value of `calculate_for_unseen` in :meth:`fit`.
128
95
  """
129
- if k <= 0:
130
- raise ValueError("Smoothing parameter k must be positive.")
96
+ if k < 0:
97
+ raise ValueError("Smoothing parameter k must be non-negative.")
131
98
  self._smoothing_k = k
132
99
  return self
133
100
 
@@ -138,6 +105,7 @@ class Learner:
138
105
  if op.source_token is not None:
139
106
  counts.vocab.add(op.source_token)
140
107
  if op.target_token is not None:
108
+ counts.target_chars[op.target_token] += 1
141
109
  counts.vocab.add(op.target_token)
142
110
 
143
111
  if op.op_type == "substitute":
@@ -160,85 +128,112 @@ class Learner:
160
128
  counts.source_chars[op.source_token] += 1
161
129
  return counts
162
130
 
163
- def _calculate_single_scaled_cost(
164
- self,
165
- observed_count: int,
166
- context_total: int,
167
- vocab_size: int,
168
- ) -> Optional[float]:
169
- """Calculates a single scaled cost for an edit operation."""
170
- denominator = context_total + self._smoothing_k * vocab_size
171
- if denominator <= 0:
172
- return None
173
-
174
- # Calculate the cost of an unseen event in this context, used for scaling
175
- prob_unseen = self._smoothing_k / denominator
176
- scaling_factor = self._cost_function(prob_unseen)
177
- if scaling_factor <= 0:
178
- return None
179
-
180
- # Calculate the cost for the actually observed event
181
- prob_observed = (observed_count + self._smoothing_k) / denominator
182
- cost_observed = self._cost_function(prob_observed)
183
-
184
- return cost_observed / scaling_factor
185
-
186
- def _calculate_costs(self, counts: TallyCounts, vocab_size: int) -> _Costs:
131
+ def _calculate_costs(
132
+ self, counts: TallyCounts, vocab: set[str], calculate_for_unseen: bool = False
133
+ ) -> _Costs:
187
134
  """
188
- Calculates and scales costs for observed operations using a context-dependent
189
- scaling factor to ensure the effective default cost is 1.0.
135
+ Calculates the costs for edit operations based on tallied counts.
190
136
  """
191
-
192
- # Substitutions
193
137
  sub_costs: dict[tuple[str, str], float] = {}
194
- for (source, target), count in counts.substitutions.items():
195
- source_char_count = counts.source_chars[source]
196
- cost = self._calculate_single_scaled_cost(count, source_char_count, vocab_size)
197
- if cost is not None:
198
- sub_costs[(source, target)] = cost
199
-
200
- # Insertions
201
138
  ins_costs: dict[str, float] = {}
202
- total_chars = sum(counts.source_chars.values())
203
- for target, count in counts.insertions.items():
204
- cost = self._calculate_single_scaled_cost(count, total_chars, vocab_size)
205
- if cost is not None:
206
- ins_costs[target] = cost
139
+ del_costs: dict[str, float] = {}
140
+ k = self._smoothing_k
141
+
142
+ if k == 0:
143
+ calculate_for_unseen = False
144
+
145
+ # Error space size V for all conditional probabilities.
146
+ # The space of possible outcomes for a given source character (from OCR)
147
+ # includes all vocab characters (for matches/substitutions) plus the empty
148
+ # character (for deletions). This gives V = len(vocab) + 1.
149
+ # Symmetrically, the space of outcomes for a given target character (from GT)
150
+ # includes all vocab characters plus the empty character (for insertions/misses).
151
+ V = len(vocab) + 1
152
+
153
+ # Normalization ceiling Z' = -log(1/V).
154
+ normalization_ceiling = math.log(V) if V > 1 else 1.0
155
+
156
+ # Substitutions
157
+ sub_iterator = (
158
+ itertools.product(vocab, vocab) if calculate_for_unseen else counts.substitutions.keys()
159
+ )
160
+ for source, target in sub_iterator:
161
+ count = counts.substitutions[(source, target)]
162
+ total_count = counts.source_chars[source]
163
+ prob = (count + k) / (total_count + k * V)
164
+ base_cost = negative_log_likelihood(prob)
165
+ sub_costs[(source, target)] = base_cost / normalization_ceiling
207
166
 
208
167
  # Deletions
209
- del_costs: dict[str, float] = {}
210
- for source, count in counts.deletions.items():
211
- source_char_count = counts.source_chars[source]
212
- cost = self._calculate_single_scaled_cost(count, source_char_count, vocab_size)
213
- if cost is not None:
214
- del_costs[source] = cost
168
+ del_iterator = vocab if calculate_for_unseen else counts.deletions.keys()
169
+ for source in del_iterator:
170
+ count = counts.deletions[source]
171
+ total_count = counts.source_chars[source]
172
+ prob = (count + k) / (total_count + k * V)
173
+ base_cost = negative_log_likelihood(prob)
174
+ del_costs[source] = base_cost / normalization_ceiling
175
+
176
+ # Insertions
177
+ ins_iterator = vocab if calculate_for_unseen else counts.insertions.keys()
178
+ for target in ins_iterator:
179
+ count = counts.insertions[target]
180
+ total_target_count = counts.target_chars[target]
181
+ prob = (count + k) / (total_target_count + k * V)
182
+ base_cost = negative_log_likelihood(prob)
183
+ ins_costs[target] = base_cost / normalization_ceiling
215
184
 
216
185
  return _Costs(substitutions=sub_costs, insertions=ins_costs, deletions=del_costs)
217
186
 
218
- def _calculate_operations(self, pairs: Iterable[tuple[str, str]]) -> list["EditOperation"]:
219
- """Calculate edit operations for all string pairs using unweighted Levenshtein."""
220
- from .levenshtein import WeightedLevenshtein
187
+ def _calculate_operations(
188
+ self, pairs: Iterable[tuple[str, str]], aligner: "Aligner"
189
+ ) -> list["EditOperation"]:
190
+ """Calculate edit operations for all string pairs using the provided aligner."""
221
191
 
222
- unweighted_lev = WeightedLevenshtein.unweighted()
223
192
  all_ops = [
224
193
  op
225
194
  for ocr_str, truth_str in pairs
226
- for op in unweighted_lev.explain(ocr_str, truth_str, filter_matches=False)
195
+ for op in aligner.explain(ocr_str, truth_str, filter_matches=False)
227
196
  ]
228
197
  return all_ops
229
198
 
230
- def fit(self, pairs: Iterable[tuple[str, str]]) -> "WeightedLevenshtein":
199
+ def fit(
200
+ self,
201
+ pairs: Iterable[tuple[str, str]],
202
+ *,
203
+ initial_model: "Aligner | None" = None,
204
+ calculate_for_unseen: bool = False,
205
+ ) -> "WeightedLevenshtein":
231
206
  """
232
207
  Fits the costs of a WeightedLevenshtein instance to the provided data.
233
208
 
234
- Note that learning multi-character tokens is not yet supported.
209
+ Note that learning multi-character tokens is only supported if an initial alignment model
210
+ is provided that can handle those multi-character tokens.
211
+
212
+ This method analyzes pairs of strings to learn the costs of edit operations
213
+ based on their observed frequencies. The underlying model calculates costs
214
+ based on the principle of relative information cost.
215
+
216
+ For a detailed explanation of the methodology, please see the
217
+ :doc:`Cost Learning Model <cost_learning_model>` documentation page.
235
218
 
236
219
  :param pairs: An iterable of (ocr_string, ground_truth_string) tuples.
220
+ :param initial_model: Optional initial model used to align OCR outputs and ground truth
221
+ strings. By default, an unweighted Levenshtein distance is used.
222
+ :param calculate_for_unseen: If True (and k > 0), pre-calculates costs for all
223
+ possible edit operations based on the vocabulary.
224
+ If False (default), only calculates costs for operations
225
+ observed in the data.
237
226
  :return: A `WeightedLevenshtein` instance with the learned costs.
238
227
  """
239
228
  from .levenshtein import WeightedLevenshtein
240
229
 
241
- all_ops = self._calculate_operations(pairs)
230
+ if not pairs:
231
+ return WeightedLevenshtein.unweighted()
232
+
233
+ if initial_model is None:
234
+ initial_model = WeightedLevenshtein.unweighted()
235
+
236
+ all_ops = self._calculate_operations(pairs, aligner=initial_model)
242
237
  self.counts = self._tally_operations(all_ops)
243
238
  vocab = self.counts.vocab
244
239
  self.vocab_size = len(vocab)
@@ -246,12 +241,13 @@ class Learner:
246
241
  if not self.vocab_size:
247
242
  return WeightedLevenshtein.unweighted()
248
243
 
249
- costs = self._calculate_costs(self.counts, self.vocab_size)
244
+ costs = self._calculate_costs(self.counts, vocab, calculate_for_unseen=calculate_for_unseen)
250
245
 
251
246
  return WeightedLevenshtein(
252
247
  substitution_costs=costs.substitutions,
253
248
  insertion_costs=costs.insertions,
254
249
  deletion_costs=costs.deletions,
250
+ symmetric_substitution=False,
255
251
  default_substitution_cost=1.0,
256
252
  default_insertion_cost=1.0,
257
253
  default_deletion_cost=1.0,
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from collections.abc import Iterable
4
- from typing import Optional
4
+ from typing import Any, Optional
5
5
 
6
6
  from ._rust_stringdist import (
7
7
  _batch_weighted_levenshtein_distance,
@@ -24,10 +24,13 @@ class WeightedLevenshtein:
24
24
  Defaults to costs based on common OCR errors.
25
25
  :param insertion_costs: Maps a string to its insertion cost.
26
26
  :param deletion_costs: Maps a string to its deletion cost.
27
- :param symmetric_substitution: If True, substitution costs are bidirectional.
27
+ :param symmetric_substitution: If True, a cost defined for, e.g., ('0', 'O') will automatically
28
+ apply to ('O', '0'). If False, both must be defined explicitly.
28
29
  :param default_substitution_cost: Default cost for single-char substitutions not in the map.
29
30
  :param default_insertion_cost: Default cost for single-char insertions not in the map.
30
31
  :param default_deletion_cost: Default cost for single-char deletions not in the map.
32
+
33
+ :raises TypeError, ValueError: If the provided arguments are invalid.
31
34
  """
32
35
 
33
36
  substitution_costs: dict[tuple[str, str], float]
@@ -49,9 +52,37 @@ class WeightedLevenshtein:
49
52
  default_insertion_cost: float = 1.0,
50
53
  default_deletion_cost: float = 1.0,
51
54
  ) -> None:
52
- self.substitution_costs = (
53
- ocr_distance_map if substitution_costs is None else substitution_costs
54
- )
55
+ # Validate default costs
56
+ for cost_name, cost_val in [
57
+ ("default_substitution_cost", default_substitution_cost),
58
+ ("default_insertion_cost", default_insertion_cost),
59
+ ("default_deletion_cost", default_deletion_cost),
60
+ ]:
61
+ if not isinstance(cost_val, (int, float)):
62
+ raise TypeError(f"{cost_name} must be a number, but got: {type(cost_val).__name__}")
63
+ if cost_val < 0:
64
+ raise ValueError(f"{cost_name} must be non-negative, got value: {cost_val}")
65
+
66
+ # Validate substitution_costs dictionary
67
+ sub_costs = ocr_distance_map if substitution_costs is None else substitution_costs
68
+ for key, cost in sub_costs.items():
69
+ if not (
70
+ isinstance(key, tuple)
71
+ and len(key) == 2
72
+ and isinstance(key[0], str)
73
+ and isinstance(key[1], str)
74
+ ):
75
+ raise TypeError(
76
+ f"substitution_costs keys must be tuples of two strings, but found: {key}"
77
+ )
78
+ if not isinstance(cost, (int, float)):
79
+ raise TypeError(
80
+ f"Cost for substitution {key} must be a number, but got: {type(cost).__name__}"
81
+ )
82
+ if cost < 0:
83
+ raise ValueError(f"Cost for substitution {key} cannot be negative, but got: {cost}")
84
+
85
+ self.substitution_costs = sub_costs
55
86
  self.insertion_costs = {} if insertion_costs is None else insertion_costs
56
87
  self.deletion_costs = {} if deletion_costs is None else deletion_costs
57
88
  self.symmetric_substitution = symmetric_substitution
@@ -92,7 +123,8 @@ class WeightedLevenshtein:
92
123
  """
93
124
  Creates an instance by learning costs from a dataset of (OCR, ground truth) string pairs.
94
125
 
95
- For more advanced learning configuration, see the `ocr_stringdist.learner.Learner` class.
126
+ For more advanced learning configuration, see the
127
+ :class:`ocr_stringdist.learner.CostLearner` class.
96
128
 
97
129
  :param pairs: An iterable of (ocr_string, ground_truth_string) tuples. Correct pairs
98
130
  are not intended to be filtered; they are needed to learn well-aligned costs.
@@ -111,9 +143,9 @@ class WeightedLevenshtein:
111
143
  print(wl.substitution_costs) # learned cost for substituting '8' with 'B'
112
144
  print(wl.deletion_costs) # learned cost for deleting '.'
113
145
  """
114
- from .learner import Learner
146
+ from .learner import CostLearner
115
147
 
116
- return Learner().fit(pairs)
148
+ return CostLearner().fit(pairs)
117
149
 
118
150
  def __eq__(self, other: object) -> bool:
119
151
  if not isinstance(other, WeightedLevenshtein):
@@ -128,154 +160,56 @@ class WeightedLevenshtein:
128
160
  and self.default_deletion_cost == other.default_deletion_cost
129
161
  )
130
162
 
163
+ def to_dict(self) -> dict[str, Any]:
164
+ """
165
+ Serializes the instance's configuration to a dictionary.
131
166
 
132
- def weighted_levenshtein_distance(
133
- s1: str,
134
- s2: str,
135
- /,
136
- substitution_costs: Optional[dict[tuple[str, str], float]] = None,
137
- insertion_costs: Optional[dict[str, float]] = None,
138
- deletion_costs: Optional[dict[str, float]] = None,
139
- *,
140
- symmetric_substitution: bool = True,
141
- default_substitution_cost: float = 1.0,
142
- default_insertion_cost: float = 1.0,
143
- default_deletion_cost: float = 1.0,
144
- ) -> float:
145
- """
146
- Levenshtein distance with custom substitution, insertion and deletion costs.
147
-
148
- See also :meth:`WeightedLevenshtein.distance`.
149
-
150
- The default `substitution_costs` considers common OCR errors, see
151
- :py:data:`ocr_stringdist.default_ocr_distances.ocr_distance_map`.
152
-
153
- :param s1: First string (interpreted as the string read via OCR)
154
- :param s2: Second string
155
- :param substitution_costs: Dictionary mapping tuples of strings ("substitution tokens") to their
156
- substitution costs. Only one direction needs to be configured unless
157
- `symmetric_substitution` is False.
158
- Note that the runtime scales in the length of the longest substitution token.
159
- Defaults to `ocr_stringdist.ocr_distance_map`.
160
- :param insertion_costs: Dictionary mapping strings to their insertion costs.
161
- :param deletion_costs: Dictionary mapping strings to their deletion costs.
162
- :param symmetric_substitution: Should the keys of `substitution_costs` be considered to be
163
- symmetric? Defaults to True.
164
- :param default_substitution_cost: The default substitution cost for character pairs not found
165
- in `substitution_costs`.
166
- :param default_insertion_cost: The default insertion cost for characters not found in
167
- `insertion_costs`.
168
- :param default_deletion_cost: The default deletion cost for characters not found in
169
- `deletion_costs`.
170
- """
171
- return WeightedLevenshtein(
172
- substitution_costs=substitution_costs,
173
- insertion_costs=insertion_costs,
174
- deletion_costs=deletion_costs,
175
- symmetric_substitution=symmetric_substitution,
176
- default_substitution_cost=default_substitution_cost,
177
- default_insertion_cost=default_insertion_cost,
178
- default_deletion_cost=default_deletion_cost,
179
- ).distance(s1, s2)
180
-
181
-
182
- def batch_weighted_levenshtein_distance(
183
- s: str,
184
- candidates: list[str],
185
- /,
186
- substitution_costs: Optional[dict[tuple[str, str], float]] = None,
187
- insertion_costs: Optional[dict[str, float]] = None,
188
- deletion_costs: Optional[dict[str, float]] = None,
189
- *,
190
- symmetric_substitution: bool = True,
191
- default_substitution_cost: float = 1.0,
192
- default_insertion_cost: float = 1.0,
193
- default_deletion_cost: float = 1.0,
194
- ) -> list[float]:
195
- """
196
- Calculate weighted Levenshtein distances between a string and multiple candidates.
197
-
198
- See also :meth:`WeightedLevenshtein.batch_distance`.
199
-
200
- This is more efficient than calling :func:`weighted_levenshtein_distance` multiple times.
201
-
202
- :param s: The string to compare (interpreted as the string read via OCR)
203
- :param candidates: List of candidate strings to compare against
204
- :param substitution_costs: Dictionary mapping tuples of strings ("substitution tokens") to their
205
- substitution costs. Only one direction needs to be configured unless
206
- `symmetric_substitution` is False.
207
- Note that the runtime scales in the length of the longest substitution token.
208
- Defaults to `ocr_stringdist.ocr_distance_map`.
209
- :param insertion_costs: Dictionary mapping strings to their insertion costs.
210
- :param deletion_costs: Dictionary mapping strings to their deletion costs.
211
- :param symmetric_substitution: Should the keys of `substitution_costs` be considered to be
212
- symmetric? Defaults to True.
213
- :param default_substitution_cost: The default substitution cost for character pairs not found
214
- in `substitution_costs`.
215
- :param default_insertion_cost: The default insertion cost for characters not found in
216
- `insertion_costs`.
217
- :param default_deletion_cost: The default deletion cost for characters not found in
218
- `deletion_costs`.
219
- :return: A list of distances corresponding to each candidate
220
- """
221
- return WeightedLevenshtein(
222
- substitution_costs=substitution_costs,
223
- insertion_costs=insertion_costs,
224
- deletion_costs=deletion_costs,
225
- symmetric_substitution=symmetric_substitution,
226
- default_substitution_cost=default_substitution_cost,
227
- default_insertion_cost=default_insertion_cost,
228
- default_deletion_cost=default_deletion_cost,
229
- ).batch_distance(s, candidates)
230
-
231
-
232
- def explain_weighted_levenshtein(
233
- s1: str,
234
- s2: str,
235
- /,
236
- substitution_costs: Optional[dict[tuple[str, str], float]] = None,
237
- insertion_costs: Optional[dict[str, float]] = None,
238
- deletion_costs: Optional[dict[str, float]] = None,
239
- *,
240
- symmetric_substitution: bool = True,
241
- default_substitution_cost: float = 1.0,
242
- default_insertion_cost: float = 1.0,
243
- default_deletion_cost: float = 1.0,
244
- filter_matches: bool = True,
245
- ) -> list[EditOperation]:
246
- """
247
- Computes the path of operations associated with the custom Levenshtein distance.
248
-
249
- See also :meth:`WeightedLevenshtein.explain`.
250
-
251
- The default `substitution_costs` considers common OCR errors, see
252
- :py:data:`ocr_stringdist.default_ocr_distances.ocr_distance_map`.
253
-
254
- :param s1: First string (interpreted as the string read via OCR)
255
- :param s2: Second string
256
- :param substitution_costs: Dictionary mapping tuples of strings ("substitution tokens") to their
257
- substitution costs. Only one direction needs to be configured unless
258
- `symmetric_substitution` is False.
259
- Note that the runtime scales in the length of the longest substitution token.
260
- Defaults to `ocr_stringdist.ocr_distance_map`.
261
- :param insertion_costs: Dictionary mapping strings to their insertion costs.
262
- :param deletion_costs: Dictionary mapping strings to their deletion costs.
263
- :param symmetric_substitution: Should the keys of `substitution_costs` be considered to be
264
- symmetric? Defaults to True.
265
- :param default_substitution_cost: The default substitution cost for character pairs not found
266
- in `substitution_costs`.
267
- :param default_insertion_cost: The default insertion cost for characters not found in
268
- `insertion_costs`.
269
- :param default_deletion_cost: The default deletion cost for characters not found in
270
- `deletion_costs`.
271
- :return: List of :class:`EditOperation` instances.
272
- """
273
- return WeightedLevenshtein(
274
- substitution_costs=substitution_costs,
275
- insertion_costs=insertion_costs,
276
- deletion_costs=deletion_costs,
277
- symmetric_substitution=symmetric_substitution,
278
- default_substitution_cost=default_substitution_cost,
279
- default_insertion_cost=default_insertion_cost,
280
- default_deletion_cost=default_deletion_cost,
281
- ).explain(s1, s2, filter_matches=filter_matches)
167
+ The result can be written to, say, JSON.
168
+
169
+ For the counterpart, see :meth:`WeightedLevenshtein.from_dict`.
170
+ """
171
+ # Convert tuple keys to a list of lists/objects for broader compatibility (e.g., JSON)
172
+ sub_costs_serializable = [
173
+ {"from": k[0], "to": k[1], "cost": v} for k, v in self.substitution_costs.items()
174
+ ]
175
+
176
+ return {
177
+ "substitution_costs": sub_costs_serializable,
178
+ "insertion_costs": self.insertion_costs,
179
+ "deletion_costs": self.deletion_costs,
180
+ "symmetric_substitution": self.symmetric_substitution,
181
+ "default_substitution_cost": self.default_substitution_cost,
182
+ "default_insertion_cost": self.default_insertion_cost,
183
+ "default_deletion_cost": self.default_deletion_cost,
184
+ }
185
+
186
+ @classmethod
187
+ def from_dict(cls, data: dict[str, Any]) -> WeightedLevenshtein:
188
+ """
189
+ Deserialize from a dictionary.
190
+
191
+ For the counterpart, see :meth:`WeightedLevenshtein.to_dict`.
192
+
193
+ :param data: A dictionary with (not necessarily all of) the following keys:
194
+ - "substitution_costs": {"from": str, "to": str, "cost": float}
195
+ - "substitution_costs": dict[str, float]
196
+ - "deletion_costs": dict[str, float]
197
+ - "symmetric_substitution": bool
198
+ - "default_substitution_cost": float
199
+ - "default_insertion_cost": float
200
+ - "default_deletion_cost": float
201
+ """
202
+ # Convert the list of substitution costs back to the required dict format
203
+ sub_costs: dict[tuple[str, str], float] = {
204
+ (item["from"], item["to"]): item["cost"] for item in data.get("substitution_costs", {})
205
+ }
206
+
207
+ return cls(
208
+ substitution_costs=sub_costs,
209
+ insertion_costs=data.get("substitution_costs"),
210
+ deletion_costs=data.get("deletion_costs"),
211
+ symmetric_substitution=data.get("symmetric_substitution", True),
212
+ default_substitution_cost=data.get("default_substitution_cost", 1.0),
213
+ default_insertion_cost=data.get("default_insertion_cost", 1.0),
214
+ default_deletion_cost=data.get("default_deletion_cost", 1.0),
215
+ )
@@ -0,0 +1,9 @@
1
+ from typing import TYPE_CHECKING, Protocol, runtime_checkable
2
+
3
+ if TYPE_CHECKING:
4
+ from .edit_operation import EditOperation
5
+
6
+
7
+ @runtime_checkable
8
+ class Aligner(Protocol):
9
+ def explain(self, s1: str, s2: str, filter_matches: bool) -> list["EditOperation"]: ...
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ocr-stringdist
3
- Version: 0.3.0
3
+ Version: 1.0.0
4
4
  Classifier: Programming Language :: Rust
5
5
  Classifier: Programming Language :: Python :: Implementation :: CPython
6
6
  Classifier: License :: OSI Approved :: MIT License
@@ -13,7 +13,7 @@ Project-URL: documentation, https://niklasvonm.github.io/ocr-stringdist/
13
13
 
14
14
  # OCR-StringDist
15
15
 
16
- A Python library for fast string distance calculations that account for common OCR (optical character recognition) errors.
16
+ A Python library to learn, model, explain and correct OCR errors using a fast string distance engine.
17
17
 
18
18
  Documentation: https://niklasvonm.github.io/ocr-stringdist/
19
19
 
@@ -24,7 +24,7 @@ Documentation: https://niklasvonm.github.io/ocr-stringdist/
24
24
 
25
25
  Standard string distances (like Levenshtein) treat all character substitutions equally. This is suboptimal for text read from images via OCR, where errors like `O` vs `0` are far more common than, say, `O` vs `X`.
26
26
 
27
- OCR-StringDist uses a **weighted Levenshtein distance**, assigning lower costs to common OCR errors.
27
+ OCR-StringDist provides a learnable **weighted Levenshtein distance**, implementing part of the **Noisy Channel model**.
28
28
 
29
29
  **Example:** Matching against the correct word `CODE`:
30
30
 
@@ -33,12 +33,12 @@ OCR-StringDist uses a **weighted Levenshtein distance**, assigning lower costs t
33
33
  * $d(\text{"CODE"}, \text{"CXDE"}) = 1$ (O → X)
34
34
  * Result: Both appear equally likely/distant.
35
35
 
36
- * **OCR-StringDist (Weighted):**
36
+ * **OCR-StringDist (Channel Model):**
37
37
  * $d(\text{"CODE"}, \text{"C0DE"}) \approx 0.1$ (common error, low cost)
38
38
  * $d(\text{"CODE"}, \text{"CXDE"}) = 1.0$ (unlikely error, high cost)
39
39
  * Result: Correctly identifies `C0DE` as a much closer match.
40
40
 
41
- This makes it ideal for matching potentially incorrect OCR output against known values (e.g., product codes, database entries).
41
+ This makes it ideal for matching potentially incorrect OCR output against known values (e.g., product codes). By combining this *channel model* with a *source model* (e.g., product code frequencies), you can build a complete and robust OCR correction system.
42
42
 
43
43
  ## Installation
44
44
 
@@ -48,63 +48,47 @@ pip install ocr-stringdist
48
48
 
49
49
  ## Features
50
50
 
51
- - **High Performance**: The core logic is implemented in Rust with speed in mind.
52
- - **Weighted Levenshtein Distance**: Calculates Levenshtein distance with customizable costs for substitutions, insertions, and deletions. Includes an efficient batch version (`batch_weighted_levenshtein_distance`) for comparing one string against many candidates.
53
- - **Explainable Edit Path**: Returns the optimal sequence of edit operations (substitutions, insertions, and deletions) used to transform one string into another.
51
+ - **Learnable Costs**: Automatically learn substitution, insertion, and deletion costs from a dataset of (OCR string, ground truth string) pairs.
52
+ - **Weighted Levenshtein Distance**: Models OCR error patterns by assigning custom costs to specific edit operations.
53
+ - **High Performance**: Core logic in Rust and a batch_distance function for efficiently comparing one string against thousands of candidates.
54
54
  - **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
55
+ - **Explainable Edit Path**: Returns the optimal sequence of edit operations (substitutions, insertions, and deletions) used to transform one string into another.
55
56
  - **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
56
- - **Learnable Costs**: Easily learn costs from a dataset of (OCR string, ground truth string)-pairs.
57
- - **Unicode Support**: Works with arbitrary Unicode strings.
58
- - **Best Match Finder**: Includes a utility function `find_best_candidate` to efficiently find the best match from a list based on _any_ distance function.
57
+ - **Full Unicode Support**: Works with arbitrary Unicode strings.
59
58
 
60
- ## Usage
59
+ ## Core Workflow
61
60
 
62
- ### Basic usage
61
+ The typical workflow involves
62
+ - learning costs from your data and then
63
+ - using the resulting model to find the best match from a list of candidates.
63
64
 
64
65
  ```python
65
66
  from ocr_stringdist import WeightedLevenshtein
66
67
 
67
- # Default substitution costs are ocr_stringdist.ocr_distance_map.
68
- wl = WeightedLevenshtein()
68
+ # 1. LEARN costs from your own data
69
+ training_data = [
70
+ ("128", "123"),
71
+ ("567", "567"),
72
+ ]
73
+ wl = WeightedLevenshtein.learn_from(training_data)
69
74
 
70
- print(wl.distance("CXDE", "CODE")) # == 1
71
- print(wl.distance("C0DE", "CODE")) # < 1
72
- ```
75
+ # The engine has now learned that '8' -> '3' is a low-cost substitution
76
+ print(f"Learned cost for ('8', '3'): {wl.substitution_costs[('8', '3')]:.2f}")
73
77
 
74
- ### Explain the Edit Path
75
78
 
76
- ```python
77
- edit_path = wl.explain("C0DE", "CODE")
78
- print(edit_path)
79
- # [EditOperation(op_type='substitute', source_token='0', target_token='O', cost=0.1)]
80
- ```
79
+ # 2. MATCH new OCR output against a list of candidates
80
+ ocr_output = "Product Code 128"
81
+ candidates = [
82
+ "Product Code 123",
83
+ "Product Code 523", # '5' -> '1' is an unlikely error
84
+ ]
81
85
 
82
- ### Fast Batch Calculations
86
+ distances = wl.batch_distance(ocr_output, candidates)
83
87
 
84
- Quickly compare a string to a list of candidates.
88
+ # Find the best match
89
+ min_distance = min(distances)
90
+ best_match = candidates[distances.index(min_distance)]
85
91
 
86
- ```python
87
- distances: list[float] = wl.batch_distance("CODE", ["CXDE", "C0DE"])
88
- # [1.0, 0.1]
92
+ print(f"Best match for '{ocr_output}': '{best_match}' (Cost: {min_distance:.2f})")
89
93
  ```
90
94
 
91
- ### Multi-character Substitutions
92
-
93
- ```python
94
- # Custom costs with multi-character substitution
95
- wl = WeightedLevenshtein(substitution_costs={("In", "h"): 0.5})
96
-
97
- print(wl.distance("hi", "Ini")) # 0.5
98
- ```
99
-
100
- ### Learn Costs
101
-
102
- ```python
103
- wl = WeightedLevenshtein.learn_from([("Hallo", "Hello")])
104
- print(wl.substitution_costs[("a", "e")]) # < 1
105
- ```
106
-
107
- ## Acknowledgements
108
-
109
- This project is inspired by [jellyfish](https://github.com/jamesturk/jellyfish), providing the base implementations of the algorithms used here.
110
-
@@ -0,0 +1,14 @@
1
+ ocr_stringdist-1.0.0.dist-info/METADATA,sha256=sFZnhhX8kHoYFbMua4zHCq2tELQPXQw3vWGNRoStR-4,3963
2
+ ocr_stringdist-1.0.0.dist-info/WHEEL,sha256=FluwvplnLTdP5a6iTRVk79q1kl0QHijDLCcqSF4Wei0,105
3
+ ocr_stringdist-1.0.0.dist-info/licenses/LICENSE,sha256=5BPRcjlnbl2t4TidSgpfGrtC_birSf8JlZfA-qmVoQE,1072
4
+ ocr_stringdist.libs/libgcc_s-27e5a392.so.1,sha256=x5sO63liVwXxrjGGP371wB0RyQe1KEnIynYm82T0G0M,449745
5
+ ocr_stringdist/__init__.py,sha256=mL-19TkQQElK5B6iVFCV7vjKVal-6JcsBOFKwiCPQnA,284
6
+ ocr_stringdist/_rust_stringdist.cpython-313-i386-linux-musl.so,sha256=RM36Ip39T92gQdh9GdQ4CBidSCBRfQXclFCxHjtwy5Y,776649
7
+ ocr_stringdist/default_ocr_distances.py,sha256=oSu-TpHjPA4jxKpLAfmap8z0ZsC99jsOjnRVHW7Hj_Y,1033
8
+ ocr_stringdist/edit_operation.py,sha256=EgEc-2_nOwLUZDOWtogYqKLXIQJxOd9sIAbcGkn-TMY,395
9
+ ocr_stringdist/learner.py,sha256=3qWvqHrAWm4seuwmBmFN4InRL20u8HnPATHjCTnU3I0,10491
10
+ ocr_stringdist/levenshtein.py,sha256=t05FicwL5WTTsRSzDa92v79D2LpDiEUOYG_6te8oT28,9854
11
+ ocr_stringdist/matching.py,sha256=28Xt-x_V_iVsohD3F64MfZ0mys4_qOZXTIAcmSOE0dA,3270
12
+ ocr_stringdist/protocols.py,sha256=IyvGzzktPgmPRZyDRE0UKCYo4C0tdewU8IgwFbxZLls,265
13
+ ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ ocr_stringdist-1.0.0.dist-info/RECORD,,
@@ -1,13 +0,0 @@
1
- ocr_stringdist-0.3.0.dist-info/METADATA,sha256=YZzEt3aeo26-vS_1HCdXtbV1w7YC9Rnfw2dU8uUfy-s,4188
2
- ocr_stringdist-0.3.0.dist-info/WHEEL,sha256=FluwvplnLTdP5a6iTRVk79q1kl0QHijDLCcqSF4Wei0,105
3
- ocr_stringdist-0.3.0.dist-info/licenses/LICENSE,sha256=5BPRcjlnbl2t4TidSgpfGrtC_birSf8JlZfA-qmVoQE,1072
4
- ocr_stringdist.libs/libgcc_s-27e5a392.so.1,sha256=x5sO63liVwXxrjGGP371wB0RyQe1KEnIynYm82T0G0M,449745
5
- ocr_stringdist/__init__.py,sha256=ApxqraLRcWAkzXhGJXSf3EqGEVFbxghrYrfJ9dmQjQU,467
6
- ocr_stringdist/_rust_stringdist.cpython-313-i386-linux-musl.so,sha256=CNHIY3dX9jKYz4831WbekYn76CBXSU8JEwY4QRjw83U,776649
7
- ocr_stringdist/default_ocr_distances.py,sha256=oSu-TpHjPA4jxKpLAfmap8z0ZsC99jsOjnRVHW7Hj_Y,1033
8
- ocr_stringdist/edit_operation.py,sha256=EgEc-2_nOwLUZDOWtogYqKLXIQJxOd9sIAbcGkn-TMY,395
9
- ocr_stringdist/learner.py,sha256=9gZMqnF5Fm3gSKOnAfSI40JhGtMKGg0gZNil1-LVP9Q,10191
10
- ocr_stringdist/levenshtein.py,sha256=5A1k8Ana10tCbRbQXYIbMHIjeU9mhUK_DSFZZ2Znx2o,13360
11
- ocr_stringdist/matching.py,sha256=28Xt-x_V_iVsohD3F64MfZ0mys4_qOZXTIAcmSOE0dA,3270
12
- ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
- ocr_stringdist-0.3.0.dist-info/RECORD,,