ocr-stringdist 0.2.2__cp313-cp313-musllinux_1_1_x86_64.whl → 1.0.0__cp313-cp313-musllinux_1_1_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocr_stringdist/__init__.py +3 -9
- ocr_stringdist/_rust_stringdist.cpython-313-x86_64-linux-musl.so +0 -0
- ocr_stringdist/edit_operation.py +16 -0
- ocr_stringdist/learner.py +254 -0
- ocr_stringdist/levenshtein.py +151 -178
- ocr_stringdist/matching.py +6 -6
- ocr_stringdist/protocols.py +9 -0
- ocr_stringdist-1.0.0.dist-info/METADATA +94 -0
- ocr_stringdist-1.0.0.dist-info/RECORD +14 -0
- ocr_stringdist-0.2.2.dist-info/METADATA +0 -102
- ocr_stringdist-0.2.2.dist-info/RECORD +0 -11
- {ocr_stringdist-0.2.2.dist-info → ocr_stringdist-1.0.0.dist-info}/WHEEL +0 -0
- {ocr_stringdist-0.2.2.dist-info → ocr_stringdist-1.0.0.dist-info}/licenses/LICENSE +0 -0
ocr_stringdist/__init__.py
CHANGED
|
@@ -1,17 +1,11 @@
|
|
|
1
1
|
from .default_ocr_distances import ocr_distance_map
|
|
2
|
-
from .
|
|
3
|
-
|
|
4
|
-
batch_weighted_levenshtein_distance,
|
|
5
|
-
explain_weighted_levenshtein,
|
|
6
|
-
weighted_levenshtein_distance,
|
|
7
|
-
)
|
|
2
|
+
from .learner import CostLearner
|
|
3
|
+
from .levenshtein import WeightedLevenshtein
|
|
8
4
|
from .matching import find_best_candidate
|
|
9
5
|
|
|
10
6
|
__all__ = [
|
|
11
7
|
"ocr_distance_map",
|
|
8
|
+
"CostLearner",
|
|
12
9
|
"WeightedLevenshtein",
|
|
13
|
-
"weighted_levenshtein_distance",
|
|
14
|
-
"batch_weighted_levenshtein_distance",
|
|
15
|
-
"explain_weighted_levenshtein",
|
|
16
10
|
"find_best_candidate",
|
|
17
11
|
]
|
|
Binary file
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Literal, Optional
|
|
3
|
+
|
|
4
|
+
OperationType = Literal["substitute", "insert", "delete", "match"]
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass(frozen=True)
|
|
8
|
+
class EditOperation:
|
|
9
|
+
"""
|
|
10
|
+
Represents a single edit operation (substitution, insertion, deletion or match).
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
op_type: OperationType
|
|
14
|
+
source_token: Optional[str]
|
|
15
|
+
target_token: Optional[str]
|
|
16
|
+
cost: float
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
import itertools
|
|
2
|
+
import math
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from collections.abc import Iterable
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import TYPE_CHECKING, Callable, Optional
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from .edit_operation import EditOperation
|
|
10
|
+
from .levenshtein import WeightedLevenshtein
|
|
11
|
+
from .protocols import Aligner
|
|
12
|
+
|
|
13
|
+
CostFunction = Callable[[float], float]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def negative_log_likelihood(probability: float) -> float:
|
|
17
|
+
if probability <= 0.0:
|
|
18
|
+
raise ValueError("Probability must be positive to compute negative log likelihood.")
|
|
19
|
+
return -math.log(probability)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class TallyCounts:
|
|
24
|
+
substitutions: defaultdict[tuple[str, str], int] = field(
|
|
25
|
+
default_factory=lambda: defaultdict(int)
|
|
26
|
+
)
|
|
27
|
+
insertions: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
|
|
28
|
+
deletions: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
|
|
29
|
+
source_chars: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
|
|
30
|
+
target_chars: defaultdict[str, int] = field(default_factory=lambda: defaultdict(int))
|
|
31
|
+
vocab: set[str] = field(default_factory=set)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class _Costs:
|
|
36
|
+
substitutions: dict[tuple[str, str], float]
|
|
37
|
+
insertions: dict[str, float]
|
|
38
|
+
deletions: dict[str, float]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class CostLearner:
|
|
42
|
+
"""
|
|
43
|
+
Configures and executes the process of learning Levenshtein costs from data.
|
|
44
|
+
|
|
45
|
+
This class uses a builder pattern, allowing chaining configuration methods
|
|
46
|
+
before running the final calculation with .fit().
|
|
47
|
+
|
|
48
|
+
Example::
|
|
49
|
+
|
|
50
|
+
from ocr_stringdist import CostLearner
|
|
51
|
+
|
|
52
|
+
data = [
|
|
53
|
+
("Hell0", "Hello"),
|
|
54
|
+
]
|
|
55
|
+
learner = CostLearner().with_smoothing(1.0)
|
|
56
|
+
wl = learner.fit(data) # Substitution 0 -> o learned with cost < 1.0
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
# Configuration parameters
|
|
60
|
+
_smoothing_k: float
|
|
61
|
+
|
|
62
|
+
# These attributes are set during fitting
|
|
63
|
+
counts: Optional[TallyCounts] = None
|
|
64
|
+
vocab_size: Optional[int] = None
|
|
65
|
+
|
|
66
|
+
def __init__(self) -> None:
|
|
67
|
+
self._smoothing_k = 1.0
|
|
68
|
+
|
|
69
|
+
def with_smoothing(self, k: float) -> "CostLearner":
|
|
70
|
+
r"""
|
|
71
|
+
Sets the smoothing parameter `k`.
|
|
72
|
+
|
|
73
|
+
This parameter controls how strongly the model defaults to a uniform
|
|
74
|
+
probability distribution by adding a "pseudo-count" of `k` to every
|
|
75
|
+
possible event.
|
|
76
|
+
|
|
77
|
+
:param k: The smoothing factor, which must be a non-negative number.
|
|
78
|
+
:return: The CostLearner instance for method chaining.
|
|
79
|
+
:raises ValueError: If k < 0.
|
|
80
|
+
|
|
81
|
+
Notes
|
|
82
|
+
-----
|
|
83
|
+
This parameter allows for a continuous transition between two modes:
|
|
84
|
+
|
|
85
|
+
- **k > 0 (recommended):** This enables additive smoothing, with `k = 1.0`
|
|
86
|
+
being Laplace smoothing. It regularizes the model by assuming no event is impossible.
|
|
87
|
+
The final costs are a measure of "relative surprisal," normalized by the vocabulary size
|
|
88
|
+
|
|
89
|
+
- **k = 0:** This corresponds to a normalized Maximum Likelihood Estimation.
|
|
90
|
+
Probabilities are derived from the raw observed frequencies. The final costs are
|
|
91
|
+
normalized using the same logic as the `k > 0` case, making `k=0` the continuous limit
|
|
92
|
+
of the smoothed model. In this mode, costs can only be calculated for events observed in
|
|
93
|
+
the training data. Unseen events will receive the default cost, regardless of
|
|
94
|
+
the value of `calculate_for_unseen` in :meth:`fit`.
|
|
95
|
+
"""
|
|
96
|
+
if k < 0:
|
|
97
|
+
raise ValueError("Smoothing parameter k must be non-negative.")
|
|
98
|
+
self._smoothing_k = k
|
|
99
|
+
return self
|
|
100
|
+
|
|
101
|
+
def _tally_operations(self, operations: Iterable["EditOperation"]) -> TallyCounts:
|
|
102
|
+
"""Tally all edit operations."""
|
|
103
|
+
counts = TallyCounts()
|
|
104
|
+
for op in operations:
|
|
105
|
+
if op.source_token is not None:
|
|
106
|
+
counts.vocab.add(op.source_token)
|
|
107
|
+
if op.target_token is not None:
|
|
108
|
+
counts.target_chars[op.target_token] += 1
|
|
109
|
+
counts.vocab.add(op.target_token)
|
|
110
|
+
|
|
111
|
+
if op.op_type == "substitute":
|
|
112
|
+
if op.source_token is None or op.target_token is None:
|
|
113
|
+
raise ValueError("Tokens cannot be None for 'substitute'")
|
|
114
|
+
counts.substitutions[(op.source_token, op.target_token)] += 1
|
|
115
|
+
counts.source_chars[op.source_token] += 1
|
|
116
|
+
elif op.op_type == "delete":
|
|
117
|
+
if op.source_token is None:
|
|
118
|
+
raise ValueError("Source token cannot be None for 'delete'")
|
|
119
|
+
counts.deletions[op.source_token] += 1
|
|
120
|
+
counts.source_chars[op.source_token] += 1
|
|
121
|
+
elif op.op_type == "insert":
|
|
122
|
+
if op.target_token is None:
|
|
123
|
+
raise ValueError("Target token cannot be None for 'insert'")
|
|
124
|
+
counts.insertions[op.target_token] += 1
|
|
125
|
+
elif op.op_type == "match":
|
|
126
|
+
if op.source_token is None:
|
|
127
|
+
raise ValueError("Source token cannot be None for 'match'")
|
|
128
|
+
counts.source_chars[op.source_token] += 1
|
|
129
|
+
return counts
|
|
130
|
+
|
|
131
|
+
def _calculate_costs(
|
|
132
|
+
self, counts: TallyCounts, vocab: set[str], calculate_for_unseen: bool = False
|
|
133
|
+
) -> _Costs:
|
|
134
|
+
"""
|
|
135
|
+
Calculates the costs for edit operations based on tallied counts.
|
|
136
|
+
"""
|
|
137
|
+
sub_costs: dict[tuple[str, str], float] = {}
|
|
138
|
+
ins_costs: dict[str, float] = {}
|
|
139
|
+
del_costs: dict[str, float] = {}
|
|
140
|
+
k = self._smoothing_k
|
|
141
|
+
|
|
142
|
+
if k == 0:
|
|
143
|
+
calculate_for_unseen = False
|
|
144
|
+
|
|
145
|
+
# Error space size V for all conditional probabilities.
|
|
146
|
+
# The space of possible outcomes for a given source character (from OCR)
|
|
147
|
+
# includes all vocab characters (for matches/substitutions) plus the empty
|
|
148
|
+
# character (for deletions). This gives V = len(vocab) + 1.
|
|
149
|
+
# Symmetrically, the space of outcomes for a given target character (from GT)
|
|
150
|
+
# includes all vocab characters plus the empty character (for insertions/misses).
|
|
151
|
+
V = len(vocab) + 1
|
|
152
|
+
|
|
153
|
+
# Normalization ceiling Z' = -log(1/V).
|
|
154
|
+
normalization_ceiling = math.log(V) if V > 1 else 1.0
|
|
155
|
+
|
|
156
|
+
# Substitutions
|
|
157
|
+
sub_iterator = (
|
|
158
|
+
itertools.product(vocab, vocab) if calculate_for_unseen else counts.substitutions.keys()
|
|
159
|
+
)
|
|
160
|
+
for source, target in sub_iterator:
|
|
161
|
+
count = counts.substitutions[(source, target)]
|
|
162
|
+
total_count = counts.source_chars[source]
|
|
163
|
+
prob = (count + k) / (total_count + k * V)
|
|
164
|
+
base_cost = negative_log_likelihood(prob)
|
|
165
|
+
sub_costs[(source, target)] = base_cost / normalization_ceiling
|
|
166
|
+
|
|
167
|
+
# Deletions
|
|
168
|
+
del_iterator = vocab if calculate_for_unseen else counts.deletions.keys()
|
|
169
|
+
for source in del_iterator:
|
|
170
|
+
count = counts.deletions[source]
|
|
171
|
+
total_count = counts.source_chars[source]
|
|
172
|
+
prob = (count + k) / (total_count + k * V)
|
|
173
|
+
base_cost = negative_log_likelihood(prob)
|
|
174
|
+
del_costs[source] = base_cost / normalization_ceiling
|
|
175
|
+
|
|
176
|
+
# Insertions
|
|
177
|
+
ins_iterator = vocab if calculate_for_unseen else counts.insertions.keys()
|
|
178
|
+
for target in ins_iterator:
|
|
179
|
+
count = counts.insertions[target]
|
|
180
|
+
total_target_count = counts.target_chars[target]
|
|
181
|
+
prob = (count + k) / (total_target_count + k * V)
|
|
182
|
+
base_cost = negative_log_likelihood(prob)
|
|
183
|
+
ins_costs[target] = base_cost / normalization_ceiling
|
|
184
|
+
|
|
185
|
+
return _Costs(substitutions=sub_costs, insertions=ins_costs, deletions=del_costs)
|
|
186
|
+
|
|
187
|
+
def _calculate_operations(
|
|
188
|
+
self, pairs: Iterable[tuple[str, str]], aligner: "Aligner"
|
|
189
|
+
) -> list["EditOperation"]:
|
|
190
|
+
"""Calculate edit operations for all string pairs using the provided aligner."""
|
|
191
|
+
|
|
192
|
+
all_ops = [
|
|
193
|
+
op
|
|
194
|
+
for ocr_str, truth_str in pairs
|
|
195
|
+
for op in aligner.explain(ocr_str, truth_str, filter_matches=False)
|
|
196
|
+
]
|
|
197
|
+
return all_ops
|
|
198
|
+
|
|
199
|
+
def fit(
|
|
200
|
+
self,
|
|
201
|
+
pairs: Iterable[tuple[str, str]],
|
|
202
|
+
*,
|
|
203
|
+
initial_model: "Aligner | None" = None,
|
|
204
|
+
calculate_for_unseen: bool = False,
|
|
205
|
+
) -> "WeightedLevenshtein":
|
|
206
|
+
"""
|
|
207
|
+
Fits the costs of a WeightedLevenshtein instance to the provided data.
|
|
208
|
+
|
|
209
|
+
Note that learning multi-character tokens is only supported if an initial alignment model
|
|
210
|
+
is provided that can handle those multi-character tokens.
|
|
211
|
+
|
|
212
|
+
This method analyzes pairs of strings to learn the costs of edit operations
|
|
213
|
+
based on their observed frequencies. The underlying model calculates costs
|
|
214
|
+
based on the principle of relative information cost.
|
|
215
|
+
|
|
216
|
+
For a detailed explanation of the methodology, please see the
|
|
217
|
+
:doc:`Cost Learning Model <cost_learning_model>` documentation page.
|
|
218
|
+
|
|
219
|
+
:param pairs: An iterable of (ocr_string, ground_truth_string) tuples.
|
|
220
|
+
:param initial_model: Optional initial model used to align OCR outputs and ground truth
|
|
221
|
+
strings. By default, an unweighted Levenshtein distance is used.
|
|
222
|
+
:param calculate_for_unseen: If True (and k > 0), pre-calculates costs for all
|
|
223
|
+
possible edit operations based on the vocabulary.
|
|
224
|
+
If False (default), only calculates costs for operations
|
|
225
|
+
observed in the data.
|
|
226
|
+
:return: A `WeightedLevenshtein` instance with the learned costs.
|
|
227
|
+
"""
|
|
228
|
+
from .levenshtein import WeightedLevenshtein
|
|
229
|
+
|
|
230
|
+
if not pairs:
|
|
231
|
+
return WeightedLevenshtein.unweighted()
|
|
232
|
+
|
|
233
|
+
if initial_model is None:
|
|
234
|
+
initial_model = WeightedLevenshtein.unweighted()
|
|
235
|
+
|
|
236
|
+
all_ops = self._calculate_operations(pairs, aligner=initial_model)
|
|
237
|
+
self.counts = self._tally_operations(all_ops)
|
|
238
|
+
vocab = self.counts.vocab
|
|
239
|
+
self.vocab_size = len(vocab)
|
|
240
|
+
|
|
241
|
+
if not self.vocab_size:
|
|
242
|
+
return WeightedLevenshtein.unweighted()
|
|
243
|
+
|
|
244
|
+
costs = self._calculate_costs(self.counts, vocab, calculate_for_unseen=calculate_for_unseen)
|
|
245
|
+
|
|
246
|
+
return WeightedLevenshtein(
|
|
247
|
+
substitution_costs=costs.substitutions,
|
|
248
|
+
insertion_costs=costs.insertions,
|
|
249
|
+
deletion_costs=costs.deletions,
|
|
250
|
+
symmetric_substitution=False,
|
|
251
|
+
default_substitution_cost=1.0,
|
|
252
|
+
default_insertion_cost=1.0,
|
|
253
|
+
default_deletion_cost=1.0,
|
|
254
|
+
)
|
ocr_stringdist/levenshtein.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from
|
|
4
|
-
from typing import
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
from typing import Any, Optional
|
|
5
5
|
|
|
6
6
|
from ._rust_stringdist import (
|
|
7
7
|
_batch_weighted_levenshtein_distance,
|
|
@@ -9,20 +9,7 @@ from ._rust_stringdist import (
|
|
|
9
9
|
_weighted_levenshtein_distance,
|
|
10
10
|
)
|
|
11
11
|
from .default_ocr_distances import ocr_distance_map
|
|
12
|
-
|
|
13
|
-
OperationType = Literal["substitute", "insert", "delete"]
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
@dataclass(frozen=True)
|
|
17
|
-
class EditOperation:
|
|
18
|
-
"""
|
|
19
|
-
Represents a single edit operation (substitution, insertion, or deletion).
|
|
20
|
-
"""
|
|
21
|
-
|
|
22
|
-
op_type: OperationType
|
|
23
|
-
source_token: Optional[str]
|
|
24
|
-
target_token: Optional[str]
|
|
25
|
-
cost: float
|
|
12
|
+
from .edit_operation import EditOperation
|
|
26
13
|
|
|
27
14
|
|
|
28
15
|
class WeightedLevenshtein:
|
|
@@ -33,14 +20,17 @@ class WeightedLevenshtein:
|
|
|
33
20
|
how the distance is measured. Once created, its methods can be used to
|
|
34
21
|
efficiently compute distances and explain the edit operations.
|
|
35
22
|
|
|
36
|
-
:param substitution_costs: Maps (
|
|
23
|
+
:param substitution_costs: Maps (str, str) tuples to their substitution cost.
|
|
37
24
|
Defaults to costs based on common OCR errors.
|
|
38
|
-
:param insertion_costs: Maps a
|
|
39
|
-
:param deletion_costs: Maps a
|
|
40
|
-
:param symmetric_substitution: If True,
|
|
41
|
-
|
|
42
|
-
:param
|
|
43
|
-
:param
|
|
25
|
+
:param insertion_costs: Maps a string to its insertion cost.
|
|
26
|
+
:param deletion_costs: Maps a string to its deletion cost.
|
|
27
|
+
:param symmetric_substitution: If True, a cost defined for, e.g., ('0', 'O') will automatically
|
|
28
|
+
apply to ('O', '0'). If False, both must be defined explicitly.
|
|
29
|
+
:param default_substitution_cost: Default cost for single-char substitutions not in the map.
|
|
30
|
+
:param default_insertion_cost: Default cost for single-char insertions not in the map.
|
|
31
|
+
:param default_deletion_cost: Default cost for single-char deletions not in the map.
|
|
32
|
+
|
|
33
|
+
:raises TypeError, ValueError: If the provided arguments are invalid.
|
|
44
34
|
"""
|
|
45
35
|
|
|
46
36
|
substitution_costs: dict[tuple[str, str], float]
|
|
@@ -62,9 +52,37 @@ class WeightedLevenshtein:
|
|
|
62
52
|
default_insertion_cost: float = 1.0,
|
|
63
53
|
default_deletion_cost: float = 1.0,
|
|
64
54
|
) -> None:
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
55
|
+
# Validate default costs
|
|
56
|
+
for cost_name, cost_val in [
|
|
57
|
+
("default_substitution_cost", default_substitution_cost),
|
|
58
|
+
("default_insertion_cost", default_insertion_cost),
|
|
59
|
+
("default_deletion_cost", default_deletion_cost),
|
|
60
|
+
]:
|
|
61
|
+
if not isinstance(cost_val, (int, float)):
|
|
62
|
+
raise TypeError(f"{cost_name} must be a number, but got: {type(cost_val).__name__}")
|
|
63
|
+
if cost_val < 0:
|
|
64
|
+
raise ValueError(f"{cost_name} must be non-negative, got value: {cost_val}")
|
|
65
|
+
|
|
66
|
+
# Validate substitution_costs dictionary
|
|
67
|
+
sub_costs = ocr_distance_map if substitution_costs is None else substitution_costs
|
|
68
|
+
for key, cost in sub_costs.items():
|
|
69
|
+
if not (
|
|
70
|
+
isinstance(key, tuple)
|
|
71
|
+
and len(key) == 2
|
|
72
|
+
and isinstance(key[0], str)
|
|
73
|
+
and isinstance(key[1], str)
|
|
74
|
+
):
|
|
75
|
+
raise TypeError(
|
|
76
|
+
f"substitution_costs keys must be tuples of two strings, but found: {key}"
|
|
77
|
+
)
|
|
78
|
+
if not isinstance(cost, (int, float)):
|
|
79
|
+
raise TypeError(
|
|
80
|
+
f"Cost for substitution {key} must be a number, but got: {type(cost).__name__}"
|
|
81
|
+
)
|
|
82
|
+
if cost < 0:
|
|
83
|
+
raise ValueError(f"Cost for substitution {key} cannot be negative, but got: {cost}")
|
|
84
|
+
|
|
85
|
+
self.substitution_costs = sub_costs
|
|
68
86
|
self.insertion_costs = {} if insertion_costs is None else insertion_costs
|
|
69
87
|
self.deletion_costs = {} if deletion_costs is None else deletion_costs
|
|
70
88
|
self.symmetric_substitution = symmetric_substitution
|
|
@@ -81,162 +99,117 @@ class WeightedLevenshtein:
|
|
|
81
99
|
"""Calculates the weighted Levenshtein distance between two strings."""
|
|
82
100
|
return _weighted_levenshtein_distance(s1, s2, **self.__dict__) # type: ignore[no-any-return]
|
|
83
101
|
|
|
84
|
-
def explain(self, s1: str, s2: str) -> list[EditOperation]:
|
|
85
|
-
"""
|
|
102
|
+
def explain(self, s1: str, s2: str, filter_matches: bool = True) -> list[EditOperation]:
|
|
103
|
+
"""
|
|
104
|
+
Returns the list of edit operations to transform s1 into s2.
|
|
105
|
+
|
|
106
|
+
:param s1: First string (interpreted as the string read via OCR)
|
|
107
|
+
:param s2: Second string (interpreted as the target string)
|
|
108
|
+
:param filter_matches: If True, 'match' operations are excluded from the result.
|
|
109
|
+
:return: List of :class:`EditOperation` instances.
|
|
110
|
+
"""
|
|
86
111
|
raw_path = _explain_weighted_levenshtein_distance(s1, s2, **self.__dict__)
|
|
87
|
-
|
|
112
|
+
parsed_path = [EditOperation(*op) for op in raw_path]
|
|
113
|
+
if filter_matches:
|
|
114
|
+
return list(filter(lambda op: op.op_type != "match", parsed_path))
|
|
115
|
+
return parsed_path
|
|
88
116
|
|
|
89
117
|
def batch_distance(self, s: str, candidates: list[str]) -> list[float]:
|
|
90
118
|
"""Calculates distances between a string and a list of candidates."""
|
|
91
119
|
return _batch_weighted_levenshtein_distance(s, candidates, **self.__dict__) # type: ignore[no-any-return]
|
|
92
120
|
|
|
121
|
+
@classmethod
|
|
122
|
+
def learn_from(cls, pairs: Iterable[tuple[str, str]]) -> WeightedLevenshtein:
|
|
123
|
+
"""
|
|
124
|
+
Creates an instance by learning costs from a dataset of (OCR, ground truth) string pairs.
|
|
125
|
+
|
|
126
|
+
For more advanced learning configuration, see the
|
|
127
|
+
:class:`ocr_stringdist.learner.CostLearner` class.
|
|
128
|
+
|
|
129
|
+
:param pairs: An iterable of (ocr_string, ground_truth_string) tuples. Correct pairs
|
|
130
|
+
are not intended to be filtered; they are needed to learn well-aligned costs.
|
|
131
|
+
:return: A new `WeightedLevenshtein` instance with the learned costs.
|
|
132
|
+
|
|
133
|
+
Example::
|
|
134
|
+
|
|
135
|
+
from ocr_stringdist import WeightedLevenshtein
|
|
136
|
+
|
|
137
|
+
training_data = [
|
|
138
|
+
("8N234", "BN234"), # read '8' instead of 'B'
|
|
139
|
+
("BJK18", "BJK18"), # correct
|
|
140
|
+
("ABC0.", "ABC0"), # extra '.'
|
|
141
|
+
]
|
|
142
|
+
wl = WeightedLevenshtein.learn_from(training_data)
|
|
143
|
+
print(wl.substitution_costs) # learned cost for substituting '8' with 'B'
|
|
144
|
+
print(wl.deletion_costs) # learned cost for deleting '.'
|
|
145
|
+
"""
|
|
146
|
+
from .learner import CostLearner
|
|
147
|
+
|
|
148
|
+
return CostLearner().fit(pairs)
|
|
149
|
+
|
|
150
|
+
def __eq__(self, other: object) -> bool:
|
|
151
|
+
if not isinstance(other, WeightedLevenshtein):
|
|
152
|
+
return NotImplemented
|
|
153
|
+
return (
|
|
154
|
+
self.substitution_costs == other.substitution_costs
|
|
155
|
+
and self.insertion_costs == other.insertion_costs
|
|
156
|
+
and self.deletion_costs == other.deletion_costs
|
|
157
|
+
and self.symmetric_substitution == other.symmetric_substitution
|
|
158
|
+
and self.default_substitution_cost == other.default_substitution_cost
|
|
159
|
+
and self.default_insertion_cost == other.default_insertion_cost
|
|
160
|
+
and self.default_deletion_cost == other.default_deletion_cost
|
|
161
|
+
)
|
|
93
162
|
|
|
94
|
-
def
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
/,
|
|
148
|
-
substitution_costs: Optional[dict[tuple[str, str], float]] = None,
|
|
149
|
-
insertion_costs: Optional[dict[str, float]] = None,
|
|
150
|
-
deletion_costs: Optional[dict[str, float]] = None,
|
|
151
|
-
*,
|
|
152
|
-
symmetric_substitution: bool = True,
|
|
153
|
-
default_substitution_cost: float = 1.0,
|
|
154
|
-
default_insertion_cost: float = 1.0,
|
|
155
|
-
default_deletion_cost: float = 1.0,
|
|
156
|
-
) -> list[float]:
|
|
157
|
-
"""
|
|
158
|
-
Calculate weighted Levenshtein distances between a string and multiple candidates.
|
|
159
|
-
|
|
160
|
-
See also :meth:`WeightedLevenshtein.batch_distance`.
|
|
161
|
-
|
|
162
|
-
This is more efficient than calling :func:`weighted_levenshtein_distance` multiple times.
|
|
163
|
-
|
|
164
|
-
:param s: The string to compare (interpreted as the string read via OCR)
|
|
165
|
-
:param candidates: List of candidate strings to compare against
|
|
166
|
-
:param substitution_costs: Dictionary mapping tuples of strings ("substitution tokens") to their
|
|
167
|
-
substitution costs. Only one direction needs to be configured unless
|
|
168
|
-
`symmetric_substitution` is False.
|
|
169
|
-
Note that the runtime scales in the length of the longest substitution token.
|
|
170
|
-
Defaults to `ocr_stringdist.ocr_distance_map`.
|
|
171
|
-
:param insertion_costs: Dictionary mapping strings to their insertion costs.
|
|
172
|
-
:param deletion_costs: Dictionary mapping strings to their deletion costs.
|
|
173
|
-
:param symmetric_substitution: Should the keys of `substitution_costs` be considered to be
|
|
174
|
-
symmetric? Defaults to True.
|
|
175
|
-
:param default_substitution_cost: The default substitution cost for character pairs not found
|
|
176
|
-
in `substitution_costs`.
|
|
177
|
-
:param default_insertion_cost: The default insertion cost for characters not found in
|
|
178
|
-
`insertion_costs`.
|
|
179
|
-
:param default_deletion_cost: The default deletion cost for characters not found in
|
|
180
|
-
`deletion_costs`.
|
|
181
|
-
:return: A list of distances corresponding to each candidate
|
|
182
|
-
"""
|
|
183
|
-
return WeightedLevenshtein(
|
|
184
|
-
substitution_costs=substitution_costs,
|
|
185
|
-
insertion_costs=insertion_costs,
|
|
186
|
-
deletion_costs=deletion_costs,
|
|
187
|
-
symmetric_substitution=symmetric_substitution,
|
|
188
|
-
default_substitution_cost=default_substitution_cost,
|
|
189
|
-
default_insertion_cost=default_insertion_cost,
|
|
190
|
-
default_deletion_cost=default_deletion_cost,
|
|
191
|
-
).batch_distance(s, candidates)
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
def explain_weighted_levenshtein(
|
|
195
|
-
s1: str,
|
|
196
|
-
s2: str,
|
|
197
|
-
/,
|
|
198
|
-
substitution_costs: Optional[dict[tuple[str, str], float]] = None,
|
|
199
|
-
insertion_costs: Optional[dict[str, float]] = None,
|
|
200
|
-
deletion_costs: Optional[dict[str, float]] = None,
|
|
201
|
-
*,
|
|
202
|
-
symmetric_substitution: bool = True,
|
|
203
|
-
default_substitution_cost: float = 1.0,
|
|
204
|
-
default_insertion_cost: float = 1.0,
|
|
205
|
-
default_deletion_cost: float = 1.0,
|
|
206
|
-
) -> list[EditOperation]:
|
|
207
|
-
"""
|
|
208
|
-
Computes the path of operations associated with the custom Levenshtein distance.
|
|
209
|
-
|
|
210
|
-
See also :meth:`WeightedLevenshtein.explain`.
|
|
211
|
-
|
|
212
|
-
The default `substitution_costs` considers common OCR errors, see
|
|
213
|
-
:py:data:`ocr_stringdist.default_ocr_distances.ocr_distance_map`.
|
|
214
|
-
|
|
215
|
-
:param s1: First string (interpreted as the string read via OCR)
|
|
216
|
-
:param s2: Second string
|
|
217
|
-
:param substitution_costs: Dictionary mapping tuples of strings ("substitution tokens") to their
|
|
218
|
-
substitution costs. Only one direction needs to be configured unless
|
|
219
|
-
`symmetric_substitution` is False.
|
|
220
|
-
Note that the runtime scales in the length of the longest substitution token.
|
|
221
|
-
Defaults to `ocr_stringdist.ocr_distance_map`.
|
|
222
|
-
:param insertion_costs: Dictionary mapping strings to their insertion costs.
|
|
223
|
-
:param deletion_costs: Dictionary mapping strings to their deletion costs.
|
|
224
|
-
:param symmetric_substitution: Should the keys of `substitution_costs` be considered to be
|
|
225
|
-
symmetric? Defaults to True.
|
|
226
|
-
:param default_substitution_cost: The default substitution cost for character pairs not found
|
|
227
|
-
in `substitution_costs`.
|
|
228
|
-
:param default_insertion_cost: The default insertion cost for characters not found in
|
|
229
|
-
`insertion_costs`.
|
|
230
|
-
:param default_deletion_cost: The default deletion cost for characters not found in
|
|
231
|
-
`deletion_costs`.
|
|
232
|
-
:return: List of :class:`EditOperation` instances.
|
|
233
|
-
"""
|
|
234
|
-
return WeightedLevenshtein(
|
|
235
|
-
substitution_costs=substitution_costs,
|
|
236
|
-
insertion_costs=insertion_costs,
|
|
237
|
-
deletion_costs=deletion_costs,
|
|
238
|
-
symmetric_substitution=symmetric_substitution,
|
|
239
|
-
default_substitution_cost=default_substitution_cost,
|
|
240
|
-
default_insertion_cost=default_insertion_cost,
|
|
241
|
-
default_deletion_cost=default_deletion_cost,
|
|
242
|
-
).explain(s1, s2)
|
|
163
|
+
def to_dict(self) -> dict[str, Any]:
|
|
164
|
+
"""
|
|
165
|
+
Serializes the instance's configuration to a dictionary.
|
|
166
|
+
|
|
167
|
+
The result can be written to, say, JSON.
|
|
168
|
+
|
|
169
|
+
For the counterpart, see :meth:`WeightedLevenshtein.from_dict`.
|
|
170
|
+
"""
|
|
171
|
+
# Convert tuple keys to a list of lists/objects for broader compatibility (e.g., JSON)
|
|
172
|
+
sub_costs_serializable = [
|
|
173
|
+
{"from": k[0], "to": k[1], "cost": v} for k, v in self.substitution_costs.items()
|
|
174
|
+
]
|
|
175
|
+
|
|
176
|
+
return {
|
|
177
|
+
"substitution_costs": sub_costs_serializable,
|
|
178
|
+
"insertion_costs": self.insertion_costs,
|
|
179
|
+
"deletion_costs": self.deletion_costs,
|
|
180
|
+
"symmetric_substitution": self.symmetric_substitution,
|
|
181
|
+
"default_substitution_cost": self.default_substitution_cost,
|
|
182
|
+
"default_insertion_cost": self.default_insertion_cost,
|
|
183
|
+
"default_deletion_cost": self.default_deletion_cost,
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
@classmethod
|
|
187
|
+
def from_dict(cls, data: dict[str, Any]) -> WeightedLevenshtein:
|
|
188
|
+
"""
|
|
189
|
+
Deserialize from a dictionary.
|
|
190
|
+
|
|
191
|
+
For the counterpart, see :meth:`WeightedLevenshtein.to_dict`.
|
|
192
|
+
|
|
193
|
+
:param data: A dictionary with (not necessarily all of) the following keys:
|
|
194
|
+
- "substitution_costs": {"from": str, "to": str, "cost": float}
|
|
195
|
+
- "substitution_costs": dict[str, float]
|
|
196
|
+
- "deletion_costs": dict[str, float]
|
|
197
|
+
- "symmetric_substitution": bool
|
|
198
|
+
- "default_substitution_cost": float
|
|
199
|
+
- "default_insertion_cost": float
|
|
200
|
+
- "default_deletion_cost": float
|
|
201
|
+
"""
|
|
202
|
+
# Convert the list of substitution costs back to the required dict format
|
|
203
|
+
sub_costs: dict[tuple[str, str], float] = {
|
|
204
|
+
(item["from"], item["to"]): item["cost"] for item in data.get("substitution_costs", {})
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
return cls(
|
|
208
|
+
substitution_costs=sub_costs,
|
|
209
|
+
insertion_costs=data.get("substitution_costs"),
|
|
210
|
+
deletion_costs=data.get("deletion_costs"),
|
|
211
|
+
symmetric_substitution=data.get("symmetric_substitution", True),
|
|
212
|
+
default_substitution_cost=data.get("default_substitution_cost", 1.0),
|
|
213
|
+
default_insertion_cost=data.get("default_insertion_cost", 1.0),
|
|
214
|
+
default_deletion_cost=data.get("default_deletion_cost", 1.0),
|
|
215
|
+
)
|
ocr_stringdist/matching.py
CHANGED
|
@@ -39,13 +39,13 @@ def find_best_candidate(
|
|
|
39
39
|
calculated distance/score.
|
|
40
40
|
:rtype: tuple[str, float]
|
|
41
41
|
|
|
42
|
-
|
|
42
|
+
Example::
|
|
43
43
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
44
|
+
from ocr_stringdist import find_best_candidate, WeightedLevenshtein
|
|
45
|
+
|
|
46
|
+
wl = WeightedLevenshtein({("l", "I"): 0.1})
|
|
47
|
+
find_best_candidate("apple", ["apply", "apples", "orange", "appIe"], wl.distance)
|
|
48
|
+
# ('appIe', 0.1)
|
|
49
49
|
"""
|
|
50
50
|
if not candidates:
|
|
51
51
|
raise ValueError("The 'candidates' iterable cannot be empty.")
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING, Protocol, runtime_checkable
|
|
2
|
+
|
|
3
|
+
if TYPE_CHECKING:
|
|
4
|
+
from .edit_operation import EditOperation
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@runtime_checkable
|
|
8
|
+
class Aligner(Protocol):
|
|
9
|
+
def explain(self, s1: str, s2: str, filter_matches: bool) -> list["EditOperation"]: ...
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ocr-stringdist
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Classifier: Programming Language :: Rust
|
|
5
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
6
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
7
|
+
Classifier: Operating System :: OS Independent
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Python: >=3.9
|
|
10
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
11
|
+
Project-URL: repository, https://github.com/NiklasvonM/ocr-stringdist
|
|
12
|
+
Project-URL: documentation, https://niklasvonm.github.io/ocr-stringdist/
|
|
13
|
+
|
|
14
|
+
# OCR-StringDist
|
|
15
|
+
|
|
16
|
+
A Python library to learn, model, explain and correct OCR errors using a fast string distance engine.
|
|
17
|
+
|
|
18
|
+
Documentation: https://niklasvonm.github.io/ocr-stringdist/
|
|
19
|
+
|
|
20
|
+
[](https://badge.fury.io/py/ocr-stringdist)
|
|
21
|
+
[](LICENSE)
|
|
22
|
+
|
|
23
|
+
## Overview
|
|
24
|
+
|
|
25
|
+
Standard string distances (like Levenshtein) treat all character substitutions equally. This is suboptimal for text read from images via OCR, where errors like `O` vs `0` are far more common than, say, `O` vs `X`.
|
|
26
|
+
|
|
27
|
+
OCR-StringDist provides a learnable **weighted Levenshtein distance**, implementing part of the **Noisy Channel model**.
|
|
28
|
+
|
|
29
|
+
**Example:** Matching against the correct word `CODE`:
|
|
30
|
+
|
|
31
|
+
* **Standard Levenshtein:**
|
|
32
|
+
* $d(\text{"CODE"}, \text{"C0DE"}) = 1$ (O → 0)
|
|
33
|
+
* $d(\text{"CODE"}, \text{"CXDE"}) = 1$ (O → X)
|
|
34
|
+
* Result: Both appear equally likely/distant.
|
|
35
|
+
|
|
36
|
+
* **OCR-StringDist (Channel Model):**
|
|
37
|
+
* $d(\text{"CODE"}, \text{"C0DE"}) \approx 0.1$ (common error, low cost)
|
|
38
|
+
* $d(\text{"CODE"}, \text{"CXDE"}) = 1.0$ (unlikely error, high cost)
|
|
39
|
+
* Result: Correctly identifies `C0DE` as a much closer match.
|
|
40
|
+
|
|
41
|
+
This makes it ideal for matching potentially incorrect OCR output against known values (e.g., product codes). By combining this *channel model* with a *source model* (e.g., product code frequencies), you can build a complete and robust OCR correction system.
|
|
42
|
+
|
|
43
|
+
## Installation
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install ocr-stringdist
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Features
|
|
50
|
+
|
|
51
|
+
- **Learnable Costs**: Automatically learn substitution, insertion, and deletion costs from a dataset of (OCR string, ground truth string) pairs.
|
|
52
|
+
- **Weighted Levenshtein Distance**: Models OCR error patterns by assigning custom costs to specific edit operations.
|
|
53
|
+
- **High Performance**: Core logic in Rust and a batch_distance function for efficiently comparing one string against thousands of candidates.
|
|
54
|
+
- **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
|
|
55
|
+
- **Explainable Edit Path**: Returns the optimal sequence of edit operations (substitutions, insertions, and deletions) used to transform one string into another.
|
|
56
|
+
- **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
|
|
57
|
+
- **Full Unicode Support**: Works with arbitrary Unicode strings.
|
|
58
|
+
|
|
59
|
+
## Core Workflow
|
|
60
|
+
|
|
61
|
+
The typical workflow involves
|
|
62
|
+
- learning costs from your data and then
|
|
63
|
+
- using the resulting model to find the best match from a list of candidates.
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
from ocr_stringdist import WeightedLevenshtein
|
|
67
|
+
|
|
68
|
+
# 1. LEARN costs from your own data
|
|
69
|
+
training_data = [
|
|
70
|
+
("128", "123"),
|
|
71
|
+
("567", "567"),
|
|
72
|
+
]
|
|
73
|
+
wl = WeightedLevenshtein.learn_from(training_data)
|
|
74
|
+
|
|
75
|
+
# The engine has now learned that '8' -> '3' is a low-cost substitution
|
|
76
|
+
print(f"Learned cost for ('8', '3'): {wl.substitution_costs[('8', '3')]:.2f}")
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
# 2. MATCH new OCR output against a list of candidates
|
|
80
|
+
ocr_output = "Product Code 128"
|
|
81
|
+
candidates = [
|
|
82
|
+
"Product Code 123",
|
|
83
|
+
"Product Code 523", # '5' -> '1' is an unlikely error
|
|
84
|
+
]
|
|
85
|
+
|
|
86
|
+
distances = wl.batch_distance(ocr_output, candidates)
|
|
87
|
+
|
|
88
|
+
# Find the best match
|
|
89
|
+
min_distance = min(distances)
|
|
90
|
+
best_match = candidates[distances.index(min_distance)]
|
|
91
|
+
|
|
92
|
+
print(f"Best match for '{ocr_output}': '{best_match}' (Cost: {min_distance:.2f})")
|
|
93
|
+
```
|
|
94
|
+
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
ocr_stringdist-1.0.0.dist-info/METADATA,sha256=sFZnhhX8kHoYFbMua4zHCq2tELQPXQw3vWGNRoStR-4,3963
|
|
2
|
+
ocr_stringdist-1.0.0.dist-info/WHEEL,sha256=6cglYgN2x9bsL8KQgndplH9dyQf2UyeKi6L__-GPsk0,107
|
|
3
|
+
ocr_stringdist-1.0.0.dist-info/licenses/LICENSE,sha256=5BPRcjlnbl2t4TidSgpfGrtC_birSf8JlZfA-qmVoQE,1072
|
|
4
|
+
ocr_stringdist.libs/libgcc_s-98a1ef30.so.1,sha256=XOVRhHznCIpbSdhFoozhla-OfRqBtXftKPQ4cSMKjrs,433441
|
|
5
|
+
ocr_stringdist/__init__.py,sha256=mL-19TkQQElK5B6iVFCV7vjKVal-6JcsBOFKwiCPQnA,284
|
|
6
|
+
ocr_stringdist/_rust_stringdist.cpython-313-x86_64-linux-musl.so,sha256=ixwa7I8bvHbWfMNN40Wkdsnvn585CdzOC88U9gaM2b4,826057
|
|
7
|
+
ocr_stringdist/default_ocr_distances.py,sha256=oSu-TpHjPA4jxKpLAfmap8z0ZsC99jsOjnRVHW7Hj_Y,1033
|
|
8
|
+
ocr_stringdist/edit_operation.py,sha256=EgEc-2_nOwLUZDOWtogYqKLXIQJxOd9sIAbcGkn-TMY,395
|
|
9
|
+
ocr_stringdist/learner.py,sha256=3qWvqHrAWm4seuwmBmFN4InRL20u8HnPATHjCTnU3I0,10491
|
|
10
|
+
ocr_stringdist/levenshtein.py,sha256=t05FicwL5WTTsRSzDa92v79D2LpDiEUOYG_6te8oT28,9854
|
|
11
|
+
ocr_stringdist/matching.py,sha256=28Xt-x_V_iVsohD3F64MfZ0mys4_qOZXTIAcmSOE0dA,3270
|
|
12
|
+
ocr_stringdist/protocols.py,sha256=IyvGzzktPgmPRZyDRE0UKCYo4C0tdewU8IgwFbxZLls,265
|
|
13
|
+
ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
+
ocr_stringdist-1.0.0.dist-info/RECORD,,
|
|
@@ -1,102 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: ocr-stringdist
|
|
3
|
-
Version: 0.2.2
|
|
4
|
-
Classifier: Programming Language :: Rust
|
|
5
|
-
Classifier: Programming Language :: Python
|
|
6
|
-
Classifier: Operating System :: OS Independent
|
|
7
|
-
License-File: LICENSE
|
|
8
|
-
Requires-Python: >=3.9
|
|
9
|
-
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
10
|
-
Project-URL: repository, https://github.com/NiklasvonM/ocr-stringdist
|
|
11
|
-
Project-URL: documentation, https://niklasvonm.github.io/ocr-stringdist/
|
|
12
|
-
|
|
13
|
-
# OCR-StringDist
|
|
14
|
-
|
|
15
|
-
A Python library for fast string distance calculations that account for common OCR (optical character recognition) errors.
|
|
16
|
-
|
|
17
|
-
Documentation: https://niklasvonm.github.io/ocr-stringdist/
|
|
18
|
-
|
|
19
|
-
[](https://pypi.org/project/ocr-stringdist/)
|
|
20
|
-
[](LICENSE)
|
|
21
|
-
|
|
22
|
-
## Overview
|
|
23
|
-
|
|
24
|
-
Standard string distances (like Levenshtein) treat all character substitutions equally. This is suboptimal for text read from images via OCR, where errors like `O` vs `0` are far more common than, say, `O` vs `X`.
|
|
25
|
-
|
|
26
|
-
OCR-StringDist uses a **weighted Levenshtein distance**, assigning lower costs to common OCR errors.
|
|
27
|
-
|
|
28
|
-
**Example:** Matching against the correct word `CODE`:
|
|
29
|
-
|
|
30
|
-
* **Standard Levenshtein:**
|
|
31
|
-
* $d(\text{"CODE"}, \text{"C0DE"}) = 1$ (O → 0)
|
|
32
|
-
* $d(\text{"CODE"}, \text{"CXDE"}) = 1$ (O → X)
|
|
33
|
-
* Result: Both appear equally likely/distant.
|
|
34
|
-
|
|
35
|
-
* **OCR-StringDist (Weighted):**
|
|
36
|
-
* $d(\text{"CODE"}, \text{"C0DE"}) \approx 0.1$ (common error, low cost)
|
|
37
|
-
* $d(\text{"CODE"}, \text{"CXDE"}) = 1.0$ (unlikely error, high cost)
|
|
38
|
-
* Result: Correctly identifies `C0DE` as a much closer match.
|
|
39
|
-
|
|
40
|
-
This makes it ideal for matching potentially incorrect OCR output against known values (e.g., product codes, database entries).
|
|
41
|
-
|
|
42
|
-
## Installation
|
|
43
|
-
|
|
44
|
-
```bash
|
|
45
|
-
pip install ocr-stringdist
|
|
46
|
-
```
|
|
47
|
-
|
|
48
|
-
## Features
|
|
49
|
-
|
|
50
|
-
- **High Performance**: The core logic is implemented in Rust with speed in mind.
|
|
51
|
-
- **Weighted Levenshtein Distance**: Calculates Levenshtein distance with customizable costs for substitutions, insertions, and deletions. Includes an efficient batch version (`batch_weighted_levenshtein_distance`) for comparing one string against many candidates.
|
|
52
|
-
- **Explainable Edit Path**: Returns the optimal sequence of edit operations (substitutions, insertions, and deletions) used to transform one string into another.
|
|
53
|
-
- **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
|
|
54
|
-
- **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
|
|
55
|
-
- **Unicode Support**: Works with arbitrary Unicode strings.
|
|
56
|
-
- **Best Match Finder**: Includes a utility function `find_best_candidate` to efficiently find the best match from a list based on _any_ distance function.
|
|
57
|
-
|
|
58
|
-
## Usage
|
|
59
|
-
|
|
60
|
-
### Basic usage
|
|
61
|
-
|
|
62
|
-
```python
|
|
63
|
-
from ocr_stringdist import WeightedLevenshtein
|
|
64
|
-
|
|
65
|
-
# Default substitution costs are ocr_stringdist.ocr_distance_map.
|
|
66
|
-
wl = WeightedLevenshtein()
|
|
67
|
-
|
|
68
|
-
print(wl.distance("CXDE", "CODE")) # == 1
|
|
69
|
-
print(wl.distance("C0DE", "CODE")) # < 1
|
|
70
|
-
```
|
|
71
|
-
|
|
72
|
-
### Explain the Edit Path
|
|
73
|
-
|
|
74
|
-
```python
|
|
75
|
-
edit_path = wl.explain("C0DE", "CODE")
|
|
76
|
-
print(edit_path)
|
|
77
|
-
# EditOperation(op_type='substitute', source_token='0', target_token='O', cost=0.1)]
|
|
78
|
-
```
|
|
79
|
-
|
|
80
|
-
### Fast Batch Calculations
|
|
81
|
-
|
|
82
|
-
Quickly compare a string to a list of candidates.
|
|
83
|
-
|
|
84
|
-
```python
|
|
85
|
-
distances: list[float] = wl.batch_distance("CODE", ["CXDE", "C0DE"])
|
|
86
|
-
# [1.0, 0.1]
|
|
87
|
-
```
|
|
88
|
-
|
|
89
|
-
### Multi-character Substitutions
|
|
90
|
-
|
|
91
|
-
```python
|
|
92
|
-
# Custom costs with multi-character substitution
|
|
93
|
-
wl = WeightedLevenshtein(substitution_costs={("In", "h"): 0.5})
|
|
94
|
-
|
|
95
|
-
print(wl.distance("hi", "Ini")) # 0.5
|
|
96
|
-
```
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
## Acknowledgements
|
|
100
|
-
|
|
101
|
-
This project is inspired by [jellyfish](https://github.com/jamesturk/jellyfish), providing the base implementations of the algorithms used here.
|
|
102
|
-
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
ocr_stringdist-0.2.2.dist-info/METADATA,sha256=2KjG6DHqpsannN0lPK4EwkYBbY3adZrl1oTCq-elnL8,3868
|
|
2
|
-
ocr_stringdist-0.2.2.dist-info/WHEEL,sha256=6cglYgN2x9bsL8KQgndplH9dyQf2UyeKi6L__-GPsk0,107
|
|
3
|
-
ocr_stringdist-0.2.2.dist-info/licenses/LICENSE,sha256=5BPRcjlnbl2t4TidSgpfGrtC_birSf8JlZfA-qmVoQE,1072
|
|
4
|
-
ocr_stringdist.libs/libgcc_s-98a1ef30.so.1,sha256=XOVRhHznCIpbSdhFoozhla-OfRqBtXftKPQ4cSMKjrs,433441
|
|
5
|
-
ocr_stringdist/__init__.py,sha256=ApxqraLRcWAkzXhGJXSf3EqGEVFbxghrYrfJ9dmQjQU,467
|
|
6
|
-
ocr_stringdist/_rust_stringdist.cpython-313-x86_64-linux-musl.so,sha256=EcPdJhjPSI5_vBlDULel2-m6U_5pncK31yxEyRFZMVY,821945
|
|
7
|
-
ocr_stringdist/default_ocr_distances.py,sha256=oSu-TpHjPA4jxKpLAfmap8z0ZsC99jsOjnRVHW7Hj_Y,1033
|
|
8
|
-
ocr_stringdist/levenshtein.py,sha256=Jypg31BQyULipJ_Yh3dcBQDKNnbvEIlmf28tDr_gySw,11243
|
|
9
|
-
ocr_stringdist/matching.py,sha256=rr8R63Ttu2hTf5Mni7_P8aGBbjWs6t2QPV3wxKXspAs,3293
|
|
10
|
-
ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
-
ocr_stringdist-0.2.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|