ocr-stringdist 0.0.7__cp311-cp311-musllinux_1_1_aarch64.whl → 0.2.0__cp311-cp311-musllinux_1_1_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,17 @@
1
1
  from .default_ocr_distances import ocr_distance_map
2
- from .levenshtein import batch_weighted_levenshtein_distance, weighted_levenshtein_distance
2
+ from .levenshtein import (
3
+ WeightedLevenshtein,
4
+ batch_weighted_levenshtein_distance,
5
+ explain_weighted_levenshtein,
6
+ weighted_levenshtein_distance,
7
+ )
3
8
  from .matching import find_best_candidate
4
9
 
5
10
  __all__ = [
6
11
  "ocr_distance_map",
12
+ "WeightedLevenshtein",
7
13
  "weighted_levenshtein_distance",
8
14
  "batch_weighted_levenshtein_distance",
15
+ "explain_weighted_levenshtein",
9
16
  "find_best_candidate",
10
17
  ]
@@ -1,71 +1,242 @@
1
- from typing import Optional
1
+ from __future__ import annotations
2
2
 
3
- from ._rust_stringdist import * # noqa: F403
3
+ from dataclasses import dataclass
4
+ from typing import Literal, Optional
5
+
6
+ from ._rust_stringdist import (
7
+ _batch_weighted_levenshtein_distance,
8
+ _explain_weighted_levenshtein_distance,
9
+ _weighted_levenshtein_distance,
10
+ )
4
11
  from .default_ocr_distances import ocr_distance_map
5
12
 
13
+ OperationType = Literal["substitute", "insert", "delete"]
14
+
15
+
16
+ @dataclass(frozen=True)
17
+ class EditOperation:
18
+ """
19
+ Represents a single edit operation (substitution, insertion, or deletion).
20
+ """
21
+
22
+ op_type: OperationType
23
+ source_token: Optional[str]
24
+ target_token: Optional[str]
25
+ cost: float
26
+
27
+
28
+ class WeightedLevenshtein:
29
+ """
30
+ Calculates Levenshtein distance with custom, configurable costs.
31
+
32
+ This class is initialized with cost dictionaries and settings that define
33
+ how the distance is measured. Once created, its methods can be used to
34
+ efficiently compute distances and explain the edit operations.
35
+
36
+ :param substitution_costs: Maps (char, char) tuples to their substitution cost.
37
+ Defaults to costs based on common OCR errors.
38
+ :param insertion_costs: Maps a character to its insertion cost.
39
+ :param deletion_costs: Maps a character to its deletion cost.
40
+ :param symmetric_substitution: If True, substitution costs are bidirectional.
41
+ :param default_substitution_cost: Default cost for substitutions not in the map.
42
+ :param default_insertion_cost: Default cost for insertions not in the map.
43
+ :param default_deletion_cost: Default cost for deletions not in the map.
44
+ """
45
+
46
+ substitution_costs: dict[tuple[str, str], float]
47
+ insertion_costs: dict[str, float]
48
+ deletion_costs: dict[str, float]
49
+ symmetric_substitution: bool
50
+ default_substitution_cost: float
51
+ default_insertion_cost: float
52
+ default_deletion_cost: float
53
+
54
+ def __init__(
55
+ self,
56
+ substitution_costs: Optional[dict[tuple[str, str], float]] = None,
57
+ insertion_costs: Optional[dict[str, float]] = None,
58
+ deletion_costs: Optional[dict[str, float]] = None,
59
+ *,
60
+ symmetric_substitution: bool = True,
61
+ default_substitution_cost: float = 1.0,
62
+ default_insertion_cost: float = 1.0,
63
+ default_deletion_cost: float = 1.0,
64
+ ) -> None:
65
+ self.substitution_costs = (
66
+ ocr_distance_map if substitution_costs is None else substitution_costs
67
+ )
68
+ self.insertion_costs = {} if insertion_costs is None else insertion_costs
69
+ self.deletion_costs = {} if deletion_costs is None else deletion_costs
70
+ self.symmetric_substitution = symmetric_substitution
71
+ self.default_substitution_cost = default_substitution_cost
72
+ self.default_insertion_cost = default_insertion_cost
73
+ self.default_deletion_cost = default_deletion_cost
74
+
75
+ @classmethod
76
+ def unweighted(cls) -> WeightedLevenshtein:
77
+ """Creates an instance with all operations having equal cost of 1.0."""
78
+ return cls(substitution_costs={}, insertion_costs={}, deletion_costs={})
79
+
80
+ def distance(self, s1: str, s2: str) -> float:
81
+ """Calculates the weighted Levenshtein distance between two strings."""
82
+ return _weighted_levenshtein_distance(s1, s2, **self.__dict__) # type: ignore[no-any-return]
83
+
84
+ def explain(self, s1: str, s2: str) -> list[EditOperation]:
85
+ """Returns the list of edit operations to transform s1 into s2."""
86
+ raw_path = _explain_weighted_levenshtein_distance(s1, s2, **self.__dict__)
87
+ return [EditOperation(*op) for op in raw_path]
88
+
89
+ def batch_distance(self, s: str, candidates: list[str]) -> list[float]:
90
+ """Calculates distances between a string and a list of candidates."""
91
+ return _batch_weighted_levenshtein_distance(s, candidates, **self.__dict__) # type: ignore[no-any-return]
92
+
6
93
 
7
94
  def weighted_levenshtein_distance(
8
95
  s1: str,
9
96
  s2: str,
10
97
  /,
11
- cost_map: Optional[dict[tuple[str, str], float]] = None,
98
+ substitution_costs: Optional[dict[tuple[str, str], float]] = None,
99
+ insertion_costs: Optional[dict[str, float]] = None,
100
+ deletion_costs: Optional[dict[str, float]] = None,
12
101
  *,
13
- symmetric: bool = True,
14
- default_cost: float = 1.0,
102
+ symmetric_substitution: bool = True,
103
+ default_substitution_cost: float = 1.0,
104
+ default_insertion_cost: float = 1.0,
105
+ default_deletion_cost: float = 1.0,
15
106
  ) -> float:
16
107
  """
17
- Levenshtein distance with custom substitution costs.
18
- Insertion/deletion costs are 1.
108
+ Levenshtein distance with custom substitution, insertion and deletion costs.
109
+
110
+ See also :meth:`WeightedLevenshtein.distance`.
19
111
 
20
- The default `cost_map` considers common OCR errors, see
112
+ The default `substitution_costs` considers common OCR errors, see
21
113
  :py:data:`ocr_stringdist.default_ocr_distances.ocr_distance_map`.
22
114
 
23
- :param s1: First string
115
+ :param s1: First string (interpreted as the string read via OCR)
24
116
  :param s2: Second string
25
- :param cost_map: Dictionary mapping tuples of strings ("substitution tokens") to their
26
- substitution costs.
27
- Only one direction needs to be configured unless `symmetric` is False.
117
+ :param substitution_costs: Dictionary mapping tuples of strings ("substitution tokens") to their
118
+ substitution costs. Only one direction needs to be configured unless
119
+ `symmetric_substitution` is False.
28
120
  Note that the runtime scales in the length of the longest substitution token.
29
121
  Defaults to `ocr_stringdist.ocr_distance_map`.
30
- :param symmetric: Should the keys of `cost_map` be considered to be symmetric? Defaults to True.
31
- :param default_cost: The default substitution cost for character pairs not found in `cost_map`.
122
+ :param insertion_costs: Dictionary mapping strings to their insertion costs.
123
+ :param deletion_costs: Dictionary mapping strings to their deletion costs.
124
+ :param symmetric_substitution: Should the keys of `substitution_costs` be considered to be
125
+ symmetric? Defaults to True.
126
+ :param default_substitution_cost: The default substitution cost for character pairs not found
127
+ in `substitution_costs`.
128
+ :param default_insertion_cost: The default insertion cost for characters not found in
129
+ `insertion_costs`.
130
+ :param default_deletion_cost: The default deletion cost for characters not found in
131
+ `deletion_costs`.
32
132
  """
33
- if cost_map is None:
34
- cost_map = ocr_distance_map
35
- # _weighted_levenshtein_distance is written in Rust, see src/rust_stringdist.rs.
36
- return _weighted_levenshtein_distance( # type: ignore # noqa: F405
37
- s1, s2, cost_map=cost_map, symmetric=symmetric, default_cost=default_cost
38
- )
133
+ return WeightedLevenshtein(
134
+ substitution_costs=substitution_costs,
135
+ insertion_costs=insertion_costs,
136
+ deletion_costs=deletion_costs,
137
+ symmetric_substitution=symmetric_substitution,
138
+ default_substitution_cost=default_substitution_cost,
139
+ default_insertion_cost=default_insertion_cost,
140
+ default_deletion_cost=default_deletion_cost,
141
+ ).distance(s1, s2)
39
142
 
40
143
 
41
144
  def batch_weighted_levenshtein_distance(
42
145
  s: str,
43
146
  candidates: list[str],
44
147
  /,
45
- cost_map: Optional[dict[tuple[str, str], float]] = None,
148
+ substitution_costs: Optional[dict[tuple[str, str], float]] = None,
149
+ insertion_costs: Optional[dict[str, float]] = None,
150
+ deletion_costs: Optional[dict[str, float]] = None,
46
151
  *,
47
- symmetric: bool = True,
48
- default_cost: float = 1.0,
152
+ symmetric_substitution: bool = True,
153
+ default_substitution_cost: float = 1.0,
154
+ default_insertion_cost: float = 1.0,
155
+ default_deletion_cost: float = 1.0,
49
156
  ) -> list[float]:
50
157
  """
51
158
  Calculate weighted Levenshtein distances between a string and multiple candidates.
52
159
 
160
+ See also :meth:`WeightedLevenshtein.batch_distance`.
161
+
53
162
  This is more efficient than calling :func:`weighted_levenshtein_distance` multiple times.
54
163
 
55
- :param s: The string to compare
164
+ :param s: The string to compare (interpreted as the string read via OCR)
56
165
  :param candidates: List of candidate strings to compare against
57
- :param cost_map: Dictionary mapping tuples of strings ("substitution tokens") to their
58
- substitution costs.
59
- Only one direction needs to be configured unless `symmetric` is False.
166
+ :param substitution_costs: Dictionary mapping tuples of strings ("substitution tokens") to their
167
+ substitution costs. Only one direction needs to be configured unless
168
+ `symmetric_substitution` is False.
60
169
  Note that the runtime scales in the length of the longest substitution token.
61
170
  Defaults to `ocr_stringdist.ocr_distance_map`.
62
- :param symmetric: Should the keys of `cost_map` be considered to be symmetric? Defaults to True.
63
- :param default_cost: The default substitution cost for character pairs not found in `cost_map`.
171
+ :param insertion_costs: Dictionary mapping strings to their insertion costs.
172
+ :param deletion_costs: Dictionary mapping strings to their deletion costs.
173
+ :param symmetric_substitution: Should the keys of `substitution_costs` be considered to be
174
+ symmetric? Defaults to True.
175
+ :param default_substitution_cost: The default substitution cost for character pairs not found
176
+ in `substitution_costs`.
177
+ :param default_insertion_cost: The default insertion cost for characters not found in
178
+ `insertion_costs`.
179
+ :param default_deletion_cost: The default deletion cost for characters not found in
180
+ `deletion_costs`.
64
181
  :return: A list of distances corresponding to each candidate
65
182
  """
66
- if cost_map is None:
67
- cost_map = ocr_distance_map
68
- # _batch_weighted_levenshtein_distance is written in Rust, see src/rust_stringdist.rs.
69
- return _batch_weighted_levenshtein_distance( # type: ignore # noqa: F405
70
- s, candidates, cost_map=cost_map, symmetric=symmetric, default_cost=default_cost
71
- )
183
+ return WeightedLevenshtein(
184
+ substitution_costs=substitution_costs,
185
+ insertion_costs=insertion_costs,
186
+ deletion_costs=deletion_costs,
187
+ symmetric_substitution=symmetric_substitution,
188
+ default_substitution_cost=default_substitution_cost,
189
+ default_insertion_cost=default_insertion_cost,
190
+ default_deletion_cost=default_deletion_cost,
191
+ ).batch_distance(s, candidates)
192
+
193
+
194
+ def explain_weighted_levenshtein(
195
+ s1: str,
196
+ s2: str,
197
+ /,
198
+ substitution_costs: Optional[dict[tuple[str, str], float]] = None,
199
+ insertion_costs: Optional[dict[str, float]] = None,
200
+ deletion_costs: Optional[dict[str, float]] = None,
201
+ *,
202
+ symmetric_substitution: bool = True,
203
+ default_substitution_cost: float = 1.0,
204
+ default_insertion_cost: float = 1.0,
205
+ default_deletion_cost: float = 1.0,
206
+ ) -> list[EditOperation]:
207
+ """
208
+ Computes the path of operations associated with the custom Levenshtein distance.
209
+
210
+ See also :meth:`WeightedLevenshtein.explain`.
211
+
212
+ The default `substitution_costs` considers common OCR errors, see
213
+ :py:data:`ocr_stringdist.default_ocr_distances.ocr_distance_map`.
214
+
215
+ :param s1: First string (interpreted as the string read via OCR)
216
+ :param s2: Second string
217
+ :param substitution_costs: Dictionary mapping tuples of strings ("substitution tokens") to their
218
+ substitution costs. Only one direction needs to be configured unless
219
+ `symmetric_substitution` is False.
220
+ Note that the runtime scales in the length of the longest substitution token.
221
+ Defaults to `ocr_stringdist.ocr_distance_map`.
222
+ :param insertion_costs: Dictionary mapping strings to their insertion costs.
223
+ :param deletion_costs: Dictionary mapping strings to their deletion costs.
224
+ :param symmetric_substitution: Should the keys of `substitution_costs` be considered to be
225
+ symmetric? Defaults to True.
226
+ :param default_substitution_cost: The default substitution cost for character pairs not found
227
+ in `substitution_costs`.
228
+ :param default_insertion_cost: The default insertion cost for characters not found in
229
+ `insertion_costs`.
230
+ :param default_deletion_cost: The default deletion cost for characters not found in
231
+ `deletion_costs`.
232
+ :return: List of :class:`EditOperation` instances.
233
+ """
234
+ return WeightedLevenshtein(
235
+ substitution_costs=substitution_costs,
236
+ insertion_costs=insertion_costs,
237
+ deletion_costs=deletion_costs,
238
+ symmetric_substitution=symmetric_substitution,
239
+ default_substitution_cost=default_substitution_cost,
240
+ default_insertion_cost=default_insertion_cost,
241
+ default_deletion_cost=default_deletion_cost,
242
+ ).explain(s1, s2)
@@ -0,0 +1,9 @@
1
+ Metadata-Version: 2.4
2
+ Name: ocr_stringdist
3
+ Version: 0.2.0
4
+ Classifier: Programming Language :: Rust
5
+ Classifier: Programming Language :: Python
6
+ Classifier: Operating System :: OS Independent
7
+ License-File: LICENSE
8
+ Requires-Python: >=3.9
9
+ Project-URL: repository, https://github.com/NiklasvonM/ocr-stringdist
@@ -0,0 +1,11 @@
1
+ ocr_stringdist-0.2.0.dist-info/METADATA,sha256=OVF3jUKVM038ogWfwZIHmpu3eUXdeuS1Cy-t96G8Tgo,304
2
+ ocr_stringdist-0.2.0.dist-info/WHEEL,sha256=YZ-7NO8DvWz9hvgmCkZeUq1zdF91KUkb-9x6mgD1Qlg,108
3
+ ocr_stringdist-0.2.0.dist-info/licenses/LICENSE,sha256=5BPRcjlnbl2t4TidSgpfGrtC_birSf8JlZfA-qmVoQE,1072
4
+ ocr_stringdist.libs/libgcc_s-e52197c3.so.1,sha256=vkPW1Auw6CH9Bjk7frmX3hry_1H9c0tRI0Ncyg71WUI,724137
5
+ ocr_stringdist/__init__.py,sha256=ApxqraLRcWAkzXhGJXSf3EqGEVFbxghrYrfJ9dmQjQU,467
6
+ ocr_stringdist/_rust_stringdist.cpython-311-aarch64-linux-musl.so,sha256=IQyxOxiF8SXAyxpLrbtWSPNjKTyQ4r3KmlthYLqXnwU,920193
7
+ ocr_stringdist/default_ocr_distances.py,sha256=oSu-TpHjPA4jxKpLAfmap8z0ZsC99jsOjnRVHW7Hj_Y,1033
8
+ ocr_stringdist/levenshtein.py,sha256=Jypg31BQyULipJ_Yh3dcBQDKNnbvEIlmf28tDr_gySw,11243
9
+ ocr_stringdist/matching.py,sha256=rr8R63Ttu2hTf5Mni7_P8aGBbjWs6t2QPV3wxKXspAs,3293
10
+ ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ ocr_stringdist-0.2.0.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: maturin (1.8.3)
2
+ Generator: maturin (1.9.4)
3
3
  Root-Is-Purelib: false
4
4
  Tag: cp311-cp311-musllinux_1_1_aarch64
@@ -1,86 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: ocr_stringdist
3
- Version: 0.0.7
4
- Classifier: Programming Language :: Rust
5
- Classifier: Programming Language :: Python
6
- Classifier: Operating System :: OS Independent
7
- License-File: LICENSE
8
- Summary: String distances considering OCR errors.
9
- Author: Niklas von Moers <niklasvmoers@protonmail.com>
10
- Author-email: Niklas von Moers <niklasvmoers@protonmail.com>
11
- License: MIT
12
- Requires-Python: >=3.9
13
- Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
14
- Project-URL: repository, https://github.com/NiklasvonM/ocr-stringdist
15
-
16
- # OCR-StringDist
17
-
18
- A Python library for string distance calculations that account for common OCR (optical character recognition) errors.
19
-
20
- Documentation: https://niklasvonm.github.io/ocr-stringdist/
21
-
22
- [![PyPI](https://img.shields.io/badge/PyPI-Package-blue)](https://pypi.org/project/ocr-stringdist/)
23
- [![License](https://img.shields.io/badge/License-MIT-green)](LICENSE)
24
-
25
- ## Overview
26
-
27
- OCR-StringDist provides specialized string distance algorithms that accommodate for optical character recognition (OCR) errors. Unlike traditional string comparison algorithms, OCR-StringDist considers common OCR confusions (like "0" vs "O", "6" vs "G", etc.) when calculating distances between strings.
28
-
29
- > **Note:** This project is in early development. APIs may change in future releases.
30
-
31
- ## Installation
32
-
33
- ```bash
34
- pip install ocr-stringdist
35
- ```
36
-
37
- ## Features
38
-
39
- - **Weighted Levenshtein Distance**: An adaptation of the classic Levenshtein algorithm with custom substitution costs for character pairs that are commonly confused in OCR models, including efficient batch processing.
40
- - **Unicode Support**: Arbitrary unicode strings can be compared.
41
- - **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
42
- - **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
43
- - **Customizable Cost Maps**: Create your own substitution cost maps for specific OCR systems or domains.
44
- - **Best Match Finder**: Utility function `find_best_candidate` to efficiently find the best matching string from a collection of candidates using any specified distance function (including the library's OCR-aware ones).
45
-
46
- ## Usage
47
-
48
- ### Weighted Levenshtein Distance
49
-
50
- ```python
51
- import ocr_stringdist as osd
52
-
53
- # Using default OCR distance map
54
- distance = osd.weighted_levenshtein_distance("OCR5", "OCRS")
55
- print(f"Distance between 'OCR5' and 'OCRS': {distance}") # Will be less than 1.0
56
-
57
- # Custom cost map
58
- custom_map = {("In", "h"): 0.5}
59
- distance = osd.weighted_levenshtein_distance(
60
- "hi", "Ini",
61
- cost_map=custom_map,
62
- symmetric=True,
63
- )
64
- print(f"Distance with custom map: {distance}")
65
- ```
66
-
67
- ### Finding the Best Candidate
68
-
69
- ```python
70
- import ocr_stringdist as osd
71
-
72
- s = "apple"
73
- candidates = ["apply", "apples", "orange", "appIe"] # 'appIe' has an OCR-like error
74
-
75
- def ocr_aware_distance(s1: str, s2: str) -> float:
76
- return osd.weighted_levenshtein_distance(s1, s2, cost_map={("l", "I"): 0.1})
77
-
78
- best_candidate, best_dist = osd.find_best_candidate(s, candidates, ocr_aware_distance)
79
- print(f"Best candidate for '{s}' is '{best_candidate}' with distance {best_dist}")
80
- # Output: Best candidate for 'apple' is 'appIe' with distance 0.1
81
- ```
82
-
83
- ## Acknowledgements
84
-
85
- This project is inspired by [jellyfish](https://github.com/jamesturk/jellyfish), providing the base implementations of the algorithms used here.
86
-
@@ -1,11 +0,0 @@
1
- ocr_stringdist-0.0.7.dist-info/METADATA,sha256=Rx1RtodaH76hveE0246t_Lo-lID1h6rI_j47joUb6Ak,3494
2
- ocr_stringdist-0.0.7.dist-info/WHEEL,sha256=_6kdhUAwdXFNoVDlhgbbljfv0z-zRN2adeNGPnKJZD8,108
3
- ocr_stringdist-0.0.7.dist-info/licenses/LICENSE,sha256=5BPRcjlnbl2t4TidSgpfGrtC_birSf8JlZfA-qmVoQE,1072
4
- ocr_stringdist.libs/libgcc_s-e52197c3.so.1,sha256=vkPW1Auw6CH9Bjk7frmX3hry_1H9c0tRI0Ncyg71WUI,724137
5
- ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- ocr_stringdist/__init__.py,sha256=haj_CNOovN9O6j1ixmku6BNTV4U3NQ5JEN2dC_4TmXc,332
7
- ocr_stringdist/default_ocr_distances.py,sha256=oSu-TpHjPA4jxKpLAfmap8z0ZsC99jsOjnRVHW7Hj_Y,1033
8
- ocr_stringdist/matching.py,sha256=rr8R63Ttu2hTf5Mni7_P8aGBbjWs6t2QPV3wxKXspAs,3293
9
- ocr_stringdist/levenshtein.py,sha256=kJBJM5sG_g4MBXizbYVsGMQPWHk2vXWOvMspCYnthCA,3053
10
- ocr_stringdist/_rust_stringdist.cpython-311-aarch64-linux-musl.so,sha256=5lSxG3u28KJNdAU-apGJXOuSeASuUYqb5tQBe4g_rHA,854657
11
- ocr_stringdist-0.0.7.dist-info/RECORD,,