ocr-stringdist 0.1.0__pp311-pypy311_pp73-musllinux_1_1_aarch64.whl → 0.2.1__pp311-pypy311_pp73-musllinux_1_1_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,17 @@
1
1
  from .default_ocr_distances import ocr_distance_map
2
- from .levenshtein import batch_weighted_levenshtein_distance, weighted_levenshtein_distance
2
+ from .levenshtein import (
3
+ WeightedLevenshtein,
4
+ batch_weighted_levenshtein_distance,
5
+ explain_weighted_levenshtein,
6
+ weighted_levenshtein_distance,
7
+ )
3
8
  from .matching import find_best_candidate
4
9
 
5
10
  __all__ = [
6
11
  "ocr_distance_map",
12
+ "WeightedLevenshtein",
7
13
  "weighted_levenshtein_distance",
8
14
  "batch_weighted_levenshtein_distance",
15
+ "explain_weighted_levenshtein",
9
16
  "find_best_candidate",
10
17
  ]
@@ -1,8 +1,95 @@
1
- from typing import Optional
1
+ from __future__ import annotations
2
2
 
3
- from ._rust_stringdist import * # noqa: F403
3
+ from dataclasses import dataclass
4
+ from typing import Literal, Optional
5
+
6
+ from ._rust_stringdist import (
7
+ _batch_weighted_levenshtein_distance,
8
+ _explain_weighted_levenshtein_distance,
9
+ _weighted_levenshtein_distance,
10
+ )
4
11
  from .default_ocr_distances import ocr_distance_map
5
12
 
13
+ OperationType = Literal["substitute", "insert", "delete"]
14
+
15
+
16
+ @dataclass(frozen=True)
17
+ class EditOperation:
18
+ """
19
+ Represents a single edit operation (substitution, insertion, or deletion).
20
+ """
21
+
22
+ op_type: OperationType
23
+ source_token: Optional[str]
24
+ target_token: Optional[str]
25
+ cost: float
26
+
27
+
28
+ class WeightedLevenshtein:
29
+ """
30
+ Calculates Levenshtein distance with custom, configurable costs.
31
+
32
+ This class is initialized with cost dictionaries and settings that define
33
+ how the distance is measured. Once created, its methods can be used to
34
+ efficiently compute distances and explain the edit operations.
35
+
36
+ :param substitution_costs: Maps (char, char) tuples to their substitution cost.
37
+ Defaults to costs based on common OCR errors.
38
+ :param insertion_costs: Maps a character to its insertion cost.
39
+ :param deletion_costs: Maps a character to its deletion cost.
40
+ :param symmetric_substitution: If True, substitution costs are bidirectional.
41
+ :param default_substitution_cost: Default cost for substitutions not in the map.
42
+ :param default_insertion_cost: Default cost for insertions not in the map.
43
+ :param default_deletion_cost: Default cost for deletions not in the map.
44
+ """
45
+
46
+ substitution_costs: dict[tuple[str, str], float]
47
+ insertion_costs: dict[str, float]
48
+ deletion_costs: dict[str, float]
49
+ symmetric_substitution: bool
50
+ default_substitution_cost: float
51
+ default_insertion_cost: float
52
+ default_deletion_cost: float
53
+
54
+ def __init__(
55
+ self,
56
+ substitution_costs: Optional[dict[tuple[str, str], float]] = None,
57
+ insertion_costs: Optional[dict[str, float]] = None,
58
+ deletion_costs: Optional[dict[str, float]] = None,
59
+ *,
60
+ symmetric_substitution: bool = True,
61
+ default_substitution_cost: float = 1.0,
62
+ default_insertion_cost: float = 1.0,
63
+ default_deletion_cost: float = 1.0,
64
+ ) -> None:
65
+ self.substitution_costs = (
66
+ ocr_distance_map if substitution_costs is None else substitution_costs
67
+ )
68
+ self.insertion_costs = {} if insertion_costs is None else insertion_costs
69
+ self.deletion_costs = {} if deletion_costs is None else deletion_costs
70
+ self.symmetric_substitution = symmetric_substitution
71
+ self.default_substitution_cost = default_substitution_cost
72
+ self.default_insertion_cost = default_insertion_cost
73
+ self.default_deletion_cost = default_deletion_cost
74
+
75
+ @classmethod
76
+ def unweighted(cls) -> WeightedLevenshtein:
77
+ """Creates an instance with all operations having equal cost of 1.0."""
78
+ return cls(substitution_costs={}, insertion_costs={}, deletion_costs={})
79
+
80
+ def distance(self, s1: str, s2: str) -> float:
81
+ """Calculates the weighted Levenshtein distance between two strings."""
82
+ return _weighted_levenshtein_distance(s1, s2, **self.__dict__) # type: ignore[no-any-return]
83
+
84
+ def explain(self, s1: str, s2: str) -> list[EditOperation]:
85
+ """Returns the list of edit operations to transform s1 into s2."""
86
+ raw_path = _explain_weighted_levenshtein_distance(s1, s2, **self.__dict__)
87
+ return [EditOperation(*op) for op in raw_path]
88
+
89
+ def batch_distance(self, s: str, candidates: list[str]) -> list[float]:
90
+ """Calculates distances between a string and a list of candidates."""
91
+ return _batch_weighted_levenshtein_distance(s, candidates, **self.__dict__) # type: ignore[no-any-return]
92
+
6
93
 
7
94
  def weighted_levenshtein_distance(
8
95
  s1: str,
@@ -20,6 +107,8 @@ def weighted_levenshtein_distance(
20
107
  """
21
108
  Levenshtein distance with custom substitution, insertion and deletion costs.
22
109
 
110
+ See also :meth:`WeightedLevenshtein.distance`.
111
+
23
112
  The default `substitution_costs` considers common OCR errors, see
24
113
  :py:data:`ocr_stringdist.default_ocr_distances.ocr_distance_map`.
25
114
 
@@ -41,16 +130,7 @@ def weighted_levenshtein_distance(
41
130
  :param default_deletion_cost: The default deletion cost for characters not found in
42
131
  `deletion_costs`.
43
132
  """
44
- if substitution_costs is None:
45
- substitution_costs = ocr_distance_map
46
- if insertion_costs is None:
47
- insertion_costs = {}
48
- if deletion_costs is None:
49
- deletion_costs = {}
50
- # _weighted_levenshtein_distance is written in Rust, see src/rust_stringdist.rs.
51
- return _weighted_levenshtein_distance( # type: ignore # noqa: F405
52
- s1,
53
- s2,
133
+ return WeightedLevenshtein(
54
134
  substitution_costs=substitution_costs,
55
135
  insertion_costs=insertion_costs,
56
136
  deletion_costs=deletion_costs,
@@ -58,7 +138,7 @@ def weighted_levenshtein_distance(
58
138
  default_substitution_cost=default_substitution_cost,
59
139
  default_insertion_cost=default_insertion_cost,
60
140
  default_deletion_cost=default_deletion_cost,
61
- )
141
+ ).distance(s1, s2)
62
142
 
63
143
 
64
144
  def batch_weighted_levenshtein_distance(
@@ -77,6 +157,8 @@ def batch_weighted_levenshtein_distance(
77
157
  """
78
158
  Calculate weighted Levenshtein distances between a string and multiple candidates.
79
159
 
160
+ See also :meth:`WeightedLevenshtein.batch_distance`.
161
+
80
162
  This is more efficient than calling :func:`weighted_levenshtein_distance` multiple times.
81
163
 
82
164
  :param s: The string to compare (interpreted as the string read via OCR)
@@ -98,16 +180,58 @@ def batch_weighted_levenshtein_distance(
98
180
  `deletion_costs`.
99
181
  :return: A list of distances corresponding to each candidate
100
182
  """
101
- if substitution_costs is None:
102
- substitution_costs = ocr_distance_map
103
- if insertion_costs is None:
104
- insertion_costs = {}
105
- if deletion_costs is None:
106
- deletion_costs = {}
107
- # _batch_weighted_levenshtein_distance is written in Rust, see src/rust_stringdist.rs.
108
- return _batch_weighted_levenshtein_distance( # type: ignore # noqa: F405
109
- s,
110
- candidates,
183
+ return WeightedLevenshtein(
184
+ substitution_costs=substitution_costs,
185
+ insertion_costs=insertion_costs,
186
+ deletion_costs=deletion_costs,
187
+ symmetric_substitution=symmetric_substitution,
188
+ default_substitution_cost=default_substitution_cost,
189
+ default_insertion_cost=default_insertion_cost,
190
+ default_deletion_cost=default_deletion_cost,
191
+ ).batch_distance(s, candidates)
192
+
193
+
194
+ def explain_weighted_levenshtein(
195
+ s1: str,
196
+ s2: str,
197
+ /,
198
+ substitution_costs: Optional[dict[tuple[str, str], float]] = None,
199
+ insertion_costs: Optional[dict[str, float]] = None,
200
+ deletion_costs: Optional[dict[str, float]] = None,
201
+ *,
202
+ symmetric_substitution: bool = True,
203
+ default_substitution_cost: float = 1.0,
204
+ default_insertion_cost: float = 1.0,
205
+ default_deletion_cost: float = 1.0,
206
+ ) -> list[EditOperation]:
207
+ """
208
+ Computes the path of operations associated with the custom Levenshtein distance.
209
+
210
+ See also :meth:`WeightedLevenshtein.explain`.
211
+
212
+ The default `substitution_costs` considers common OCR errors, see
213
+ :py:data:`ocr_stringdist.default_ocr_distances.ocr_distance_map`.
214
+
215
+ :param s1: First string (interpreted as the string read via OCR)
216
+ :param s2: Second string
217
+ :param substitution_costs: Dictionary mapping tuples of strings ("substitution tokens") to their
218
+ substitution costs. Only one direction needs to be configured unless
219
+ `symmetric_substitution` is False.
220
+ Note that the runtime scales in the length of the longest substitution token.
221
+ Defaults to `ocr_stringdist.ocr_distance_map`.
222
+ :param insertion_costs: Dictionary mapping strings to their insertion costs.
223
+ :param deletion_costs: Dictionary mapping strings to their deletion costs.
224
+ :param symmetric_substitution: Should the keys of `substitution_costs` be considered to be
225
+ symmetric? Defaults to True.
226
+ :param default_substitution_cost: The default substitution cost for character pairs not found
227
+ in `substitution_costs`.
228
+ :param default_insertion_cost: The default insertion cost for characters not found in
229
+ `insertion_costs`.
230
+ :param default_deletion_cost: The default deletion cost for characters not found in
231
+ `deletion_costs`.
232
+ :return: List of :class:`EditOperation` instances.
233
+ """
234
+ return WeightedLevenshtein(
111
235
  substitution_costs=substitution_costs,
112
236
  insertion_costs=insertion_costs,
113
237
  deletion_costs=deletion_costs,
@@ -115,4 +239,4 @@ def batch_weighted_levenshtein_distance(
115
239
  default_substitution_cost=default_substitution_cost,
116
240
  default_insertion_cost=default_insertion_cost,
117
241
  default_deletion_cost=default_deletion_cost,
118
- )
242
+ ).explain(s1, s2)
@@ -0,0 +1,81 @@
1
+ Metadata-Version: 2.4
2
+ Name: ocr_stringdist
3
+ Version: 0.2.1
4
+ Classifier: Programming Language :: Rust
5
+ Classifier: Programming Language :: Python
6
+ Classifier: Operating System :: OS Independent
7
+ License-File: LICENSE
8
+ Requires-Python: >=3.9
9
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
10
+ Project-URL: repository, https://github.com/NiklasvonM/ocr-stringdist
11
+
12
+ # OCR-StringDist
13
+
14
+ A Python library for fast string distance calculations that account for common OCR (optical character recognition) errors.
15
+
16
+ Documentation: https://niklasvonm.github.io/ocr-stringdist/
17
+
18
+ [![PyPI](https://img.shields.io/badge/PyPI-Package-blue)](https://pypi.org/project/ocr-stringdist/)
19
+ [![License](https://img.shields.io/badge/License-MIT-green)](LICENSE)
20
+
21
+ ## Overview
22
+
23
+ Standard string distances (like Levenshtein) treat all character substitutions equally. This is suboptimal for text read from images via OCR, where errors like `O` vs `0` are far more common than, say, `O` vs `X`.
24
+
25
+ OCR-StringDist uses a **weighted Levenshtein distance**, assigning lower costs to common OCR errors.
26
+
27
+ **Example:** Matching against the correct word `CODE`:
28
+
29
+ * **Standard Levenshtein:**
30
+ * $d(\text{"CODE"}, \text{"C0DE"}) = 1$ (O → 0)
31
+ * $d(\text{"CODE"}, \text{"CXDE"}) = 1$ (O → X)
32
+ * Result: Both appear equally likely/distant.
33
+
34
+ * **OCR-StringDist (Weighted):**
35
+ * $d(\text{"CODE"}, \text{"C0DE"}) \approx 0.1$ (common error, low cost)
36
+ * $d(\text{"CODE"}, \text{"CXDE"}) = 1.0$ (unlikely error, high cost)
37
+ * Result: Correctly identifies `C0DE` as a much closer match.
38
+
39
+ This makes it ideal for matching potentially incorrect OCR output against known values (e.g., product codes, database entries).
40
+
41
+ > **Note:** This project is in early development. APIs may change in future releases.
42
+
43
+ ## Installation
44
+
45
+ ```bash
46
+ pip install ocr-stringdist
47
+ ```
48
+
49
+ ## Features
50
+
51
+ - **Weighted Levenshtein Distance**: Calculates Levenshtein distance with customizable costs for substitutions, insertions, and deletions. Includes an efficient batch version (`batch_weighted_levenshtein_distance`) for comparing one string against many candidates.
52
+ - **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
53
+ - **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
54
+ - **Unicode Support**: Works with arbitrary Unicode strings.
55
+ - **Best Match Finder**: Includes a utility function `find_best_candidate` to efficiently find the best match from a list based on _any_ distance function.
56
+
57
+ ## Usage
58
+
59
+ ### Weighted Levenshtein Distance
60
+
61
+ ```python
62
+ import ocr_stringdist as osd
63
+
64
+ # Using default OCR distance map
65
+ distance = osd.weighted_levenshtein_distance("OCR5", "OCRS")
66
+ print(f"Distance between 'OCR5' and 'OCRS': {distance}") # Will be less than 1.0
67
+
68
+ # Custom cost map
69
+ substitution_costs = {("In", "h"): 0.5}
70
+ distance = osd.weighted_levenshtein_distance(
71
+ "hi", "Ini",
72
+ substitution_costs=substitution_costs,
73
+ symmetric_substitution=True,
74
+ )
75
+ print(f"Distance with custom map: {distance}")
76
+ ```
77
+
78
+ ## Acknowledgements
79
+
80
+ This project is inspired by [jellyfish](https://github.com/jamesturk/jellyfish), providing the base implementations of the algorithms used here.
81
+
@@ -0,0 +1,11 @@
1
+ ocr_stringdist-0.2.1.dist-info/METADATA,sha256=dIjhLqdKIzSgqyX45jQ6mZTzZjm3UOcghfe2zYoKeS0,3320
2
+ ocr_stringdist-0.2.1.dist-info/WHEEL,sha256=oFlwWNYZ4chUb5MrQOIOJ496c7LWejWxpKDjpxQikkk,115
3
+ ocr_stringdist-0.2.1.dist-info/licenses/LICENSE,sha256=5BPRcjlnbl2t4TidSgpfGrtC_birSf8JlZfA-qmVoQE,1072
4
+ ocr_stringdist.libs/libgcc_s-e52197c3.so.1,sha256=vkPW1Auw6CH9Bjk7frmX3hry_1H9c0tRI0Ncyg71WUI,724137
5
+ ocr_stringdist/__init__.py,sha256=ApxqraLRcWAkzXhGJXSf3EqGEVFbxghrYrfJ9dmQjQU,467
6
+ ocr_stringdist/_rust_stringdist.pypy311-pp73-aarch64-linux-gnu.so,sha256=9FxbjWu14W6X4HX-O-ngrovT2qp77p9RsHdmj7kNMwA,920353
7
+ ocr_stringdist/default_ocr_distances.py,sha256=oSu-TpHjPA4jxKpLAfmap8z0ZsC99jsOjnRVHW7Hj_Y,1033
8
+ ocr_stringdist/levenshtein.py,sha256=Jypg31BQyULipJ_Yh3dcBQDKNnbvEIlmf28tDr_gySw,11243
9
+ ocr_stringdist/matching.py,sha256=rr8R63Ttu2hTf5Mni7_P8aGBbjWs6t2QPV3wxKXspAs,3293
10
+ ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ ocr_stringdist-0.2.1.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: maturin (1.8.3)
2
+ Generator: maturin (1.9.4)
3
3
  Root-Is-Purelib: false
4
4
  Tag: pp311-pypy311_pp73-musllinux_1_1_aarch64
@@ -1,85 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: ocr_stringdist
3
- Version: 0.1.0
4
- Classifier: Programming Language :: Rust
5
- Classifier: Programming Language :: Python
6
- Classifier: Operating System :: OS Independent
7
- License-File: LICENSE
8
- Summary: String distances considering OCR errors.
9
- Author: Niklas von Moers <niklasvmoers@protonmail.com>
10
- Author-email: Niklas von Moers <niklasvmoers@protonmail.com>
11
- License: MIT
12
- Requires-Python: >=3.9
13
- Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
14
- Project-URL: repository, https://github.com/NiklasvonM/ocr-stringdist
15
-
16
- # OCR-StringDist
17
-
18
- A Python library for string distance calculations that account for common OCR (optical character recognition) errors.
19
-
20
- Documentation: https://niklasvonm.github.io/ocr-stringdist/
21
-
22
- [![PyPI](https://img.shields.io/badge/PyPI-Package-blue)](https://pypi.org/project/ocr-stringdist/)
23
- [![License](https://img.shields.io/badge/License-MIT-green)](LICENSE)
24
-
25
- ## Overview
26
-
27
- OCR-StringDist provides specialized string distance algorithms that accommodate for optical character recognition (OCR) errors. Unlike traditional string comparison algorithms, OCR-StringDist considers common OCR confusions (like "0" vs "O", "6" vs "G", etc.) when calculating distances between strings.
28
-
29
- > **Note:** This project is in early development. APIs may change in future releases.
30
-
31
- ## Installation
32
-
33
- ```bash
34
- pip install ocr-stringdist
35
- ```
36
-
37
- ## Features
38
-
39
- - **Weighted Levenshtein Distance**: An adaptation of the classic Levenshtein algorithm with custom substitution costs for character pairs that are commonly confused in OCR models, including efficient batch processing.
40
- - **Unicode Support**: Arbitrary unicode strings can be compared.
41
- - **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
42
- - **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
43
- - **Best Match Finder**: Utility function `find_best_candidate` to efficiently find the best matching string from a collection of candidates using any specified distance function (including the library's OCR-aware ones).
44
-
45
- ## Usage
46
-
47
- ### Weighted Levenshtein Distance
48
-
49
- ```python
50
- import ocr_stringdist as osd
51
-
52
- # Using default OCR distance map
53
- distance = osd.weighted_levenshtein_distance("OCR5", "OCRS")
54
- print(f"Distance between 'OCR5' and 'OCRS': {distance}") # Will be less than 1.0
55
-
56
- # Custom cost map
57
- custom_map = {("In", "h"): 0.5}
58
- distance = osd.weighted_levenshtein_distance(
59
- "hi", "Ini",
60
- cost_map=custom_map,
61
- symmetric=True,
62
- )
63
- print(f"Distance with custom map: {distance}")
64
- ```
65
-
66
- ### Finding the Best Candidate
67
-
68
- ```python
69
- import ocr_stringdist as osd
70
-
71
- s = "apple"
72
- candidates = ["apply", "apples", "orange", "appIe"] # 'appIe' has an OCR-like error
73
-
74
- def ocr_aware_distance(s1: str, s2: str) -> float:
75
- return osd.weighted_levenshtein_distance(s1, s2, cost_map={("l", "I"): 0.1})
76
-
77
- best_candidate, best_dist = osd.find_best_candidate(s, candidates, ocr_aware_distance)
78
- print(f"Best candidate for '{s}' is '{best_candidate}' with distance {best_dist}")
79
- # Output: Best candidate for 'apple' is 'appIe' with distance 0.1
80
- ```
81
-
82
- ## Acknowledgements
83
-
84
- This project is inspired by [jellyfish](https://github.com/jamesturk/jellyfish), providing the base implementations of the algorithms used here.
85
-
@@ -1,11 +0,0 @@
1
- ocr_stringdist-0.1.0.dist-info/METADATA,sha256=vQJRhn0AEoIm159PyKQGMmH7JW35ZWUrQ-PWEBRteg8,3388
2
- ocr_stringdist-0.1.0.dist-info/WHEEL,sha256=A8-e_GHIEbo9CenYBC58SYnAVCE4DtU0zUUTexlpjXQ,115
3
- ocr_stringdist-0.1.0.dist-info/licenses/LICENSE,sha256=5BPRcjlnbl2t4TidSgpfGrtC_birSf8JlZfA-qmVoQE,1072
4
- ocr_stringdist.libs/libgcc_s-e52197c3.so.1,sha256=vkPW1Auw6CH9Bjk7frmX3hry_1H9c0tRI0Ncyg71WUI,724137
5
- ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- ocr_stringdist/default_ocr_distances.py,sha256=oSu-TpHjPA4jxKpLAfmap8z0ZsC99jsOjnRVHW7Hj_Y,1033
7
- ocr_stringdist/levenshtein.py,sha256=hHcarxjOzxhuopNNW3ZkPIcrSx2NjXYUWh9rR59W0Tc,5627
8
- ocr_stringdist/matching.py,sha256=rr8R63Ttu2hTf5Mni7_P8aGBbjWs6t2QPV3wxKXspAs,3293
9
- ocr_stringdist/__init__.py,sha256=haj_CNOovN9O6j1ixmku6BNTV4U3NQ5JEN2dC_4TmXc,332
10
- ocr_stringdist/_rust_stringdist.pypy311-pp73-aarch64-linux-gnu.so,sha256=Dh__T5Gi3Ra81uPiG0Dq359YcQ6CQ4BkhHCJYIoDgFg,920353
11
- ocr_stringdist-0.1.0.dist-info/RECORD,,