ocr-stringdist 0.0.3__cp39-cp39-win32.whl → 0.0.4__cp39-cp39-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocr_stringdist/__init__.py +3 -3
- ocr_stringdist/_rust_stringdist.cp39-win32.pyd +0 -0
- ocr_stringdist/matching.py +83 -0
- {ocr_stringdist-0.0.3.dist-info → ocr_stringdist-0.0.4.dist-info}/METADATA +21 -2
- ocr_stringdist-0.0.4.dist-info/RECORD +9 -0
- ocr_stringdist-0.0.3.dist-info/RECORD +0 -8
- {ocr_stringdist-0.0.3.dist-info → ocr_stringdist-0.0.4.dist-info}/WHEEL +0 -0
- {ocr_stringdist-0.0.3.dist-info → ocr_stringdist-0.0.4.dist-info}/licenses/LICENSE +0 -0
ocr_stringdist/__init__.py
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
from typing import Optional
|
2
2
|
|
3
3
|
from ._rust_stringdist import * # noqa: F403
|
4
|
-
|
5
4
|
from .default_ocr_distances import ocr_distance_map
|
6
|
-
|
5
|
+
from .matching import find_best_candidate
|
7
6
|
|
8
7
|
__all__ = [
|
9
8
|
"ocr_distance_map",
|
10
9
|
"weighted_levenshtein_distance", # noqa: F405
|
10
|
+
"find_best_candidate",
|
11
11
|
]
|
12
12
|
|
13
13
|
|
@@ -36,6 +36,6 @@ def weighted_levenshtein_distance(
|
|
36
36
|
"""
|
37
37
|
if cost_map is None:
|
38
38
|
cost_map = ocr_distance_map
|
39
|
-
return _weighted_levenshtein_distance( # noqa: F405
|
39
|
+
return _weighted_levenshtein_distance( # type: ignore # noqa: F405
|
40
40
|
s1, s2, cost_map=cost_map, symmetric=symmetric, default_cost=default_cost
|
41
41
|
)
|
Binary file
|
@@ -0,0 +1,83 @@
|
|
1
|
+
from collections.abc import Callable, Iterable
|
2
|
+
from typing import Optional
|
3
|
+
|
4
|
+
|
5
|
+
def find_best_candidate(
|
6
|
+
s: str,
|
7
|
+
candidates: Iterable[str],
|
8
|
+
distance_fun: Callable[[str, str], float],
|
9
|
+
*,
|
10
|
+
minimize: bool = True,
|
11
|
+
early_return_value: Optional[float] = None,
|
12
|
+
) -> tuple[str, float]:
|
13
|
+
"""
|
14
|
+
Finds the best matching string from a collection of candidates based on a distance function.
|
15
|
+
|
16
|
+
Compares a given string against each string in the 'candidates'
|
17
|
+
iterable using the provided 'distance_fun'. It identifies the candidate
|
18
|
+
that yields the minimum (or maximum, if minimize=False) distance.
|
19
|
+
|
20
|
+
:param s: The reference string to compare against.
|
21
|
+
:type s: str
|
22
|
+
:param candidates: An iterable of candidate strings to compare with 's'.
|
23
|
+
:type candidates: Iterable[str]
|
24
|
+
:param distance_fun: A function that takes two strings (s, candidate) and
|
25
|
+
returns a float representing their distance or similarity.
|
26
|
+
:type distance_fun: Callable[[str, str], float]
|
27
|
+
:param minimize: If True (default), finds the candidate with the minimum
|
28
|
+
distance. If False, finds the candidate with the maximum
|
29
|
+
distance (useful for similarity scores).
|
30
|
+
:type minimize: bool
|
31
|
+
:param early_return_value: If provided, the function will return immediately
|
32
|
+
if a distance is found that is less than or equal
|
33
|
+
to this value (if minimize=True) or greater than
|
34
|
+
or equal to this value (if minimize=False).
|
35
|
+
If None (default), all candidates are checked.
|
36
|
+
:type early_return_value: Optional[float]
|
37
|
+
:raises ValueError: If the 'candidates' iterable is empty.
|
38
|
+
:return: A tuple containing the best matching candidate string and its
|
39
|
+
calculated distance/score.
|
40
|
+
:rtype: tuple[str, float]
|
41
|
+
|
42
|
+
:Example:
|
43
|
+
|
44
|
+
>>> from ocr_stringdist import weighted_levenshtein_distance as distance
|
45
|
+
>>> s = "apple"
|
46
|
+
>>> candidates = ["apply", "apples", "orange", "appIe"]
|
47
|
+
>>> find_best_match(s, candidates, lambda s1, s2: distance(s1, s2, {("l", "I"): 0.1}))
|
48
|
+
('appIe', 0.1)
|
49
|
+
"""
|
50
|
+
if not candidates:
|
51
|
+
raise ValueError("The 'candidates' iterable cannot be empty.")
|
52
|
+
|
53
|
+
best_candidate: str = ""
|
54
|
+
|
55
|
+
if minimize:
|
56
|
+
best_distance = float("inf")
|
57
|
+
|
58
|
+
def is_next_best(current: float, best: float) -> bool:
|
59
|
+
return current < best
|
60
|
+
|
61
|
+
def can_return_early(current: float, threshold: float) -> bool:
|
62
|
+
return current <= threshold
|
63
|
+
else:
|
64
|
+
best_distance = -float("inf")
|
65
|
+
|
66
|
+
def is_next_best(current: float, best: float) -> bool:
|
67
|
+
return current > best
|
68
|
+
|
69
|
+
def can_return_early(current: float, threshold: float) -> bool:
|
70
|
+
return current >= threshold
|
71
|
+
|
72
|
+
for candidate in candidates:
|
73
|
+
current_distance = distance_fun(s, candidate)
|
74
|
+
|
75
|
+
if early_return_value is not None and can_return_early(
|
76
|
+
current_distance, early_return_value
|
77
|
+
):
|
78
|
+
return candidate, current_distance
|
79
|
+
if is_next_best(current_distance, best_distance):
|
80
|
+
best_distance = current_distance
|
81
|
+
best_candidate = candidate
|
82
|
+
|
83
|
+
return best_candidate, best_distance
|
@@ -1,8 +1,8 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: ocr_stringdist
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.4
|
4
4
|
Classifier: Programming Language :: Rust
|
5
|
-
Classifier: Programming Language :: Python
|
5
|
+
Classifier: Programming Language :: Python
|
6
6
|
Classifier: Operating System :: OS Independent
|
7
7
|
License-File: LICENSE
|
8
8
|
Summary: String distances considering OCR errors.
|
@@ -37,9 +37,12 @@ pip install ocr-stringdist
|
|
37
37
|
- **Weighted Levenshtein Distance**: An adaptation of the classic Levenshtein algorithm with custom substitution costs for character pairs that are commonly confused in OCR models.
|
38
38
|
- **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
|
39
39
|
- **Customizable Cost Maps**: Create your own substitution cost maps for specific OCR systems or domains.
|
40
|
+
- **Best Match Finder**: Utility function find_best_candidate to efficiently find the best matching string from a collection of candidates using any specified distance function (including the library's OCR-aware ones). Supports early stopping for performance optimization.
|
40
41
|
|
41
42
|
## Usage
|
42
43
|
|
44
|
+
### Weighted Levenshtein Distance
|
45
|
+
|
43
46
|
```python
|
44
47
|
import ocr_stringdist as osd
|
45
48
|
|
@@ -58,6 +61,22 @@ distance = osd.weighted_levenshtein_distance(
|
|
58
61
|
print(f"Distance with custom map: {distance}")
|
59
62
|
```
|
60
63
|
|
64
|
+
### Finding the Best Candidate
|
65
|
+
|
66
|
+
```python
|
67
|
+
import ocr_stringdist as osd
|
68
|
+
|
69
|
+
s = "apple"
|
70
|
+
candidates = ["apply", "apples", "orange", "appIe"] # 'appIe' has an OCR-like error
|
71
|
+
|
72
|
+
def ocr_aware_distance(s1: str, s2: str) -> float:
|
73
|
+
return osd.weighted_levenshtein_distance(s1, s2, cost_map={("l", "I"): 0.1})
|
74
|
+
|
75
|
+
best_candidate, best_dist = osd.find_best_candidate(s, candidates, ocr_aware_distance)
|
76
|
+
print(f"Best candidate for '{s}' is '{best_candidate}' with distance {best_dist}")
|
77
|
+
# Output: Best candidate for 'apple' is 'appIe' with distance 0.1
|
78
|
+
```
|
79
|
+
|
61
80
|
## Acknowledgements
|
62
81
|
|
63
82
|
This project is inspired by [jellyfish](https://github.com/jamesturk/jellyfish), providing the base implementations of the algorithms used here.
|
@@ -0,0 +1,9 @@
|
|
1
|
+
ocr_stringdist-0.0.4.dist-info/METADATA,sha256=1lSh8HZ9TrBv0BWdBbMZk-_0qbE1tEEUd25xfAFnd7s,3320
|
2
|
+
ocr_stringdist-0.0.4.dist-info/WHEEL,sha256=0Nk1AmIV1z6U4gJsrIxrgrzvCxDT7puUGM_j24me7T0,90
|
3
|
+
ocr_stringdist-0.0.4.dist-info/licenses/LICENSE,sha256=3cNRiJag5vI0KMMDNf0oiaY4vg43rLxRszbMJs1GBoU,1092
|
4
|
+
ocr_stringdist/default_ocr_distances.py,sha256=STNRMGWEYOCHo11uP51JUQfvNrSZleMCxt6wsPkctfg,925
|
5
|
+
ocr_stringdist/matching.py,sha256=KEzYBBEHZhfLA9eD3MxDaehKiD9lUb0RQq74u5qWpVw,3376
|
6
|
+
ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
+
ocr_stringdist/__init__.py,sha256=Z6ZeTSfpKRaUM15FPW00MfLBKVDUCP21Xh5VLtnC4Tk,1471
|
8
|
+
ocr_stringdist/_rust_stringdist.cp39-win32.pyd,sha256=PWbzLvhNtGpBERO5T54Ug7MZGeKOERhBUu2xRxp1Rdg,204800
|
9
|
+
ocr_stringdist-0.0.4.dist-info/RECORD,,
|
@@ -1,8 +0,0 @@
|
|
1
|
-
ocr_stringdist-0.0.3.dist-info/METADATA,sha256=xoqr4v-Rf9nommTK1eLLOCj2YPuubbCq4aW2FBhDsYc,2475
|
2
|
-
ocr_stringdist-0.0.3.dist-info/WHEEL,sha256=0Nk1AmIV1z6U4gJsrIxrgrzvCxDT7puUGM_j24me7T0,90
|
3
|
-
ocr_stringdist-0.0.3.dist-info/licenses/LICENSE,sha256=3cNRiJag5vI0KMMDNf0oiaY4vg43rLxRszbMJs1GBoU,1092
|
4
|
-
ocr_stringdist/default_ocr_distances.py,sha256=STNRMGWEYOCHo11uP51JUQfvNrSZleMCxt6wsPkctfg,925
|
5
|
-
ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
-
ocr_stringdist/__init__.py,sha256=9u9MIGHrywSF0aC0ZW9RJiJJEB-EVbl74O9eGvzlELQ,1388
|
7
|
-
ocr_stringdist/_rust_stringdist.cp39-win32.pyd,sha256=jLq3GpQGiFDUOYnwN0Gwmo_dYyf8Wh_brHbgtPf73Gw,204800
|
8
|
-
ocr_stringdist-0.0.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|