ocr-stringdist 0.0.6__cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl → 0.0.7__cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocr_stringdist/__init__.py +1 -89
- ocr_stringdist/_rust_stringdist.cpython-312-arm-linux-gnueabihf.so +0 -0
- ocr_stringdist/levenshtein.py +71 -0
- {ocr_stringdist-0.0.6.dist-info → ocr_stringdist-0.0.7.dist-info}/METADATA +1 -2
- ocr_stringdist-0.0.7.dist-info/RECORD +10 -0
- ocr_stringdist-0.0.6.dist-info/RECORD +0 -9
- {ocr_stringdist-0.0.6.dist-info → ocr_stringdist-0.0.7.dist-info}/WHEEL +0 -0
- {ocr_stringdist-0.0.6.dist-info → ocr_stringdist-0.0.7.dist-info}/licenses/LICENSE +0 -0
ocr_stringdist/__init__.py
CHANGED
@@ -1,7 +1,5 @@
|
|
1
|
-
from typing import Optional
|
2
|
-
|
3
|
-
from ._rust_stringdist import * # noqa: F403
|
4
1
|
from .default_ocr_distances import ocr_distance_map
|
2
|
+
from .levenshtein import batch_weighted_levenshtein_distance, weighted_levenshtein_distance
|
5
3
|
from .matching import find_best_candidate
|
6
4
|
|
7
5
|
__all__ = [
|
@@ -10,89 +8,3 @@ __all__ = [
|
|
10
8
|
"batch_weighted_levenshtein_distance",
|
11
9
|
"find_best_candidate",
|
12
10
|
]
|
13
|
-
|
14
|
-
|
15
|
-
def weighted_levenshtein_distance(
|
16
|
-
s1: str,
|
17
|
-
s2: str,
|
18
|
-
/,
|
19
|
-
cost_map: Optional[dict[tuple[str, str], float]] = None,
|
20
|
-
*,
|
21
|
-
symmetric: bool = True,
|
22
|
-
default_cost: float = 1.0,
|
23
|
-
max_token_characters: int = 1,
|
24
|
-
) -> float:
|
25
|
-
"""
|
26
|
-
Levenshtein distance with custom substitution costs.
|
27
|
-
Insertion/deletion costs are 1.
|
28
|
-
|
29
|
-
The default `cost_map` considers common OCR errors, see
|
30
|
-
:py:data:`ocr_stringdist.default_ocr_distances.ocr_distance_map`.
|
31
|
-
|
32
|
-
:param s1: First string
|
33
|
-
:param s2: Second string
|
34
|
-
:param cost_map: Dictionary mapping tuples of strings ("substitution tokens") to their
|
35
|
-
substitution costs.
|
36
|
-
Only one direction needs to be configured unless `symmetric` is False.
|
37
|
-
Note that you need to set `max_token_characters` if the substitution tokens
|
38
|
-
have more than one character, for example when substituting "w" for "vv".
|
39
|
-
Defaults to `ocr_stringdist.ocr_distance_map`.
|
40
|
-
:param symmetric: Should the keys of `cost_map` be considered to be symmetric? Defaults to True.
|
41
|
-
:param default_cost: The default substitution cost for character pairs not found in `cost_map`.
|
42
|
-
:param max_token_characters: A positive integer, indicating the maximum number of characters a
|
43
|
-
substitution token in `cost_map` may have. The default 1 indicates
|
44
|
-
that only single characters can be substituted for each other.
|
45
|
-
Higher values lead to slower calculations.
|
46
|
-
"""
|
47
|
-
if cost_map is None:
|
48
|
-
cost_map = ocr_distance_map
|
49
|
-
# _weighted_levenshtein_distance is written in Rust, see src/rust_stringdist.rs.
|
50
|
-
return _weighted_levenshtein_distance( # type: ignore # noqa: F405
|
51
|
-
s1,
|
52
|
-
s2,
|
53
|
-
cost_map=cost_map,
|
54
|
-
symmetric=symmetric,
|
55
|
-
default_cost=default_cost,
|
56
|
-
max_token_characters=max_token_characters,
|
57
|
-
)
|
58
|
-
|
59
|
-
|
60
|
-
def batch_weighted_levenshtein_distance(
|
61
|
-
s: str,
|
62
|
-
candidates: list[str],
|
63
|
-
/,
|
64
|
-
cost_map: Optional[dict[tuple[str, str], float]] = None,
|
65
|
-
*,
|
66
|
-
symmetric: bool = True,
|
67
|
-
default_cost: float = 1.0,
|
68
|
-
max_token_characters: int = 1,
|
69
|
-
) -> list[float]:
|
70
|
-
"""
|
71
|
-
Calculate weighted Levenshtein distances between a string and multiple candidates.
|
72
|
-
|
73
|
-
This is more efficient than calling :func:`weighted_levenshtein_distance` multiple times.
|
74
|
-
|
75
|
-
:param s: The string to compare
|
76
|
-
:param candidates: List of candidate strings to compare against
|
77
|
-
:param cost_map: Dictionary mapping tuples of characters to their substitution cost.
|
78
|
-
Only one direction needs to be configured unless `symmetric` is False.
|
79
|
-
Defaults to `ocr_stringdist.ocr_distance_map`.
|
80
|
-
:param symmetric: Should the keys of `cost_map` be considered to be symmetric? Defaults to True.
|
81
|
-
:param default_cost: The default substitution cost for character pairs not found in `cost_map`.
|
82
|
-
:param max_token_characters: A positive integer, indicating the maximum number of characters a
|
83
|
-
substitution token in `cost_map` may have. The default 1 indicates
|
84
|
-
that only single characters can be substituted for each other.
|
85
|
-
Higher values lead to slower calculations.
|
86
|
-
:return: A list of distances corresponding to each candidate
|
87
|
-
"""
|
88
|
-
if cost_map is None:
|
89
|
-
cost_map = ocr_distance_map
|
90
|
-
# _batch_weighted_levenshtein_distance is written in Rust, see src/rust_stringdist.rs.
|
91
|
-
return _batch_weighted_levenshtein_distance( # type: ignore # noqa: F405
|
92
|
-
s,
|
93
|
-
candidates,
|
94
|
-
cost_map=cost_map,
|
95
|
-
symmetric=symmetric,
|
96
|
-
default_cost=default_cost,
|
97
|
-
max_token_characters=max_token_characters,
|
98
|
-
)
|
Binary file
|
@@ -0,0 +1,71 @@
|
|
1
|
+
from typing import Optional
|
2
|
+
|
3
|
+
from ._rust_stringdist import * # noqa: F403
|
4
|
+
from .default_ocr_distances import ocr_distance_map
|
5
|
+
|
6
|
+
|
7
|
+
def weighted_levenshtein_distance(
|
8
|
+
s1: str,
|
9
|
+
s2: str,
|
10
|
+
/,
|
11
|
+
cost_map: Optional[dict[tuple[str, str], float]] = None,
|
12
|
+
*,
|
13
|
+
symmetric: bool = True,
|
14
|
+
default_cost: float = 1.0,
|
15
|
+
) -> float:
|
16
|
+
"""
|
17
|
+
Levenshtein distance with custom substitution costs.
|
18
|
+
Insertion/deletion costs are 1.
|
19
|
+
|
20
|
+
The default `cost_map` considers common OCR errors, see
|
21
|
+
:py:data:`ocr_stringdist.default_ocr_distances.ocr_distance_map`.
|
22
|
+
|
23
|
+
:param s1: First string
|
24
|
+
:param s2: Second string
|
25
|
+
:param cost_map: Dictionary mapping tuples of strings ("substitution tokens") to their
|
26
|
+
substitution costs.
|
27
|
+
Only one direction needs to be configured unless `symmetric` is False.
|
28
|
+
Note that the runtime scales in the length of the longest substitution token.
|
29
|
+
Defaults to `ocr_stringdist.ocr_distance_map`.
|
30
|
+
:param symmetric: Should the keys of `cost_map` be considered to be symmetric? Defaults to True.
|
31
|
+
:param default_cost: The default substitution cost for character pairs not found in `cost_map`.
|
32
|
+
"""
|
33
|
+
if cost_map is None:
|
34
|
+
cost_map = ocr_distance_map
|
35
|
+
# _weighted_levenshtein_distance is written in Rust, see src/rust_stringdist.rs.
|
36
|
+
return _weighted_levenshtein_distance( # type: ignore # noqa: F405
|
37
|
+
s1, s2, cost_map=cost_map, symmetric=symmetric, default_cost=default_cost
|
38
|
+
)
|
39
|
+
|
40
|
+
|
41
|
+
def batch_weighted_levenshtein_distance(
|
42
|
+
s: str,
|
43
|
+
candidates: list[str],
|
44
|
+
/,
|
45
|
+
cost_map: Optional[dict[tuple[str, str], float]] = None,
|
46
|
+
*,
|
47
|
+
symmetric: bool = True,
|
48
|
+
default_cost: float = 1.0,
|
49
|
+
) -> list[float]:
|
50
|
+
"""
|
51
|
+
Calculate weighted Levenshtein distances between a string and multiple candidates.
|
52
|
+
|
53
|
+
This is more efficient than calling :func:`weighted_levenshtein_distance` multiple times.
|
54
|
+
|
55
|
+
:param s: The string to compare
|
56
|
+
:param candidates: List of candidate strings to compare against
|
57
|
+
:param cost_map: Dictionary mapping tuples of strings ("substitution tokens") to their
|
58
|
+
substitution costs.
|
59
|
+
Only one direction needs to be configured unless `symmetric` is False.
|
60
|
+
Note that the runtime scales in the length of the longest substitution token.
|
61
|
+
Defaults to `ocr_stringdist.ocr_distance_map`.
|
62
|
+
:param symmetric: Should the keys of `cost_map` be considered to be symmetric? Defaults to True.
|
63
|
+
:param default_cost: The default substitution cost for character pairs not found in `cost_map`.
|
64
|
+
:return: A list of distances corresponding to each candidate
|
65
|
+
"""
|
66
|
+
if cost_map is None:
|
67
|
+
cost_map = ocr_distance_map
|
68
|
+
# _batch_weighted_levenshtein_distance is written in Rust, see src/rust_stringdist.rs.
|
69
|
+
return _batch_weighted_levenshtein_distance( # type: ignore # noqa: F405
|
70
|
+
s, candidates, cost_map=cost_map, symmetric=symmetric, default_cost=default_cost
|
71
|
+
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: ocr_stringdist
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.7
|
4
4
|
Classifier: Programming Language :: Rust
|
5
5
|
Classifier: Programming Language :: Python
|
6
6
|
Classifier: Operating System :: OS Independent
|
@@ -60,7 +60,6 @@ distance = osd.weighted_levenshtein_distance(
|
|
60
60
|
"hi", "Ini",
|
61
61
|
cost_map=custom_map,
|
62
62
|
symmetric=True,
|
63
|
-
max_token_characters=2,
|
64
63
|
)
|
65
64
|
print(f"Distance with custom map: {distance}")
|
66
65
|
```
|
@@ -0,0 +1,10 @@
|
|
1
|
+
ocr_stringdist-0.0.7.dist-info/METADATA,sha256=Rx1RtodaH76hveE0246t_Lo-lID1h6rI_j47joUb6Ak,3494
|
2
|
+
ocr_stringdist-0.0.7.dist-info/WHEEL,sha256=SW30N8kywnSNNSd8rZHXljiViD5k9KuBxLtnK5N91_Q,129
|
3
|
+
ocr_stringdist-0.0.7.dist-info/licenses/LICENSE,sha256=5BPRcjlnbl2t4TidSgpfGrtC_birSf8JlZfA-qmVoQE,1072
|
4
|
+
ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
ocr_stringdist/__init__.py,sha256=haj_CNOovN9O6j1ixmku6BNTV4U3NQ5JEN2dC_4TmXc,332
|
6
|
+
ocr_stringdist/default_ocr_distances.py,sha256=oSu-TpHjPA4jxKpLAfmap8z0ZsC99jsOjnRVHW7Hj_Y,1033
|
7
|
+
ocr_stringdist/matching.py,sha256=rr8R63Ttu2hTf5Mni7_P8aGBbjWs6t2QPV3wxKXspAs,3293
|
8
|
+
ocr_stringdist/levenshtein.py,sha256=kJBJM5sG_g4MBXizbYVsGMQPWHk2vXWOvMspCYnthCA,3053
|
9
|
+
ocr_stringdist/_rust_stringdist.cpython-312-arm-linux-gnueabihf.so,sha256=1c17EgzaztMl_v2bG3mK5c5-exqb3f1Xbbq1o2kpN6s,727928
|
10
|
+
ocr_stringdist-0.0.7.dist-info/RECORD,,
|
@@ -1,9 +0,0 @@
|
|
1
|
-
ocr_stringdist-0.0.6.dist-info/METADATA,sha256=w_dnhka08_rjLZ3LxFzTqeaUuvUrzer7FkjrqQI2xIg,3522
|
2
|
-
ocr_stringdist-0.0.6.dist-info/WHEEL,sha256=SW30N8kywnSNNSd8rZHXljiViD5k9KuBxLtnK5N91_Q,129
|
3
|
-
ocr_stringdist-0.0.6.dist-info/licenses/LICENSE,sha256=5BPRcjlnbl2t4TidSgpfGrtC_birSf8JlZfA-qmVoQE,1072
|
4
|
-
ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
-
ocr_stringdist/__init__.py,sha256=45mZy8Gx3ygO60XAcWR8ZVS4LAoPcvKd946MwbM9-vc,4172
|
6
|
-
ocr_stringdist/default_ocr_distances.py,sha256=oSu-TpHjPA4jxKpLAfmap8z0ZsC99jsOjnRVHW7Hj_Y,1033
|
7
|
-
ocr_stringdist/matching.py,sha256=rr8R63Ttu2hTf5Mni7_P8aGBbjWs6t2QPV3wxKXspAs,3293
|
8
|
-
ocr_stringdist/_rust_stringdist.cpython-312-arm-linux-gnueabihf.so,sha256=D7IVyBttsogLhQq8jzmWjW0OVo_jx8b7nJffA1EgM0c,736272
|
9
|
-
ocr_stringdist-0.0.6.dist-info/RECORD,,
|
File without changes
|
File without changes
|