ocr-stringdist 0.0.7__pp311-pypy311_pp73-macosx_11_0_arm64.whl → 0.1.0__pp311-pypy311_pp73-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocr_stringdist/_rust_stringdist.pypy311-pp73-darwin.so +0 -0
- ocr_stringdist/levenshtein.py +74 -27
- {ocr_stringdist-0.0.7.dist-info → ocr_stringdist-0.1.0.dist-info}/METADATA +1 -2
- ocr_stringdist-0.1.0.dist-info/RECORD +10 -0
- ocr_stringdist-0.0.7.dist-info/RECORD +0 -10
- {ocr_stringdist-0.0.7.dist-info → ocr_stringdist-0.1.0.dist-info}/WHEEL +0 -0
- {ocr_stringdist-0.0.7.dist-info → ocr_stringdist-0.1.0.dist-info}/licenses/LICENSE +0 -0
Binary file
|
ocr_stringdist/levenshtein.py
CHANGED
@@ -8,33 +8,56 @@ def weighted_levenshtein_distance(
|
|
8
8
|
s1: str,
|
9
9
|
s2: str,
|
10
10
|
/,
|
11
|
-
|
11
|
+
substitution_costs: Optional[dict[tuple[str, str], float]] = None,
|
12
|
+
insertion_costs: Optional[dict[str, float]] = None,
|
13
|
+
deletion_costs: Optional[dict[str, float]] = None,
|
12
14
|
*,
|
13
|
-
|
14
|
-
|
15
|
+
symmetric_substitution: bool = True,
|
16
|
+
default_substitution_cost: float = 1.0,
|
17
|
+
default_insertion_cost: float = 1.0,
|
18
|
+
default_deletion_cost: float = 1.0,
|
15
19
|
) -> float:
|
16
20
|
"""
|
17
|
-
Levenshtein distance with custom substitution costs.
|
18
|
-
Insertion/deletion costs are 1.
|
21
|
+
Levenshtein distance with custom substitution, insertion and deletion costs.
|
19
22
|
|
20
|
-
The default `
|
23
|
+
The default `substitution_costs` considers common OCR errors, see
|
21
24
|
:py:data:`ocr_stringdist.default_ocr_distances.ocr_distance_map`.
|
22
25
|
|
23
|
-
:param s1: First string
|
26
|
+
:param s1: First string (interpreted as the string read via OCR)
|
24
27
|
:param s2: Second string
|
25
|
-
:param
|
26
|
-
substitution costs.
|
27
|
-
|
28
|
+
:param substitution_costs: Dictionary mapping tuples of strings ("substitution tokens") to their
|
29
|
+
substitution costs. Only one direction needs to be configured unless
|
30
|
+
`symmetric_substitution` is False.
|
28
31
|
Note that the runtime scales in the length of the longest substitution token.
|
29
32
|
Defaults to `ocr_stringdist.ocr_distance_map`.
|
30
|
-
:param
|
31
|
-
:param
|
33
|
+
:param insertion_costs: Dictionary mapping strings to their insertion costs.
|
34
|
+
:param deletion_costs: Dictionary mapping strings to their deletion costs.
|
35
|
+
:param symmetric_substitution: Should the keys of `substitution_costs` be considered to be
|
36
|
+
symmetric? Defaults to True.
|
37
|
+
:param default_substitution_cost: The default substitution cost for character pairs not found
|
38
|
+
in `substitution_costs`.
|
39
|
+
:param default_insertion_cost: The default insertion cost for characters not found in
|
40
|
+
`insertion_costs`.
|
41
|
+
:param default_deletion_cost: The default deletion cost for characters not found in
|
42
|
+
`deletion_costs`.
|
32
43
|
"""
|
33
|
-
if
|
34
|
-
|
44
|
+
if substitution_costs is None:
|
45
|
+
substitution_costs = ocr_distance_map
|
46
|
+
if insertion_costs is None:
|
47
|
+
insertion_costs = {}
|
48
|
+
if deletion_costs is None:
|
49
|
+
deletion_costs = {}
|
35
50
|
# _weighted_levenshtein_distance is written in Rust, see src/rust_stringdist.rs.
|
36
51
|
return _weighted_levenshtein_distance( # type: ignore # noqa: F405
|
37
|
-
s1,
|
52
|
+
s1,
|
53
|
+
s2,
|
54
|
+
substitution_costs=substitution_costs,
|
55
|
+
insertion_costs=insertion_costs,
|
56
|
+
deletion_costs=deletion_costs,
|
57
|
+
symmetric_substitution=symmetric_substitution,
|
58
|
+
default_substitution_cost=default_substitution_cost,
|
59
|
+
default_insertion_cost=default_insertion_cost,
|
60
|
+
default_deletion_cost=default_deletion_cost,
|
38
61
|
)
|
39
62
|
|
40
63
|
|
@@ -42,30 +65,54 @@ def batch_weighted_levenshtein_distance(
|
|
42
65
|
s: str,
|
43
66
|
candidates: list[str],
|
44
67
|
/,
|
45
|
-
|
68
|
+
substitution_costs: Optional[dict[tuple[str, str], float]] = None,
|
69
|
+
insertion_costs: Optional[dict[str, float]] = None,
|
70
|
+
deletion_costs: Optional[dict[str, float]] = None,
|
46
71
|
*,
|
47
|
-
|
48
|
-
|
72
|
+
symmetric_substitution: bool = True,
|
73
|
+
default_substitution_cost: float = 1.0,
|
74
|
+
default_insertion_cost: float = 1.0,
|
75
|
+
default_deletion_cost: float = 1.0,
|
49
76
|
) -> list[float]:
|
50
77
|
"""
|
51
78
|
Calculate weighted Levenshtein distances between a string and multiple candidates.
|
52
79
|
|
53
80
|
This is more efficient than calling :func:`weighted_levenshtein_distance` multiple times.
|
54
81
|
|
55
|
-
:param s: The string to compare
|
82
|
+
:param s: The string to compare (interpreted as the string read via OCR)
|
56
83
|
:param candidates: List of candidate strings to compare against
|
57
|
-
:param
|
58
|
-
substitution costs.
|
59
|
-
|
84
|
+
:param substitution_costs: Dictionary mapping tuples of strings ("substitution tokens") to their
|
85
|
+
substitution costs. Only one direction needs to be configured unless
|
86
|
+
`symmetric_substitution` is False.
|
60
87
|
Note that the runtime scales in the length of the longest substitution token.
|
61
88
|
Defaults to `ocr_stringdist.ocr_distance_map`.
|
62
|
-
:param
|
63
|
-
:param
|
89
|
+
:param insertion_costs: Dictionary mapping strings to their insertion costs.
|
90
|
+
:param deletion_costs: Dictionary mapping strings to their deletion costs.
|
91
|
+
:param symmetric_substitution: Should the keys of `substitution_costs` be considered to be
|
92
|
+
symmetric? Defaults to True.
|
93
|
+
:param default_substitution_cost: The default substitution cost for character pairs not found
|
94
|
+
in `substitution_costs`.
|
95
|
+
:param default_insertion_cost: The default insertion cost for characters not found in
|
96
|
+
`insertion_costs`.
|
97
|
+
:param default_deletion_cost: The default deletion cost for characters not found in
|
98
|
+
`deletion_costs`.
|
64
99
|
:return: A list of distances corresponding to each candidate
|
65
100
|
"""
|
66
|
-
if
|
67
|
-
|
101
|
+
if substitution_costs is None:
|
102
|
+
substitution_costs = ocr_distance_map
|
103
|
+
if insertion_costs is None:
|
104
|
+
insertion_costs = {}
|
105
|
+
if deletion_costs is None:
|
106
|
+
deletion_costs = {}
|
68
107
|
# _batch_weighted_levenshtein_distance is written in Rust, see src/rust_stringdist.rs.
|
69
108
|
return _batch_weighted_levenshtein_distance( # type: ignore # noqa: F405
|
70
|
-
s,
|
109
|
+
s,
|
110
|
+
candidates,
|
111
|
+
substitution_costs=substitution_costs,
|
112
|
+
insertion_costs=insertion_costs,
|
113
|
+
deletion_costs=deletion_costs,
|
114
|
+
symmetric_substitution=symmetric_substitution,
|
115
|
+
default_substitution_cost=default_substitution_cost,
|
116
|
+
default_insertion_cost=default_insertion_cost,
|
117
|
+
default_deletion_cost=default_deletion_cost,
|
71
118
|
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: ocr_stringdist
|
3
|
-
Version: 0.0
|
3
|
+
Version: 0.1.0
|
4
4
|
Classifier: Programming Language :: Rust
|
5
5
|
Classifier: Programming Language :: Python
|
6
6
|
Classifier: Operating System :: OS Independent
|
@@ -40,7 +40,6 @@ pip install ocr-stringdist
|
|
40
40
|
- **Unicode Support**: Arbitrary unicode strings can be compared.
|
41
41
|
- **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
|
42
42
|
- **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
|
43
|
-
- **Customizable Cost Maps**: Create your own substitution cost maps for specific OCR systems or domains.
|
44
43
|
- **Best Match Finder**: Utility function `find_best_candidate` to efficiently find the best matching string from a collection of candidates using any specified distance function (including the library's OCR-aware ones).
|
45
44
|
|
46
45
|
## Usage
|
@@ -0,0 +1,10 @@
|
|
1
|
+
ocr_stringdist-0.1.0.dist-info/METADATA,sha256=vQJRhn0AEoIm159PyKQGMmH7JW35ZWUrQ-PWEBRteg8,3388
|
2
|
+
ocr_stringdist-0.1.0.dist-info/WHEEL,sha256=d4nkB8nDBtE7QiNpIO4hl5gGjw3iQWNAqvU5LfTpJ00,111
|
3
|
+
ocr_stringdist-0.1.0.dist-info/licenses/LICENSE,sha256=5BPRcjlnbl2t4TidSgpfGrtC_birSf8JlZfA-qmVoQE,1072
|
4
|
+
ocr_stringdist/default_ocr_distances.py,sha256=oSu-TpHjPA4jxKpLAfmap8z0ZsC99jsOjnRVHW7Hj_Y,1033
|
5
|
+
ocr_stringdist/__init__.py,sha256=haj_CNOovN9O6j1ixmku6BNTV4U3NQ5JEN2dC_4TmXc,332
|
6
|
+
ocr_stringdist/matching.py,sha256=rr8R63Ttu2hTf5Mni7_P8aGBbjWs6t2QPV3wxKXspAs,3293
|
7
|
+
ocr_stringdist/levenshtein.py,sha256=hHcarxjOzxhuopNNW3ZkPIcrSx2NjXYUWh9rR59W0Tc,5627
|
8
|
+
ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
+
ocr_stringdist/_rust_stringdist.pypy311-pp73-darwin.so,sha256=r4Ltn067saa8O-9ezDvzcbNMHBUeAGMb0M6gqvbTRco,716912
|
10
|
+
ocr_stringdist-0.1.0.dist-info/RECORD,,
|
@@ -1,10 +0,0 @@
|
|
1
|
-
ocr_stringdist-0.0.7.dist-info/METADATA,sha256=Rx1RtodaH76hveE0246t_Lo-lID1h6rI_j47joUb6Ak,3494
|
2
|
-
ocr_stringdist-0.0.7.dist-info/WHEEL,sha256=d4nkB8nDBtE7QiNpIO4hl5gGjw3iQWNAqvU5LfTpJ00,111
|
3
|
-
ocr_stringdist-0.0.7.dist-info/licenses/LICENSE,sha256=5BPRcjlnbl2t4TidSgpfGrtC_birSf8JlZfA-qmVoQE,1072
|
4
|
-
ocr_stringdist/default_ocr_distances.py,sha256=oSu-TpHjPA4jxKpLAfmap8z0ZsC99jsOjnRVHW7Hj_Y,1033
|
5
|
-
ocr_stringdist/__init__.py,sha256=haj_CNOovN9O6j1ixmku6BNTV4U3NQ5JEN2dC_4TmXc,332
|
6
|
-
ocr_stringdist/matching.py,sha256=rr8R63Ttu2hTf5Mni7_P8aGBbjWs6t2QPV3wxKXspAs,3293
|
7
|
-
ocr_stringdist/levenshtein.py,sha256=kJBJM5sG_g4MBXizbYVsGMQPWHk2vXWOvMspCYnthCA,3053
|
8
|
-
ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
-
ocr_stringdist/_rust_stringdist.pypy311-pp73-darwin.so,sha256=TYzBkI-7RS126tUb6CLlF6G-O6w7stbMk-aXZ9qbynY,676368
|
10
|
-
ocr_stringdist-0.0.7.dist-info/RECORD,,
|
File without changes
|
File without changes
|