ocr-stringdist 0.0.4__cp313-cp313-win32.whl → 0.0.5__cp313-cp313-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocr_stringdist/__init__.py +1 -0
- ocr_stringdist/_rust_stringdist.cp313-win32.pyd +0 -0
- {ocr_stringdist-0.0.4.dist-info → ocr_stringdist-0.0.5.dist-info}/METADATA +9 -5
- ocr_stringdist-0.0.5.dist-info/RECORD +9 -0
- ocr_stringdist-0.0.4.dist-info/RECORD +0 -9
- {ocr_stringdist-0.0.4.dist-info → ocr_stringdist-0.0.5.dist-info}/WHEEL +0 -0
- {ocr_stringdist-0.0.4.dist-info → ocr_stringdist-0.0.5.dist-info}/licenses/LICENSE +0 -0
ocr_stringdist/__init__.py
CHANGED
@@ -36,6 +36,7 @@ def weighted_levenshtein_distance(
|
|
36
36
|
"""
|
37
37
|
if cost_map is None:
|
38
38
|
cost_map = ocr_distance_map
|
39
|
+
# _weighted_levenshtein_distance is written in Rust, see src/rust_stringdist.rs.
|
39
40
|
return _weighted_levenshtein_distance( # type: ignore # noqa: F405
|
40
41
|
s1, s2, cost_map=cost_map, symmetric=symmetric, default_cost=default_cost
|
41
42
|
)
|
Binary file
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: ocr_stringdist
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.5
|
4
4
|
Classifier: Programming Language :: Rust
|
5
5
|
Classifier: Programming Language :: Python
|
6
6
|
Classifier: Operating System :: OS Independent
|
@@ -17,6 +17,8 @@ Project-URL: repository, https://github.com/NiklasvonM/ocr-stringdist
|
|
17
17
|
|
18
18
|
A Python library for string distance calculations that account for common OCR (optical character recognition) errors.
|
19
19
|
|
20
|
+
Documentation: https://niklasvonm.github.io/ocr-stringdist/
|
21
|
+
|
20
22
|
[](https://pypi.org/project/ocr-stringdist/)
|
21
23
|
[](LICENSE)
|
22
24
|
|
@@ -35,9 +37,11 @@ pip install ocr-stringdist
|
|
35
37
|
## Features
|
36
38
|
|
37
39
|
- **Weighted Levenshtein Distance**: An adaptation of the classic Levenshtein algorithm with custom substitution costs for character pairs that are commonly confused in OCR models.
|
40
|
+
- **Unicode Support**: Arbitrary unicode strings can be compared.
|
41
|
+
- **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
|
38
42
|
- **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
|
39
43
|
- **Customizable Cost Maps**: Create your own substitution cost maps for specific OCR systems or domains.
|
40
|
-
- **Best Match Finder**: Utility function find_best_candidate to efficiently find the best matching string from a collection of candidates using any specified distance function (including the library's OCR-aware ones).
|
44
|
+
- **Best Match Finder**: Utility function `find_best_candidate` to efficiently find the best matching string from a collection of candidates using any specified distance function (including the library's OCR-aware ones).
|
41
45
|
|
42
46
|
## Usage
|
43
47
|
|
@@ -51,12 +55,12 @@ distance = osd.weighted_levenshtein_distance("OCR5", "OCRS")
|
|
51
55
|
print(f"Distance between 'OCR5' and 'OCRS': {distance}") # Will be less than 1.0
|
52
56
|
|
53
57
|
# Custom cost map
|
54
|
-
custom_map = {("
|
58
|
+
custom_map = {("In", "h"): 0.5}
|
55
59
|
distance = osd.weighted_levenshtein_distance(
|
56
|
-
"
|
60
|
+
"hi", "Ini",
|
57
61
|
cost_map=custom_map,
|
58
62
|
symmetric=True,
|
59
|
-
default_cost=1.0
|
63
|
+
default_cost=1.0,
|
60
64
|
)
|
61
65
|
print(f"Distance with custom map: {distance}")
|
62
66
|
```
|
@@ -0,0 +1,9 @@
|
|
1
|
+
ocr_stringdist-0.0.5.dist-info/METADATA,sha256=R1JX6n_V46exR7NjMVdc4tachj3ceA-JsD-OMkQ1AfE,3549
|
2
|
+
ocr_stringdist-0.0.5.dist-info/WHEEL,sha256=eh90R9THiv1HYPhYUCnpm_RAErMfEQKvZWVMxF3uaCM,92
|
3
|
+
ocr_stringdist-0.0.5.dist-info/licenses/LICENSE,sha256=3cNRiJag5vI0KMMDNf0oiaY4vg43rLxRszbMJs1GBoU,1092
|
4
|
+
ocr_stringdist/default_ocr_distances.py,sha256=STNRMGWEYOCHo11uP51JUQfvNrSZleMCxt6wsPkctfg,925
|
5
|
+
ocr_stringdist/matching.py,sha256=KEzYBBEHZhfLA9eD3MxDaehKiD9lUb0RQq74u5qWpVw,3376
|
6
|
+
ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
+
ocr_stringdist/__init__.py,sha256=rEW1u0sWzXDSoziEJzfCgxbKQiYNmrLCdQzFfZ1mDtM,1557
|
8
|
+
ocr_stringdist/_rust_stringdist.cp313-win32.pyd,sha256=tC7HRCuACl-L1_RsgaJw93uaBTAtFHciXps3Q3HwDbU,208384
|
9
|
+
ocr_stringdist-0.0.5.dist-info/RECORD,,
|
@@ -1,9 +0,0 @@
|
|
1
|
-
ocr_stringdist-0.0.4.dist-info/METADATA,sha256=1lSh8HZ9TrBv0BWdBbMZk-_0qbE1tEEUd25xfAFnd7s,3320
|
2
|
-
ocr_stringdist-0.0.4.dist-info/WHEEL,sha256=eh90R9THiv1HYPhYUCnpm_RAErMfEQKvZWVMxF3uaCM,92
|
3
|
-
ocr_stringdist-0.0.4.dist-info/licenses/LICENSE,sha256=3cNRiJag5vI0KMMDNf0oiaY4vg43rLxRszbMJs1GBoU,1092
|
4
|
-
ocr_stringdist/default_ocr_distances.py,sha256=STNRMGWEYOCHo11uP51JUQfvNrSZleMCxt6wsPkctfg,925
|
5
|
-
ocr_stringdist/matching.py,sha256=KEzYBBEHZhfLA9eD3MxDaehKiD9lUb0RQq74u5qWpVw,3376
|
6
|
-
ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
ocr_stringdist/__init__.py,sha256=Z6ZeTSfpKRaUM15FPW00MfLBKVDUCP21Xh5VLtnC4Tk,1471
|
8
|
-
ocr_stringdist/_rust_stringdist.cp313-win32.pyd,sha256=_gG1gEjjgQmcDh0QaxCEI1OLMw5NrUngnvxckxjhl94,199680
|
9
|
-
ocr_stringdist-0.0.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|