ocr-stringdist 0.0.1__cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl → 0.0.3__cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1 +1,41 @@
1
- from ._rust_stringdist import *
1
+ from typing import Optional
2
+
3
+ from ._rust_stringdist import * # noqa: F403
4
+
5
+ from .default_ocr_distances import ocr_distance_map
6
+
7
+
8
+ __all__ = [
9
+ "ocr_distance_map",
10
+ "weighted_levenshtein_distance", # noqa: F405
11
+ ]
12
+
13
+
14
+ def weighted_levenshtein_distance(
15
+ s1: str,
16
+ s2: str,
17
+ /,
18
+ cost_map: Optional[dict[tuple[str, str], float]] = None,
19
+ *,
20
+ symmetric: bool = True,
21
+ default_cost: float = 1.0,
22
+ ) -> float:
23
+ """
24
+ Levenshtein distance with custom substitution costs.
25
+ Insertion/deletion costs are 1.
26
+
27
+ The default `cost_map` considers common OCR errors, see `ocr_stringdist.ocr_distance_map`.
28
+
29
+ :param s1: First string
30
+ :param s2: Second string
31
+ :param cost_map: Dictionary mapping tuples of characters to their substitution cost.
32
+ Only one direction needs to be configured unless `symmetric` is False.
33
+ Defaults to `ocr_stringdist.ocr_distance_map`.
34
+ :param symmetric: Should the keys of `cost_map` be considered to be symmetric? Defaults to True.
35
+ :param default_cost: The default substitution cost for character pairs not found in `cost_map`.
36
+ """
37
+ if cost_map is None:
38
+ cost_map = ocr_distance_map
39
+ return _weighted_levenshtein_distance( # noqa: F405
40
+ s1, s2, cost_map=cost_map, symmetric=symmetric, default_cost=default_cost
41
+ )
@@ -0,0 +1,38 @@
1
+ ocr_distance_map: dict[tuple[str, str], float] = {
2
+ ("O", "0"): 0.1,
3
+ ("l", "1"): 0.1,
4
+ ("I", "1"): 0.15,
5
+ ("o", "0"): 0.2,
6
+ ("B", "8"): 0.25,
7
+ ("S", "5"): 0.3,
8
+ ("G", "6"): 0.3,
9
+ ("Z", "2"): 0.3,
10
+ ("C", "c"): 0.3,
11
+ ("é", "e"): 0.3,
12
+ ("Ä", "A"): 0.4,
13
+ ("Ö", "O"): 0.4,
14
+ ("Ü", "U"): 0.4,
15
+ ("c", "e"): 0.4,
16
+ ("a", "o"): 0.4,
17
+ ("u", "v"): 0.4,
18
+ ("i", "l"): 0.4,
19
+ ("s", "5"): 0.4,
20
+ ("m", "n"): 0.5,
21
+ ("f", "s"): 0.5,
22
+ (".", ","): 0.5,
23
+ ("2", "Z"): 0.5,
24
+ ("t", "f"): 0.6,
25
+ ("r", "n"): 0.6,
26
+ ("-", "_"): 0.6,
27
+ ("ß", "B"): 0.6,
28
+ ("h", "b"): 0.7,
29
+ ("v", "y"): 0.7,
30
+ ("i", "j"): 0.7,
31
+ ("é", "á"): 0.7,
32
+ ("E", "F"): 0.8,
33
+ }
34
+ """
35
+ Pre-defined distance map between characters, considering common OCR errors.
36
+ The distances are between 0 and 1.
37
+ This map is intended to be used with `symmetric=True`.
38
+ """
@@ -0,0 +1,64 @@
1
+ Metadata-Version: 2.4
2
+ Name: ocr_stringdist
3
+ Version: 0.0.3
4
+ Classifier: Programming Language :: Rust
5
+ Classifier: Programming Language :: Python :: Implementation :: PyPy
6
+ Classifier: Operating System :: OS Independent
7
+ License-File: LICENSE
8
+ Summary: String distances considering OCR errors.
9
+ Author: Niklas von Moers <niklasvmoers@protonmail.com>
10
+ Author-email: Niklas von Moers <niklasvmoers@protonmail.com>
11
+ License: MIT
12
+ Requires-Python: >=3.9
13
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
14
+ Project-URL: repository, https://github.com/NiklasvonM/ocr-stringdist
15
+
16
+ # OCR-StringDist
17
+
18
+ A Python library for string distance calculations that account for common OCR (optical character recognition) errors.
19
+
20
+ [![PyPI](https://img.shields.io/badge/PyPI-Package-blue)](https://pypi.org/project/ocr-stringdist/)
21
+ [![License](https://img.shields.io/badge/License-MIT-green)](LICENSE)
22
+
23
+ ## Overview
24
+
25
+ OCR-StringDist provides specialized string distance algorithms that accommodate for optical character recognition (OCR) errors. Unlike traditional string comparison algorithms, OCR-StringDist considers common OCR confusions (like "0" vs "O", "6" vs "G", etc.) when calculating distances between strings.
26
+
27
+ > **Note:** This project is in early development. APIs may change in future releases.
28
+
29
+ ## Installation
30
+
31
+ ```bash
32
+ pip install ocr-stringdist
33
+ ```
34
+
35
+ ## Features
36
+
37
+ - **Weighted Levenshtein Distance**: An adaptation of the classic Levenshtein algorithm with custom substitution costs for character pairs that are commonly confused in OCR models.
38
+ - **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
39
+ - **Customizable Cost Maps**: Create your own substitution cost maps for specific OCR systems or domains.
40
+
41
+ ## Usage
42
+
43
+ ```python
44
+ import ocr_stringdist as osd
45
+
46
+ # Using default OCR distance map
47
+ distance = osd.weighted_levenshtein_distance("OCR5", "OCRS")
48
+ print(f"Distance between 'OCR5' and 'OCRS': {distance}") # Will be less than 1.0
49
+
50
+ # Custom cost map
51
+ custom_map = {("f", "t"): 0.2, ("m", "n"): 0.1}
52
+ distance = osd.weighted_levenshtein_distance(
53
+ "first", "tirst",
54
+ cost_map=custom_map,
55
+ symmetric=True,
56
+ default_cost=1.0
57
+ )
58
+ print(f"Distance with custom map: {distance}")
59
+ ```
60
+
61
+ ## Acknowledgements
62
+
63
+ This project is inspired by [jellyfish](https://github.com/jamesturk/jellyfish), providing the base implementations of the algorithms used here.
64
+
@@ -0,0 +1,8 @@
1
+ ocr_stringdist-0.0.3.dist-info/METADATA,sha256=8zNTXdk7LHrW3hhpozRJoxUX_lFfUAwi04c3HajrDQ0,2427
2
+ ocr_stringdist-0.0.3.dist-info/WHEEL,sha256=g5K2e2K_d6dSxc4iNLXoY6_5bqvbTor5WhBlTFP_8rM,129
3
+ ocr_stringdist-0.0.3.dist-info/licenses/LICENSE,sha256=5BPRcjlnbl2t4TidSgpfGrtC_birSf8JlZfA-qmVoQE,1072
4
+ ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ ocr_stringdist/__init__.py,sha256=80efQ2jxe_BjTFQHbsPXYfpk5Cu6R9sOxrEY7tI8sGk,1347
6
+ ocr_stringdist/default_ocr_distances.py,sha256=8jmR5aLrEfrm5Fj2-nEqdTCKEmoEcm8DxBWv7IQd5_k,887
7
+ ocr_stringdist/_rust_stringdist.cpython-311-arm-linux-gnueabihf.so,sha256=YCOBeLCQQohUNHvgojIW2EcNNbtoX18RsRYJQx0hTm4,576676
8
+ ocr_stringdist-0.0.3.dist-info/RECORD,,
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Niklas von Moers
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -1,8 +0,0 @@
1
- def levenshtein_distance(s1: str, s2: str) -> int: ...
2
- def ocr_weighted_levenshtein_distance(s1: str, s2: str) -> float: ...
3
- def custom_weighted_levenshtein_distance(
4
- s1: str,
5
- s2: str,
6
- cost_map: dict[tuple[str, str], float],
7
- default_cost: float | None = None
8
- ) -> float: ...
@@ -1,16 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: ocr_stringdist
3
- Version: 0.0.1
4
- Classifier: Programming Language :: Rust
5
- Classifier: Programming Language :: Python :: Implementation :: PyPy
6
- Classifier: Operating System :: OS Independent
7
- Summary: String distances considering OCR errors.
8
- Author: Niklas von Moers <niklasvmoers@protonmail.com>
9
- Author-email: Niklas von Moers <niklasvmoers@protonmail.com>
10
- License: MIT
11
- Requires-Python: >=3.9
12
- Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
13
- Project-URL: repository, https://github.com/NiklasvonM/ocr-stringdist
14
-
15
- # OCR-Stringdist
16
-
@@ -1,7 +0,0 @@
1
- ocr_stringdist-0.0.1.dist-info/METADATA,sha256=wzwsLfsR7-TwdpbHNwd09kUufLa_mq7Q38qwMjkQRI8,574
2
- ocr_stringdist-0.0.1.dist-info/WHEEL,sha256=g5K2e2K_d6dSxc4iNLXoY6_5bqvbTor5WhBlTFP_8rM,129
3
- ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- ocr_stringdist/__init__.py,sha256=XDpr7RXrjnR6Ct3cFl8f_6VoSXbshoIKWha-g3p6aR4,32
5
- ocr_stringdist/__init__.pyi,sha256=YCjks23HgatBVfwi3EkDPPrgM4ctQ3f2apesgA5S13Q,294
6
- ocr_stringdist/_rust_stringdist.cpython-311-arm-linux-gnueabihf.so,sha256=BLc51sMpBOM9D5lo8ZI6wAGpNMvqLTthHgtWShWiEww,578000
7
- ocr_stringdist-0.0.1.dist-info/RECORD,,