ocr-stringdist 0.0.3__cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl → 0.0.5__cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,13 @@
1
1
  from typing import Optional
2
2
 
3
3
  from ._rust_stringdist import * # noqa: F403
4
-
5
4
  from .default_ocr_distances import ocr_distance_map
6
-
5
+ from .matching import find_best_candidate
7
6
 
8
7
  __all__ = [
9
8
  "ocr_distance_map",
10
9
  "weighted_levenshtein_distance", # noqa: F405
10
+ "find_best_candidate",
11
11
  ]
12
12
 
13
13
 
@@ -36,6 +36,7 @@ def weighted_levenshtein_distance(
36
36
  """
37
37
  if cost_map is None:
38
38
  cost_map = ocr_distance_map
39
- return _weighted_levenshtein_distance( # noqa: F405
39
+ # _weighted_levenshtein_distance is written in Rust, see src/rust_stringdist.rs.
40
+ return _weighted_levenshtein_distance( # type: ignore # noqa: F405
40
41
  s1, s2, cost_map=cost_map, symmetric=symmetric, default_cost=default_cost
41
42
  )
@@ -0,0 +1,83 @@
1
+ from collections.abc import Callable, Iterable
2
+ from typing import Optional
3
+
4
+
5
+ def find_best_candidate(
6
+ s: str,
7
+ candidates: Iterable[str],
8
+ distance_fun: Callable[[str, str], float],
9
+ *,
10
+ minimize: bool = True,
11
+ early_return_value: Optional[float] = None,
12
+ ) -> tuple[str, float]:
13
+ """
14
+ Finds the best matching string from a collection of candidates based on a distance function.
15
+
16
+ Compares a given string against each string in the 'candidates'
17
+ iterable using the provided 'distance_fun'. It identifies the candidate
18
+ that yields the minimum (or maximum, if minimize=False) distance.
19
+
20
+ :param s: The reference string to compare against.
21
+ :type s: str
22
+ :param candidates: An iterable of candidate strings to compare with 's'.
23
+ :type candidates: Iterable[str]
24
+ :param distance_fun: A function that takes two strings (s, candidate) and
25
+ returns a float representing their distance or similarity.
26
+ :type distance_fun: Callable[[str, str], float]
27
+ :param minimize: If True (default), finds the candidate with the minimum
28
+ distance. If False, finds the candidate with the maximum
29
+ distance (useful for similarity scores).
30
+ :type minimize: bool
31
+ :param early_return_value: If provided, the function will return immediately
32
+ if a distance is found that is less than or equal
33
+ to this value (if minimize=True) or greater than
34
+ or equal to this value (if minimize=False).
35
+ If None (default), all candidates are checked.
36
+ :type early_return_value: Optional[float]
37
+ :raises ValueError: If the 'candidates' iterable is empty.
38
+ :return: A tuple containing the best matching candidate string and its
39
+ calculated distance/score.
40
+ :rtype: tuple[str, float]
41
+
42
+ :Example:
43
+
44
+ >>> from ocr_stringdist import weighted_levenshtein_distance as distance
45
+ >>> s = "apple"
46
+ >>> candidates = ["apply", "apples", "orange", "appIe"]
47
+ >>> find_best_match(s, candidates, lambda s1, s2: distance(s1, s2, {("l", "I"): 0.1}))
48
+ ('appIe', 0.1)
49
+ """
50
+ if not candidates:
51
+ raise ValueError("The 'candidates' iterable cannot be empty.")
52
+
53
+ best_candidate: str = ""
54
+
55
+ if minimize:
56
+ best_distance = float("inf")
57
+
58
+ def is_next_best(current: float, best: float) -> bool:
59
+ return current < best
60
+
61
+ def can_return_early(current: float, threshold: float) -> bool:
62
+ return current <= threshold
63
+ else:
64
+ best_distance = -float("inf")
65
+
66
+ def is_next_best(current: float, best: float) -> bool:
67
+ return current > best
68
+
69
+ def can_return_early(current: float, threshold: float) -> bool:
70
+ return current >= threshold
71
+
72
+ for candidate in candidates:
73
+ current_distance = distance_fun(s, candidate)
74
+
75
+ if early_return_value is not None and can_return_early(
76
+ current_distance, early_return_value
77
+ ):
78
+ return candidate, current_distance
79
+ if is_next_best(current_distance, best_distance):
80
+ best_distance = current_distance
81
+ best_candidate = candidate
82
+
83
+ return best_candidate, best_distance
@@ -1,8 +1,8 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ocr_stringdist
3
- Version: 0.0.3
3
+ Version: 0.0.5
4
4
  Classifier: Programming Language :: Rust
5
- Classifier: Programming Language :: Python :: Implementation :: PyPy
5
+ Classifier: Programming Language :: Python
6
6
  Classifier: Operating System :: OS Independent
7
7
  License-File: LICENSE
8
8
  Summary: String distances considering OCR errors.
@@ -17,6 +17,8 @@ Project-URL: repository, https://github.com/NiklasvonM/ocr-stringdist
17
17
 
18
18
  A Python library for string distance calculations that account for common OCR (optical character recognition) errors.
19
19
 
20
+ Documentation: https://niklasvonm.github.io/ocr-stringdist/
21
+
20
22
  [![PyPI](https://img.shields.io/badge/PyPI-Package-blue)](https://pypi.org/project/ocr-stringdist/)
21
23
  [![License](https://img.shields.io/badge/License-MIT-green)](LICENSE)
22
24
 
@@ -35,11 +37,16 @@ pip install ocr-stringdist
35
37
  ## Features
36
38
 
37
39
  - **Weighted Levenshtein Distance**: An adaptation of the classic Levenshtein algorithm with custom substitution costs for character pairs that are commonly confused in OCR models.
40
+ - **Unicode Support**: Arbitrary unicode strings can be compared.
41
+ - **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
38
42
  - **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
39
43
  - **Customizable Cost Maps**: Create your own substitution cost maps for specific OCR systems or domains.
44
+ - **Best Match Finder**: Utility function `find_best_candidate` to efficiently find the best matching string from a collection of candidates using any specified distance function (including the library's OCR-aware ones).
40
45
 
41
46
  ## Usage
42
47
 
48
+ ### Weighted Levenshtein Distance
49
+
43
50
  ```python
44
51
  import ocr_stringdist as osd
45
52
 
@@ -48,16 +55,32 @@ distance = osd.weighted_levenshtein_distance("OCR5", "OCRS")
48
55
  print(f"Distance between 'OCR5' and 'OCRS': {distance}") # Will be less than 1.0
49
56
 
50
57
  # Custom cost map
51
- custom_map = {("f", "t"): 0.2, ("m", "n"): 0.1}
58
+ custom_map = {("In", "h"): 0.5}
52
59
  distance = osd.weighted_levenshtein_distance(
53
- "first", "tirst",
60
+ "hi", "Ini",
54
61
  cost_map=custom_map,
55
62
  symmetric=True,
56
- default_cost=1.0
63
+ default_cost=1.0,
57
64
  )
58
65
  print(f"Distance with custom map: {distance}")
59
66
  ```
60
67
 
68
+ ### Finding the Best Candidate
69
+
70
+ ```python
71
+ import ocr_stringdist as osd
72
+
73
+ s = "apple"
74
+ candidates = ["apply", "apples", "orange", "appIe"] # 'appIe' has an OCR-like error
75
+
76
+ def ocr_aware_distance(s1: str, s2: str) -> float:
77
+ return osd.weighted_levenshtein_distance(s1, s2, cost_map={("l", "I"): 0.1})
78
+
79
+ best_candidate, best_dist = osd.find_best_candidate(s, candidates, ocr_aware_distance)
80
+ print(f"Best candidate for '{s}' is '{best_candidate}' with distance {best_dist}")
81
+ # Output: Best candidate for 'apple' is 'appIe' with distance 0.1
82
+ ```
83
+
61
84
  ## Acknowledgements
62
85
 
63
86
  This project is inspired by [jellyfish](https://github.com/jamesturk/jellyfish), providing the base implementations of the algorithms used here.
@@ -0,0 +1,9 @@
1
+ ocr_stringdist-0.0.5.dist-info/METADATA,sha256=vX4O04CresnzPoQnQSnejENFPcgSK5XWqRewrBvl9BU,3478
2
+ ocr_stringdist-0.0.5.dist-info/WHEEL,sha256=txrf3mWfYT8fuidWcn4w4bCG5e7tCVvnYzYyHWO_WFg,129
3
+ ocr_stringdist-0.0.5.dist-info/licenses/LICENSE,sha256=5BPRcjlnbl2t4TidSgpfGrtC_birSf8JlZfA-qmVoQE,1072
4
+ ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ ocr_stringdist/__init__.py,sha256=zL-0Bmo6jCas2QlDCGxL2vPbjoNrdpLQ9kPfsAjC0QI,1515
6
+ ocr_stringdist/default_ocr_distances.py,sha256=8jmR5aLrEfrm5Fj2-nEqdTCKEmoEcm8DxBWv7IQd5_k,887
7
+ ocr_stringdist/matching.py,sha256=rr8R63Ttu2hTf5Mni7_P8aGBbjWs6t2QPV3wxKXspAs,3293
8
+ ocr_stringdist/_rust_stringdist.cpython-310-arm-linux-gnueabihf.so,sha256=64cHzzTkkCjYrJpATtOb726gEygPdlGopd47mqiDxkc,581768
9
+ ocr_stringdist-0.0.5.dist-info/RECORD,,
@@ -1,8 +0,0 @@
1
- ocr_stringdist-0.0.3.dist-info/METADATA,sha256=8zNTXdk7LHrW3hhpozRJoxUX_lFfUAwi04c3HajrDQ0,2427
2
- ocr_stringdist-0.0.3.dist-info/WHEEL,sha256=txrf3mWfYT8fuidWcn4w4bCG5e7tCVvnYzYyHWO_WFg,129
3
- ocr_stringdist-0.0.3.dist-info/licenses/LICENSE,sha256=5BPRcjlnbl2t4TidSgpfGrtC_birSf8JlZfA-qmVoQE,1072
4
- ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- ocr_stringdist/__init__.py,sha256=80efQ2jxe_BjTFQHbsPXYfpk5Cu6R9sOxrEY7tI8sGk,1347
6
- ocr_stringdist/default_ocr_distances.py,sha256=8jmR5aLrEfrm5Fj2-nEqdTCKEmoEcm8DxBWv7IQd5_k,887
7
- ocr_stringdist/_rust_stringdist.cpython-310-arm-linux-gnueabihf.so,sha256=1iq3f9YfiZzUkpqhaf8DWUXP0QXt4jYC9oEUNeLkveI,577264
8
- ocr_stringdist-0.0.3.dist-info/RECORD,,