ocr-stringdist 0.0.2__cp39-cp39-win32.whl → 0.0.4__cp39-cp39-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,13 @@
1
1
  from typing import Optional
2
2
 
3
3
  from ._rust_stringdist import * # noqa: F403
4
-
5
4
  from .default_ocr_distances import ocr_distance_map
6
-
5
+ from .matching import find_best_candidate
7
6
 
8
7
  __all__ = [
9
8
  "ocr_distance_map",
10
9
  "weighted_levenshtein_distance", # noqa: F405
10
+ "find_best_candidate",
11
11
  ]
12
12
 
13
13
 
@@ -36,6 +36,6 @@ def weighted_levenshtein_distance(
36
36
  """
37
37
  if cost_map is None:
38
38
  cost_map = ocr_distance_map
39
- return _weighted_levenshtein_distance( # noqa: F405
39
+ return _weighted_levenshtein_distance( # type: ignore # noqa: F405
40
40
  s1, s2, cost_map=cost_map, symmetric=symmetric, default_cost=default_cost
41
41
  )
@@ -1,16 +1,38 @@
1
1
  ocr_distance_map: dict[tuple[str, str], float] = {
2
- ("G", "6"): 0.2,
3
- ("O", "0"): 0.2,
2
+ ("O", "0"): 0.1,
3
+ ("l", "1"): 0.1,
4
+ ("I", "1"): 0.15,
4
5
  ("o", "0"): 0.2,
5
- ("l", "1"): 0.2,
6
- ("I", "1"): 0.2,
7
- ("2", "Z"): 0.2,
8
- ("B", "8"): 0.2,
6
+ ("B", "8"): 0.25,
9
7
  ("S", "5"): 0.3,
10
- ("s", "5"): 0.3,
8
+ ("G", "6"): 0.3,
9
+ ("Z", "2"): 0.3,
10
+ ("C", "c"): 0.3,
11
+ ("é", "e"): 0.3,
12
+ ("Ä", "A"): 0.4,
13
+ ("Ö", "O"): 0.4,
14
+ ("Ü", "U"): 0.4,
15
+ ("c", "e"): 0.4,
16
+ ("a", "o"): 0.4,
17
+ ("u", "v"): 0.4,
18
+ ("i", "l"): 0.4,
19
+ ("s", "5"): 0.4,
20
+ ("m", "n"): 0.5,
21
+ ("f", "s"): 0.5,
22
+ (".", ","): 0.5,
23
+ ("2", "Z"): 0.5,
24
+ ("t", "f"): 0.6,
25
+ ("r", "n"): 0.6,
26
+ ("-", "_"): 0.6,
27
+ ("ß", "B"): 0.6,
28
+ ("h", "b"): 0.7,
29
+ ("v", "y"): 0.7,
30
+ ("i", "j"): 0.7,
31
+ ("é", "á"): 0.7,
11
32
  ("E", "F"): 0.8,
12
33
  }
13
34
  """
14
35
  Pre-defined distance map between characters, considering common OCR errors.
15
36
  The distances are between 0 and 1.
37
+ This map is intended to be used with `symmetric=True`.
16
38
  """
@@ -0,0 +1,83 @@
1
+ from collections.abc import Callable, Iterable
2
+ from typing import Optional
3
+
4
+
5
+ def find_best_candidate(
6
+ s: str,
7
+ candidates: Iterable[str],
8
+ distance_fun: Callable[[str, str], float],
9
+ *,
10
+ minimize: bool = True,
11
+ early_return_value: Optional[float] = None,
12
+ ) -> tuple[str, float]:
13
+ """
14
+ Finds the best matching string from a collection of candidates based on a distance function.
15
+
16
+ Compares a given string against each string in the 'candidates'
17
+ iterable using the provided 'distance_fun'. It identifies the candidate
18
+ that yields the minimum (or maximum, if minimize=False) distance.
19
+
20
+ :param s: The reference string to compare against.
21
+ :type s: str
22
+ :param candidates: An iterable of candidate strings to compare with 's'.
23
+ :type candidates: Iterable[str]
24
+ :param distance_fun: A function that takes two strings (s, candidate) and
25
+ returns a float representing their distance or similarity.
26
+ :type distance_fun: Callable[[str, str], float]
27
+ :param minimize: If True (default), finds the candidate with the minimum
28
+ distance. If False, finds the candidate with the maximum
29
+ distance (useful for similarity scores).
30
+ :type minimize: bool
31
+ :param early_return_value: If provided, the function will return immediately
32
+ if a distance is found that is less than or equal
33
+ to this value (if minimize=True) or greater than
34
+ or equal to this value (if minimize=False).
35
+ If None (default), all candidates are checked.
36
+ :type early_return_value: Optional[float]
37
+ :raises ValueError: If the 'candidates' iterable is empty.
38
+ :return: A tuple containing the best matching candidate string and its
39
+ calculated distance/score.
40
+ :rtype: tuple[str, float]
41
+
42
+ :Example:
43
+
44
+ >>> from ocr_stringdist import weighted_levenshtein_distance as distance
45
+ >>> s = "apple"
46
+ >>> candidates = ["apply", "apples", "orange", "appIe"]
47
+ >>> find_best_match(s, candidates, lambda s1, s2: distance(s1, s2, {("l", "I"): 0.1}))
48
+ ('appIe', 0.1)
49
+ """
50
+ if not candidates:
51
+ raise ValueError("The 'candidates' iterable cannot be empty.")
52
+
53
+ best_candidate: str = ""
54
+
55
+ if minimize:
56
+ best_distance = float("inf")
57
+
58
+ def is_next_best(current: float, best: float) -> bool:
59
+ return current < best
60
+
61
+ def can_return_early(current: float, threshold: float) -> bool:
62
+ return current <= threshold
63
+ else:
64
+ best_distance = -float("inf")
65
+
66
+ def is_next_best(current: float, best: float) -> bool:
67
+ return current > best
68
+
69
+ def can_return_early(current: float, threshold: float) -> bool:
70
+ return current >= threshold
71
+
72
+ for candidate in candidates:
73
+ current_distance = distance_fun(s, candidate)
74
+
75
+ if early_return_value is not None and can_return_early(
76
+ current_distance, early_return_value
77
+ ):
78
+ return candidate, current_distance
79
+ if is_next_best(current_distance, best_distance):
80
+ best_distance = current_distance
81
+ best_candidate = candidate
82
+
83
+ return best_candidate, best_distance
@@ -1,8 +1,8 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ocr_stringdist
3
- Version: 0.0.2
3
+ Version: 0.0.4
4
4
  Classifier: Programming Language :: Rust
5
- Classifier: Programming Language :: Python :: Implementation :: PyPy
5
+ Classifier: Programming Language :: Python
6
6
  Classifier: Operating System :: OS Independent
7
7
  License-File: LICENSE
8
8
  Summary: String distances considering OCR errors.
@@ -37,9 +37,12 @@ pip install ocr-stringdist
37
37
  - **Weighted Levenshtein Distance**: An adaptation of the classic Levenshtein algorithm with custom substitution costs for character pairs that are commonly confused in OCR models.
38
38
  - **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
39
39
  - **Customizable Cost Maps**: Create your own substitution cost maps for specific OCR systems or domains.
40
+ - **Best Match Finder**: Utility function find_best_candidate to efficiently find the best matching string from a collection of candidates using any specified distance function (including the library's OCR-aware ones). Supports early stopping for performance optimization.
40
41
 
41
42
  ## Usage
42
43
 
44
+ ### Weighted Levenshtein Distance
45
+
43
46
  ```python
44
47
  import ocr_stringdist as osd
45
48
 
@@ -58,6 +61,22 @@ distance = osd.weighted_levenshtein_distance(
58
61
  print(f"Distance with custom map: {distance}")
59
62
  ```
60
63
 
64
+ ### Finding the Best Candidate
65
+
66
+ ```python
67
+ import ocr_stringdist as osd
68
+
69
+ s = "apple"
70
+ candidates = ["apply", "apples", "orange", "appIe"] # 'appIe' has an OCR-like error
71
+
72
+ def ocr_aware_distance(s1: str, s2: str) -> float:
73
+ return osd.weighted_levenshtein_distance(s1, s2, cost_map={("l", "I"): 0.1})
74
+
75
+ best_candidate, best_dist = osd.find_best_candidate(s, candidates, ocr_aware_distance)
76
+ print(f"Best candidate for '{s}' is '{best_candidate}' with distance {best_dist}")
77
+ # Output: Best candidate for 'apple' is 'appIe' with distance 0.1
78
+ ```
79
+
61
80
  ## Acknowledgements
62
81
 
63
82
  This project is inspired by [jellyfish](https://github.com/jamesturk/jellyfish), providing the base implementations of the algorithms used here.
@@ -0,0 +1,9 @@
1
+ ocr_stringdist-0.0.4.dist-info/METADATA,sha256=1lSh8HZ9TrBv0BWdBbMZk-_0qbE1tEEUd25xfAFnd7s,3320
2
+ ocr_stringdist-0.0.4.dist-info/WHEEL,sha256=0Nk1AmIV1z6U4gJsrIxrgrzvCxDT7puUGM_j24me7T0,90
3
+ ocr_stringdist-0.0.4.dist-info/licenses/LICENSE,sha256=3cNRiJag5vI0KMMDNf0oiaY4vg43rLxRszbMJs1GBoU,1092
4
+ ocr_stringdist/default_ocr_distances.py,sha256=STNRMGWEYOCHo11uP51JUQfvNrSZleMCxt6wsPkctfg,925
5
+ ocr_stringdist/matching.py,sha256=KEzYBBEHZhfLA9eD3MxDaehKiD9lUb0RQq74u5qWpVw,3376
6
+ ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ ocr_stringdist/__init__.py,sha256=Z6ZeTSfpKRaUM15FPW00MfLBKVDUCP21Xh5VLtnC4Tk,1471
8
+ ocr_stringdist/_rust_stringdist.cp39-win32.pyd,sha256=PWbzLvhNtGpBERO5T54Ug7MZGeKOERhBUu2xRxp1Rdg,204800
9
+ ocr_stringdist-0.0.4.dist-info/RECORD,,
@@ -1,8 +0,0 @@
1
- ocr_stringdist-0.0.2.dist-info/METADATA,sha256=nRGqWiRLfw3GSkCg0ylWayaIXxJJiTiNLMA2y-QLuuA,2475
2
- ocr_stringdist-0.0.2.dist-info/WHEEL,sha256=0Nk1AmIV1z6U4gJsrIxrgrzvCxDT7puUGM_j24me7T0,90
3
- ocr_stringdist-0.0.2.dist-info/licenses/LICENSE,sha256=3cNRiJag5vI0KMMDNf0oiaY4vg43rLxRszbMJs1GBoU,1092
4
- ocr_stringdist/default_ocr_distances.py,sha256=UYI0_CgYmaMRR4KfPO8FMmYoJeYml51p3PYBNWRMUBc,398
5
- ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- ocr_stringdist/__init__.py,sha256=9u9MIGHrywSF0aC0ZW9RJiJJEB-EVbl74O9eGvzlELQ,1388
7
- ocr_stringdist/_rust_stringdist.cp39-win32.pyd,sha256=VqfIQgSg9dUdHtJeCceNNM8fOknV4FM4gou2L_s82No,204800
8
- ocr_stringdist-0.0.2.dist-info/RECORD,,