ocr-stringdist 0.0.5__cp310-cp310-win_amd64.whl → 0.0.7__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,42 +1,10 @@
1
- from typing import Optional
2
-
3
- from ._rust_stringdist import * # noqa: F403
4
1
  from .default_ocr_distances import ocr_distance_map
2
+ from .levenshtein import batch_weighted_levenshtein_distance, weighted_levenshtein_distance
5
3
  from .matching import find_best_candidate
6
4
 
7
5
  __all__ = [
8
6
  "ocr_distance_map",
9
- "weighted_levenshtein_distance", # noqa: F405
7
+ "weighted_levenshtein_distance",
8
+ "batch_weighted_levenshtein_distance",
10
9
  "find_best_candidate",
11
10
  ]
12
-
13
-
14
- def weighted_levenshtein_distance(
15
- s1: str,
16
- s2: str,
17
- /,
18
- cost_map: Optional[dict[tuple[str, str], float]] = None,
19
- *,
20
- symmetric: bool = True,
21
- default_cost: float = 1.0,
22
- ) -> float:
23
- """
24
- Levenshtein distance with custom substitution costs.
25
- Insertion/deletion costs are 1.
26
-
27
- The default `cost_map` considers common OCR errors, see `ocr_stringdist.ocr_distance_map`.
28
-
29
- :param s1: First string
30
- :param s2: Second string
31
- :param cost_map: Dictionary mapping tuples of characters to their substitution cost.
32
- Only one direction needs to be configured unless `symmetric` is False.
33
- Defaults to `ocr_stringdist.ocr_distance_map`.
34
- :param symmetric: Should the keys of `cost_map` be considered to be symmetric? Defaults to True.
35
- :param default_cost: The default substitution cost for character pairs not found in `cost_map`.
36
- """
37
- if cost_map is None:
38
- cost_map = ocr_distance_map
39
- # _weighted_levenshtein_distance is written in Rust, see src/rust_stringdist.rs.
40
- return _weighted_levenshtein_distance( # type: ignore # noqa: F405
41
- s1, s2, cost_map=cost_map, symmetric=symmetric, default_cost=default_cost
42
- )
@@ -1,3 +1,5 @@
1
+ # Start marker for literalinclude, see docs/source/api/index.rst.
2
+ # OCR_DISTANCE_MAP_START
1
3
  ocr_distance_map: dict[tuple[str, str], float] = {
2
4
  ("O", "0"): 0.1,
3
5
  ("l", "1"): 0.1,
@@ -31,6 +33,8 @@ ocr_distance_map: dict[tuple[str, str], float] = {
31
33
  ("é", "á"): 0.7,
32
34
  ("E", "F"): 0.8,
33
35
  }
36
+ # OCR_DISTANCE_MAP_END
37
+ # End marker for literalinclude
34
38
  """
35
39
  Pre-defined distance map between characters, considering common OCR errors.
36
40
  The distances are between 0 and 1.
@@ -0,0 +1,71 @@
1
+ from typing import Optional
2
+
3
+ from ._rust_stringdist import * # noqa: F403
4
+ from .default_ocr_distances import ocr_distance_map
5
+
6
+
7
+ def weighted_levenshtein_distance(
8
+ s1: str,
9
+ s2: str,
10
+ /,
11
+ cost_map: Optional[dict[tuple[str, str], float]] = None,
12
+ *,
13
+ symmetric: bool = True,
14
+ default_cost: float = 1.0,
15
+ ) -> float:
16
+ """
17
+ Levenshtein distance with custom substitution costs.
18
+ Insertion/deletion costs are 1.
19
+
20
+ The default `cost_map` considers common OCR errors, see
21
+ :py:data:`ocr_stringdist.default_ocr_distances.ocr_distance_map`.
22
+
23
+ :param s1: First string
24
+ :param s2: Second string
25
+ :param cost_map: Dictionary mapping tuples of strings ("substitution tokens") to their
26
+ substitution costs.
27
+ Only one direction needs to be configured unless `symmetric` is False.
28
+ Note that the runtime scales in the length of the longest substitution token.
29
+ Defaults to `ocr_stringdist.ocr_distance_map`.
30
+ :param symmetric: Should the keys of `cost_map` be considered to be symmetric? Defaults to True.
31
+ :param default_cost: The default substitution cost for character pairs not found in `cost_map`.
32
+ """
33
+ if cost_map is None:
34
+ cost_map = ocr_distance_map
35
+ # _weighted_levenshtein_distance is written in Rust, see src/rust_stringdist.rs.
36
+ return _weighted_levenshtein_distance( # type: ignore # noqa: F405
37
+ s1, s2, cost_map=cost_map, symmetric=symmetric, default_cost=default_cost
38
+ )
39
+
40
+
41
+ def batch_weighted_levenshtein_distance(
42
+ s: str,
43
+ candidates: list[str],
44
+ /,
45
+ cost_map: Optional[dict[tuple[str, str], float]] = None,
46
+ *,
47
+ symmetric: bool = True,
48
+ default_cost: float = 1.0,
49
+ ) -> list[float]:
50
+ """
51
+ Calculate weighted Levenshtein distances between a string and multiple candidates.
52
+
53
+ This is more efficient than calling :func:`weighted_levenshtein_distance` multiple times.
54
+
55
+ :param s: The string to compare
56
+ :param candidates: List of candidate strings to compare against
57
+ :param cost_map: Dictionary mapping tuples of strings ("substitution tokens") to their
58
+ substitution costs.
59
+ Only one direction needs to be configured unless `symmetric` is False.
60
+ Note that the runtime scales in the length of the longest substitution token.
61
+ Defaults to `ocr_stringdist.ocr_distance_map`.
62
+ :param symmetric: Should the keys of `cost_map` be considered to be symmetric? Defaults to True.
63
+ :param default_cost: The default substitution cost for character pairs not found in `cost_map`.
64
+ :return: A list of distances corresponding to each candidate
65
+ """
66
+ if cost_map is None:
67
+ cost_map = ocr_distance_map
68
+ # _batch_weighted_levenshtein_distance is written in Rust, see src/rust_stringdist.rs.
69
+ return _batch_weighted_levenshtein_distance( # type: ignore # noqa: F405
70
+ s, candidates, cost_map=cost_map, symmetric=symmetric, default_cost=default_cost
71
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ocr_stringdist
3
- Version: 0.0.5
3
+ Version: 0.0.7
4
4
  Classifier: Programming Language :: Rust
5
5
  Classifier: Programming Language :: Python
6
6
  Classifier: Operating System :: OS Independent
@@ -36,7 +36,7 @@ pip install ocr-stringdist
36
36
 
37
37
  ## Features
38
38
 
39
- - **Weighted Levenshtein Distance**: An adaptation of the classic Levenshtein algorithm with custom substitution costs for character pairs that are commonly confused in OCR models.
39
+ - **Weighted Levenshtein Distance**: An adaptation of the classic Levenshtein algorithm with custom substitution costs for character pairs that are commonly confused in OCR models, including efficient batch processing.
40
40
  - **Unicode Support**: Arbitrary unicode strings can be compared.
41
41
  - **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
42
42
  - **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
@@ -60,7 +60,6 @@ distance = osd.weighted_levenshtein_distance(
60
60
  "hi", "Ini",
61
61
  cost_map=custom_map,
62
62
  symmetric=True,
63
- default_cost=1.0,
64
63
  )
65
64
  print(f"Distance with custom map: {distance}")
66
65
  ```
@@ -0,0 +1,10 @@
1
+ ocr_stringdist-0.0.7.dist-info/METADATA,sha256=VQRJCE3NI3ZpbuesyXKgVqvp4yqmSQnJXDqtdVk9IQE,3564
2
+ ocr_stringdist-0.0.7.dist-info/WHEEL,sha256=77DqkvxB4HqZitBRK_M49NRS207JKb0MotMEjnxEWQ8,96
3
+ ocr_stringdist-0.0.7.dist-info/licenses/LICENSE,sha256=3cNRiJag5vI0KMMDNf0oiaY4vg43rLxRszbMJs1GBoU,1092
4
+ ocr_stringdist/default_ocr_distances.py,sha256=vlhzQCCcE-D1xor5RvMW0oaMuL_HP_5Y7SO4ESkdb4w,1075
5
+ ocr_stringdist/levenshtein.py,sha256=RF2B9dtlqDb6D3IjLlrJMYI5037N7U7-1mJwztijbBQ,3124
6
+ ocr_stringdist/matching.py,sha256=KEzYBBEHZhfLA9eD3MxDaehKiD9lUb0RQq74u5qWpVw,3376
7
+ ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ ocr_stringdist/__init__.py,sha256=MkIgLBJKXQRGfRoEdbrKBxwlRJKV85w-_jBdYDeH__0,342
9
+ ocr_stringdist/_rust_stringdist.cp310-win_amd64.pyd,sha256=P9Mfe_5iu50XKb8kswsoLxR3k6oTsceJmGQRxfuiRoQ,355840
10
+ ocr_stringdist-0.0.7.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- ocr_stringdist-0.0.5.dist-info/METADATA,sha256=R1JX6n_V46exR7NjMVdc4tachj3ceA-JsD-OMkQ1AfE,3549
2
- ocr_stringdist-0.0.5.dist-info/WHEEL,sha256=77DqkvxB4HqZitBRK_M49NRS207JKb0MotMEjnxEWQ8,96
3
- ocr_stringdist-0.0.5.dist-info/licenses/LICENSE,sha256=3cNRiJag5vI0KMMDNf0oiaY4vg43rLxRszbMJs1GBoU,1092
4
- ocr_stringdist/default_ocr_distances.py,sha256=STNRMGWEYOCHo11uP51JUQfvNrSZleMCxt6wsPkctfg,925
5
- ocr_stringdist/matching.py,sha256=KEzYBBEHZhfLA9eD3MxDaehKiD9lUb0RQq74u5qWpVw,3376
6
- ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- ocr_stringdist/__init__.py,sha256=rEW1u0sWzXDSoziEJzfCgxbKQiYNmrLCdQzFfZ1mDtM,1557
8
- ocr_stringdist/_rust_stringdist.cp310-win_amd64.pyd,sha256=vkKitiAiQEq2IeCOtQWTxI9TDom2qNW8nE6ryjTRCh0,242176
9
- ocr_stringdist-0.0.5.dist-info/RECORD,,