ocr-stringdist 0.0.4__cp313-cp313-win32.whl → 0.0.5__cp313-cp313-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -36,6 +36,7 @@ def weighted_levenshtein_distance(
36
36
  """
37
37
  if cost_map is None:
38
38
  cost_map = ocr_distance_map
39
+ # _weighted_levenshtein_distance is written in Rust, see src/rust_stringdist.rs.
39
40
  return _weighted_levenshtein_distance( # type: ignore # noqa: F405
40
41
  s1, s2, cost_map=cost_map, symmetric=symmetric, default_cost=default_cost
41
42
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ocr_stringdist
3
- Version: 0.0.4
3
+ Version: 0.0.5
4
4
  Classifier: Programming Language :: Rust
5
5
  Classifier: Programming Language :: Python
6
6
  Classifier: Operating System :: OS Independent
@@ -17,6 +17,8 @@ Project-URL: repository, https://github.com/NiklasvonM/ocr-stringdist
17
17
 
18
18
  A Python library for string distance calculations that account for common OCR (optical character recognition) errors.
19
19
 
20
+ Documentation: https://niklasvonm.github.io/ocr-stringdist/
21
+
20
22
  [![PyPI](https://img.shields.io/badge/PyPI-Package-blue)](https://pypi.org/project/ocr-stringdist/)
21
23
  [![License](https://img.shields.io/badge/License-MIT-green)](LICENSE)
22
24
 
@@ -35,9 +37,11 @@ pip install ocr-stringdist
35
37
  ## Features
36
38
 
37
39
  - **Weighted Levenshtein Distance**: An adaptation of the classic Levenshtein algorithm with custom substitution costs for character pairs that are commonly confused in OCR models.
40
+ - **Unicode Support**: Arbitrary unicode strings can be compared.
41
+ - **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
38
42
  - **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
39
43
  - **Customizable Cost Maps**: Create your own substitution cost maps for specific OCR systems or domains.
40
- - **Best Match Finder**: Utility function find_best_candidate to efficiently find the best matching string from a collection of candidates using any specified distance function (including the library's OCR-aware ones). Supports early stopping for performance optimization.
44
+ - **Best Match Finder**: Utility function `find_best_candidate` to efficiently find the best matching string from a collection of candidates using any specified distance function (including the library's OCR-aware ones).
41
45
 
42
46
  ## Usage
43
47
 
@@ -51,12 +55,12 @@ distance = osd.weighted_levenshtein_distance("OCR5", "OCRS")
51
55
  print(f"Distance between 'OCR5' and 'OCRS': {distance}") # Will be less than 1.0
52
56
 
53
57
  # Custom cost map
54
- custom_map = {("f", "t"): 0.2, ("m", "n"): 0.1}
58
+ custom_map = {("In", "h"): 0.5}
55
59
  distance = osd.weighted_levenshtein_distance(
56
- "first", "tirst",
60
+ "hi", "Ini",
57
61
  cost_map=custom_map,
58
62
  symmetric=True,
59
- default_cost=1.0
63
+ default_cost=1.0,
60
64
  )
61
65
  print(f"Distance with custom map: {distance}")
62
66
  ```
@@ -0,0 +1,9 @@
1
+ ocr_stringdist-0.0.5.dist-info/METADATA,sha256=R1JX6n_V46exR7NjMVdc4tachj3ceA-JsD-OMkQ1AfE,3549
2
+ ocr_stringdist-0.0.5.dist-info/WHEEL,sha256=eh90R9THiv1HYPhYUCnpm_RAErMfEQKvZWVMxF3uaCM,92
3
+ ocr_stringdist-0.0.5.dist-info/licenses/LICENSE,sha256=3cNRiJag5vI0KMMDNf0oiaY4vg43rLxRszbMJs1GBoU,1092
4
+ ocr_stringdist/default_ocr_distances.py,sha256=STNRMGWEYOCHo11uP51JUQfvNrSZleMCxt6wsPkctfg,925
5
+ ocr_stringdist/matching.py,sha256=KEzYBBEHZhfLA9eD3MxDaehKiD9lUb0RQq74u5qWpVw,3376
6
+ ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ ocr_stringdist/__init__.py,sha256=rEW1u0sWzXDSoziEJzfCgxbKQiYNmrLCdQzFfZ1mDtM,1557
8
+ ocr_stringdist/_rust_stringdist.cp313-win32.pyd,sha256=tC7HRCuACl-L1_RsgaJw93uaBTAtFHciXps3Q3HwDbU,208384
9
+ ocr_stringdist-0.0.5.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- ocr_stringdist-0.0.4.dist-info/METADATA,sha256=1lSh8HZ9TrBv0BWdBbMZk-_0qbE1tEEUd25xfAFnd7s,3320
2
- ocr_stringdist-0.0.4.dist-info/WHEEL,sha256=eh90R9THiv1HYPhYUCnpm_RAErMfEQKvZWVMxF3uaCM,92
3
- ocr_stringdist-0.0.4.dist-info/licenses/LICENSE,sha256=3cNRiJag5vI0KMMDNf0oiaY4vg43rLxRszbMJs1GBoU,1092
4
- ocr_stringdist/default_ocr_distances.py,sha256=STNRMGWEYOCHo11uP51JUQfvNrSZleMCxt6wsPkctfg,925
5
- ocr_stringdist/matching.py,sha256=KEzYBBEHZhfLA9eD3MxDaehKiD9lUb0RQq74u5qWpVw,3376
6
- ocr_stringdist/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- ocr_stringdist/__init__.py,sha256=Z6ZeTSfpKRaUM15FPW00MfLBKVDUCP21Xh5VLtnC4Tk,1471
8
- ocr_stringdist/_rust_stringdist.cp313-win32.pyd,sha256=_gG1gEjjgQmcDh0QaxCEI1OLMw5NrUngnvxckxjhl94,199680
9
- ocr_stringdist-0.0.4.dist-info/RECORD,,