ocr-stringdist 0.0.6__tar.gz → 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. ocr_stringdist-0.1.0/CHANGELOG.md +16 -0
  2. {ocr_stringdist-0.0.6 → ocr_stringdist-0.1.0}/Cargo.lock +1 -1
  3. {ocr_stringdist-0.0.6 → ocr_stringdist-0.1.0}/Cargo.toml +1 -1
  4. {ocr_stringdist-0.0.6 → ocr_stringdist-0.1.0}/Justfile +3 -2
  5. {ocr_stringdist-0.0.6 → ocr_stringdist-0.1.0}/PKG-INFO +1 -3
  6. {ocr_stringdist-0.0.6 → ocr_stringdist-0.1.0}/README.md +0 -2
  7. {ocr_stringdist-0.0.6 → ocr_stringdist-0.1.0}/docs/source/api/index.rst +4 -5
  8. ocr_stringdist-0.1.0/docs/source/changelog.rst +1 -0
  9. {ocr_stringdist-0.0.6 → ocr_stringdist-0.1.0}/docs/source/conf.py +3 -0
  10. ocr_stringdist-0.1.0/docs/source/examples.rst +20 -0
  11. ocr_stringdist-0.1.0/docs/source/getting-started.rst +10 -0
  12. ocr_stringdist-0.1.0/docs/source/index.rst +36 -0
  13. {ocr_stringdist-0.0.6 → ocr_stringdist-0.1.0}/examples/batch_processing.py +2 -14
  14. {ocr_stringdist-0.0.6 → ocr_stringdist-0.1.0}/examples/weighted_levenshtein.py +4 -5
  15. {ocr_stringdist-0.0.6 → ocr_stringdist-0.1.0}/pyproject.toml +1 -0
  16. ocr_stringdist-0.1.0/python/ocr_stringdist/__init__.py +10 -0
  17. ocr_stringdist-0.1.0/python/ocr_stringdist/levenshtein.py +118 -0
  18. ocr_stringdist-0.1.0/src/cost_map.rs +306 -0
  19. {ocr_stringdist-0.0.6 → ocr_stringdist-0.1.0}/src/lib.rs +5 -1
  20. ocr_stringdist-0.1.0/src/rust_stringdist.rs +135 -0
  21. ocr_stringdist-0.1.0/src/types.rs +13 -0
  22. ocr_stringdist-0.1.0/src/weighted_levenshtein.rs +992 -0
  23. ocr_stringdist-0.0.6/tests/test_batch_functions.py → ocr_stringdist-0.1.0/tests/test_batch_weighted_levenshtein.py +7 -11
  24. {ocr_stringdist-0.0.6 → ocr_stringdist-0.1.0}/tests/test_matching.py +3 -1
  25. ocr_stringdist-0.1.0/tests/test_weighted_levenshtein.py +545 -0
  26. {ocr_stringdist-0.0.6 → ocr_stringdist-0.1.0}/uv.lock +90 -0
  27. ocr_stringdist-0.0.6/docs/source/index.rst +0 -10
  28. ocr_stringdist-0.0.6/python/ocr_stringdist/__init__.py +0 -98
  29. ocr_stringdist-0.0.6/src/rust_stringdist.rs +0 -56
  30. ocr_stringdist-0.0.6/src/weighted_levenshtein.rs +0 -352
  31. ocr_stringdist-0.0.6/tests/test_ocr_stringdist.py +0 -112
  32. {ocr_stringdist-0.0.6 → ocr_stringdist-0.1.0}/.github/workflows/CI.yml +0 -0
  33. {ocr_stringdist-0.0.6 → ocr_stringdist-0.1.0}/.github/workflows/docs.yml +0 -0
  34. {ocr_stringdist-0.0.6 → ocr_stringdist-0.1.0}/.gitignore +0 -0
  35. {ocr_stringdist-0.0.6 → ocr_stringdist-0.1.0}/LICENSE +0 -0
  36. {ocr_stringdist-0.0.6 → ocr_stringdist-0.1.0}/docs/Makefile +0 -0
  37. {ocr_stringdist-0.0.6 → ocr_stringdist-0.1.0}/docs/make.bat +0 -0
  38. {ocr_stringdist-0.0.6 → ocr_stringdist-0.1.0}/mypy.ini +0 -0
  39. {ocr_stringdist-0.0.6 → ocr_stringdist-0.1.0}/python/ocr_stringdist/default_ocr_distances.py +0 -0
  40. {ocr_stringdist-0.0.6 → ocr_stringdist-0.1.0}/python/ocr_stringdist/matching.py +0 -0
  41. {ocr_stringdist-0.0.6 → ocr_stringdist-0.1.0}/python/ocr_stringdist/py.typed +0 -0
  42. {ocr_stringdist-0.0.6 → ocr_stringdist-0.1.0}/ruff.toml +0 -0
@@ -0,0 +1,16 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [0.1.0] - 2025-04-26
9
+
10
+ ### Added
11
+
12
+ - Custom insertion and deletion costs for weighted Levenshtein distance.
13
+
14
+ ### Changed
15
+
16
+ - Breaking changes to Levenshtein distance functions signatures.
@@ -74,7 +74,7 @@ dependencies = [
74
74
 
75
75
  [[package]]
76
76
  name = "ocr_stringdist"
77
- version = "0.0.6"
77
+ version = "0.1.0"
78
78
  dependencies = [
79
79
  "pyo3",
80
80
  "rayon",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "ocr_stringdist"
3
- version = "0.0.6"
3
+ version = "0.1.0"
4
4
  edition = "2021"
5
5
  description = "String distances considering OCR errors."
6
6
  authors = ["Niklas von Moers <niklasvmoers@protonmail.com>"]
@@ -5,10 +5,11 @@ venv:
5
5
 
6
6
  pytest:
7
7
  uv run maturin develop
8
- uv run pytest
8
+ uv run pytest --cov=python/ocr_stringdist tests
9
9
 
10
10
  test:
11
- cargo test
11
+ cargo llvm-cov
12
+ #cargo test
12
13
 
13
14
  mypy:
14
15
  uv run mypy .
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ocr_stringdist
3
- Version: 0.0.6
3
+ Version: 0.1.0
4
4
  Classifier: Programming Language :: Rust
5
5
  Classifier: Programming Language :: Python
6
6
  Classifier: Operating System :: OS Independent
@@ -40,7 +40,6 @@ pip install ocr-stringdist
40
40
  - **Unicode Support**: Arbitrary unicode strings can be compared.
41
41
  - **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
42
42
  - **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
43
- - **Customizable Cost Maps**: Create your own substitution cost maps for specific OCR systems or domains.
44
43
  - **Best Match Finder**: Utility function `find_best_candidate` to efficiently find the best matching string from a collection of candidates using any specified distance function (including the library's OCR-aware ones).
45
44
 
46
45
  ## Usage
@@ -60,7 +59,6 @@ distance = osd.weighted_levenshtein_distance(
60
59
  "hi", "Ini",
61
60
  cost_map=custom_map,
62
61
  symmetric=True,
63
- max_token_characters=2,
64
62
  )
65
63
  print(f"Distance with custom map: {distance}")
66
64
  ```
@@ -25,7 +25,6 @@ pip install ocr-stringdist
25
25
  - **Unicode Support**: Arbitrary unicode strings can be compared.
26
26
  - **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
27
27
  - **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
28
- - **Customizable Cost Maps**: Create your own substitution cost maps for specific OCR systems or domains.
29
28
  - **Best Match Finder**: Utility function `find_best_candidate` to efficiently find the best matching string from a collection of candidates using any specified distance function (including the library's OCR-aware ones).
30
29
 
31
30
  ## Usage
@@ -45,7 +44,6 @@ distance = osd.weighted_levenshtein_distance(
45
44
  "hi", "Ini",
46
45
  cost_map=custom_map,
47
46
  symmetric=True,
48
- max_token_characters=2,
49
47
  )
50
48
  print(f"Distance with custom map: {distance}")
51
49
  ```
@@ -3,11 +3,10 @@
3
3
  API Reference
4
4
  =============
5
5
 
6
- This page contains the auto-generated API reference documentation.
7
-
8
- .. autofunction:: ocr_stringdist.weighted_levenshtein_distance
9
-
10
- .. autofunction:: ocr_stringdist.batch_weighted_levenshtein_distance
6
+ .. automodule:: ocr_stringdist.levenshtein
7
+ :members:
8
+ :undoc-members:
9
+ :show-inheritance:
11
10
 
12
11
  .. automodule:: ocr_stringdist.matching
13
12
  :members:
@@ -0,0 +1 @@
1
+ .. mdinclude:: ../../CHANGELOG.md
@@ -4,6 +4,7 @@
4
4
  # https://www.sphinx-doc.org/en/master/usage/configuration.html
5
5
 
6
6
 
7
+ import importlib.metadata
7
8
  import os
8
9
  import sys
9
10
 
@@ -17,6 +18,8 @@ sys.path.insert(0, os.path.abspath("../../python"))
17
18
  project = "OCR-StringDist"
18
19
  copyright = "2025, Niklas von Moers"
19
20
  author = "Niklas von Moers"
21
+ release = importlib.metadata.version("ocr_stringdist")
22
+ version = release
20
23
 
21
24
  # -- General configuration ---------------------------------------------------
22
25
  # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
@@ -0,0 +1,20 @@
1
+ ==========
2
+ Examples
3
+ ==========
4
+
5
+ .. code-block:: python
6
+
7
+ import ocr_stringdist as osd
8
+
9
+ # Using default OCR distance map
10
+ distance = osd.weighted_levenshtein_distance("OCR5", "OCRS")
11
+ print(f"Distance between 'OCR5' and 'OCRS': {distance}") # Will be less than 1.0
12
+
13
+ # Custom cost map
14
+ custom_map = {("In", "h"): 0.5}
15
+ distance = osd.weighted_levenshtein_distance(
16
+ "hi", "Ini",
17
+ cost_map=custom_map,
18
+ symmetric=True,
19
+ )
20
+ print(f"Distance with custom map: {distance}")
@@ -0,0 +1,10 @@
1
+ =================
2
+ Getting Started
3
+ =================
4
+
5
+ Installation
6
+ ============
7
+
8
+ .. code-block:: console
9
+
10
+ pip install ocr-stringdist
@@ -0,0 +1,36 @@
1
+ ================
2
+ OCR-StringDist
3
+ ================
4
+
5
+ A Python library for string distance calculations that account for common OCR (optical character recognition) errors, written in Rust.
6
+
7
+ :Repository: https://niklasvonm.github.io/ocr-stringdist/
8
+ :Current version: |release|
9
+
10
+ .. image:: https://img.shields.io/badge/PyPI-Package-blue
11
+ :target: https://pypi.org/project/ocr-stringdist/
12
+ :alt: PyPI
13
+
14
+ .. image:: https://img.shields.io/badge/License-MIT-green
15
+ :target: LICENSE
16
+ :alt: License
17
+
18
+ Features
19
+ ========
20
+
21
+ - **Weighted Levenshtein Distance**: An adaptation of the classic Levenshtein algorithm with custom substitution costs for character pairs that are commonly confused in OCR models, including efficient batch processing.
22
+ - **Unicode Support**: Arbitrary unicode strings can be compared.
23
+ - **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
24
+ - **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
25
+ - **Best Match Finder**: Utility function ``find_best_candidate`` to efficiently find the best matching string from a collection of candidates using any specified distance function (including the library's OCR-aware ones).
26
+
27
+ Contents
28
+ ========
29
+
30
+ .. toctree::
31
+ :maxdepth: 1
32
+
33
+ getting-started
34
+ examples
35
+ api/index
36
+ changelog
@@ -8,8 +8,6 @@ from typing import Any, Callable
8
8
 
9
9
  import ocr_stringdist as osd
10
10
 
11
- MAX_TOKEN_CHARACTERS = 1
12
-
13
11
 
14
12
  def benchmark(func: Callable, *args: Any, **kwargs: Any) -> tuple[Any, float]: # type: ignore
15
13
  """Run a function and return the execution time in seconds."""
@@ -32,12 +30,7 @@ def compare_methods() -> None:
32
30
 
33
31
  # Standard loop approach
34
32
  _, time_loop = benchmark(
35
- lambda: [
36
- osd.weighted_levenshtein_distance(
37
- source, cand, max_token_characters=MAX_TOKEN_CHARACTERS
38
- )
39
- for cand in candidates
40
- ]
33
+ lambda: [osd.weighted_levenshtein_distance(source, cand) for cand in candidates]
41
34
  )
42
35
  print(
43
36
  f"Loop of single calls: {time_loop:.6f} seconds "
@@ -45,12 +38,7 @@ def compare_methods() -> None:
45
38
  )
46
39
 
47
40
  # Batch approach
48
- _, time_batch = benchmark(
49
- osd.batch_weighted_levenshtein_distance,
50
- source,
51
- candidates,
52
- max_token_characters=MAX_TOKEN_CHARACTERS,
53
- )
41
+ _, time_batch = benchmark(osd.batch_weighted_levenshtein_distance, source, candidates)
54
42
  print(
55
43
  f"Batch function: {time_batch:.6f} seconds "
56
44
  f"({1000 * time_batch / len(candidates):.6f}ms each)"
@@ -24,7 +24,6 @@ ic(
24
24
  "이탈리",
25
25
  "OI탈리", # Korean syllables may be confused with multiple Latin letters at once
26
26
  {("이", "OI"): 0.5},
27
- max_token_characters=2,
28
27
  ),
29
28
  )
30
29
 
@@ -32,13 +31,13 @@ ic(
32
31
  weighted_levenshtein_distance(
33
32
  "ABCDE",
34
33
  "XBCDE",
35
- cost_map={},
36
- default_cost=0.8, # Lower default substitution cost (default is 1.0)
34
+ substitution_costs={},
35
+ default_substitution_cost=0.8, # Lower default substitution cost (default is 1.0)
37
36
  )
38
37
  )
39
38
 
40
- ic(weighted_levenshtein_distance("A", "B", {("A", "B"): 0.0}, symmetric=False))
41
- ic(weighted_levenshtein_distance("A", "B", {("B", "A"): 0.0}, symmetric=False))
39
+ ic(weighted_levenshtein_distance("A", "B", {("A", "B"): 0.0}, symmetric_substitution=False))
40
+ ic(weighted_levenshtein_distance("A", "B", {("B", "A"): 0.0}, symmetric_substitution=False))
42
41
 
43
42
  ic(
44
43
  find_best_candidate(
@@ -27,6 +27,7 @@ dev = [
27
27
  "maturin>=1.8.3",
28
28
  "mypy>=1.15.0",
29
29
  "pytest>=8.3.5",
30
+ "pytest-cov>=6.1.1",
30
31
  "ruff>=0.11.6",
31
32
  "wheel>=0.45.1",
32
33
  ]
@@ -0,0 +1,10 @@
1
+ from .default_ocr_distances import ocr_distance_map
2
+ from .levenshtein import batch_weighted_levenshtein_distance, weighted_levenshtein_distance
3
+ from .matching import find_best_candidate
4
+
5
+ __all__ = [
6
+ "ocr_distance_map",
7
+ "weighted_levenshtein_distance",
8
+ "batch_weighted_levenshtein_distance",
9
+ "find_best_candidate",
10
+ ]
@@ -0,0 +1,118 @@
1
+ from typing import Optional
2
+
3
+ from ._rust_stringdist import * # noqa: F403
4
+ from .default_ocr_distances import ocr_distance_map
5
+
6
+
7
+ def weighted_levenshtein_distance(
8
+ s1: str,
9
+ s2: str,
10
+ /,
11
+ substitution_costs: Optional[dict[tuple[str, str], float]] = None,
12
+ insertion_costs: Optional[dict[str, float]] = None,
13
+ deletion_costs: Optional[dict[str, float]] = None,
14
+ *,
15
+ symmetric_substitution: bool = True,
16
+ default_substitution_cost: float = 1.0,
17
+ default_insertion_cost: float = 1.0,
18
+ default_deletion_cost: float = 1.0,
19
+ ) -> float:
20
+ """
21
+ Levenshtein distance with custom substitution, insertion and deletion costs.
22
+
23
+ The default `substitution_costs` considers common OCR errors, see
24
+ :py:data:`ocr_stringdist.default_ocr_distances.ocr_distance_map`.
25
+
26
+ :param s1: First string (interpreted as the string read via OCR)
27
+ :param s2: Second string
28
+ :param substitution_costs: Dictionary mapping tuples of strings ("substitution tokens") to their
29
+ substitution costs. Only one direction needs to be configured unless
30
+ `symmetric_substitution` is False.
31
+ Note that the runtime scales in the length of the longest substitution token.
32
+ Defaults to `ocr_stringdist.ocr_distance_map`.
33
+ :param insertion_costs: Dictionary mapping strings to their insertion costs.
34
+ :param deletion_costs: Dictionary mapping strings to their deletion costs.
35
+ :param symmetric_substitution: Should the keys of `substitution_costs` be considered to be
36
+ symmetric? Defaults to True.
37
+ :param default_substitution_cost: The default substitution cost for character pairs not found
38
+ in `substitution_costs`.
39
+ :param default_insertion_cost: The default insertion cost for characters not found in
40
+ `insertion_costs`.
41
+ :param default_deletion_cost: The default deletion cost for characters not found in
42
+ `deletion_costs`.
43
+ """
44
+ if substitution_costs is None:
45
+ substitution_costs = ocr_distance_map
46
+ if insertion_costs is None:
47
+ insertion_costs = {}
48
+ if deletion_costs is None:
49
+ deletion_costs = {}
50
+ # _weighted_levenshtein_distance is written in Rust, see src/rust_stringdist.rs.
51
+ return _weighted_levenshtein_distance( # type: ignore # noqa: F405
52
+ s1,
53
+ s2,
54
+ substitution_costs=substitution_costs,
55
+ insertion_costs=insertion_costs,
56
+ deletion_costs=deletion_costs,
57
+ symmetric_substitution=symmetric_substitution,
58
+ default_substitution_cost=default_substitution_cost,
59
+ default_insertion_cost=default_insertion_cost,
60
+ default_deletion_cost=default_deletion_cost,
61
+ )
62
+
63
+
64
+ def batch_weighted_levenshtein_distance(
65
+ s: str,
66
+ candidates: list[str],
67
+ /,
68
+ substitution_costs: Optional[dict[tuple[str, str], float]] = None,
69
+ insertion_costs: Optional[dict[str, float]] = None,
70
+ deletion_costs: Optional[dict[str, float]] = None,
71
+ *,
72
+ symmetric_substitution: bool = True,
73
+ default_substitution_cost: float = 1.0,
74
+ default_insertion_cost: float = 1.0,
75
+ default_deletion_cost: float = 1.0,
76
+ ) -> list[float]:
77
+ """
78
+ Calculate weighted Levenshtein distances between a string and multiple candidates.
79
+
80
+ This is more efficient than calling :func:`weighted_levenshtein_distance` multiple times.
81
+
82
+ :param s: The string to compare (interpreted as the string read via OCR)
83
+ :param candidates: List of candidate strings to compare against
84
+ :param substitution_costs: Dictionary mapping tuples of strings ("substitution tokens") to their
85
+ substitution costs. Only one direction needs to be configured unless
86
+ `symmetric_substitution` is False.
87
+ Note that the runtime scales in the length of the longest substitution token.
88
+ Defaults to `ocr_stringdist.ocr_distance_map`.
89
+ :param insertion_costs: Dictionary mapping strings to their insertion costs.
90
+ :param deletion_costs: Dictionary mapping strings to their deletion costs.
91
+ :param symmetric_substitution: Should the keys of `substitution_costs` be considered to be
92
+ symmetric? Defaults to True.
93
+ :param default_substitution_cost: The default substitution cost for character pairs not found
94
+ in `substitution_costs`.
95
+ :param default_insertion_cost: The default insertion cost for characters not found in
96
+ `insertion_costs`.
97
+ :param default_deletion_cost: The default deletion cost for characters not found in
98
+ `deletion_costs`.
99
+ :return: A list of distances corresponding to each candidate
100
+ """
101
+ if substitution_costs is None:
102
+ substitution_costs = ocr_distance_map
103
+ if insertion_costs is None:
104
+ insertion_costs = {}
105
+ if deletion_costs is None:
106
+ deletion_costs = {}
107
+ # _batch_weighted_levenshtein_distance is written in Rust, see src/rust_stringdist.rs.
108
+ return _batch_weighted_levenshtein_distance( # type: ignore # noqa: F405
109
+ s,
110
+ candidates,
111
+ substitution_costs=substitution_costs,
112
+ insertion_costs=insertion_costs,
113
+ deletion_costs=deletion_costs,
114
+ symmetric_substitution=symmetric_substitution,
115
+ default_substitution_cost=default_substitution_cost,
116
+ default_insertion_cost=default_insertion_cost,
117
+ default_deletion_cost=default_deletion_cost,
118
+ )