ocr-stringdist 0.2.0__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/CHANGELOG.md +12 -0
  2. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/Cargo.lock +1 -1
  3. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/Cargo.toml +1 -1
  4. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/Justfile +1 -1
  5. ocr_stringdist-0.2.2/PKG-INFO +102 -0
  6. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/README.md +37 -17
  7. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/docs/source/examples.rst +3 -3
  8. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/docs/source/getting-started.rst +2 -2
  9. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/docs/source/index.rst +2 -0
  10. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/pyproject.toml +4 -1
  11. ocr_stringdist-0.2.0/PKG-INFO +0 -9
  12. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/.github/workflows/CI.yml +0 -0
  13. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/.github/workflows/docs.yml +0 -0
  14. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/.gitignore +0 -0
  15. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/LICENSE +0 -0
  16. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/docs/Makefile +0 -0
  17. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/docs/make.bat +0 -0
  18. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/docs/source/api/index.rst +0 -0
  19. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/docs/source/changelog.rst +0 -0
  20. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/docs/source/conf.py +0 -0
  21. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/examples/batch_processing.py +0 -0
  22. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/examples/explain_distance.py +0 -0
  23. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/examples/weighted_levenshtein.py +0 -0
  24. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/mypy.ini +0 -0
  25. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/python/ocr_stringdist/__init__.py +0 -0
  26. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/python/ocr_stringdist/default_ocr_distances.py +0 -0
  27. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/python/ocr_stringdist/levenshtein.py +0 -0
  28. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/python/ocr_stringdist/matching.py +0 -0
  29. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/python/ocr_stringdist/py.typed +0 -0
  30. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/ruff.toml +0 -0
  31. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/src/cost_map.rs +0 -0
  32. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/src/explanation.rs +0 -0
  33. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/src/lib.rs +0 -0
  34. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/src/rust_stringdist.rs +0 -0
  35. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/src/types.rs +0 -0
  36. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/src/weighted_levenshtein.rs +0 -0
  37. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/tests/test_batch_weighted_levenshtein.py +0 -0
  38. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/tests/test_explain_weighted_levenshtein.py +0 -0
  39. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/tests/test_matching.py +0 -0
  40. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/tests/test_weighted_levenshtein.py +0 -0
  41. {ocr_stringdist-0.2.0 → ocr_stringdist-0.2.2}/uv.lock +0 -0
@@ -5,6 +5,18 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.2.2] - 2025-09-01
9
+
10
+ ### Changed
11
+
12
+ - Improve documentation.
13
+
14
+ ## [0.2.1] - 2025-08-31
15
+
16
+ ### Fixed
17
+
18
+ - Documentation for PyPI
19
+
8
20
  ## [0.2.0] - 2025-08-31
9
21
 
10
22
  ### Added
@@ -74,7 +74,7 @@ dependencies = [
74
74
 
75
75
  [[package]]
76
76
  name = "ocr_stringdist"
77
- version = "0.2.0"
77
+ version = "0.2.2"
78
78
  dependencies = [
79
79
  "pyo3",
80
80
  "rayon",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "ocr_stringdist"
3
- version = "0.2.0"
3
+ version = "0.2.2"
4
4
  edition = "2021"
5
5
  description = "String distances considering OCR errors."
6
6
  authors = ["Niklas von Moers <niklasvmoers@protonmail.com>"]
@@ -1,7 +1,7 @@
1
1
  venv:
2
2
  rm -rf .venv
3
3
  uv venv
4
- uv sync
4
+ uv sync --all-groups
5
5
 
6
6
  pytest:
7
7
  uv run maturin develop
@@ -0,0 +1,102 @@
1
+ Metadata-Version: 2.4
2
+ Name: ocr-stringdist
3
+ Version: 0.2.2
4
+ Classifier: Programming Language :: Rust
5
+ Classifier: Programming Language :: Python
6
+ Classifier: Operating System :: OS Independent
7
+ License-File: LICENSE
8
+ Requires-Python: >=3.9
9
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
10
+ Project-URL: repository, https://github.com/NiklasvonM/ocr-stringdist
11
+ Project-URL: documentation, https://niklasvonm.github.io/ocr-stringdist/
12
+
13
+ # OCR-StringDist
14
+
15
+ A Python library for fast string distance calculations that account for common OCR (optical character recognition) errors.
16
+
17
+ Documentation: https://niklasvonm.github.io/ocr-stringdist/
18
+
19
+ [![PyPI](https://img.shields.io/badge/PyPI-Package-blue)](https://pypi.org/project/ocr-stringdist/)
20
+ [![License](https://img.shields.io/badge/License-MIT-green)](LICENSE)
21
+
22
+ ## Overview
23
+
24
+ Standard string distances (like Levenshtein) treat all character substitutions equally. This is suboptimal for text read from images via OCR, where errors like `O` vs `0` are far more common than, say, `O` vs `X`.
25
+
26
+ OCR-StringDist uses a **weighted Levenshtein distance**, assigning lower costs to common OCR errors.
27
+
28
+ **Example:** Matching against the correct word `CODE`:
29
+
30
+ * **Standard Levenshtein:**
31
+ * $d(\text{"CODE"}, \text{"C0DE"}) = 1$ (O → 0)
32
+ * $d(\text{"CODE"}, \text{"CXDE"}) = 1$ (O → X)
33
+ * Result: Both appear equally likely/distant.
34
+
35
+ * **OCR-StringDist (Weighted):**
36
+ * $d(\text{"CODE"}, \text{"C0DE"}) \approx 0.1$ (common error, low cost)
37
+ * $d(\text{"CODE"}, \text{"CXDE"}) = 1.0$ (unlikely error, high cost)
38
+ * Result: Correctly identifies `C0DE` as a much closer match.
39
+
40
+ This makes it ideal for matching potentially incorrect OCR output against known values (e.g., product codes, database entries).
41
+
42
+ ## Installation
43
+
44
+ ```bash
45
+ pip install ocr-stringdist
46
+ ```
47
+
48
+ ## Features
49
+
50
+ - **High Performance**: The core logic is implemented in Rust with speed in mind.
51
+ - **Weighted Levenshtein Distance**: Calculates Levenshtein distance with customizable costs for substitutions, insertions, and deletions. Includes an efficient batch version (`batch_weighted_levenshtein_distance`) for comparing one string against many candidates.
52
+ - **Explainable Edit Path**: Returns the optimal sequence of edit operations (substitutions, insertions, and deletions) used to transform one string into another.
53
+ - **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
54
+ - **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
55
+ - **Unicode Support**: Works with arbitrary Unicode strings.
56
+ - **Best Match Finder**: Includes a utility function `find_best_candidate` to efficiently find the best match from a list based on _any_ distance function.
57
+
58
+ ## Usage
59
+
60
+ ### Basic usage
61
+
62
+ ```python
63
+ from ocr_stringdist import WeightedLevenshtein
64
+
65
+ # Default substitution costs are ocr_stringdist.ocr_distance_map.
66
+ wl = WeightedLevenshtein()
67
+
68
+ print(wl.distance("CXDE", "CODE")) # == 1
69
+ print(wl.distance("C0DE", "CODE")) # < 1
70
+ ```
71
+
72
+ ### Explain the Edit Path
73
+
74
+ ```python
75
+ edit_path = wl.explain("C0DE", "CODE")
76
+ print(edit_path)
77
+ # EditOperation(op_type='substitute', source_token='0', target_token='O', cost=0.1)]
78
+ ```
79
+
80
+ ### Fast Batch Calculations
81
+
82
+ Quickly compare a string to a list of candidates.
83
+
84
+ ```python
85
+ distances: list[float] = wl.batch_distance("CODE", ["CXDE", "C0DE"])
86
+ # [1.0, 0.1]
87
+ ```
88
+
89
+ ### Multi-character Substitutions
90
+
91
+ ```python
92
+ # Custom costs with multi-character substitution
93
+ wl = WeightedLevenshtein(substitution_costs={("In", "h"): 0.5})
94
+
95
+ print(wl.distance("hi", "Ini")) # 0.5
96
+ ```
97
+
98
+
99
+ ## Acknowledgements
100
+
101
+ This project is inspired by [jellyfish](https://github.com/jamesturk/jellyfish), providing the base implementations of the algorithms used here.
102
+
@@ -27,8 +27,6 @@ OCR-StringDist uses a **weighted Levenshtein distance**, assigning lower costs t
27
27
 
28
28
  This makes it ideal for matching potentially incorrect OCR output against known values (e.g., product codes, database entries).
29
29
 
30
- > **Note:** This project is in early development. APIs may change in future releases.
31
-
32
30
  ## Installation
33
31
 
34
32
  ```bash
@@ -37,7 +35,9 @@ pip install ocr-stringdist
37
35
 
38
36
  ## Features
39
37
 
38
+ - **High Performance**: The core logic is implemented in Rust with speed in mind.
40
39
  - **Weighted Levenshtein Distance**: Calculates Levenshtein distance with customizable costs for substitutions, insertions, and deletions. Includes an efficient batch version (`batch_weighted_levenshtein_distance`) for comparing one string against many candidates.
40
+ - **Explainable Edit Path**: Returns the optimal sequence of edit operations (substitutions, insertions, and deletions) used to transform one string into another.
41
41
  - **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
42
42
  - **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
43
43
  - **Unicode Support**: Works with arbitrary Unicode strings.
@@ -45,25 +45,45 @@ pip install ocr-stringdist
45
45
 
46
46
  ## Usage
47
47
 
48
- ### Weighted Levenshtein Distance
48
+ ### Basic usage
49
+
50
+ ```python
51
+ from ocr_stringdist import WeightedLevenshtein
52
+
53
+ # Default substitution costs are ocr_stringdist.ocr_distance_map.
54
+ wl = WeightedLevenshtein()
55
+
56
+ print(wl.distance("CXDE", "CODE")) # == 1
57
+ print(wl.distance("C0DE", "CODE")) # < 1
58
+ ```
59
+
60
+ ### Explain the Edit Path
61
+
62
+ ```python
63
+ edit_path = wl.explain("C0DE", "CODE")
64
+ print(edit_path)
65
+ # EditOperation(op_type='substitute', source_token='0', target_token='O', cost=0.1)]
66
+ ```
67
+
68
+ ### Fast Batch Calculations
69
+
70
+ Quickly compare a string to a list of candidates.
49
71
 
50
72
  ```python
51
- import ocr_stringdist as osd
52
-
53
- # Using default OCR distance map
54
- distance = osd.weighted_levenshtein_distance("OCR5", "OCRS")
55
- print(f"Distance between 'OCR5' and 'OCRS': {distance}") # Will be less than 1.0
56
-
57
- # Custom cost map
58
- substitution_costs = {("In", "h"): 0.5}
59
- distance = osd.weighted_levenshtein_distance(
60
- "hi", "Ini",
61
- substitution_costs=substitution_costs,
62
- symmetric_substitution=True,
63
- )
64
- print(f"Distance with custom map: {distance}")
73
+ distances: list[float] = wl.batch_distance("CODE", ["CXDE", "C0DE"])
74
+ # [1.0, 0.1]
65
75
  ```
66
76
 
77
+ ### Multi-character Substitutions
78
+
79
+ ```python
80
+ # Custom costs with multi-character substitution
81
+ wl = WeightedLevenshtein(substitution_costs={("In", "h"): 0.5})
82
+
83
+ print(wl.distance("hi", "Ini")) # 0.5
84
+ ```
85
+
86
+
67
87
  ## Acknowledgements
68
88
 
69
89
  This project is inspired by [jellyfish](https://github.com/jamesturk/jellyfish), providing the base implementations of the algorithms used here.
@@ -9,11 +9,11 @@ Using the default pre-defined map for common OCR errors:
9
9
 
10
10
  .. code-block:: python
11
11
 
12
- import ocr_stringdist as osd
12
+ from ocr_stringdist import WeightedLevenshtein
13
13
 
14
14
  # Compare "OCR5" and "OCRS"
15
15
  # The default ocr_distance_map gives 'S' <-> '5' a cost of 0.3
16
- distance = osd.weighted_levenshtein_distance("OCR5", "OCRS")
16
+ distance: float = WeightedLevenshtein().distance("OCR5", "OCRS")
17
17
  print(f"Distance between 'OCR5' and 'OCRS' (default map): {distance}")
18
18
  # Output: Distance between 'OCR5' and 'OCRS' (default map): 0.3
19
19
 
@@ -63,7 +63,7 @@ This is a primary use case: finding the best match for an OCR string from a list
63
63
 
64
64
  # Method 2: Using WeightedLevenshtein.batch_distance
65
65
  # Generally more efficient when comparing against many candidates.
66
- distances = wl.batch_distance(ocr_output, possible_cities)
66
+ distances: list[float] = wl.batch_distance(ocr_output, possible_cities)
67
67
 
68
68
  min_dist_batch = min(distances)
69
69
  best_candidate_batch = possible_cities[distances.index(min_dist_batch)]
@@ -16,11 +16,11 @@ After installation, you can quickly calculate an OCR-aware string distance:
16
16
 
17
17
  .. code-block:: python
18
18
 
19
- import ocr_stringdist as osd
19
+ from ocr_stringdist import WeightedLevenshtein
20
20
 
21
21
  # Calculate distance using the default OCR error costs
22
22
  # ("O" vs "0" has a low cost)
23
- distance = osd.weighted_levenshtein_distance("HELLO", "HELL0")
23
+ distance = WeightedLevenshtein().distance("HELLO", "HELL0")
24
24
 
25
25
  print(f"The OCR-aware distance is: {distance}")
26
26
 
@@ -39,7 +39,9 @@ This makes it ideal for matching potentially incorrect OCR output against known
39
39
  Features
40
40
  ========
41
41
 
42
+ - **High Performance**: The core logic is implemented in Rust with speed in mind.
42
43
  - **Weighted Levenshtein Distance**: Calculates Levenshtein distance with customizable costs for substitutions, insertions, and deletions. Includes an efficient batch version (`batch_weighted_levenshtein_distance`) for comparing one string against many candidates.
44
+ - **Explainable Edit Path**: Returns the optimal sequence of edit operations (substitutions, insertions, and deletions) used to transform one string into another.
43
45
  - **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
44
46
  - **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
45
47
  - **Unicode Support**: Works with arbitrary Unicode strings.
@@ -3,8 +3,9 @@ requires = ["maturin>=0.14,<2"]
3
3
  build-backend = "maturin"
4
4
 
5
5
  [project]
6
- name = "ocr_stringdist"
6
+ name = "ocr-stringdist"
7
7
  dynamic = ["version"]
8
+ readme = "README.md"
8
9
  requires-python = ">=3.9"
9
10
  classifiers = [
10
11
  "Programming Language :: Rust",
@@ -14,6 +15,7 @@ classifiers = [
14
15
 
15
16
  [project.urls]
16
17
  repository = "https://github.com/NiklasvonM/ocr-stringdist"
18
+ documentation = "https://niklasvonm.github.io/ocr-stringdist/"
17
19
 
18
20
 
19
21
  [tool.maturin]
@@ -37,3 +39,4 @@ docs = [
37
39
  "sphinx-mdinclude>=0.6.2",
38
40
  "sphinx-rtd-theme>=3.0.2",
39
41
  ]
42
+ temp = []
@@ -1,9 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: ocr_stringdist
3
- Version: 0.2.0
4
- Classifier: Programming Language :: Rust
5
- Classifier: Programming Language :: Python
6
- Classifier: Operating System :: OS Independent
7
- License-File: LICENSE
8
- Requires-Python: >=3.9
9
- Project-URL: repository, https://github.com/NiklasvonM/ocr-stringdist
File without changes
File without changes
File without changes
File without changes