ocr-stringdist 0.2.1__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/CHANGELOG.md +12 -0
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/Cargo.lock +1 -1
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/Cargo.toml +1 -1
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/Justfile +1 -1
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/PKG-INFO +40 -19
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/README.md +37 -17
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/docs/source/examples.rst +3 -3
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/docs/source/getting-started.rst +2 -2
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/docs/source/index.rst +2 -0
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/pyproject.toml +3 -1
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/.github/workflows/CI.yml +0 -0
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/.github/workflows/docs.yml +0 -0
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/.gitignore +0 -0
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/LICENSE +0 -0
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/docs/Makefile +0 -0
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/docs/make.bat +0 -0
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/docs/source/api/index.rst +0 -0
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/docs/source/changelog.rst +0 -0
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/docs/source/conf.py +0 -0
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/examples/batch_processing.py +0 -0
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/examples/explain_distance.py +0 -0
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/examples/weighted_levenshtein.py +0 -0
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/mypy.ini +0 -0
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/python/ocr_stringdist/__init__.py +0 -0
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/python/ocr_stringdist/default_ocr_distances.py +0 -0
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/python/ocr_stringdist/levenshtein.py +0 -0
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/python/ocr_stringdist/matching.py +0 -0
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/python/ocr_stringdist/py.typed +0 -0
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/ruff.toml +0 -0
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/src/cost_map.rs +0 -0
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/src/explanation.rs +0 -0
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/src/lib.rs +0 -0
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/src/rust_stringdist.rs +0 -0
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/src/types.rs +0 -0
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/src/weighted_levenshtein.rs +0 -0
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/tests/test_batch_weighted_levenshtein.py +0 -0
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/tests/test_explain_weighted_levenshtein.py +0 -0
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/tests/test_matching.py +0 -0
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/tests/test_weighted_levenshtein.py +0 -0
- {ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/uv.lock +0 -0
@@ -5,6 +5,18 @@ All notable changes to this project will be documented in this file.
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
7
7
|
|
8
|
+
## [0.2.2] - 2025-09-01
|
9
|
+
|
10
|
+
### Changed
|
11
|
+
|
12
|
+
- Improve documentation.
|
13
|
+
|
14
|
+
## [0.2.1] - 2025-08-31
|
15
|
+
|
16
|
+
### Fixed
|
17
|
+
|
18
|
+
- Documentation for PyPI
|
19
|
+
|
8
20
|
## [0.2.0] - 2025-08-31
|
9
21
|
|
10
22
|
### Added
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
|
-
Name:
|
3
|
-
Version: 0.2.
|
2
|
+
Name: ocr-stringdist
|
3
|
+
Version: 0.2.2
|
4
4
|
Classifier: Programming Language :: Rust
|
5
5
|
Classifier: Programming Language :: Python
|
6
6
|
Classifier: Operating System :: OS Independent
|
@@ -8,6 +8,7 @@ License-File: LICENSE
|
|
8
8
|
Requires-Python: >=3.9
|
9
9
|
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
10
10
|
Project-URL: repository, https://github.com/NiklasvonM/ocr-stringdist
|
11
|
+
Project-URL: documentation, https://niklasvonm.github.io/ocr-stringdist/
|
11
12
|
|
12
13
|
# OCR-StringDist
|
13
14
|
|
@@ -38,8 +39,6 @@ OCR-StringDist uses a **weighted Levenshtein distance**, assigning lower costs t
|
|
38
39
|
|
39
40
|
This makes it ideal for matching potentially incorrect OCR output against known values (e.g., product codes, database entries).
|
40
41
|
|
41
|
-
> **Note:** This project is in early development. APIs may change in future releases.
|
42
|
-
|
43
42
|
## Installation
|
44
43
|
|
45
44
|
```bash
|
@@ -48,7 +47,9 @@ pip install ocr-stringdist
|
|
48
47
|
|
49
48
|
## Features
|
50
49
|
|
50
|
+
- **High Performance**: The core logic is implemented in Rust with speed in mind.
|
51
51
|
- **Weighted Levenshtein Distance**: Calculates Levenshtein distance with customizable costs for substitutions, insertions, and deletions. Includes an efficient batch version (`batch_weighted_levenshtein_distance`) for comparing one string against many candidates.
|
52
|
+
- **Explainable Edit Path**: Returns the optimal sequence of edit operations (substitutions, insertions, and deletions) used to transform one string into another.
|
52
53
|
- **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
|
53
54
|
- **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
|
54
55
|
- **Unicode Support**: Works with arbitrary Unicode strings.
|
@@ -56,25 +57,45 @@ pip install ocr-stringdist
|
|
56
57
|
|
57
58
|
## Usage
|
58
59
|
|
59
|
-
###
|
60
|
+
### Basic usage
|
61
|
+
|
62
|
+
```python
|
63
|
+
from ocr_stringdist import WeightedLevenshtein
|
64
|
+
|
65
|
+
# Default substitution costs are ocr_stringdist.ocr_distance_map.
|
66
|
+
wl = WeightedLevenshtein()
|
67
|
+
|
68
|
+
print(wl.distance("CXDE", "CODE")) # == 1
|
69
|
+
print(wl.distance("C0DE", "CODE")) # < 1
|
70
|
+
```
|
71
|
+
|
72
|
+
### Explain the Edit Path
|
73
|
+
|
74
|
+
```python
|
75
|
+
edit_path = wl.explain("C0DE", "CODE")
|
76
|
+
print(edit_path)
|
77
|
+
# EditOperation(op_type='substitute', source_token='0', target_token='O', cost=0.1)]
|
78
|
+
```
|
79
|
+
|
80
|
+
### Fast Batch Calculations
|
81
|
+
|
82
|
+
Quickly compare a string to a list of candidates.
|
60
83
|
|
61
84
|
```python
|
62
|
-
|
63
|
-
|
64
|
-
# Using default OCR distance map
|
65
|
-
distance = osd.weighted_levenshtein_distance("OCR5", "OCRS")
|
66
|
-
print(f"Distance between 'OCR5' and 'OCRS': {distance}") # Will be less than 1.0
|
67
|
-
|
68
|
-
# Custom cost map
|
69
|
-
substitution_costs = {("In", "h"): 0.5}
|
70
|
-
distance = osd.weighted_levenshtein_distance(
|
71
|
-
"hi", "Ini",
|
72
|
-
substitution_costs=substitution_costs,
|
73
|
-
symmetric_substitution=True,
|
74
|
-
)
|
75
|
-
print(f"Distance with custom map: {distance}")
|
85
|
+
distances: list[float] = wl.batch_distance("CODE", ["CXDE", "C0DE"])
|
86
|
+
# [1.0, 0.1]
|
76
87
|
```
|
77
88
|
|
89
|
+
### Multi-character Substitutions
|
90
|
+
|
91
|
+
```python
|
92
|
+
# Custom costs with multi-character substitution
|
93
|
+
wl = WeightedLevenshtein(substitution_costs={("In", "h"): 0.5})
|
94
|
+
|
95
|
+
print(wl.distance("hi", "Ini")) # 0.5
|
96
|
+
```
|
97
|
+
|
98
|
+
|
78
99
|
## Acknowledgements
|
79
100
|
|
80
101
|
This project is inspired by [jellyfish](https://github.com/jamesturk/jellyfish), providing the base implementations of the algorithms used here.
|
@@ -27,8 +27,6 @@ OCR-StringDist uses a **weighted Levenshtein distance**, assigning lower costs t
|
|
27
27
|
|
28
28
|
This makes it ideal for matching potentially incorrect OCR output against known values (e.g., product codes, database entries).
|
29
29
|
|
30
|
-
> **Note:** This project is in early development. APIs may change in future releases.
|
31
|
-
|
32
30
|
## Installation
|
33
31
|
|
34
32
|
```bash
|
@@ -37,7 +35,9 @@ pip install ocr-stringdist
|
|
37
35
|
|
38
36
|
## Features
|
39
37
|
|
38
|
+
- **High Performance**: The core logic is implemented in Rust with speed in mind.
|
40
39
|
- **Weighted Levenshtein Distance**: Calculates Levenshtein distance with customizable costs for substitutions, insertions, and deletions. Includes an efficient batch version (`batch_weighted_levenshtein_distance`) for comparing one string against many candidates.
|
40
|
+
- **Explainable Edit Path**: Returns the optimal sequence of edit operations (substitutions, insertions, and deletions) used to transform one string into another.
|
41
41
|
- **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
|
42
42
|
- **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
|
43
43
|
- **Unicode Support**: Works with arbitrary Unicode strings.
|
@@ -45,25 +45,45 @@ pip install ocr-stringdist
|
|
45
45
|
|
46
46
|
## Usage
|
47
47
|
|
48
|
-
###
|
48
|
+
### Basic usage
|
49
|
+
|
50
|
+
```python
|
51
|
+
from ocr_stringdist import WeightedLevenshtein
|
52
|
+
|
53
|
+
# Default substitution costs are ocr_stringdist.ocr_distance_map.
|
54
|
+
wl = WeightedLevenshtein()
|
55
|
+
|
56
|
+
print(wl.distance("CXDE", "CODE")) # == 1
|
57
|
+
print(wl.distance("C0DE", "CODE")) # < 1
|
58
|
+
```
|
59
|
+
|
60
|
+
### Explain the Edit Path
|
61
|
+
|
62
|
+
```python
|
63
|
+
edit_path = wl.explain("C0DE", "CODE")
|
64
|
+
print(edit_path)
|
65
|
+
# EditOperation(op_type='substitute', source_token='0', target_token='O', cost=0.1)]
|
66
|
+
```
|
67
|
+
|
68
|
+
### Fast Batch Calculations
|
69
|
+
|
70
|
+
Quickly compare a string to a list of candidates.
|
49
71
|
|
50
72
|
```python
|
51
|
-
|
52
|
-
|
53
|
-
# Using default OCR distance map
|
54
|
-
distance = osd.weighted_levenshtein_distance("OCR5", "OCRS")
|
55
|
-
print(f"Distance between 'OCR5' and 'OCRS': {distance}") # Will be less than 1.0
|
56
|
-
|
57
|
-
# Custom cost map
|
58
|
-
substitution_costs = {("In", "h"): 0.5}
|
59
|
-
distance = osd.weighted_levenshtein_distance(
|
60
|
-
"hi", "Ini",
|
61
|
-
substitution_costs=substitution_costs,
|
62
|
-
symmetric_substitution=True,
|
63
|
-
)
|
64
|
-
print(f"Distance with custom map: {distance}")
|
73
|
+
distances: list[float] = wl.batch_distance("CODE", ["CXDE", "C0DE"])
|
74
|
+
# [1.0, 0.1]
|
65
75
|
```
|
66
76
|
|
77
|
+
### Multi-character Substitutions
|
78
|
+
|
79
|
+
```python
|
80
|
+
# Custom costs with multi-character substitution
|
81
|
+
wl = WeightedLevenshtein(substitution_costs={("In", "h"): 0.5})
|
82
|
+
|
83
|
+
print(wl.distance("hi", "Ini")) # 0.5
|
84
|
+
```
|
85
|
+
|
86
|
+
|
67
87
|
## Acknowledgements
|
68
88
|
|
69
89
|
This project is inspired by [jellyfish](https://github.com/jamesturk/jellyfish), providing the base implementations of the algorithms used here.
|
@@ -9,11 +9,11 @@ Using the default pre-defined map for common OCR errors:
|
|
9
9
|
|
10
10
|
.. code-block:: python
|
11
11
|
|
12
|
-
|
12
|
+
from ocr_stringdist import WeightedLevenshtein
|
13
13
|
|
14
14
|
# Compare "OCR5" and "OCRS"
|
15
15
|
# The default ocr_distance_map gives 'S' <-> '5' a cost of 0.3
|
16
|
-
distance =
|
16
|
+
distance: float = WeightedLevenshtein().distance("OCR5", "OCRS")
|
17
17
|
print(f"Distance between 'OCR5' and 'OCRS' (default map): {distance}")
|
18
18
|
# Output: Distance between 'OCR5' and 'OCRS' (default map): 0.3
|
19
19
|
|
@@ -63,7 +63,7 @@ This is a primary use case: finding the best match for an OCR string from a list
|
|
63
63
|
|
64
64
|
# Method 2: Using WeightedLevenshtein.batch_distance
|
65
65
|
# Generally more efficient when comparing against many candidates.
|
66
|
-
distances = wl.batch_distance(ocr_output, possible_cities)
|
66
|
+
distances: list[float] = wl.batch_distance(ocr_output, possible_cities)
|
67
67
|
|
68
68
|
min_dist_batch = min(distances)
|
69
69
|
best_candidate_batch = possible_cities[distances.index(min_dist_batch)]
|
@@ -16,11 +16,11 @@ After installation, you can quickly calculate an OCR-aware string distance:
|
|
16
16
|
|
17
17
|
.. code-block:: python
|
18
18
|
|
19
|
-
|
19
|
+
from ocr_stringdist import WeightedLevenshtein
|
20
20
|
|
21
21
|
# Calculate distance using the default OCR error costs
|
22
22
|
# ("O" vs "0" has a low cost)
|
23
|
-
distance =
|
23
|
+
distance = WeightedLevenshtein().distance("HELLO", "HELL0")
|
24
24
|
|
25
25
|
print(f"The OCR-aware distance is: {distance}")
|
26
26
|
|
@@ -39,7 +39,9 @@ This makes it ideal for matching potentially incorrect OCR output against known
|
|
39
39
|
Features
|
40
40
|
========
|
41
41
|
|
42
|
+
- **High Performance**: The core logic is implemented in Rust with speed in mind.
|
42
43
|
- **Weighted Levenshtein Distance**: Calculates Levenshtein distance with customizable costs for substitutions, insertions, and deletions. Includes an efficient batch version (`batch_weighted_levenshtein_distance`) for comparing one string against many candidates.
|
44
|
+
- **Explainable Edit Path**: Returns the optimal sequence of edit operations (substitutions, insertions, and deletions) used to transform one string into another.
|
43
45
|
- **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
|
44
46
|
- **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
|
45
47
|
- **Unicode Support**: Works with arbitrary Unicode strings.
|
@@ -3,7 +3,7 @@ requires = ["maturin>=0.14,<2"]
|
|
3
3
|
build-backend = "maturin"
|
4
4
|
|
5
5
|
[project]
|
6
|
-
name = "
|
6
|
+
name = "ocr-stringdist"
|
7
7
|
dynamic = ["version"]
|
8
8
|
readme = "README.md"
|
9
9
|
requires-python = ">=3.9"
|
@@ -15,6 +15,7 @@ classifiers = [
|
|
15
15
|
|
16
16
|
[project.urls]
|
17
17
|
repository = "https://github.com/NiklasvonM/ocr-stringdist"
|
18
|
+
documentation = "https://niklasvonm.github.io/ocr-stringdist/"
|
18
19
|
|
19
20
|
|
20
21
|
[tool.maturin]
|
@@ -38,3 +39,4 @@ docs = [
|
|
38
39
|
"sphinx-mdinclude>=0.6.2",
|
39
40
|
"sphinx-rtd-theme>=3.0.2",
|
40
41
|
]
|
42
|
+
temp = []
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{ocr_stringdist-0.2.1 → ocr_stringdist-0.2.2}/python/ocr_stringdist/default_ocr_distances.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|