ocr-stringdist 0.0.1__tar.gz → 0.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ocr_stringdist-0.0.1 → ocr_stringdist-0.0.3}/.gitignore +1 -0
- {ocr_stringdist-0.0.1 → ocr_stringdist-0.0.3}/Cargo.lock +1 -2
- {ocr_stringdist-0.0.1 → ocr_stringdist-0.0.3}/Cargo.toml +1 -2
- ocr_stringdist-0.0.3/LICENSE +21 -0
- ocr_stringdist-0.0.3/PKG-INFO +64 -0
- ocr_stringdist-0.0.3/README.md +48 -0
- ocr_stringdist-0.0.3/example.py +41 -0
- ocr_stringdist-0.0.3/python/ocr_stringdist/__init__.py +41 -0
- ocr_stringdist-0.0.3/python/ocr_stringdist/default_ocr_distances.py +38 -0
- ocr_stringdist-0.0.3/src/lib.rs +8 -0
- {ocr_stringdist-0.0.1 → ocr_stringdist-0.0.3}/src/rust_stringdist.rs +5 -12
- {ocr_stringdist-0.0.1 → ocr_stringdist-0.0.3}/src/weighted_levenshtein.rs +7 -71
- ocr_stringdist-0.0.3/tests/test_ocr_stringdist.py +5 -0
- ocr_stringdist-0.0.1/PKG-INFO +0 -16
- ocr_stringdist-0.0.1/README.md +0 -1
- ocr_stringdist-0.0.1/example.py +0 -50
- ocr_stringdist-0.0.1/python/ocr_stringdist/__init__.py +0 -1
- ocr_stringdist-0.0.1/python/ocr_stringdist/__init__.pyi +0 -8
- ocr_stringdist-0.0.1/src/lib.rs +0 -11
- ocr_stringdist-0.0.1/tests/test_ocr_stringdist.py +0 -5
- {ocr_stringdist-0.0.1 → ocr_stringdist-0.0.3}/.github/workflows/CI.yml +0 -0
- {ocr_stringdist-0.0.1 → ocr_stringdist-0.0.3}/Justfile +0 -0
- {ocr_stringdist-0.0.1 → ocr_stringdist-0.0.3}/pyproject.toml +0 -0
- {ocr_stringdist-0.0.1 → ocr_stringdist-0.0.3}/python/ocr_stringdist/py.typed +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
[package]
|
2
2
|
name = "ocr_stringdist"
|
3
|
-
version = "0.0.
|
3
|
+
version = "0.0.3"
|
4
4
|
edition = "2021"
|
5
5
|
description = "String distances considering OCR errors."
|
6
6
|
authors = ["Niklas von Moers <niklasvmoers@protonmail.com>"]
|
@@ -16,7 +16,6 @@ crate-type = ["cdylib"]
|
|
16
16
|
[dependencies]
|
17
17
|
pyo3 = { version = "0.24.0", features = [] }
|
18
18
|
ahash = "^0.8"
|
19
|
-
once_cell = "1.21.3"
|
20
19
|
smallvec = "1.15.0"
|
21
20
|
|
22
21
|
[features]
|
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2025 Niklas von Moers
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
@@ -0,0 +1,64 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: ocr_stringdist
|
3
|
+
Version: 0.0.3
|
4
|
+
Classifier: Programming Language :: Rust
|
5
|
+
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
6
|
+
Classifier: Operating System :: OS Independent
|
7
|
+
License-File: LICENSE
|
8
|
+
Summary: String distances considering OCR errors.
|
9
|
+
Author: Niklas von Moers <niklasvmoers@protonmail.com>
|
10
|
+
Author-email: Niklas von Moers <niklasvmoers@protonmail.com>
|
11
|
+
License: MIT
|
12
|
+
Requires-Python: >=3.9
|
13
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
14
|
+
Project-URL: repository, https://github.com/NiklasvonM/ocr-stringdist
|
15
|
+
|
16
|
+
# OCR-StringDist
|
17
|
+
|
18
|
+
A Python library for string distance calculations that account for common OCR (optical character recognition) errors.
|
19
|
+
|
20
|
+
[](https://pypi.org/project/ocr-stringdist/)
|
21
|
+
[](LICENSE)
|
22
|
+
|
23
|
+
## Overview
|
24
|
+
|
25
|
+
OCR-StringDist provides specialized string distance algorithms that accommodate for optical character recognition (OCR) errors. Unlike traditional string comparison algorithms, OCR-StringDist considers common OCR confusions (like "0" vs "O", "6" vs "G", etc.) when calculating distances between strings.
|
26
|
+
|
27
|
+
> **Note:** This project is in early development. APIs may change in future releases.
|
28
|
+
|
29
|
+
## Installation
|
30
|
+
|
31
|
+
```bash
|
32
|
+
pip install ocr-stringdist
|
33
|
+
```
|
34
|
+
|
35
|
+
## Features
|
36
|
+
|
37
|
+
- **Weighted Levenshtein Distance**: An adaptation of the classic Levenshtein algorithm with custom substitution costs for character pairs that are commonly confused in OCR models.
|
38
|
+
- **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
|
39
|
+
- **Customizable Cost Maps**: Create your own substitution cost maps for specific OCR systems or domains.
|
40
|
+
|
41
|
+
## Usage
|
42
|
+
|
43
|
+
```python
|
44
|
+
import ocr_stringdist as osd
|
45
|
+
|
46
|
+
# Using default OCR distance map
|
47
|
+
distance = osd.weighted_levenshtein_distance("OCR5", "OCRS")
|
48
|
+
print(f"Distance between 'OCR5' and 'OCRS': {distance}") # Will be less than 1.0
|
49
|
+
|
50
|
+
# Custom cost map
|
51
|
+
custom_map = {("f", "t"): 0.2, ("m", "n"): 0.1}
|
52
|
+
distance = osd.weighted_levenshtein_distance(
|
53
|
+
"first", "tirst",
|
54
|
+
cost_map=custom_map,
|
55
|
+
symmetric=True,
|
56
|
+
default_cost=1.0
|
57
|
+
)
|
58
|
+
print(f"Distance with custom map: {distance}")
|
59
|
+
```
|
60
|
+
|
61
|
+
## Acknowledgements
|
62
|
+
|
63
|
+
This project is inspired by [jellyfish](https://github.com/jamesturk/jellyfish), providing the base implementations of the algorithms used here.
|
64
|
+
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# OCR-StringDist
|
2
|
+
|
3
|
+
A Python library for string distance calculations that account for common OCR (optical character recognition) errors.
|
4
|
+
|
5
|
+
[](https://pypi.org/project/ocr-stringdist/)
|
6
|
+
[](LICENSE)
|
7
|
+
|
8
|
+
## Overview
|
9
|
+
|
10
|
+
OCR-StringDist provides specialized string distance algorithms that accommodate for optical character recognition (OCR) errors. Unlike traditional string comparison algorithms, OCR-StringDist considers common OCR confusions (like "0" vs "O", "6" vs "G", etc.) when calculating distances between strings.
|
11
|
+
|
12
|
+
> **Note:** This project is in early development. APIs may change in future releases.
|
13
|
+
|
14
|
+
## Installation
|
15
|
+
|
16
|
+
```bash
|
17
|
+
pip install ocr-stringdist
|
18
|
+
```
|
19
|
+
|
20
|
+
## Features
|
21
|
+
|
22
|
+
- **Weighted Levenshtein Distance**: An adaptation of the classic Levenshtein algorithm with custom substitution costs for character pairs that are commonly confused in OCR models.
|
23
|
+
- **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
|
24
|
+
- **Customizable Cost Maps**: Create your own substitution cost maps for specific OCR systems or domains.
|
25
|
+
|
26
|
+
## Usage
|
27
|
+
|
28
|
+
```python
|
29
|
+
import ocr_stringdist as osd
|
30
|
+
|
31
|
+
# Using default OCR distance map
|
32
|
+
distance = osd.weighted_levenshtein_distance("OCR5", "OCRS")
|
33
|
+
print(f"Distance between 'OCR5' and 'OCRS': {distance}") # Will be less than 1.0
|
34
|
+
|
35
|
+
# Custom cost map
|
36
|
+
custom_map = {("f", "t"): 0.2, ("m", "n"): 0.1}
|
37
|
+
distance = osd.weighted_levenshtein_distance(
|
38
|
+
"first", "tirst",
|
39
|
+
cost_map=custom_map,
|
40
|
+
symmetric=True,
|
41
|
+
default_cost=1.0
|
42
|
+
)
|
43
|
+
print(f"Distance with custom map: {distance}")
|
44
|
+
```
|
45
|
+
|
46
|
+
## Acknowledgements
|
47
|
+
|
48
|
+
This project is inspired by [jellyfish](https://github.com/jamesturk/jellyfish), providing the base implementations of the algorithms used here.
|
@@ -0,0 +1,41 @@
|
|
1
|
+
from ocr_stringdist import weighted_levenshtein_distance
|
2
|
+
from icecream import ic
|
3
|
+
|
4
|
+
ic(
|
5
|
+
weighted_levenshtein_distance(
|
6
|
+
"12345G",
|
7
|
+
"123456",
|
8
|
+
# Default costs
|
9
|
+
),
|
10
|
+
)
|
11
|
+
|
12
|
+
ic(
|
13
|
+
weighted_levenshtein_distance(
|
14
|
+
"12345G",
|
15
|
+
"123456",
|
16
|
+
{("G", "6"): 0.1}, # Custom cost_map
|
17
|
+
)
|
18
|
+
)
|
19
|
+
|
20
|
+
ic(
|
21
|
+
weighted_levenshtein_distance(
|
22
|
+
"ABCDE",
|
23
|
+
"XBCDE",
|
24
|
+
cost_map={},
|
25
|
+
default_cost=0.8, # Lower default substitution cost (default is 1.0)
|
26
|
+
)
|
27
|
+
)
|
28
|
+
|
29
|
+
ic(
|
30
|
+
weighted_levenshtein_distance(
|
31
|
+
"RO8ERT",
|
32
|
+
"R0BERT",
|
33
|
+
{("O", "0"): 0.1, ("B", "8"): 0.2},
|
34
|
+
)
|
35
|
+
)
|
36
|
+
|
37
|
+
|
38
|
+
ic(weighted_levenshtein_distance("A", "B", {("A", "B"): 0.0}, symmetric=False))
|
39
|
+
ic(weighted_levenshtein_distance("A", "B", {("B", "A"): 0.0}, symmetric=False))
|
40
|
+
ic(weighted_levenshtein_distance("B", "A", {("B", "A"): 0.0}, symmetric=False))
|
41
|
+
ic(weighted_levenshtein_distance("B", "A", {("A", "B"): 0.0}, symmetric=False))
|
@@ -0,0 +1,41 @@
|
|
1
|
+
from typing import Optional
|
2
|
+
|
3
|
+
from ._rust_stringdist import * # noqa: F403
|
4
|
+
|
5
|
+
from .default_ocr_distances import ocr_distance_map
|
6
|
+
|
7
|
+
|
8
|
+
__all__ = [
|
9
|
+
"ocr_distance_map",
|
10
|
+
"weighted_levenshtein_distance", # noqa: F405
|
11
|
+
]
|
12
|
+
|
13
|
+
|
14
|
+
def weighted_levenshtein_distance(
|
15
|
+
s1: str,
|
16
|
+
s2: str,
|
17
|
+
/,
|
18
|
+
cost_map: Optional[dict[tuple[str, str], float]] = None,
|
19
|
+
*,
|
20
|
+
symmetric: bool = True,
|
21
|
+
default_cost: float = 1.0,
|
22
|
+
) -> float:
|
23
|
+
"""
|
24
|
+
Levenshtein distance with custom substitution costs.
|
25
|
+
Insertion/deletion costs are 1.
|
26
|
+
|
27
|
+
The default `cost_map` considers common OCR errors, see `ocr_stringdist.ocr_distance_map`.
|
28
|
+
|
29
|
+
:param s1: First string
|
30
|
+
:param s2: Second string
|
31
|
+
:param cost_map: Dictionary mapping tuples of characters to their substitution cost.
|
32
|
+
Only one direction needs to be configured unless `symmetric` is False.
|
33
|
+
Defaults to `ocr_stringdist.ocr_distance_map`.
|
34
|
+
:param symmetric: Should the keys of `cost_map` be considered to be symmetric? Defaults to True.
|
35
|
+
:param default_cost: The default substitution cost for character pairs not found in `cost_map`.
|
36
|
+
"""
|
37
|
+
if cost_map is None:
|
38
|
+
cost_map = ocr_distance_map
|
39
|
+
return _weighted_levenshtein_distance( # noqa: F405
|
40
|
+
s1, s2, cost_map=cost_map, symmetric=symmetric, default_cost=default_cost
|
41
|
+
)
|
@@ -0,0 +1,38 @@
|
|
1
|
+
ocr_distance_map: dict[tuple[str, str], float] = {
|
2
|
+
("O", "0"): 0.1,
|
3
|
+
("l", "1"): 0.1,
|
4
|
+
("I", "1"): 0.15,
|
5
|
+
("o", "0"): 0.2,
|
6
|
+
("B", "8"): 0.25,
|
7
|
+
("S", "5"): 0.3,
|
8
|
+
("G", "6"): 0.3,
|
9
|
+
("Z", "2"): 0.3,
|
10
|
+
("C", "c"): 0.3,
|
11
|
+
("é", "e"): 0.3,
|
12
|
+
("Ä", "A"): 0.4,
|
13
|
+
("Ö", "O"): 0.4,
|
14
|
+
("Ü", "U"): 0.4,
|
15
|
+
("c", "e"): 0.4,
|
16
|
+
("a", "o"): 0.4,
|
17
|
+
("u", "v"): 0.4,
|
18
|
+
("i", "l"): 0.4,
|
19
|
+
("s", "5"): 0.4,
|
20
|
+
("m", "n"): 0.5,
|
21
|
+
("f", "s"): 0.5,
|
22
|
+
(".", ","): 0.5,
|
23
|
+
("2", "Z"): 0.5,
|
24
|
+
("t", "f"): 0.6,
|
25
|
+
("r", "n"): 0.6,
|
26
|
+
("-", "_"): 0.6,
|
27
|
+
("ß", "B"): 0.6,
|
28
|
+
("h", "b"): 0.7,
|
29
|
+
("v", "y"): 0.7,
|
30
|
+
("i", "j"): 0.7,
|
31
|
+
("é", "á"): 0.7,
|
32
|
+
("E", "F"): 0.8,
|
33
|
+
}
|
34
|
+
"""
|
35
|
+
Pre-defined distance map between characters, considering common OCR errors.
|
36
|
+
The distances are between 0 and 1.
|
37
|
+
This map is intended to be used with `symmetric=True`.
|
38
|
+
"""
|
@@ -1,23 +1,17 @@
|
|
1
|
-
use crate::custom_levenshtein_distance as _weighted_lev;
|
2
1
|
use crate::custom_levenshtein_distance_with_cost_map as _weighted_lev_with_map;
|
3
2
|
use crate::OcrCostMap;
|
4
3
|
use pyo3::prelude::*;
|
5
4
|
use pyo3::types::PyDict;
|
6
5
|
use std::collections::HashMap;
|
7
6
|
|
8
|
-
// Calculates the Levenshtein distance between two strings.
|
9
|
-
#[pyfunction]
|
10
|
-
fn ocr_weighted_levenshtein_distance(a: &str, b: &str) -> PyResult<f64> {
|
11
|
-
Ok(_weighted_lev(a, b))
|
12
|
-
}
|
13
|
-
|
14
7
|
// Calculates the weighted Levenshtein distance with a custom cost map from Python.
|
15
8
|
#[pyfunction]
|
16
|
-
#[pyo3(signature = (a, b, cost_map, default_cost =
|
17
|
-
fn
|
9
|
+
#[pyo3(signature = (a, b, cost_map, symmetric = true, default_cost = 1.0))]
|
10
|
+
fn _weighted_levenshtein_distance(
|
18
11
|
a: &str,
|
19
12
|
b: &str,
|
20
13
|
cost_map: &Bound<'_, PyDict>,
|
14
|
+
symmetric: bool,
|
21
15
|
default_cost: Option<f64>,
|
22
16
|
) -> PyResult<f64> {
|
23
17
|
let default_cost_value = default_cost.unwrap_or(1.0);
|
@@ -38,14 +32,13 @@ fn custom_weighted_levenshtein_distance(
|
|
38
32
|
}
|
39
33
|
|
40
34
|
// Create a custom cost map and calculate the distance
|
41
|
-
let custom_cost_map = OcrCostMap::new(char_costs, default_cost_value);
|
35
|
+
let custom_cost_map = OcrCostMap::new(char_costs, default_cost_value, symmetric);
|
42
36
|
Ok(_weighted_lev_with_map(a, b, &custom_cost_map))
|
43
37
|
}
|
44
38
|
|
45
39
|
/// A Python module implemented in Rust.
|
46
40
|
#[pymodule]
|
47
41
|
pub fn _rust_stringdist(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
|
48
|
-
m.add_function(wrap_pyfunction!(
|
49
|
-
m.add_function(wrap_pyfunction!(custom_weighted_levenshtein_distance, m)?)?;
|
42
|
+
m.add_function(wrap_pyfunction!(_weighted_levenshtein_distance, m)?)?;
|
50
43
|
Ok(())
|
51
44
|
}
|
@@ -1,4 +1,3 @@
|
|
1
|
-
use once_cell::sync::Lazy;
|
2
1
|
use smallvec::SmallVec;
|
3
2
|
use std::collections::HashMap;
|
4
3
|
|
@@ -14,15 +13,18 @@ pub struct OcrCostMap {
|
|
14
13
|
impl OcrCostMap {
|
15
14
|
/// Creates a new OcrCostMap with specified costs.
|
16
15
|
/// Ensures symmetry by adding both (a, b) and (b, a) if only one is provided.
|
16
|
+
/// If symmetric, keys are inserted in both directions.
|
17
17
|
pub fn new(
|
18
18
|
custom_costs_input: HashMap<(char, char), f64>,
|
19
19
|
default_substitution_cost: f64,
|
20
|
+
symmetric: bool,
|
20
21
|
) -> Self {
|
21
22
|
let mut costs = HashMap::with_capacity(custom_costs_input.len() * 2); // Pre-allocate
|
22
23
|
for ((c1, c2), cost) in custom_costs_input {
|
23
|
-
// Ensure symmetry and avoid overwriting if both orders are present
|
24
24
|
costs.entry((c1, c2)).or_insert(cost);
|
25
|
-
|
25
|
+
if symmetric {
|
26
|
+
costs.entry((c2, c1)).or_insert(cost);
|
27
|
+
}
|
26
28
|
}
|
27
29
|
|
28
30
|
OcrCostMap {
|
@@ -32,7 +34,7 @@ impl OcrCostMap {
|
|
32
34
|
}
|
33
35
|
|
34
36
|
/// Gets the substitution cost between two characters.
|
35
|
-
/// Checks the custom map
|
37
|
+
/// Checks the custom map first, then falls back to the
|
36
38
|
/// default substitution cost configured within this map instance.
|
37
39
|
pub fn get_substitution_cost(&self, c1: char, c2: char) -> f64 {
|
38
40
|
if c1 == c2 {
|
@@ -48,35 +50,6 @@ impl OcrCostMap {
|
|
48
50
|
}
|
49
51
|
}
|
50
52
|
|
51
|
-
impl Default for OcrCostMap {
|
52
|
-
fn default() -> Self {
|
53
|
-
DEFAULT_OCR_COST_MAP.clone()
|
54
|
-
}
|
55
|
-
}
|
56
|
-
|
57
|
-
// --- Default OCR Map Initialization (Immutable HashMap) ---
|
58
|
-
|
59
|
-
// Define the costs as a static array of tuples
|
60
|
-
const DEFAULT_OCR_PAIRS: &[((char, char), f64)] = &[
|
61
|
-
(('G', '6'), 0.2),
|
62
|
-
(('O', '0'), 0.2),
|
63
|
-
(('o', '0'), 0.2),
|
64
|
-
(('l', '1'), 0.2),
|
65
|
-
(('I', '1'), 0.2),
|
66
|
-
(('2', 'Z'), 0.2),
|
67
|
-
(('B', '8'), 0.2),
|
68
|
-
(('S', '5'), 0.3),
|
69
|
-
(('s', '5'), 0.3),
|
70
|
-
(('E', 'F'), 0.8),
|
71
|
-
];
|
72
|
-
|
73
|
-
// Use Lazy and collect from the static array for initialization
|
74
|
-
static DEFAULT_OCR_COST_MAP: Lazy<OcrCostMap> = Lazy::new(|| {
|
75
|
-
// Collect the static array into a HashMap directly
|
76
|
-
let ocr_costs: HashMap<(char, char), f64> = DEFAULT_OCR_PAIRS.iter().copied().collect();
|
77
|
-
OcrCostMap::new(ocr_costs, 1.0)
|
78
|
-
});
|
79
|
-
|
80
53
|
// Helper to create a range vector with f64 values
|
81
54
|
fn range_vec_f64(size: usize) -> SmallVec<[f64; 16]> {
|
82
55
|
let mut vec = SmallVec::with_capacity(size);
|
@@ -86,11 +59,6 @@ fn range_vec_f64(size: usize) -> SmallVec<[f64; 16]> {
|
|
86
59
|
vec
|
87
60
|
}
|
88
61
|
|
89
|
-
/// Calculates Levenshtein distance between two vectors using custom costs.
|
90
|
-
pub fn vec_custom_levenshtein_distance(v1: &[char], v2: &[char]) -> f64 {
|
91
|
-
vec_custom_levenshtein_distance_with_cost_map(v1, v2, &OcrCostMap::default())
|
92
|
-
}
|
93
|
-
|
94
62
|
/// Calculates Levenshtein distance between two vectors using a specified cost map.
|
95
63
|
pub fn vec_custom_levenshtein_distance_with_cost_map(
|
96
64
|
v1: &[char],
|
@@ -131,18 +99,6 @@ pub fn vec_custom_levenshtein_distance_with_cost_map(
|
|
131
99
|
cur[cols - 1]
|
132
100
|
}
|
133
101
|
|
134
|
-
/// Calculates custom Levenshtein distance between two strings using OCR cost map.
|
135
|
-
pub fn custom_levenshtein_distance(s1: &str, s2: &str) -> f64 {
|
136
|
-
if s1 == s2 {
|
137
|
-
return 0.0;
|
138
|
-
}
|
139
|
-
|
140
|
-
let v1: Vec<char> = s1.chars().collect();
|
141
|
-
let v2: Vec<char> = s2.chars().collect();
|
142
|
-
|
143
|
-
vec_custom_levenshtein_distance(&v1, &v2)
|
144
|
-
}
|
145
|
-
|
146
102
|
/// Calculates custom Levenshtein distance between two strings using a provided cost map.
|
147
103
|
pub fn custom_levenshtein_distance_with_cost_map(s1: &str, s2: &str, cost_map: &OcrCostMap) -> f64 {
|
148
104
|
if s1 == s2 {
|
@@ -169,31 +125,11 @@ mod test {
|
|
169
125
|
);
|
170
126
|
}
|
171
127
|
|
172
|
-
#[test]
|
173
|
-
fn test_custom_levenshtein_simple() {
|
174
|
-
assert_approx_eq(custom_levenshtein_distance("abc", "axc"), 1.0, 1e-9);
|
175
|
-
assert_approx_eq(custom_levenshtein_distance("abc", "ac"), 1.0, 1e-9);
|
176
|
-
assert_approx_eq(custom_levenshtein_distance("ac", "abc"), 1.0, 1e-9);
|
177
|
-
}
|
178
|
-
|
179
|
-
#[test]
|
180
|
-
fn test_custom_levenshtein_ocr_pairs() {
|
181
|
-
assert_approx_eq(custom_levenshtein_distance("ABCDEFG", "ABCDEF6"), 0.2, 1e-9);
|
182
|
-
|
183
|
-
assert_approx_eq(custom_levenshtein_distance("ABCDEF6", "ABCDEFG"), 0.2, 1e-9);
|
184
|
-
|
185
|
-
assert_approx_eq(
|
186
|
-
custom_levenshtein_distance("ABCDEFG", "ABCDEF6X"),
|
187
|
-
0.2 + 1.0,
|
188
|
-
1e-9,
|
189
|
-
);
|
190
|
-
}
|
191
|
-
|
192
128
|
#[test]
|
193
129
|
fn test_custom_levenshtein_with_custom_map() {
|
194
130
|
let mut custom_costs = HashMap::new();
|
195
131
|
custom_costs.insert(('a', 'b'), 0.1);
|
196
|
-
let cost_map = OcrCostMap::new(custom_costs, 1.0);
|
132
|
+
let cost_map = OcrCostMap::new(custom_costs, 1.0, true);
|
197
133
|
|
198
134
|
assert_approx_eq(
|
199
135
|
custom_levenshtein_distance_with_cost_map("abc", "bbc", &cost_map),
|
ocr_stringdist-0.0.1/PKG-INFO
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.4
|
2
|
-
Name: ocr_stringdist
|
3
|
-
Version: 0.0.1
|
4
|
-
Classifier: Programming Language :: Rust
|
5
|
-
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
6
|
-
Classifier: Operating System :: OS Independent
|
7
|
-
Summary: String distances considering OCR errors.
|
8
|
-
Author: Niklas von Moers <niklasvmoers@protonmail.com>
|
9
|
-
Author-email: Niklas von Moers <niklasvmoers@protonmail.com>
|
10
|
-
License: MIT
|
11
|
-
Requires-Python: >=3.9
|
12
|
-
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
13
|
-
Project-URL: repository, https://github.com/NiklasvonM/ocr-stringdist
|
14
|
-
|
15
|
-
# OCR-Stringdist
|
16
|
-
|
ocr_stringdist-0.0.1/README.md
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
# OCR-Stringdist
|
ocr_stringdist-0.0.1/example.py
DELETED
@@ -1,50 +0,0 @@
|
|
1
|
-
from ocr_stringdist import (
|
2
|
-
ocr_weighted_levenshtein_distance,
|
3
|
-
custom_weighted_levenshtein_distance,
|
4
|
-
)
|
5
|
-
|
6
|
-
# Example with default OCR cost map
|
7
|
-
print("Using default OCR cost map:")
|
8
|
-
default_result = ocr_weighted_levenshtein_distance("12345G", "123456")
|
9
|
-
print(f"Distance between '12345G' and '123456': {default_result}")
|
10
|
-
|
11
|
-
# Example with custom cost map
|
12
|
-
custom_cost_map: dict[tuple[str, str], float] = {
|
13
|
-
("G", "6"): 0.1, # Make G/6 even more similar (default is 0.2)
|
14
|
-
("A", "B"): 0.3, # Make A/B somewhat similar
|
15
|
-
("X", "Y"): 0.5, # Make X/Y moderately similar
|
16
|
-
}
|
17
|
-
|
18
|
-
print("\nUsing custom cost map:")
|
19
|
-
custom_result = custom_weighted_levenshtein_distance(
|
20
|
-
"12345G", "123456", custom_cost_map
|
21
|
-
)
|
22
|
-
print(f"Distance between '12345G' and '123456' with custom map: {custom_result}")
|
23
|
-
|
24
|
-
# Example with custom default cost
|
25
|
-
print("\nUsing custom default cost:")
|
26
|
-
custom_default_result = custom_weighted_levenshtein_distance(
|
27
|
-
"ABCDE",
|
28
|
-
"XBCDE",
|
29
|
-
cost_map={("A", "X"): 0.5},
|
30
|
-
default_cost=0.8, # Lower default substitution cost (default is 1.0)
|
31
|
-
)
|
32
|
-
print(
|
33
|
-
f"Distance between 'ABCDE' and 'XBCDE' with custom default cost: {custom_default_result}"
|
34
|
-
)
|
35
|
-
|
36
|
-
# More complex example - comparing names with custom costs for similar looking characters
|
37
|
-
name_cost_map = {
|
38
|
-
("O", "0"): 0.1, # Letter O and number 0
|
39
|
-
("l", "1"): 0.1, # Lowercase L and number 1
|
40
|
-
("I", "1"): 0.1, # Uppercase I and number 1
|
41
|
-
("S", "5"): 0.2, # Letter S and number 5
|
42
|
-
("Z", "2"): 0.2, # Letter Z and number 2
|
43
|
-
("B", "8"): 0.2, # Letter B and number 8
|
44
|
-
}
|
45
|
-
|
46
|
-
print("\nComparing names with OCR-like errors:")
|
47
|
-
name1 = "ROBERT"
|
48
|
-
name2 = "R0BERT" # Using 0 instead of O
|
49
|
-
distance = custom_weighted_levenshtein_distance(name1, name2, name_cost_map)
|
50
|
-
print(f"Distance between '{name1}' and '{name2}': {distance}")
|
@@ -1 +0,0 @@
|
|
1
|
-
from ._rust_stringdist import *
|
@@ -1,8 +0,0 @@
|
|
1
|
-
def levenshtein_distance(s1: str, s2: str) -> int: ...
|
2
|
-
def ocr_weighted_levenshtein_distance(s1: str, s2: str) -> float: ...
|
3
|
-
def custom_weighted_levenshtein_distance(
|
4
|
-
s1: str,
|
5
|
-
s2: str,
|
6
|
-
cost_map: dict[tuple[str, str], float],
|
7
|
-
default_cost: float | None = None
|
8
|
-
) -> float: ...
|
ocr_stringdist-0.0.1/src/lib.rs
DELETED
@@ -1,11 +0,0 @@
|
|
1
|
-
mod weighted_levenshtein;
|
2
|
-
|
3
|
-
pub use weighted_levenshtein::{
|
4
|
-
custom_levenshtein_distance, custom_levenshtein_distance_with_cost_map,
|
5
|
-
vec_custom_levenshtein_distance, vec_custom_levenshtein_distance_with_cost_map, OcrCostMap,
|
6
|
-
};
|
7
|
-
|
8
|
-
#[cfg(feature = "python")]
|
9
|
-
mod rust_stringdist;
|
10
|
-
#[cfg(feature = "python")]
|
11
|
-
pub use rust_stringdist::_rust_stringdist;
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|