ocr-stringdist 0.0.6__tar.gz → 0.0.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/Cargo.lock +1 -1
- {ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/Cargo.toml +1 -1
- {ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/PKG-INFO +1 -2
- {ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/README.md +0 -1
- {ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/docs/source/api/index.rst +4 -3
- {ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/examples/batch_processing.py +2 -14
- {ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/examples/weighted_levenshtein.py +0 -1
- ocr_stringdist-0.0.7/python/ocr_stringdist/__init__.py +10 -0
- ocr_stringdist-0.0.6/python/ocr_stringdist/__init__.py → ocr_stringdist-0.0.7/python/ocr_stringdist/levenshtein.py +6 -33
- {ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/src/lib.rs +2 -0
- ocr_stringdist-0.0.7/src/longest_tokens.rs +48 -0
- {ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/src/rust_stringdist.rs +9 -9
- {ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/src/weighted_levenshtein.rs +1 -1
- {ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/tests/test_batch_functions.py +7 -17
- {ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/tests/test_ocr_stringdist.py +7 -13
- {ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/.github/workflows/CI.yml +0 -0
- {ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/.github/workflows/docs.yml +0 -0
- {ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/.gitignore +0 -0
- {ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/Justfile +0 -0
- {ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/LICENSE +0 -0
- {ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/docs/Makefile +0 -0
- {ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/docs/make.bat +0 -0
- {ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/docs/source/conf.py +0 -0
- {ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/docs/source/index.rst +0 -0
- {ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/mypy.ini +0 -0
- {ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/pyproject.toml +0 -0
- {ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/python/ocr_stringdist/default_ocr_distances.py +0 -0
- {ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/python/ocr_stringdist/matching.py +0 -0
- {ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/python/ocr_stringdist/py.typed +0 -0
- {ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/ruff.toml +0 -0
- {ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/tests/test_matching.py +0 -0
- {ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/uv.lock +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: ocr_stringdist
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.7
|
4
4
|
Classifier: Programming Language :: Rust
|
5
5
|
Classifier: Programming Language :: Python
|
6
6
|
Classifier: Operating System :: OS Independent
|
@@ -60,7 +60,6 @@ distance = osd.weighted_levenshtein_distance(
|
|
60
60
|
"hi", "Ini",
|
61
61
|
cost_map=custom_map,
|
62
62
|
symmetric=True,
|
63
|
-
max_token_characters=2,
|
64
63
|
)
|
65
64
|
print(f"Distance with custom map: {distance}")
|
66
65
|
```
|
@@ -5,9 +5,10 @@ API Reference
|
|
5
5
|
|
6
6
|
This page contains the auto-generated API reference documentation.
|
7
7
|
|
8
|
-
..
|
9
|
-
|
10
|
-
|
8
|
+
.. automodule:: ocr_stringdist.levenshtein
|
9
|
+
:members:
|
10
|
+
:undoc-members:
|
11
|
+
:show-inheritance:
|
11
12
|
|
12
13
|
.. automodule:: ocr_stringdist.matching
|
13
14
|
:members:
|
@@ -8,8 +8,6 @@ from typing import Any, Callable
|
|
8
8
|
|
9
9
|
import ocr_stringdist as osd
|
10
10
|
|
11
|
-
MAX_TOKEN_CHARACTERS = 1
|
12
|
-
|
13
11
|
|
14
12
|
def benchmark(func: Callable, *args: Any, **kwargs: Any) -> tuple[Any, float]: # type: ignore
|
15
13
|
"""Run a function and return the execution time in seconds."""
|
@@ -32,12 +30,7 @@ def compare_methods() -> None:
|
|
32
30
|
|
33
31
|
# Standard loop approach
|
34
32
|
_, time_loop = benchmark(
|
35
|
-
lambda: [
|
36
|
-
osd.weighted_levenshtein_distance(
|
37
|
-
source, cand, max_token_characters=MAX_TOKEN_CHARACTERS
|
38
|
-
)
|
39
|
-
for cand in candidates
|
40
|
-
]
|
33
|
+
lambda: [osd.weighted_levenshtein_distance(source, cand) for cand in candidates]
|
41
34
|
)
|
42
35
|
print(
|
43
36
|
f"Loop of single calls: {time_loop:.6f} seconds "
|
@@ -45,12 +38,7 @@ def compare_methods() -> None:
|
|
45
38
|
)
|
46
39
|
|
47
40
|
# Batch approach
|
48
|
-
_, time_batch = benchmark(
|
49
|
-
osd.batch_weighted_levenshtein_distance,
|
50
|
-
source,
|
51
|
-
candidates,
|
52
|
-
max_token_characters=MAX_TOKEN_CHARACTERS,
|
53
|
-
)
|
41
|
+
_, time_batch = benchmark(osd.batch_weighted_levenshtein_distance, source, candidates)
|
54
42
|
print(
|
55
43
|
f"Batch function: {time_batch:.6f} seconds "
|
56
44
|
f"({1000 * time_batch / len(candidates):.6f}ms each)"
|
@@ -0,0 +1,10 @@
|
|
1
|
+
from .default_ocr_distances import ocr_distance_map
|
2
|
+
from .levenshtein import batch_weighted_levenshtein_distance, weighted_levenshtein_distance
|
3
|
+
from .matching import find_best_candidate
|
4
|
+
|
5
|
+
__all__ = [
|
6
|
+
"ocr_distance_map",
|
7
|
+
"weighted_levenshtein_distance",
|
8
|
+
"batch_weighted_levenshtein_distance",
|
9
|
+
"find_best_candidate",
|
10
|
+
]
|
@@ -2,14 +2,6 @@ from typing import Optional
|
|
2
2
|
|
3
3
|
from ._rust_stringdist import * # noqa: F403
|
4
4
|
from .default_ocr_distances import ocr_distance_map
|
5
|
-
from .matching import find_best_candidate
|
6
|
-
|
7
|
-
__all__ = [
|
8
|
-
"ocr_distance_map",
|
9
|
-
"weighted_levenshtein_distance",
|
10
|
-
"batch_weighted_levenshtein_distance",
|
11
|
-
"find_best_candidate",
|
12
|
-
]
|
13
5
|
|
14
6
|
|
15
7
|
def weighted_levenshtein_distance(
|
@@ -20,7 +12,6 @@ def weighted_levenshtein_distance(
|
|
20
12
|
*,
|
21
13
|
symmetric: bool = True,
|
22
14
|
default_cost: float = 1.0,
|
23
|
-
max_token_characters: int = 1,
|
24
15
|
) -> float:
|
25
16
|
"""
|
26
17
|
Levenshtein distance with custom substitution costs.
|
@@ -34,26 +25,16 @@ def weighted_levenshtein_distance(
|
|
34
25
|
:param cost_map: Dictionary mapping tuples of strings ("substitution tokens") to their
|
35
26
|
substitution costs.
|
36
27
|
Only one direction needs to be configured unless `symmetric` is False.
|
37
|
-
Note that
|
38
|
-
have more than one character, for example when substituting "w" for "vv".
|
28
|
+
Note that the runtime scales in the length of the longest substitution token.
|
39
29
|
Defaults to `ocr_stringdist.ocr_distance_map`.
|
40
30
|
:param symmetric: Should the keys of `cost_map` be considered to be symmetric? Defaults to True.
|
41
31
|
:param default_cost: The default substitution cost for character pairs not found in `cost_map`.
|
42
|
-
:param max_token_characters: A positive integer, indicating the maximum number of characters a
|
43
|
-
substitution token in `cost_map` may have. The default 1 indicates
|
44
|
-
that only single characters can be substituted for each other.
|
45
|
-
Higher values lead to slower calculations.
|
46
32
|
"""
|
47
33
|
if cost_map is None:
|
48
34
|
cost_map = ocr_distance_map
|
49
35
|
# _weighted_levenshtein_distance is written in Rust, see src/rust_stringdist.rs.
|
50
36
|
return _weighted_levenshtein_distance( # type: ignore # noqa: F405
|
51
|
-
s1,
|
52
|
-
s2,
|
53
|
-
cost_map=cost_map,
|
54
|
-
symmetric=symmetric,
|
55
|
-
default_cost=default_cost,
|
56
|
-
max_token_characters=max_token_characters,
|
37
|
+
s1, s2, cost_map=cost_map, symmetric=symmetric, default_cost=default_cost
|
57
38
|
)
|
58
39
|
|
59
40
|
|
@@ -65,7 +46,6 @@ def batch_weighted_levenshtein_distance(
|
|
65
46
|
*,
|
66
47
|
symmetric: bool = True,
|
67
48
|
default_cost: float = 1.0,
|
68
|
-
max_token_characters: int = 1,
|
69
49
|
) -> list[float]:
|
70
50
|
"""
|
71
51
|
Calculate weighted Levenshtein distances between a string and multiple candidates.
|
@@ -74,25 +54,18 @@ def batch_weighted_levenshtein_distance(
|
|
74
54
|
|
75
55
|
:param s: The string to compare
|
76
56
|
:param candidates: List of candidate strings to compare against
|
77
|
-
:param cost_map: Dictionary mapping tuples of
|
57
|
+
:param cost_map: Dictionary mapping tuples of strings ("substitution tokens") to their
|
58
|
+
substitution costs.
|
78
59
|
Only one direction needs to be configured unless `symmetric` is False.
|
60
|
+
Note that the runtime scales in the length of the longest substitution token.
|
79
61
|
Defaults to `ocr_stringdist.ocr_distance_map`.
|
80
62
|
:param symmetric: Should the keys of `cost_map` be considered to be symmetric? Defaults to True.
|
81
63
|
:param default_cost: The default substitution cost for character pairs not found in `cost_map`.
|
82
|
-
:param max_token_characters: A positive integer, indicating the maximum number of characters a
|
83
|
-
substitution token in `cost_map` may have. The default 1 indicates
|
84
|
-
that only single characters can be substituted for each other.
|
85
|
-
Higher values lead to slower calculations.
|
86
64
|
:return: A list of distances corresponding to each candidate
|
87
65
|
"""
|
88
66
|
if cost_map is None:
|
89
67
|
cost_map = ocr_distance_map
|
90
68
|
# _batch_weighted_levenshtein_distance is written in Rust, see src/rust_stringdist.rs.
|
91
69
|
return _batch_weighted_levenshtein_distance( # type: ignore # noqa: F405
|
92
|
-
s,
|
93
|
-
candidates,
|
94
|
-
cost_map=cost_map,
|
95
|
-
symmetric=symmetric,
|
96
|
-
default_cost=default_cost,
|
97
|
-
max_token_characters=max_token_characters,
|
70
|
+
s, candidates, cost_map=cost_map, symmetric=symmetric, default_cost=default_cost
|
98
71
|
)
|
@@ -0,0 +1,48 @@
|
|
1
|
+
use std::collections::HashMap;
|
2
|
+
|
3
|
+
/// Calculates the length of the longest string found within the key tuples of a HashMap.
|
4
|
+
pub fn longest_key_string_length<V>(map: &HashMap<(String, String), V>) -> usize {
|
5
|
+
map.keys()
|
6
|
+
.flat_map(|(s1, s2)| [s1.len(), s2.len()].into_iter())
|
7
|
+
.max()
|
8
|
+
.unwrap_or(1)
|
9
|
+
}
|
10
|
+
|
11
|
+
#[cfg(test)]
|
12
|
+
mod tests {
|
13
|
+
use super::*;
|
14
|
+
|
15
|
+
#[test]
|
16
|
+
fn test_longest_key_string_length_basic() {
|
17
|
+
let mut map = HashMap::new();
|
18
|
+
map.insert(("apple".to_string(), "banana".to_string()), 1); // 5, 6
|
19
|
+
map.insert(("kiwi".to_string(), "grapefruit".to_string()), 2); // 4, 10
|
20
|
+
map.insert(("short".to_string(), "tiny".to_string()), 3); // 5, 4
|
21
|
+
|
22
|
+
assert_eq!(longest_key_string_length(&map), 10); // "grapefruit"
|
23
|
+
}
|
24
|
+
|
25
|
+
#[test]
|
26
|
+
fn test_longest_key_string_length_first_element() {
|
27
|
+
let mut map = HashMap::new();
|
28
|
+
map.insert(("a_very_long_string".to_string(), "short".to_string()), 1); // 18, 5
|
29
|
+
map.insert(("medium".to_string(), "small".to_string()), 2); // 6, 5
|
30
|
+
|
31
|
+
assert_eq!(longest_key_string_length(&map), 18);
|
32
|
+
}
|
33
|
+
|
34
|
+
#[test]
|
35
|
+
fn test_longest_key_string_length_empty_map() {
|
36
|
+
let map: HashMap<(String, String), bool> = HashMap::new();
|
37
|
+
assert_eq!(longest_key_string_length(&map), 1);
|
38
|
+
}
|
39
|
+
|
40
|
+
#[test]
|
41
|
+
fn test_longest_key_string_length_empty_strings() {
|
42
|
+
let mut map = HashMap::new();
|
43
|
+
map.insert(("".to_string(), "".to_string()), 1);
|
44
|
+
map.insert(("a".to_string(), "".to_string()), 2);
|
45
|
+
|
46
|
+
assert_eq!(longest_key_string_length(&map), 1);
|
47
|
+
}
|
48
|
+
}
|
@@ -1,4 +1,5 @@
|
|
1
1
|
use crate::custom_levenshtein_distance_with_cost_map as _weighted_lev_with_map;
|
2
|
+
use crate::longest_key_string_length;
|
2
3
|
use crate::OcrCostMap;
|
3
4
|
use pyo3::prelude::*;
|
4
5
|
use pyo3::types::PyDict;
|
@@ -6,42 +7,41 @@ use rayon::prelude::*;
|
|
6
7
|
|
7
8
|
// Calculates the weighted Levenshtein distance with a custom cost map from Python.
|
8
9
|
#[pyfunction]
|
9
|
-
#[pyo3(signature = (a, b, cost_map, symmetric = true, default_cost = 1.0
|
10
|
+
#[pyo3(signature = (a, b, cost_map, symmetric = true, default_cost = 1.0))]
|
10
11
|
fn _weighted_levenshtein_distance(
|
11
12
|
a: &str,
|
12
13
|
b: &str,
|
13
14
|
cost_map: &Bound<'_, PyDict>,
|
14
15
|
symmetric: bool,
|
15
16
|
default_cost: f64,
|
16
|
-
max_token_characters: usize,
|
17
17
|
) -> PyResult<f64> {
|
18
|
+
let ocr_cost_map = OcrCostMap::from_py_dict(cost_map, default_cost, symmetric);
|
19
|
+
let max_token_characters = longest_key_string_length(&ocr_cost_map.costs);
|
18
20
|
Ok(_weighted_lev_with_map(
|
19
21
|
a,
|
20
22
|
b,
|
21
|
-
&
|
23
|
+
&ocr_cost_map,
|
22
24
|
max_token_characters,
|
23
25
|
))
|
24
26
|
}
|
25
27
|
|
26
28
|
// Calculates the weighted Levenshtein distance between a string and a list of candidates.
|
27
29
|
#[pyfunction]
|
28
|
-
#[pyo3(signature = (s, candidates, cost_map, symmetric = true, default_cost = 1.0
|
30
|
+
#[pyo3(signature = (s, candidates, cost_map, symmetric = true, default_cost = 1.0))]
|
29
31
|
fn _batch_weighted_levenshtein_distance(
|
30
32
|
s: &str,
|
31
33
|
candidates: Vec<String>,
|
32
34
|
cost_map: &Bound<'_, PyDict>,
|
33
35
|
symmetric: bool,
|
34
36
|
default_cost: f64,
|
35
|
-
max_token_characters: usize,
|
36
37
|
) -> PyResult<Vec<f64>> {
|
37
|
-
let
|
38
|
+
let ocr_cost_map = OcrCostMap::from_py_dict(cost_map, default_cost, symmetric);
|
39
|
+
let max_token_characters = longest_key_string_length(&ocr_cost_map.costs);
|
38
40
|
|
39
41
|
// Calculate distances for each candidate in parallel
|
40
42
|
let distances: Vec<f64> = candidates
|
41
43
|
.par_iter()
|
42
|
-
.map(|candidate|
|
43
|
-
_weighted_lev_with_map(s, candidate, &custom_cost_map, max_token_characters)
|
44
|
-
})
|
44
|
+
.map(|candidate| _weighted_lev_with_map(s, candidate, &ocr_cost_map, max_token_characters))
|
45
45
|
.collect();
|
46
46
|
|
47
47
|
Ok(distances)
|
@@ -6,7 +6,7 @@ use pyo3::prelude::*;
|
|
6
6
|
#[derive(Clone, Debug)]
|
7
7
|
pub struct OcrCostMap {
|
8
8
|
/// Maps pairs of strings to their specific substitution cost.
|
9
|
-
costs: HashMap<(String, String), f64>,
|
9
|
+
pub costs: HashMap<(String, String), f64>,
|
10
10
|
/// Default cost for substitutions not found in the map.
|
11
11
|
default_substitution_cost: f64,
|
12
12
|
}
|
@@ -40,14 +40,12 @@ def test_batch_vs_individual(
|
|
40
40
|
"""Test that batch results match individual function calls."""
|
41
41
|
# Individual results
|
42
42
|
individual_results = [
|
43
|
-
weighted_levenshtein_distance(source, candidate, cost_map=cost_map
|
43
|
+
weighted_levenshtein_distance(source, candidate, cost_map=cost_map)
|
44
44
|
for candidate in candidates
|
45
45
|
]
|
46
46
|
|
47
47
|
# Batch results
|
48
|
-
batch_results = batch_weighted_levenshtein_distance(
|
49
|
-
source, candidates, cost_map=cost_map, max_token_characters=2
|
50
|
-
)
|
48
|
+
batch_results = batch_weighted_levenshtein_distance(source, candidates, cost_map=cost_map)
|
51
49
|
|
52
50
|
# Compare results
|
53
51
|
for ind, batch in zip(individual_results, batch_results):
|
@@ -79,9 +77,7 @@ def test_batch_finds_best_match(
|
|
79
77
|
) -> None:
|
80
78
|
"""Test that batch processing correctly identifies the best match."""
|
81
79
|
# Using OCR cost map
|
82
|
-
distances = batch_weighted_levenshtein_distance(
|
83
|
-
source, candidates, cost_map=OCR_COST_MAP, max_token_characters=2
|
84
|
-
)
|
80
|
+
distances = batch_weighted_levenshtein_distance(source, candidates, cost_map=OCR_COST_MAP)
|
85
81
|
print(f"------------------------------------distances: {distances}")
|
86
82
|
|
87
83
|
# Find the index with minimum distance
|
@@ -101,14 +97,12 @@ def test_batch_finds_best_match(
|
|
101
97
|
)
|
102
98
|
def test_custom_cost_map(test_string: str, expected_distance: float) -> None:
|
103
99
|
"""Test using a custom cost map for specific substitution costs."""
|
104
|
-
result = weighted_levenshtein_distance(
|
105
|
-
"hello", test_string, cost_map=OCR_COST_MAP, max_token_characters=2
|
106
|
-
)
|
100
|
+
result = weighted_levenshtein_distance("hello", test_string, cost_map=OCR_COST_MAP)
|
107
101
|
assert result == pytest.approx(expected_distance)
|
108
102
|
|
109
103
|
# Check that batch processing gives the same result
|
110
104
|
batch_result = batch_weighted_levenshtein_distance(
|
111
|
-
"hello", [test_string], cost_map=OCR_COST_MAP
|
105
|
+
"hello", [test_string], cost_map=OCR_COST_MAP
|
112
106
|
)[0]
|
113
107
|
assert batch_result == pytest.approx(expected_distance)
|
114
108
|
|
@@ -126,15 +120,11 @@ def test_empty_vs_default_cost_map(
|
|
126
120
|
) -> None:
|
127
121
|
"""Test that empty cost maps produce different results than default cost maps."""
|
128
122
|
# With empty cost map (all costs are 1.0)
|
129
|
-
default_result = batch_weighted_levenshtein_distance(
|
130
|
-
string1, [string2], cost_map={}, max_token_characters=2
|
131
|
-
)
|
123
|
+
default_result = batch_weighted_levenshtein_distance(string1, [string2], cost_map={})
|
132
124
|
assert default_result[0] == pytest.approx(default_map_distance)
|
133
125
|
|
134
126
|
# With custom cost map (OCR-specific costs)
|
135
|
-
custom_result = batch_weighted_levenshtein_distance(
|
136
|
-
string1, [string2], cost_map=OCR_COST_MAP, max_token_characters=2
|
137
|
-
)
|
127
|
+
custom_result = batch_weighted_levenshtein_distance(string1, [string2], cost_map=OCR_COST_MAP)
|
138
128
|
assert custom_result[0] == pytest.approx(custom_map_distance)
|
139
129
|
|
140
130
|
# Custom map should give lower distance for OCR errors
|
@@ -35,9 +35,7 @@ from ocr_stringdist import weighted_levenshtein_distance
|
|
35
35
|
def test_weighted_levenshtein_distance(
|
36
36
|
s1: str, s2: str, cost_map: dict[tuple[str, str], float], expected: float
|
37
37
|
) -> None:
|
38
|
-
assert weighted_levenshtein_distance(
|
39
|
-
s1, s2, cost_map=cost_map, max_token_characters=3
|
40
|
-
) == pytest.approx(expected)
|
38
|
+
assert weighted_levenshtein_distance(s1, s2, cost_map=cost_map) == pytest.approx(expected)
|
41
39
|
|
42
40
|
|
43
41
|
def test_complex_ocr_substitutions() -> None:
|
@@ -57,12 +55,8 @@ def test_complex_ocr_substitutions() -> None:
|
|
57
55
|
original = "The man ran down the hill at 10 km/h."
|
58
56
|
ocr_result = "Tine rnan ram dovvn tine Ini11 at 1O krn/In."
|
59
57
|
|
60
|
-
distance = weighted_levenshtein_distance(
|
61
|
-
|
62
|
-
)
|
63
|
-
standard_distance = weighted_levenshtein_distance(
|
64
|
-
original, ocr_result, cost_map={}, max_token_characters=3
|
65
|
-
)
|
58
|
+
distance = weighted_levenshtein_distance(original, ocr_result, cost_map=ocr_cost_map)
|
59
|
+
standard_distance = weighted_levenshtein_distance(original, ocr_result, cost_map={})
|
66
60
|
assert standard_distance > distance
|
67
61
|
|
68
62
|
|
@@ -83,7 +77,7 @@ def test_asymmetric_substitution_costs(s1: str, s2: str, expected: float) -> Non
|
|
83
77
|
("S", "5"): 0.6,
|
84
78
|
}
|
85
79
|
assert weighted_levenshtein_distance(
|
86
|
-
s1, s2, cost_map=asymmetric_cost_map, symmetric=False
|
80
|
+
s1, s2, cost_map=asymmetric_cost_map, symmetric=False
|
87
81
|
) == pytest.approx(expected)
|
88
82
|
|
89
83
|
|
@@ -107,6 +101,6 @@ def test_nested_substitution_patterns(s1: str, s2: str, expected: float) -> None
|
|
107
101
|
("abc", "d"): 0.3,
|
108
102
|
("d", "abc"): 0.3,
|
109
103
|
}
|
110
|
-
assert weighted_levenshtein_distance(
|
111
|
-
|
112
|
-
)
|
104
|
+
assert weighted_levenshtein_distance(s1, s2, cost_map=nested_cost_map) == pytest.approx(
|
105
|
+
expected
|
106
|
+
)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{ocr_stringdist-0.0.6 → ocr_stringdist-0.0.7}/python/ocr_stringdist/default_ocr_distances.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|