ocr-stringdist 0.1.0__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. {ocr_stringdist-0.1.0 → ocr_stringdist-0.2.0}/CHANGELOG.md +7 -0
  2. {ocr_stringdist-0.1.0 → ocr_stringdist-0.2.0}/Cargo.lock +1 -1
  3. {ocr_stringdist-0.1.0 → ocr_stringdist-0.2.0}/Cargo.toml +1 -1
  4. ocr_stringdist-0.2.0/PKG-INFO +9 -0
  5. ocr_stringdist-0.2.0/README.md +69 -0
  6. {ocr_stringdist-0.1.0 → ocr_stringdist-0.2.0}/docs/source/api/index.rst +5 -3
  7. {ocr_stringdist-0.1.0 → ocr_stringdist-0.2.0}/docs/source/conf.py +1 -0
  8. ocr_stringdist-0.2.0/docs/source/examples.rst +98 -0
  9. ocr_stringdist-0.2.0/docs/source/getting-started.rst +27 -0
  10. ocr_stringdist-0.2.0/docs/source/index.rst +57 -0
  11. {ocr_stringdist-0.1.0 → ocr_stringdist-0.2.0}/examples/batch_processing.py +5 -4
  12. ocr_stringdist-0.2.0/examples/explain_distance.py +23 -0
  13. {ocr_stringdist-0.1.0 → ocr_stringdist-0.2.0}/pyproject.toml +1 -0
  14. {ocr_stringdist-0.1.0 → ocr_stringdist-0.2.0}/python/ocr_stringdist/__init__.py +8 -1
  15. ocr_stringdist-0.2.0/python/ocr_stringdist/levenshtein.py +242 -0
  16. {ocr_stringdist-0.1.0 → ocr_stringdist-0.2.0}/ruff.toml +3 -1
  17. ocr_stringdist-0.2.0/src/explanation.rs +28 -0
  18. {ocr_stringdist-0.1.0 → ocr_stringdist-0.2.0}/src/lib.rs +4 -1
  19. ocr_stringdist-0.2.0/src/rust_stringdist.rs +235 -0
  20. {ocr_stringdist-0.1.0 → ocr_stringdist-0.2.0}/src/weighted_levenshtein.rs +265 -275
  21. ocr_stringdist-0.2.0/tests/test_explain_weighted_levenshtein.py +51 -0
  22. {ocr_stringdist-0.1.0 → ocr_stringdist-0.2.0}/tests/test_weighted_levenshtein.py +11 -1
  23. {ocr_stringdist-0.1.0 → ocr_stringdist-0.2.0}/uv.lock +16 -0
  24. ocr_stringdist-0.1.0/PKG-INFO +0 -85
  25. ocr_stringdist-0.1.0/README.md +0 -69
  26. ocr_stringdist-0.1.0/docs/source/examples.rst +0 -20
  27. ocr_stringdist-0.1.0/docs/source/getting-started.rst +0 -10
  28. ocr_stringdist-0.1.0/docs/source/index.rst +0 -36
  29. ocr_stringdist-0.1.0/python/ocr_stringdist/levenshtein.py +0 -118
  30. ocr_stringdist-0.1.0/src/rust_stringdist.rs +0 -135
  31. {ocr_stringdist-0.1.0 → ocr_stringdist-0.2.0}/.github/workflows/CI.yml +0 -0
  32. {ocr_stringdist-0.1.0 → ocr_stringdist-0.2.0}/.github/workflows/docs.yml +0 -0
  33. {ocr_stringdist-0.1.0 → ocr_stringdist-0.2.0}/.gitignore +0 -0
  34. {ocr_stringdist-0.1.0 → ocr_stringdist-0.2.0}/Justfile +0 -0
  35. {ocr_stringdist-0.1.0 → ocr_stringdist-0.2.0}/LICENSE +0 -0
  36. {ocr_stringdist-0.1.0 → ocr_stringdist-0.2.0}/docs/Makefile +0 -0
  37. {ocr_stringdist-0.1.0 → ocr_stringdist-0.2.0}/docs/make.bat +0 -0
  38. {ocr_stringdist-0.1.0 → ocr_stringdist-0.2.0}/docs/source/changelog.rst +0 -0
  39. {ocr_stringdist-0.1.0 → ocr_stringdist-0.2.0}/examples/weighted_levenshtein.py +0 -0
  40. {ocr_stringdist-0.1.0 → ocr_stringdist-0.2.0}/mypy.ini +0 -0
  41. {ocr_stringdist-0.1.0 → ocr_stringdist-0.2.0}/python/ocr_stringdist/default_ocr_distances.py +0 -0
  42. {ocr_stringdist-0.1.0 → ocr_stringdist-0.2.0}/python/ocr_stringdist/matching.py +0 -0
  43. {ocr_stringdist-0.1.0 → ocr_stringdist-0.2.0}/python/ocr_stringdist/py.typed +0 -0
  44. {ocr_stringdist-0.1.0 → ocr_stringdist-0.2.0}/src/cost_map.rs +0 -0
  45. {ocr_stringdist-0.1.0 → ocr_stringdist-0.2.0}/src/types.rs +0 -0
  46. {ocr_stringdist-0.1.0 → ocr_stringdist-0.2.0}/tests/test_batch_weighted_levenshtein.py +0 -0
  47. {ocr_stringdist-0.1.0 → ocr_stringdist-0.2.0}/tests/test_matching.py +0 -0
@@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.2.0] - 2025-08-31
9
+
10
+ ### Added
11
+
12
+ - `WeightedLevenshtein` class for reusable configuration.
13
+ - Explanation of edit operations via `WeightedLevenshtein.explain` and `explain_weighted_levenshtein`.
14
+
8
15
  ## [0.1.0] - 2025-04-26
9
16
 
10
17
  ### Added
@@ -74,7 +74,7 @@ dependencies = [
74
74
 
75
75
  [[package]]
76
76
  name = "ocr_stringdist"
77
- version = "0.1.0"
77
+ version = "0.2.0"
78
78
  dependencies = [
79
79
  "pyo3",
80
80
  "rayon",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "ocr_stringdist"
3
- version = "0.1.0"
3
+ version = "0.2.0"
4
4
  edition = "2021"
5
5
  description = "String distances considering OCR errors."
6
6
  authors = ["Niklas von Moers <niklasvmoers@protonmail.com>"]
@@ -0,0 +1,9 @@
1
+ Metadata-Version: 2.4
2
+ Name: ocr_stringdist
3
+ Version: 0.2.0
4
+ Classifier: Programming Language :: Rust
5
+ Classifier: Programming Language :: Python
6
+ Classifier: Operating System :: OS Independent
7
+ License-File: LICENSE
8
+ Requires-Python: >=3.9
9
+ Project-URL: repository, https://github.com/NiklasvonM/ocr-stringdist
@@ -0,0 +1,69 @@
1
+ # OCR-StringDist
2
+
3
+ A Python library for fast string distance calculations that account for common OCR (optical character recognition) errors.
4
+
5
+ Documentation: https://niklasvonm.github.io/ocr-stringdist/
6
+
7
+ [![PyPI](https://img.shields.io/badge/PyPI-Package-blue)](https://pypi.org/project/ocr-stringdist/)
8
+ [![License](https://img.shields.io/badge/License-MIT-green)](LICENSE)
9
+
10
+ ## Overview
11
+
12
+ Standard string distances (like Levenshtein) treat all character substitutions equally. This is suboptimal for text read from images via OCR, where errors like `O` vs `0` are far more common than, say, `O` vs `X`.
13
+
14
+ OCR-StringDist uses a **weighted Levenshtein distance**, assigning lower costs to common OCR errors.
15
+
16
+ **Example:** Matching against the correct word `CODE`:
17
+
18
+ * **Standard Levenshtein:**
19
+ * $d(\text{"CODE"}, \text{"C0DE"}) = 1$ (O → 0)
20
+ * $d(\text{"CODE"}, \text{"CXDE"}) = 1$ (O → X)
21
+ * Result: Both appear equally likely/distant.
22
+
23
+ * **OCR-StringDist (Weighted):**
24
+ * $d(\text{"CODE"}, \text{"C0DE"}) \approx 0.1$ (common error, low cost)
25
+ * $d(\text{"CODE"}, \text{"CXDE"}) = 1.0$ (unlikely error, high cost)
26
+ * Result: Correctly identifies `C0DE` as a much closer match.
27
+
28
+ This makes it ideal for matching potentially incorrect OCR output against known values (e.g., product codes, database entries).
29
+
30
+ > **Note:** This project is in early development. APIs may change in future releases.
31
+
32
+ ## Installation
33
+
34
+ ```bash
35
+ pip install ocr-stringdist
36
+ ```
37
+
38
+ ## Features
39
+
40
+ - **Weighted Levenshtein Distance**: Calculates Levenshtein distance with customizable costs for substitutions, insertions, and deletions. Includes an efficient batch version (`batch_weighted_levenshtein_distance`) for comparing one string against many candidates.
41
+ - **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
42
+ - **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
43
+ - **Unicode Support**: Works with arbitrary Unicode strings.
44
+ - **Best Match Finder**: Includes a utility function `find_best_candidate` to efficiently find the best match from a list based on _any_ distance function.
45
+
46
+ ## Usage
47
+
48
+ ### Weighted Levenshtein Distance
49
+
50
+ ```python
51
+ import ocr_stringdist as osd
52
+
53
+ # Using default OCR distance map
54
+ distance = osd.weighted_levenshtein_distance("OCR5", "OCRS")
55
+ print(f"Distance between 'OCR5' and 'OCRS': {distance}") # Will be less than 1.0
56
+
57
+ # Custom cost map
58
+ substitution_costs = {("In", "h"): 0.5}
59
+ distance = osd.weighted_levenshtein_distance(
60
+ "hi", "Ini",
61
+ substitution_costs=substitution_costs,
62
+ symmetric_substitution=True,
63
+ )
64
+ print(f"Distance with custom map: {distance}")
65
+ ```
66
+
67
+ ## Acknowledgements
68
+
69
+ This project is inspired by [jellyfish](https://github.com/jamesturk/jellyfish), providing the base implementations of the algorithms used here.
@@ -3,10 +3,12 @@
3
3
  API Reference
4
4
  =============
5
5
 
6
- .. automodule:: ocr_stringdist.levenshtein
6
+ .. autoclass:: ocr_stringdist.WeightedLevenshtein
7
7
  :members:
8
- :undoc-members:
9
- :show-inheritance:
8
+
9
+ .. autofunction:: ocr_stringdist.weighted_levenshtein_distance
10
+ .. autofunction:: ocr_stringdist.batch_weighted_levenshtein_distance
11
+ .. autofunction:: ocr_stringdist.explain_weighted_levenshtein
10
12
 
11
13
  .. automodule:: ocr_stringdist.matching
12
14
  :members:
@@ -30,6 +30,7 @@ extensions: list[str] = [
30
30
  "sphinx.ext.intersphinx", # Link to other projects' documentation
31
31
  "sphinx.ext.viewcode", # Add links to source code
32
32
  "sphinx_mdinclude", # Include Markdown
33
+ "sphinx_copybutton", # Add "copy" button to code blocks
33
34
  ]
34
35
 
35
36
  templates_path = ["_templates"]
@@ -0,0 +1,98 @@
1
+ ================
2
+ Usage Examples
3
+ ================
4
+
5
+ Basic Distance Calculation
6
+ ==========================
7
+
8
+ Using the default pre-defined map for common OCR errors:
9
+
10
+ .. code-block:: python
11
+
12
+ import ocr_stringdist as osd
13
+
14
+ # Compare "OCR5" and "OCRS"
15
+ # The default ocr_distance_map gives 'S' <-> '5' a cost of 0.3
16
+ distance = osd.weighted_levenshtein_distance("OCR5", "OCRS")
17
+ print(f"Distance between 'OCR5' and 'OCRS' (default map): {distance}")
18
+ # Output: Distance between 'OCR5' and 'OCRS' (default map): 0.3
19
+
20
+ Using Custom Costs
21
+ ==================
22
+
23
+ Define your own substitution costs:
24
+
25
+ .. code-block:: python
26
+
27
+ from ocr_stringdist import WeightedLevenshtein
28
+
29
+ # Define a custom cost for substituting "rn" with "m"
30
+ wl = WeightedLevenshtein(substitution_costs={("rn", "m"): 0.5})
31
+
32
+ distance = wl.distance("Churn Bucket", "Chum Bucket")
33
+ print(f"Distance using custom map: {distance}") # 0.5
34
+
35
+
36
+ Matching OCR Output Against Candidates
37
+ ======================================
38
+
39
+ This is a primary use case: finding the best match for an OCR string from a list of known possibilities.
40
+
41
+ .. code-block:: python
42
+
43
+ import ocr_stringdist as osd
44
+
45
+ ocr_output = "Harnburg" # OCR potentially misread 'm' as 'rn'
46
+ possible_cities = ["Harburg", "Hamburg", "Hannover", "Berlin"]
47
+
48
+ # Define costs relevant to the potential error
49
+ wl = osd.WeightedLevenshtein(substitution_costs={("rn", "m"): 0.2})
50
+
51
+ # Method 1: Using find_best_candidate
52
+ best_match_finder, min_distance_finder = osd.find_best_candidate(
53
+ ocr_output,
54
+ possible_cities,
55
+ distance_fun=wl.distance,
56
+ )
57
+ print(
58
+ f"(find_best_candidate) Best match for '{ocr_output}': '{best_match_finder}' "
59
+ f"(Distance: {min_distance_finder:.2f})"
60
+ )
61
+ # Output: (find_best_candidate) Best match for 'Harnburg': 'Hamburg' (Distance: 0.20)
62
+
63
+
64
+ # Method 2: Using WeightedLevenshtein.batch_distance
65
+ # Generally more efficient when comparing against many candidates.
66
+ distances = wl.batch_distance(ocr_output, possible_cities)
67
+
68
+ min_dist_batch = min(distances)
69
+ best_candidate_batch = possible_cities[distances.index(min_dist_batch)]
70
+
71
+ print(
72
+ f"(Batch) Best match for '{ocr_output}': '{best_candidate_batch}' "
73
+ f"(Distance: {min_dist_batch:.2f})"
74
+ )
75
+ # Output: (Batch) Best match for 'Harnburg': 'Hamburg' (Distance: 0.20)
76
+
77
+ Explaining Edit Operations
78
+ ==========================
79
+
80
+ You can get a detailed list of edit operations needed to transform one string into another.
81
+
82
+ .. code-block:: python
83
+
84
+ from ocr_stringdist import WeightedLevenshtein
85
+
86
+ wl = WeightedLevenshtein(substitution_costs={("日月", "明"): 0.4, ("末", "未"): 0.3})
87
+
88
+ s1 = "末日月" # mò rì yuè
89
+ s2 = "未明" # wèi míng
90
+
91
+ operations = wl.explain(s1, s2)
92
+ print(operations)
93
+
94
+ # Output:
95
+ # [
96
+ # EditOperation(op_type='substitute', source_token='末', target_token='未', cost=0.3),
97
+ # EditOperation(op_type='substitute', source_token='日月', target_token='明', cost=0.4)
98
+ # ]
@@ -0,0 +1,27 @@
1
+ =================
2
+ Getting Started
3
+ =================
4
+
5
+ Installation
6
+ ============
7
+
8
+ .. code-block:: console
9
+
10
+ pip install ocr-stringdist
11
+
12
+ Quick Example
13
+ =============
14
+
15
+ After installation, you can quickly calculate an OCR-aware string distance:
16
+
17
+ .. code-block:: python
18
+
19
+ import ocr_stringdist as osd
20
+
21
+ # Calculate distance using the default OCR error costs
22
+ # ("O" vs "0" has a low cost)
23
+ distance = osd.weighted_levenshtein_distance("HELLO", "HELL0")
24
+
25
+ print(f"The OCR-aware distance is: {distance}")
26
+
27
+ This uses the built-in :data:`ocr_distance_map` which assigns lower costs to common OCR character confusions. See the :doc:`examples` and :doc:`api/index` for more details and customization options.
@@ -0,0 +1,57 @@
1
+ ================
2
+ OCR-StringDist
3
+ ================
4
+
5
+ A Python library for fast string distance calculations that account for common OCR (optical character recognition) errors.
6
+
7
+ :Repository: https://niklasvonm.github.io/ocr-stringdist/
8
+ :Current version: |release|
9
+
10
+ .. image:: https://img.shields.io/badge/PyPI-Package-blue
11
+ :target: https://pypi.org/project/ocr-stringdist/
12
+ :alt: PyPI
13
+
14
+ .. image:: https://img.shields.io/badge/License-MIT-green
15
+ :target: LICENSE
16
+ :alt: License
17
+
18
+ Motivation
19
+ ==========
20
+
21
+ Standard string distances (like Levenshtein) treat all character substitutions equally. This is suboptimal for text read from images via OCR, where errors like `O` vs `0` are far more common than, say, `O` vs `X`.
22
+
23
+ OCR-StringDist uses a **weighted Levenshtein distance**, assigning lower costs to common OCR errors.
24
+
25
+ **Example:** Matching against the correct word `CODE`:
26
+
27
+ * **Standard Levenshtein:**
28
+ * :math:`d(\text{"C0DE"}, \text{"CODE"}) = 1` (0 → O)
29
+ * :math:`d(\text{"CXDE"}, \text{"CODE"}) = 1` (X → O)
30
+ * Result: Both appear equally likely/distant.
31
+
32
+ * **OCR-StringDist (Weighted):**
33
+ * :math:`d(\text{"C0DE"}, \text{"CODE"}) \approx 0.1` (common error, low cost)
34
+ * :math:`d(\text{"CXDE"}, \text{"CODE"}) = 1.0` (unlikely error, high cost)
35
+ * Result: Correctly identifies `C0DE` as a much closer match.
36
+
37
+ This makes it ideal for matching potentially incorrect OCR output against known values (e.g., product codes, database entries).
38
+
39
+ Features
40
+ ========
41
+
42
+ - **Weighted Levenshtein Distance**: Calculates Levenshtein distance with customizable costs for substitutions, insertions, and deletions. Includes an efficient batch version (`batch_weighted_levenshtein_distance`) for comparing one string against many candidates.
43
+ - **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
44
+ - **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
45
+ - **Unicode Support**: Works with arbitrary Unicode strings.
46
+ - **Best Match Finder**: Includes a utility function `find_best_candidate` to efficiently find the best match from a list based on _any_ distance function.
47
+
48
+ Contents
49
+ ========
50
+
51
+ .. toctree::
52
+ :maxdepth: 1
53
+
54
+ getting-started
55
+ examples
56
+ api/index
57
+ changelog
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env python3
2
2
  """
3
- Example demonstrating the usage of the batch processing functions from ocr_stringdist.
3
+ Example demonstrating the usage of the batch processing functions.
4
4
  """
5
5
 
6
6
  import time
@@ -28,9 +28,11 @@ def compare_methods() -> None:
28
28
  print("\nSingle string against multiple candidates:")
29
29
  print("-" * 50)
30
30
 
31
+ weighted_levenshtein = osd.WeightedLevenshtein()
32
+
31
33
  # Standard loop approach
32
34
  _, time_loop = benchmark(
33
- lambda: [osd.weighted_levenshtein_distance(source, cand) for cand in candidates]
35
+ lambda: [weighted_levenshtein.distance(source, cand) for cand in candidates]
34
36
  )
35
37
  print(
36
38
  f"Loop of single calls: {time_loop:.6f} seconds "
@@ -38,7 +40,7 @@ def compare_methods() -> None:
38
40
  )
39
41
 
40
42
  # Batch approach
41
- _, time_batch = benchmark(osd.batch_weighted_levenshtein_distance, source, candidates)
43
+ _, time_batch = benchmark(weighted_levenshtein.batch_distance, source, candidates)
42
44
  print(
43
45
  f"Batch function: {time_batch:.6f} seconds "
44
46
  f"({1000 * time_batch / len(candidates):.6f}ms each)"
@@ -47,7 +49,6 @@ def compare_methods() -> None:
47
49
 
48
50
 
49
51
  def main() -> None:
50
- """Main function."""
51
52
  print("Demonstrating batch processing functions from ocr_stringdist\n")
52
53
 
53
54
  # Run the benchmarks
@@ -0,0 +1,23 @@
1
+ from ocr_stringdist import explain_weighted_levenshtein
2
+
3
+ print(
4
+ explain_weighted_levenshtein(
5
+ "Churn Buckets",
6
+ "Chum Bucket",
7
+ substitution_costs={("rn", "m"): 0.5},
8
+ )
9
+ )
10
+ # [
11
+ # EditOperation(
12
+ # op_type='substitute',
13
+ # source_token='rn',
14
+ # target_token='m',
15
+ # cost=0.5
16
+ # ),
17
+ # EditOperation(
18
+ # op_type='delete',
19
+ # source_token='s',
20
+ # target_token=None,
21
+ # cost=1.0
22
+ # ),
23
+ # ]
@@ -33,6 +33,7 @@ dev = [
33
33
  ]
34
34
  docs = [
35
35
  "sphinx>=7.4.7",
36
+ "sphinx-copybutton>=0.5.2",
36
37
  "sphinx-mdinclude>=0.6.2",
37
38
  "sphinx-rtd-theme>=3.0.2",
38
39
  ]
@@ -1,10 +1,17 @@
1
1
  from .default_ocr_distances import ocr_distance_map
2
- from .levenshtein import batch_weighted_levenshtein_distance, weighted_levenshtein_distance
2
+ from .levenshtein import (
3
+ WeightedLevenshtein,
4
+ batch_weighted_levenshtein_distance,
5
+ explain_weighted_levenshtein,
6
+ weighted_levenshtein_distance,
7
+ )
3
8
  from .matching import find_best_candidate
4
9
 
5
10
  __all__ = [
6
11
  "ocr_distance_map",
12
+ "WeightedLevenshtein",
7
13
  "weighted_levenshtein_distance",
8
14
  "batch_weighted_levenshtein_distance",
15
+ "explain_weighted_levenshtein",
9
16
  "find_best_candidate",
10
17
  ]
@@ -0,0 +1,242 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Literal, Optional
5
+
6
+ from ._rust_stringdist import (
7
+ _batch_weighted_levenshtein_distance,
8
+ _explain_weighted_levenshtein_distance,
9
+ _weighted_levenshtein_distance,
10
+ )
11
+ from .default_ocr_distances import ocr_distance_map
12
+
13
+ OperationType = Literal["substitute", "insert", "delete"]
14
+
15
+
16
+ @dataclass(frozen=True)
17
+ class EditOperation:
18
+ """
19
+ Represents a single edit operation (substitution, insertion, or deletion).
20
+ """
21
+
22
+ op_type: OperationType
23
+ source_token: Optional[str]
24
+ target_token: Optional[str]
25
+ cost: float
26
+
27
+
28
+ class WeightedLevenshtein:
29
+ """
30
+ Calculates Levenshtein distance with custom, configurable costs.
31
+
32
+ This class is initialized with cost dictionaries and settings that define
33
+ how the distance is measured. Once created, its methods can be used to
34
+ efficiently compute distances and explain the edit operations.
35
+
36
+ :param substitution_costs: Maps (char, char) tuples to their substitution cost.
37
+ Defaults to costs based on common OCR errors.
38
+ :param insertion_costs: Maps a character to its insertion cost.
39
+ :param deletion_costs: Maps a character to its deletion cost.
40
+ :param symmetric_substitution: If True, substitution costs are bidirectional.
41
+ :param default_substitution_cost: Default cost for substitutions not in the map.
42
+ :param default_insertion_cost: Default cost for insertions not in the map.
43
+ :param default_deletion_cost: Default cost for deletions not in the map.
44
+ """
45
+
46
+ substitution_costs: dict[tuple[str, str], float]
47
+ insertion_costs: dict[str, float]
48
+ deletion_costs: dict[str, float]
49
+ symmetric_substitution: bool
50
+ default_substitution_cost: float
51
+ default_insertion_cost: float
52
+ default_deletion_cost: float
53
+
54
+ def __init__(
55
+ self,
56
+ substitution_costs: Optional[dict[tuple[str, str], float]] = None,
57
+ insertion_costs: Optional[dict[str, float]] = None,
58
+ deletion_costs: Optional[dict[str, float]] = None,
59
+ *,
60
+ symmetric_substitution: bool = True,
61
+ default_substitution_cost: float = 1.0,
62
+ default_insertion_cost: float = 1.0,
63
+ default_deletion_cost: float = 1.0,
64
+ ) -> None:
65
+ self.substitution_costs = (
66
+ ocr_distance_map if substitution_costs is None else substitution_costs
67
+ )
68
+ self.insertion_costs = {} if insertion_costs is None else insertion_costs
69
+ self.deletion_costs = {} if deletion_costs is None else deletion_costs
70
+ self.symmetric_substitution = symmetric_substitution
71
+ self.default_substitution_cost = default_substitution_cost
72
+ self.default_insertion_cost = default_insertion_cost
73
+ self.default_deletion_cost = default_deletion_cost
74
+
75
+ @classmethod
76
+ def unweighted(cls) -> WeightedLevenshtein:
77
+ """Creates an instance with all operations having equal cost of 1.0."""
78
+ return cls(substitution_costs={}, insertion_costs={}, deletion_costs={})
79
+
80
+ def distance(self, s1: str, s2: str) -> float:
81
+ """Calculates the weighted Levenshtein distance between two strings."""
82
+ return _weighted_levenshtein_distance(s1, s2, **self.__dict__) # type: ignore[no-any-return]
83
+
84
+ def explain(self, s1: str, s2: str) -> list[EditOperation]:
85
+ """Returns the list of edit operations to transform s1 into s2."""
86
+ raw_path = _explain_weighted_levenshtein_distance(s1, s2, **self.__dict__)
87
+ return [EditOperation(*op) for op in raw_path]
88
+
89
+ def batch_distance(self, s: str, candidates: list[str]) -> list[float]:
90
+ """Calculates distances between a string and a list of candidates."""
91
+ return _batch_weighted_levenshtein_distance(s, candidates, **self.__dict__) # type: ignore[no-any-return]
92
+
93
+
94
+ def weighted_levenshtein_distance(
95
+ s1: str,
96
+ s2: str,
97
+ /,
98
+ substitution_costs: Optional[dict[tuple[str, str], float]] = None,
99
+ insertion_costs: Optional[dict[str, float]] = None,
100
+ deletion_costs: Optional[dict[str, float]] = None,
101
+ *,
102
+ symmetric_substitution: bool = True,
103
+ default_substitution_cost: float = 1.0,
104
+ default_insertion_cost: float = 1.0,
105
+ default_deletion_cost: float = 1.0,
106
+ ) -> float:
107
+ """
108
+ Levenshtein distance with custom substitution, insertion and deletion costs.
109
+
110
+ See also :meth:`WeightedLevenshtein.distance`.
111
+
112
+ The default `substitution_costs` considers common OCR errors, see
113
+ :py:data:`ocr_stringdist.default_ocr_distances.ocr_distance_map`.
114
+
115
+ :param s1: First string (interpreted as the string read via OCR)
116
+ :param s2: Second string
117
+ :param substitution_costs: Dictionary mapping tuples of strings ("substitution tokens") to their
118
+ substitution costs. Only one direction needs to be configured unless
119
+ `symmetric_substitution` is False.
120
+ Note that the runtime scales in the length of the longest substitution token.
121
+ Defaults to `ocr_stringdist.ocr_distance_map`.
122
+ :param insertion_costs: Dictionary mapping strings to their insertion costs.
123
+ :param deletion_costs: Dictionary mapping strings to their deletion costs.
124
+ :param symmetric_substitution: Should the keys of `substitution_costs` be considered to be
125
+ symmetric? Defaults to True.
126
+ :param default_substitution_cost: The default substitution cost for character pairs not found
127
+ in `substitution_costs`.
128
+ :param default_insertion_cost: The default insertion cost for characters not found in
129
+ `insertion_costs`.
130
+ :param default_deletion_cost: The default deletion cost for characters not found in
131
+ `deletion_costs`.
132
+ """
133
+ return WeightedLevenshtein(
134
+ substitution_costs=substitution_costs,
135
+ insertion_costs=insertion_costs,
136
+ deletion_costs=deletion_costs,
137
+ symmetric_substitution=symmetric_substitution,
138
+ default_substitution_cost=default_substitution_cost,
139
+ default_insertion_cost=default_insertion_cost,
140
+ default_deletion_cost=default_deletion_cost,
141
+ ).distance(s1, s2)
142
+
143
+
144
+ def batch_weighted_levenshtein_distance(
145
+ s: str,
146
+ candidates: list[str],
147
+ /,
148
+ substitution_costs: Optional[dict[tuple[str, str], float]] = None,
149
+ insertion_costs: Optional[dict[str, float]] = None,
150
+ deletion_costs: Optional[dict[str, float]] = None,
151
+ *,
152
+ symmetric_substitution: bool = True,
153
+ default_substitution_cost: float = 1.0,
154
+ default_insertion_cost: float = 1.0,
155
+ default_deletion_cost: float = 1.0,
156
+ ) -> list[float]:
157
+ """
158
+ Calculate weighted Levenshtein distances between a string and multiple candidates.
159
+
160
+ See also :meth:`WeightedLevenshtein.batch_distance`.
161
+
162
+ This is more efficient than calling :func:`weighted_levenshtein_distance` multiple times.
163
+
164
+ :param s: The string to compare (interpreted as the string read via OCR)
165
+ :param candidates: List of candidate strings to compare against
166
+ :param substitution_costs: Dictionary mapping tuples of strings ("substitution tokens") to their
167
+ substitution costs. Only one direction needs to be configured unless
168
+ `symmetric_substitution` is False.
169
+ Note that the runtime scales in the length of the longest substitution token.
170
+ Defaults to `ocr_stringdist.ocr_distance_map`.
171
+ :param insertion_costs: Dictionary mapping strings to their insertion costs.
172
+ :param deletion_costs: Dictionary mapping strings to their deletion costs.
173
+ :param symmetric_substitution: Should the keys of `substitution_costs` be considered to be
174
+ symmetric? Defaults to True.
175
+ :param default_substitution_cost: The default substitution cost for character pairs not found
176
+ in `substitution_costs`.
177
+ :param default_insertion_cost: The default insertion cost for characters not found in
178
+ `insertion_costs`.
179
+ :param default_deletion_cost: The default deletion cost for characters not found in
180
+ `deletion_costs`.
181
+ :return: A list of distances corresponding to each candidate
182
+ """
183
+ return WeightedLevenshtein(
184
+ substitution_costs=substitution_costs,
185
+ insertion_costs=insertion_costs,
186
+ deletion_costs=deletion_costs,
187
+ symmetric_substitution=symmetric_substitution,
188
+ default_substitution_cost=default_substitution_cost,
189
+ default_insertion_cost=default_insertion_cost,
190
+ default_deletion_cost=default_deletion_cost,
191
+ ).batch_distance(s, candidates)
192
+
193
+
194
+ def explain_weighted_levenshtein(
195
+ s1: str,
196
+ s2: str,
197
+ /,
198
+ substitution_costs: Optional[dict[tuple[str, str], float]] = None,
199
+ insertion_costs: Optional[dict[str, float]] = None,
200
+ deletion_costs: Optional[dict[str, float]] = None,
201
+ *,
202
+ symmetric_substitution: bool = True,
203
+ default_substitution_cost: float = 1.0,
204
+ default_insertion_cost: float = 1.0,
205
+ default_deletion_cost: float = 1.0,
206
+ ) -> list[EditOperation]:
207
+ """
208
+ Computes the path of operations associated with the custom Levenshtein distance.
209
+
210
+ See also :meth:`WeightedLevenshtein.explain`.
211
+
212
+ The default `substitution_costs` considers common OCR errors, see
213
+ :py:data:`ocr_stringdist.default_ocr_distances.ocr_distance_map`.
214
+
215
+ :param s1: First string (interpreted as the string read via OCR)
216
+ :param s2: Second string
217
+ :param substitution_costs: Dictionary mapping tuples of strings ("substitution tokens") to their
218
+ substitution costs. Only one direction needs to be configured unless
219
+ `symmetric_substitution` is False.
220
+ Note that the runtime scales in the length of the longest substitution token.
221
+ Defaults to `ocr_stringdist.ocr_distance_map`.
222
+ :param insertion_costs: Dictionary mapping strings to their insertion costs.
223
+ :param deletion_costs: Dictionary mapping strings to their deletion costs.
224
+ :param symmetric_substitution: Should the keys of `substitution_costs` be considered to be
225
+ symmetric? Defaults to True.
226
+ :param default_substitution_cost: The default substitution cost for character pairs not found
227
+ in `substitution_costs`.
228
+ :param default_insertion_cost: The default insertion cost for characters not found in
229
+ `insertion_costs`.
230
+ :param default_deletion_cost: The default deletion cost for characters not found in
231
+ `deletion_costs`.
232
+ :return: List of :class:`EditOperation` instances.
233
+ """
234
+ return WeightedLevenshtein(
235
+ substitution_costs=substitution_costs,
236
+ insertion_costs=insertion_costs,
237
+ deletion_costs=deletion_costs,
238
+ symmetric_substitution=symmetric_substitution,
239
+ default_substitution_cost=default_substitution_cost,
240
+ default_insertion_cost=default_insertion_cost,
241
+ default_deletion_cost=default_deletion_cost,
242
+ ).explain(s1, s2)
@@ -51,7 +51,9 @@ select = [
51
51
  # refurb
52
52
  "FURB",
53
53
  ]
54
- ignore = []
54
+ ignore = [
55
+ "UP007", # Allow Option[T] for older Python versions.
56
+ ]
55
57
 
56
58
  # Allow fix for all enabled rules (when `--fix`) is provided.
57
59
  fixable = ["ALL"]