ocr-stringdist 0.0.4__tar.gz → 0.0.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.5}/.github/workflows/CI.yml +0 -1
- ocr_stringdist-0.0.5/.github/workflows/docs.yml +70 -0
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.5}/Cargo.lock +1 -65
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.5}/Cargo.toml +1 -3
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.5}/Justfile +1 -1
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.5}/PKG-INFO +9 -5
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.5}/README.md +8 -4
- ocr_stringdist-0.0.5/docs/Makefile +20 -0
- ocr_stringdist-0.0.5/docs/make.bat +35 -0
- ocr_stringdist-0.0.5/docs/source/api/index.rst +18 -0
- ocr_stringdist-0.0.5/docs/source/conf.py +40 -0
- ocr_stringdist-0.0.5/docs/source/index.rst +10 -0
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.5}/example.py +9 -12
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.5}/pyproject.toml +5 -0
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.5}/python/ocr_stringdist/__init__.py +1 -0
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.5}/src/rust_stringdist.rs +3 -8
- ocr_stringdist-0.0.5/src/weighted_levenshtein.rs +322 -0
- ocr_stringdist-0.0.5/tests/test_ocr_stringdist.py +106 -0
- ocr_stringdist-0.0.5/uv.lock +801 -0
- ocr_stringdist-0.0.4/src/weighted_levenshtein.rs +0 -140
- ocr_stringdist-0.0.4/tests/test_ocr_stringdist.py +0 -5
- ocr_stringdist-0.0.4/uv.lock +0 -290
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.5}/.gitignore +0 -0
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.5}/LICENSE +0 -0
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.5}/mypy.ini +0 -0
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.5}/python/ocr_stringdist/default_ocr_distances.py +0 -0
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.5}/python/ocr_stringdist/matching.py +0 -0
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.5}/python/ocr_stringdist/py.typed +0 -0
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.5}/ruff.toml +0 -0
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.5}/src/lib.rs +0 -0
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.5}/tests/test_matching.py +0 -0
@@ -0,0 +1,70 @@
|
|
1
|
+
name: Deploy Documentation to Pages
|
2
|
+
|
3
|
+
on:
|
4
|
+
push:
|
5
|
+
branches:
|
6
|
+
- main
|
7
|
+
workflow_dispatch: # Allows manual triggering from the Actions tab
|
8
|
+
|
9
|
+
# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
|
10
|
+
permissions:
|
11
|
+
contents: read
|
12
|
+
pages: write
|
13
|
+
id-token: write
|
14
|
+
|
15
|
+
# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
|
16
|
+
# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
|
17
|
+
concurrency:
|
18
|
+
group: "pages"
|
19
|
+
cancel-in-progress: false
|
20
|
+
|
21
|
+
jobs:
|
22
|
+
build:
|
23
|
+
runs-on: ubuntu-latest
|
24
|
+
steps:
|
25
|
+
- name: Checkout repository
|
26
|
+
uses: actions/checkout@v4
|
27
|
+
|
28
|
+
- name: Set up Python
|
29
|
+
uses: actions/setup-python@v5
|
30
|
+
with:
|
31
|
+
python-version: '3.12'
|
32
|
+
|
33
|
+
- name: Install uv
|
34
|
+
run: curl -LsSf https://astral.sh/uv/install.sh | sh
|
35
|
+
|
36
|
+
- name: Create virtual environment
|
37
|
+
run: uv venv
|
38
|
+
|
39
|
+
- name: Install dependencies
|
40
|
+
run: uv sync --group docs
|
41
|
+
|
42
|
+
- name: Build Sphinx documentation
|
43
|
+
run: |
|
44
|
+
uv run make -C docs html
|
45
|
+
# Add a .nojekyll file to the build output directory to prevent
|
46
|
+
# GitHub Pages from ignoring files that start with an underscore
|
47
|
+
# (like Sphinx's _static and _images directories).
|
48
|
+
touch docs/build/html/.nojekyll
|
49
|
+
|
50
|
+
- name: Setup Pages
|
51
|
+
uses: actions/configure-pages@v4
|
52
|
+
|
53
|
+
- name: Upload artifact
|
54
|
+
uses: actions/upload-pages-artifact@v3
|
55
|
+
with:
|
56
|
+
# Upload entire directory. GitHub Pages expects index.html at the root.
|
57
|
+
path: './docs/build/html'
|
58
|
+
|
59
|
+
deploy:
|
60
|
+
environment:
|
61
|
+
name: github-pages
|
62
|
+
url: ${{ steps.deployment.outputs.page_url }}
|
63
|
+
runs-on: ubuntu-latest
|
64
|
+
needs: build
|
65
|
+
steps:
|
66
|
+
- name: Deploy to GitHub Pages
|
67
|
+
id: deployment
|
68
|
+
uses: actions/deploy-pages@v4
|
69
|
+
# This action automatically downloads the artifact uploaded by
|
70
|
+
# upload-pages-artifact and deploys it to GitHub Pages.
|
@@ -2,19 +2,6 @@
|
|
2
2
|
# It is not intended for manual editing.
|
3
3
|
version = 3
|
4
4
|
|
5
|
-
[[package]]
|
6
|
-
name = "ahash"
|
7
|
-
version = "0.8.11"
|
8
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
9
|
-
checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
|
10
|
-
dependencies = [
|
11
|
-
"cfg-if",
|
12
|
-
"getrandom",
|
13
|
-
"once_cell",
|
14
|
-
"version_check",
|
15
|
-
"zerocopy",
|
16
|
-
]
|
17
|
-
|
18
5
|
[[package]]
|
19
6
|
name = "autocfg"
|
20
7
|
version = "1.4.0"
|
@@ -27,17 +14,6 @@ version = "1.0.0"
|
|
27
14
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
28
15
|
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
29
16
|
|
30
|
-
[[package]]
|
31
|
-
name = "getrandom"
|
32
|
-
version = "0.2.15"
|
33
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
34
|
-
checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
|
35
|
-
dependencies = [
|
36
|
-
"cfg-if",
|
37
|
-
"libc",
|
38
|
-
"wasi",
|
39
|
-
]
|
40
|
-
|
41
17
|
[[package]]
|
42
18
|
name = "heck"
|
43
19
|
version = "0.5.0"
|
@@ -67,11 +43,9 @@ dependencies = [
|
|
67
43
|
|
68
44
|
[[package]]
|
69
45
|
name = "ocr_stringdist"
|
70
|
-
version = "0.0.
|
46
|
+
version = "0.0.5"
|
71
47
|
dependencies = [
|
72
|
-
"ahash",
|
73
48
|
"pyo3",
|
74
|
-
"smallvec",
|
75
49
|
]
|
76
50
|
|
77
51
|
[[package]]
|
@@ -167,12 +141,6 @@ dependencies = [
|
|
167
141
|
"proc-macro2",
|
168
142
|
]
|
169
143
|
|
170
|
-
[[package]]
|
171
|
-
name = "smallvec"
|
172
|
-
version = "1.15.0"
|
173
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
174
|
-
checksum = "8917285742e9f3e1683f0a9c4e6b57960b7314d0b08d30d1ecd426713ee2eee9"
|
175
|
-
|
176
144
|
[[package]]
|
177
145
|
name = "syn"
|
178
146
|
version = "2.0.100"
|
@@ -201,35 +169,3 @@ name = "unindent"
|
|
201
169
|
version = "0.2.4"
|
202
170
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
203
171
|
checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3"
|
204
|
-
|
205
|
-
[[package]]
|
206
|
-
name = "version_check"
|
207
|
-
version = "0.9.5"
|
208
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
209
|
-
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
|
210
|
-
|
211
|
-
[[package]]
|
212
|
-
name = "wasi"
|
213
|
-
version = "0.11.0+wasi-snapshot-preview1"
|
214
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
215
|
-
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
|
216
|
-
|
217
|
-
[[package]]
|
218
|
-
name = "zerocopy"
|
219
|
-
version = "0.7.35"
|
220
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
221
|
-
checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
|
222
|
-
dependencies = [
|
223
|
-
"zerocopy-derive",
|
224
|
-
]
|
225
|
-
|
226
|
-
[[package]]
|
227
|
-
name = "zerocopy-derive"
|
228
|
-
version = "0.7.35"
|
229
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
230
|
-
checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
|
231
|
-
dependencies = [
|
232
|
-
"proc-macro2",
|
233
|
-
"quote",
|
234
|
-
"syn",
|
235
|
-
]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[package]
|
2
2
|
name = "ocr_stringdist"
|
3
|
-
version = "0.0.
|
3
|
+
version = "0.0.5"
|
4
4
|
edition = "2021"
|
5
5
|
description = "String distances considering OCR errors."
|
6
6
|
authors = ["Niklas von Moers <niklasvmoers@protonmail.com>"]
|
@@ -15,8 +15,6 @@ crate-type = ["cdylib"]
|
|
15
15
|
|
16
16
|
[dependencies]
|
17
17
|
pyo3 = { version = "0.24.0", features = [] }
|
18
|
-
ahash = "^0.8"
|
19
|
-
smallvec = "1.15.0"
|
20
18
|
|
21
19
|
[features]
|
22
20
|
python = []
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: ocr_stringdist
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.5
|
4
4
|
Classifier: Programming Language :: Rust
|
5
5
|
Classifier: Programming Language :: Python
|
6
6
|
Classifier: Operating System :: OS Independent
|
@@ -17,6 +17,8 @@ Project-URL: repository, https://github.com/NiklasvonM/ocr-stringdist
|
|
17
17
|
|
18
18
|
A Python library for string distance calculations that account for common OCR (optical character recognition) errors.
|
19
19
|
|
20
|
+
Documentation: https://niklasvonm.github.io/ocr-stringdist/
|
21
|
+
|
20
22
|
[](https://pypi.org/project/ocr-stringdist/)
|
21
23
|
[](LICENSE)
|
22
24
|
|
@@ -35,9 +37,11 @@ pip install ocr-stringdist
|
|
35
37
|
## Features
|
36
38
|
|
37
39
|
- **Weighted Levenshtein Distance**: An adaptation of the classic Levenshtein algorithm with custom substitution costs for character pairs that are commonly confused in OCR models.
|
40
|
+
- **Unicode Support**: Arbitrary unicode strings can be compared.
|
41
|
+
- **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
|
38
42
|
- **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
|
39
43
|
- **Customizable Cost Maps**: Create your own substitution cost maps for specific OCR systems or domains.
|
40
|
-
- **Best Match Finder**: Utility function find_best_candidate to efficiently find the best matching string from a collection of candidates using any specified distance function (including the library's OCR-aware ones).
|
44
|
+
- **Best Match Finder**: Utility function `find_best_candidate` to efficiently find the best matching string from a collection of candidates using any specified distance function (including the library's OCR-aware ones).
|
41
45
|
|
42
46
|
## Usage
|
43
47
|
|
@@ -51,12 +55,12 @@ distance = osd.weighted_levenshtein_distance("OCR5", "OCRS")
|
|
51
55
|
print(f"Distance between 'OCR5' and 'OCRS': {distance}") # Will be less than 1.0
|
52
56
|
|
53
57
|
# Custom cost map
|
54
|
-
custom_map = {("
|
58
|
+
custom_map = {("In", "h"): 0.5}
|
55
59
|
distance = osd.weighted_levenshtein_distance(
|
56
|
-
"
|
60
|
+
"hi", "Ini",
|
57
61
|
cost_map=custom_map,
|
58
62
|
symmetric=True,
|
59
|
-
default_cost=1.0
|
63
|
+
default_cost=1.0,
|
60
64
|
)
|
61
65
|
print(f"Distance with custom map: {distance}")
|
62
66
|
```
|
@@ -2,6 +2,8 @@
|
|
2
2
|
|
3
3
|
A Python library for string distance calculations that account for common OCR (optical character recognition) errors.
|
4
4
|
|
5
|
+
Documentation: https://niklasvonm.github.io/ocr-stringdist/
|
6
|
+
|
5
7
|
[](https://pypi.org/project/ocr-stringdist/)
|
6
8
|
[](LICENSE)
|
7
9
|
|
@@ -20,9 +22,11 @@ pip install ocr-stringdist
|
|
20
22
|
## Features
|
21
23
|
|
22
24
|
- **Weighted Levenshtein Distance**: An adaptation of the classic Levenshtein algorithm with custom substitution costs for character pairs that are commonly confused in OCR models.
|
25
|
+
- **Unicode Support**: Arbitrary unicode strings can be compared.
|
26
|
+
- **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
|
23
27
|
- **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
|
24
28
|
- **Customizable Cost Maps**: Create your own substitution cost maps for specific OCR systems or domains.
|
25
|
-
- **Best Match Finder**: Utility function find_best_candidate to efficiently find the best matching string from a collection of candidates using any specified distance function (including the library's OCR-aware ones).
|
29
|
+
- **Best Match Finder**: Utility function `find_best_candidate` to efficiently find the best matching string from a collection of candidates using any specified distance function (including the library's OCR-aware ones).
|
26
30
|
|
27
31
|
## Usage
|
28
32
|
|
@@ -36,12 +40,12 @@ distance = osd.weighted_levenshtein_distance("OCR5", "OCRS")
|
|
36
40
|
print(f"Distance between 'OCR5' and 'OCRS': {distance}") # Will be less than 1.0
|
37
41
|
|
38
42
|
# Custom cost map
|
39
|
-
custom_map = {("
|
43
|
+
custom_map = {("In", "h"): 0.5}
|
40
44
|
distance = osd.weighted_levenshtein_distance(
|
41
|
-
"
|
45
|
+
"hi", "Ini",
|
42
46
|
cost_map=custom_map,
|
43
47
|
symmetric=True,
|
44
|
-
default_cost=1.0
|
48
|
+
default_cost=1.0,
|
45
49
|
)
|
46
50
|
print(f"Distance with custom map: {distance}")
|
47
51
|
```
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# Minimal makefile for Sphinx documentation
|
2
|
+
#
|
3
|
+
|
4
|
+
# You can set these variables from the command line, and also
|
5
|
+
# from the environment for the first two.
|
6
|
+
SPHINXOPTS ?=
|
7
|
+
SPHINXBUILD ?= sphinx-build
|
8
|
+
SOURCEDIR = source
|
9
|
+
BUILDDIR = build
|
10
|
+
|
11
|
+
# Put it first so that "make" without argument is like "make help".
|
12
|
+
help:
|
13
|
+
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
14
|
+
|
15
|
+
.PHONY: help Makefile
|
16
|
+
|
17
|
+
# Catch-all target: route all unknown targets to Sphinx using the new
|
18
|
+
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
19
|
+
%: Makefile
|
20
|
+
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
@@ -0,0 +1,35 @@
|
|
1
|
+
@ECHO OFF
|
2
|
+
|
3
|
+
pushd %~dp0
|
4
|
+
|
5
|
+
REM Command file for Sphinx documentation
|
6
|
+
|
7
|
+
if "%SPHINXBUILD%" == "" (
|
8
|
+
set SPHINXBUILD=sphinx-build
|
9
|
+
)
|
10
|
+
set SOURCEDIR=source
|
11
|
+
set BUILDDIR=build
|
12
|
+
|
13
|
+
%SPHINXBUILD% >NUL 2>NUL
|
14
|
+
if errorlevel 9009 (
|
15
|
+
echo.
|
16
|
+
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
17
|
+
echo.installed, then set the SPHINXBUILD environment variable to point
|
18
|
+
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
19
|
+
echo.may add the Sphinx directory to PATH.
|
20
|
+
echo.
|
21
|
+
echo.If you don't have Sphinx installed, grab it from
|
22
|
+
echo.https://www.sphinx-doc.org/
|
23
|
+
exit /b 1
|
24
|
+
)
|
25
|
+
|
26
|
+
if "%1" == "" goto help
|
27
|
+
|
28
|
+
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
29
|
+
goto end
|
30
|
+
|
31
|
+
:help
|
32
|
+
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
33
|
+
|
34
|
+
:end
|
35
|
+
popd
|
@@ -0,0 +1,18 @@
|
|
1
|
+
.. _api_reference:
|
2
|
+
|
3
|
+
API Reference
|
4
|
+
=============
|
5
|
+
|
6
|
+
This page contains the auto-generated API reference documentation.
|
7
|
+
|
8
|
+
.. autofunction:: ocr_stringdist.__init__.weighted_levenshtein_distance
|
9
|
+
|
10
|
+
.. automodule:: ocr_stringdist.matching
|
11
|
+
:members:
|
12
|
+
:undoc-members:
|
13
|
+
:show-inheritance:
|
14
|
+
|
15
|
+
.. automodule:: ocr_stringdist.default_ocr_distances
|
16
|
+
:members:
|
17
|
+
:undoc-members:
|
18
|
+
:show-inheritance:
|
@@ -0,0 +1,40 @@
|
|
1
|
+
# Configuration file for the Sphinx documentation builder.
|
2
|
+
#
|
3
|
+
# For the full list of built-in configuration values, see the documentation:
|
4
|
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
5
|
+
|
6
|
+
|
7
|
+
import os
|
8
|
+
import sys
|
9
|
+
|
10
|
+
# source code is in project_root/python/ocr_stringdist
|
11
|
+
sys.path.insert(0, os.path.abspath("../../python"))
|
12
|
+
|
13
|
+
|
14
|
+
# -- Project information -----------------------------------------------------
|
15
|
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
|
16
|
+
|
17
|
+
project = "OCR-StringDist"
|
18
|
+
copyright = "2025, Niklas von Moers"
|
19
|
+
author = "Niklas von Moers"
|
20
|
+
|
21
|
+
# -- General configuration ---------------------------------------------------
|
22
|
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
|
23
|
+
|
24
|
+
extensions: list[str] = [
|
25
|
+
"sphinx.ext.autodoc", # Core library to pull documentation from docstrings
|
26
|
+
"sphinx.ext.napoleon", # Support for Google and NumPy style docstrings
|
27
|
+
"sphinx.ext.intersphinx", # Link to other projects' documentation
|
28
|
+
"sphinx.ext.viewcode", # Add links to source code
|
29
|
+
"sphinx_mdinclude", # Include Markdown
|
30
|
+
]
|
31
|
+
|
32
|
+
templates_path = ["_templates"]
|
33
|
+
exclude_patterns: list[str] = []
|
34
|
+
|
35
|
+
|
36
|
+
# -- Options for HTML output -------------------------------------------------
|
37
|
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
|
38
|
+
|
39
|
+
html_theme = "sphinx_rtd_theme"
|
40
|
+
html_static_path: list[str] = ["_static"]
|
@@ -17,29 +17,26 @@ ic(
|
|
17
17
|
)
|
18
18
|
)
|
19
19
|
|
20
|
+
# Substitution of multiple characters at once is supported.
|
20
21
|
ic(
|
21
22
|
weighted_levenshtein_distance(
|
22
|
-
"
|
23
|
-
"
|
24
|
-
|
25
|
-
|
26
|
-
)
|
23
|
+
"이탈리",
|
24
|
+
"OI탈리", # Korean syllables may be confused with multiple Latin letters at once
|
25
|
+
{("이", "OI"): 0.5},
|
26
|
+
),
|
27
27
|
)
|
28
28
|
|
29
29
|
ic(
|
30
30
|
weighted_levenshtein_distance(
|
31
|
-
"
|
32
|
-
"
|
33
|
-
{
|
31
|
+
"ABCDE",
|
32
|
+
"XBCDE",
|
33
|
+
cost_map={},
|
34
|
+
default_cost=0.8, # Lower default substitution cost (default is 1.0)
|
34
35
|
)
|
35
36
|
)
|
36
37
|
|
37
|
-
|
38
38
|
ic(weighted_levenshtein_distance("A", "B", {("A", "B"): 0.0}, symmetric=False))
|
39
39
|
ic(weighted_levenshtein_distance("A", "B", {("B", "A"): 0.0}, symmetric=False))
|
40
|
-
ic(weighted_levenshtein_distance("B", "A", {("B", "A"): 0.0}, symmetric=False))
|
41
|
-
ic(weighted_levenshtein_distance("B", "A", {("A", "B"): 0.0}, symmetric=False))
|
42
|
-
|
43
40
|
|
44
41
|
ic(
|
45
42
|
find_best_candidate(
|
@@ -36,6 +36,7 @@ def weighted_levenshtein_distance(
|
|
36
36
|
"""
|
37
37
|
if cost_map is None:
|
38
38
|
cost_map = ocr_distance_map
|
39
|
+
# _weighted_levenshtein_distance is written in Rust, see src/rust_stringdist.rs.
|
39
40
|
return _weighted_levenshtein_distance( # type: ignore # noqa: F405
|
40
41
|
s1, s2, cost_map=cost_map, symmetric=symmetric, default_cost=default_cost
|
41
42
|
)
|
@@ -15,24 +15,19 @@ fn _weighted_levenshtein_distance(
|
|
15
15
|
default_cost: Option<f64>,
|
16
16
|
) -> PyResult<f64> {
|
17
17
|
let default_cost_value = default_cost.unwrap_or(1.0);
|
18
|
-
let mut
|
18
|
+
let mut substitution_costs: HashMap<(String, String), f64> = HashMap::new();
|
19
19
|
|
20
20
|
// Convert Python dictionary to Rust HashMap
|
21
21
|
for (key, value) in cost_map.iter() {
|
22
22
|
if let Ok(key_tuple) = key.extract::<(String, String)>() {
|
23
23
|
if let Ok(cost) = value.extract::<f64>() {
|
24
|
-
|
25
|
-
if let (Some(c1), Some(c2)) =
|
26
|
-
(key_tuple.0.chars().next(), key_tuple.1.chars().next())
|
27
|
-
{
|
28
|
-
char_costs.insert((c1, c2), cost);
|
29
|
-
}
|
24
|
+
substitution_costs.insert((key_tuple.0, key_tuple.1), cost);
|
30
25
|
}
|
31
26
|
}
|
32
27
|
}
|
33
28
|
|
34
29
|
// Create a custom cost map and calculate the distance
|
35
|
-
let custom_cost_map = OcrCostMap::new(
|
30
|
+
let custom_cost_map = OcrCostMap::new(substitution_costs, default_cost_value, symmetric);
|
36
31
|
Ok(_weighted_lev_with_map(a, b, &custom_cost_map))
|
37
32
|
}
|
38
33
|
|