ocr-stringdist 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocr_stringdist-0.0.1/.github/workflows/CI.yml +200 -0
- ocr_stringdist-0.0.1/.gitignore +180 -0
- ocr_stringdist-0.0.1/Cargo.lock +236 -0
- ocr_stringdist-0.0.1/Cargo.toml +23 -0
- ocr_stringdist-0.0.1/Justfile +12 -0
- ocr_stringdist-0.0.1/PKG-INFO +16 -0
- ocr_stringdist-0.0.1/README.md +1 -0
- ocr_stringdist-0.0.1/example.py +50 -0
- ocr_stringdist-0.0.1/pyproject.toml +22 -0
- ocr_stringdist-0.0.1/python/ocr_stringdist/__init__.py +1 -0
- ocr_stringdist-0.0.1/python/ocr_stringdist/__init__.pyi +8 -0
- ocr_stringdist-0.0.1/python/ocr_stringdist/py.typed +0 -0
- ocr_stringdist-0.0.1/src/lib.rs +11 -0
- ocr_stringdist-0.0.1/src/rust_stringdist.rs +51 -0
- ocr_stringdist-0.0.1/src/weighted_levenshtein.rs +204 -0
- ocr_stringdist-0.0.1/tests/test_ocr_stringdist.py +5 -0
@@ -0,0 +1,200 @@
|
|
1
|
+
# This file was edited manually to add
|
2
|
+
# The original was autogenerated by maturin v0.14.15
|
3
|
+
on:
|
4
|
+
push:
|
5
|
+
branches:
|
6
|
+
- "main"
|
7
|
+
tags:
|
8
|
+
- "*"
|
9
|
+
pull_request:
|
10
|
+
workflow_dispatch:
|
11
|
+
|
12
|
+
permissions:
|
13
|
+
contents: read
|
14
|
+
|
15
|
+
jobs:
|
16
|
+
lint_and_test:
|
17
|
+
runs-on: ubuntu-latest
|
18
|
+
strategy:
|
19
|
+
matrix:
|
20
|
+
python-version: ["3.9", "3.13", "pypy3.11"]
|
21
|
+
steps:
|
22
|
+
- uses: actions/checkout@v4
|
23
|
+
with:
|
24
|
+
submodules: recursive
|
25
|
+
- uses: actions/setup-python@v5
|
26
|
+
with:
|
27
|
+
python-version: ${{ matrix.python-version }}
|
28
|
+
- name: Build wheels
|
29
|
+
uses: PyO3/maturin-action@v1
|
30
|
+
with:
|
31
|
+
target: ${{ matrix.target }}
|
32
|
+
args: --release --out dist -i ${{ matrix.python-version }}
|
33
|
+
sccache: "true"
|
34
|
+
- name: Install Just
|
35
|
+
uses: extractions/setup-just@v2
|
36
|
+
- name: Run Cargo Tests
|
37
|
+
run: |
|
38
|
+
cargo test
|
39
|
+
- name: Run pytest
|
40
|
+
run: |
|
41
|
+
# just venv pytest
|
42
|
+
rm -rf .venv
|
43
|
+
python3 -m venv .venv
|
44
|
+
. .venv/bin/activate
|
45
|
+
.venv/bin/pip install wheel pytest maturin
|
46
|
+
maturin develop
|
47
|
+
.venv/bin/pytest
|
48
|
+
|
49
|
+
linux:
|
50
|
+
runs-on: ubuntu-latest
|
51
|
+
needs: lint_and_test
|
52
|
+
strategy:
|
53
|
+
matrix:
|
54
|
+
platform:
|
55
|
+
- target: x64
|
56
|
+
interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11
|
57
|
+
- target: aarch64
|
58
|
+
interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11
|
59
|
+
- target: armv7
|
60
|
+
interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11
|
61
|
+
steps:
|
62
|
+
- uses: actions/checkout@v4
|
63
|
+
with:
|
64
|
+
submodules: recursive
|
65
|
+
- name: Build wheels
|
66
|
+
uses: PyO3/maturin-action@v1
|
67
|
+
with:
|
68
|
+
target: ${{ matrix.platform.target }}
|
69
|
+
args: --release --out dist -i ${{ matrix.platform.interpreter }}
|
70
|
+
sccache: "true"
|
71
|
+
manylinux: auto
|
72
|
+
- name: Upload wheels
|
73
|
+
uses: actions/upload-artifact@v4
|
74
|
+
with:
|
75
|
+
name: wheels-linux-${{ strategy.job-index }}
|
76
|
+
path: dist
|
77
|
+
musllinux:
|
78
|
+
runs-on: ubuntu-latest
|
79
|
+
needs: lint_and_test
|
80
|
+
strategy:
|
81
|
+
matrix:
|
82
|
+
platform:
|
83
|
+
- target: x86_64-unknown-linux-musl
|
84
|
+
arch: x86_64
|
85
|
+
interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11
|
86
|
+
- target: i686-unknown-linux-musl
|
87
|
+
arch: x86
|
88
|
+
interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11
|
89
|
+
- target: aarch64-unknown-linux-musl
|
90
|
+
arch: aarch64
|
91
|
+
interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11
|
92
|
+
# all values: [x86_64, x86, aarch64, armhf, armv7, ppc64le, riscv64, s390x]
|
93
|
+
# { target: "armv7-unknown-linux-musleabihf", image_tag: "armv7" },
|
94
|
+
# { target: "powerpc64le-unknown-linux-musl", image_tag: "ppc64le" },
|
95
|
+
steps:
|
96
|
+
- uses: actions/checkout@v4
|
97
|
+
with:
|
98
|
+
submodules: recursive
|
99
|
+
- name: Setup QEMU
|
100
|
+
uses: docker/setup-qemu-action@v3
|
101
|
+
- name: Build wheels
|
102
|
+
uses: PyO3/maturin-action@v1
|
103
|
+
with:
|
104
|
+
target: ${{ matrix.platform.target }}
|
105
|
+
args: --release --out dist -i ${{ matrix.platform.interpreter }}
|
106
|
+
sccache: "true"
|
107
|
+
manylinux: musllinux_1_1
|
108
|
+
- name: Upload wheels
|
109
|
+
uses: actions/upload-artifact@v4
|
110
|
+
with:
|
111
|
+
name: wheels-musl-${{ strategy.job-index }}
|
112
|
+
path: dist
|
113
|
+
|
114
|
+
windows:
|
115
|
+
runs-on: windows-latest
|
116
|
+
needs: lint_and_test
|
117
|
+
strategy:
|
118
|
+
matrix:
|
119
|
+
target: [x64, x86]
|
120
|
+
interpreter: ["3.9", "3.10", "3.11", "3.12", "3.13"]
|
121
|
+
steps:
|
122
|
+
- uses: actions/checkout@v4
|
123
|
+
with:
|
124
|
+
submodules: recursive
|
125
|
+
- uses: actions/setup-python@v5
|
126
|
+
with:
|
127
|
+
python-version: ${{ matrix.interpreter }}
|
128
|
+
- name: Build wheels
|
129
|
+
uses: PyO3/maturin-action@v1
|
130
|
+
with:
|
131
|
+
target: ${{ matrix.target }}
|
132
|
+
args: --release --out dist -i ${{ matrix.interpreter }}
|
133
|
+
sccache: "true"
|
134
|
+
- name: Upload wheels
|
135
|
+
uses: actions/upload-artifact@v4
|
136
|
+
with:
|
137
|
+
path: dist
|
138
|
+
name: wheels-win-${{ strategy.job-index }}
|
139
|
+
|
140
|
+
macos:
|
141
|
+
runs-on: macos-latest
|
142
|
+
needs: lint_and_test
|
143
|
+
strategy:
|
144
|
+
matrix:
|
145
|
+
platform:
|
146
|
+
- target: x64
|
147
|
+
interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11
|
148
|
+
- target: aarch64
|
149
|
+
interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11
|
150
|
+
steps:
|
151
|
+
- uses: actions/checkout@v4
|
152
|
+
with:
|
153
|
+
submodules: recursive
|
154
|
+
- name: Build wheels
|
155
|
+
uses: PyO3/maturin-action@v1
|
156
|
+
with:
|
157
|
+
target: ${{ matrix.platform.target }}
|
158
|
+
args: --release --out dist -i ${{ matrix.platform.interpreter }}
|
159
|
+
sccache: "true"
|
160
|
+
- name: Upload wheels
|
161
|
+
uses: actions/upload-artifact@v4
|
162
|
+
with:
|
163
|
+
name: wheels-mac-${{ strategy.job-index }}
|
164
|
+
path: dist
|
165
|
+
|
166
|
+
sdist:
|
167
|
+
runs-on: ubuntu-latest
|
168
|
+
needs: lint_and_test
|
169
|
+
steps:
|
170
|
+
- uses: actions/checkout@v4
|
171
|
+
with:
|
172
|
+
submodules: recursive
|
173
|
+
- name: Build sdist
|
174
|
+
uses: PyO3/maturin-action@v1
|
175
|
+
with:
|
176
|
+
command: sdist
|
177
|
+
args: --out dist
|
178
|
+
- name: Upload sdist
|
179
|
+
uses: actions/upload-artifact@v4
|
180
|
+
with:
|
181
|
+
name: wheels-sdist-${{ strategy.job-index }}
|
182
|
+
path: dist
|
183
|
+
|
184
|
+
release:
|
185
|
+
name: Release
|
186
|
+
runs-on: ubuntu-latest
|
187
|
+
if: "startsWith(github.ref, 'refs/tags/')"
|
188
|
+
needs: [linux, windows, macos, sdist, musllinux]
|
189
|
+
steps:
|
190
|
+
- uses: actions/download-artifact@v4
|
191
|
+
with:
|
192
|
+
pattern: wheels-*
|
193
|
+
merge-multiple: true
|
194
|
+
- name: Publish to PyPI
|
195
|
+
uses: PyO3/maturin-action@v1
|
196
|
+
env:
|
197
|
+
MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
|
198
|
+
with:
|
199
|
+
command: upload
|
200
|
+
args: --skip-existing *
|
@@ -0,0 +1,180 @@
|
|
1
|
+
build/
|
2
|
+
dist/
|
3
|
+
*.so
|
4
|
+
*.swp
|
5
|
+
*.pyc
|
6
|
+
*.DS_Store
|
7
|
+
*~
|
8
|
+
.tox/
|
9
|
+
.coverage
|
10
|
+
htmlcov/
|
11
|
+
.ropeproject/
|
12
|
+
_build/
|
13
|
+
.ipynb_checkpoints/
|
14
|
+
.cache
|
15
|
+
wheelhouse/
|
16
|
+
site/
|
17
|
+
target/
|
18
|
+
Cargo.lock
|
19
|
+
.venv
|
20
|
+
|
21
|
+
# Byte-compiled / optimized / DLL files
|
22
|
+
__pycache__/
|
23
|
+
*.py[cod]
|
24
|
+
*$py.class
|
25
|
+
|
26
|
+
# C extensions
|
27
|
+
*.so
|
28
|
+
|
29
|
+
# Distribution / packaging
|
30
|
+
.Python
|
31
|
+
build/
|
32
|
+
develop-eggs/
|
33
|
+
dist/
|
34
|
+
downloads/
|
35
|
+
eggs/
|
36
|
+
.eggs/
|
37
|
+
lib/
|
38
|
+
lib64/
|
39
|
+
parts/
|
40
|
+
sdist/
|
41
|
+
var/
|
42
|
+
wheels/
|
43
|
+
share/python-wheels/
|
44
|
+
*.egg-info/
|
45
|
+
.installed.cfg
|
46
|
+
*.egg
|
47
|
+
MANIFEST
|
48
|
+
|
49
|
+
# PyInstaller
|
50
|
+
# Usually these files are written by a python script from a template
|
51
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
52
|
+
*.manifest
|
53
|
+
*.spec
|
54
|
+
|
55
|
+
# Installer logs
|
56
|
+
pip-log.txt
|
57
|
+
pip-delete-this-directory.txt
|
58
|
+
|
59
|
+
# Unit test / coverage reports
|
60
|
+
htmlcov/
|
61
|
+
.tox/
|
62
|
+
.nox/
|
63
|
+
.coverage
|
64
|
+
.coverage.*
|
65
|
+
.cache
|
66
|
+
nosetests.xml
|
67
|
+
coverage.xml
|
68
|
+
*.cover
|
69
|
+
*.py,cover
|
70
|
+
.hypothesis/
|
71
|
+
.pytest_cache/
|
72
|
+
cover/
|
73
|
+
|
74
|
+
# Translations
|
75
|
+
*.mo
|
76
|
+
*.pot
|
77
|
+
|
78
|
+
# Django stuff:
|
79
|
+
*.log
|
80
|
+
local_settings.py
|
81
|
+
db.sqlite3
|
82
|
+
db.sqlite3-journal
|
83
|
+
|
84
|
+
# Flask stuff:
|
85
|
+
instance/
|
86
|
+
.webassets-cache
|
87
|
+
|
88
|
+
# Scrapy stuff:
|
89
|
+
.scrapy
|
90
|
+
|
91
|
+
# Sphinx documentation
|
92
|
+
docs/_build/
|
93
|
+
|
94
|
+
# PyBuilder
|
95
|
+
.pybuilder/
|
96
|
+
target/
|
97
|
+
|
98
|
+
# Jupyter Notebook
|
99
|
+
.ipynb_checkpoints
|
100
|
+
|
101
|
+
# IPython
|
102
|
+
profile_default/
|
103
|
+
ipython_config.py
|
104
|
+
|
105
|
+
# pyenv
|
106
|
+
# For a library or package, you might want to ignore these files since the code is
|
107
|
+
# intended to run in multiple environments; otherwise, check them in:
|
108
|
+
# .python-version
|
109
|
+
|
110
|
+
# pipenv
|
111
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
112
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
113
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
114
|
+
# install all needed dependencies.
|
115
|
+
#Pipfile.lock
|
116
|
+
|
117
|
+
# poetry
|
118
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
119
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
120
|
+
# commonly ignored for libraries.
|
121
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
122
|
+
#poetry.lock
|
123
|
+
|
124
|
+
# pdm
|
125
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
126
|
+
#pdm.lock
|
127
|
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
128
|
+
# in version control.
|
129
|
+
# https://pdm.fming.dev/#use-with-ide
|
130
|
+
.pdm.toml
|
131
|
+
|
132
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
133
|
+
__pypackages__/
|
134
|
+
|
135
|
+
# Celery stuff
|
136
|
+
celerybeat-schedule
|
137
|
+
celerybeat.pid
|
138
|
+
|
139
|
+
# SageMath parsed files
|
140
|
+
*.sage.py
|
141
|
+
|
142
|
+
# Environments
|
143
|
+
.env
|
144
|
+
.venv
|
145
|
+
env/
|
146
|
+
venv/
|
147
|
+
ENV/
|
148
|
+
env.bak/
|
149
|
+
venv.bak/
|
150
|
+
|
151
|
+
# Spyder project settings
|
152
|
+
.spyderproject
|
153
|
+
.spyproject
|
154
|
+
|
155
|
+
# Rope project settings
|
156
|
+
.ropeproject
|
157
|
+
|
158
|
+
# mkdocs documentation
|
159
|
+
/site
|
160
|
+
|
161
|
+
# mypy
|
162
|
+
.mypy_cache/
|
163
|
+
.dmypy.json
|
164
|
+
dmypy.json
|
165
|
+
|
166
|
+
# Pyre type checker
|
167
|
+
.pyre/
|
168
|
+
|
169
|
+
# pytype static type analyzer
|
170
|
+
.pytype/
|
171
|
+
|
172
|
+
# Cython debug symbols
|
173
|
+
cython_debug/
|
174
|
+
|
175
|
+
# PyCharm
|
176
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
177
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
178
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
179
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
180
|
+
#.idea/
|
@@ -0,0 +1,236 @@
|
|
1
|
+
# This file is automatically @generated by Cargo.
|
2
|
+
# It is not intended for manual editing.
|
3
|
+
version = 4
|
4
|
+
|
5
|
+
[[package]]
|
6
|
+
name = "ahash"
|
7
|
+
version = "0.8.11"
|
8
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
9
|
+
checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
|
10
|
+
dependencies = [
|
11
|
+
"cfg-if",
|
12
|
+
"getrandom",
|
13
|
+
"once_cell",
|
14
|
+
"version_check",
|
15
|
+
"zerocopy",
|
16
|
+
]
|
17
|
+
|
18
|
+
[[package]]
|
19
|
+
name = "autocfg"
|
20
|
+
version = "1.4.0"
|
21
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
22
|
+
checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
|
23
|
+
|
24
|
+
[[package]]
|
25
|
+
name = "cfg-if"
|
26
|
+
version = "1.0.0"
|
27
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
28
|
+
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
29
|
+
|
30
|
+
[[package]]
|
31
|
+
name = "getrandom"
|
32
|
+
version = "0.2.15"
|
33
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
34
|
+
checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
|
35
|
+
dependencies = [
|
36
|
+
"cfg-if",
|
37
|
+
"libc",
|
38
|
+
"wasi",
|
39
|
+
]
|
40
|
+
|
41
|
+
[[package]]
|
42
|
+
name = "heck"
|
43
|
+
version = "0.5.0"
|
44
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
45
|
+
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
|
46
|
+
|
47
|
+
[[package]]
|
48
|
+
name = "indoc"
|
49
|
+
version = "2.0.6"
|
50
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
51
|
+
checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd"
|
52
|
+
|
53
|
+
[[package]]
|
54
|
+
name = "libc"
|
55
|
+
version = "0.2.172"
|
56
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
57
|
+
checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa"
|
58
|
+
|
59
|
+
[[package]]
|
60
|
+
name = "memoffset"
|
61
|
+
version = "0.9.1"
|
62
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
63
|
+
checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a"
|
64
|
+
dependencies = [
|
65
|
+
"autocfg",
|
66
|
+
]
|
67
|
+
|
68
|
+
[[package]]
|
69
|
+
name = "ocr_stringdist"
|
70
|
+
version = "0.0.1"
|
71
|
+
dependencies = [
|
72
|
+
"ahash",
|
73
|
+
"once_cell",
|
74
|
+
"pyo3",
|
75
|
+
"smallvec",
|
76
|
+
]
|
77
|
+
|
78
|
+
[[package]]
|
79
|
+
name = "once_cell"
|
80
|
+
version = "1.21.3"
|
81
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
82
|
+
checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
|
83
|
+
|
84
|
+
[[package]]
|
85
|
+
name = "portable-atomic"
|
86
|
+
version = "1.11.0"
|
87
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
88
|
+
checksum = "350e9b48cbc6b0e028b0473b114454c6316e57336ee184ceab6e53f72c178b3e"
|
89
|
+
|
90
|
+
[[package]]
|
91
|
+
name = "proc-macro2"
|
92
|
+
version = "1.0.95"
|
93
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
94
|
+
checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778"
|
95
|
+
dependencies = [
|
96
|
+
"unicode-ident",
|
97
|
+
]
|
98
|
+
|
99
|
+
[[package]]
|
100
|
+
name = "pyo3"
|
101
|
+
version = "0.24.1"
|
102
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
103
|
+
checksum = "17da310086b068fbdcefbba30aeb3721d5bb9af8db4987d6735b2183ca567229"
|
104
|
+
dependencies = [
|
105
|
+
"cfg-if",
|
106
|
+
"indoc",
|
107
|
+
"libc",
|
108
|
+
"memoffset",
|
109
|
+
"once_cell",
|
110
|
+
"portable-atomic",
|
111
|
+
"pyo3-build-config",
|
112
|
+
"pyo3-ffi",
|
113
|
+
"pyo3-macros",
|
114
|
+
"unindent",
|
115
|
+
]
|
116
|
+
|
117
|
+
[[package]]
|
118
|
+
name = "pyo3-build-config"
|
119
|
+
version = "0.24.1"
|
120
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
121
|
+
checksum = "e27165889bd793000a098bb966adc4300c312497ea25cf7a690a9f0ac5aa5fc1"
|
122
|
+
dependencies = [
|
123
|
+
"once_cell",
|
124
|
+
"target-lexicon",
|
125
|
+
]
|
126
|
+
|
127
|
+
[[package]]
|
128
|
+
name = "pyo3-ffi"
|
129
|
+
version = "0.24.1"
|
130
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
131
|
+
checksum = "05280526e1dbf6b420062f3ef228b78c0c54ba94e157f5cb724a609d0f2faabc"
|
132
|
+
dependencies = [
|
133
|
+
"libc",
|
134
|
+
"pyo3-build-config",
|
135
|
+
]
|
136
|
+
|
137
|
+
[[package]]
|
138
|
+
name = "pyo3-macros"
|
139
|
+
version = "0.24.1"
|
140
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
141
|
+
checksum = "5c3ce5686aa4d3f63359a5100c62a127c9f15e8398e5fdeb5deef1fed5cd5f44"
|
142
|
+
dependencies = [
|
143
|
+
"proc-macro2",
|
144
|
+
"pyo3-macros-backend",
|
145
|
+
"quote",
|
146
|
+
"syn",
|
147
|
+
]
|
148
|
+
|
149
|
+
[[package]]
|
150
|
+
name = "pyo3-macros-backend"
|
151
|
+
version = "0.24.1"
|
152
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
153
|
+
checksum = "f4cf6faa0cbfb0ed08e89beb8103ae9724eb4750e3a78084ba4017cbe94f3855"
|
154
|
+
dependencies = [
|
155
|
+
"heck",
|
156
|
+
"proc-macro2",
|
157
|
+
"pyo3-build-config",
|
158
|
+
"quote",
|
159
|
+
"syn",
|
160
|
+
]
|
161
|
+
|
162
|
+
[[package]]
|
163
|
+
name = "quote"
|
164
|
+
version = "1.0.40"
|
165
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
166
|
+
checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
|
167
|
+
dependencies = [
|
168
|
+
"proc-macro2",
|
169
|
+
]
|
170
|
+
|
171
|
+
[[package]]
|
172
|
+
name = "smallvec"
|
173
|
+
version = "1.15.0"
|
174
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
175
|
+
checksum = "8917285742e9f3e1683f0a9c4e6b57960b7314d0b08d30d1ecd426713ee2eee9"
|
176
|
+
|
177
|
+
[[package]]
|
178
|
+
name = "syn"
|
179
|
+
version = "2.0.100"
|
180
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
181
|
+
checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0"
|
182
|
+
dependencies = [
|
183
|
+
"proc-macro2",
|
184
|
+
"quote",
|
185
|
+
"unicode-ident",
|
186
|
+
]
|
187
|
+
|
188
|
+
[[package]]
|
189
|
+
name = "target-lexicon"
|
190
|
+
version = "0.13.2"
|
191
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
192
|
+
checksum = "e502f78cdbb8ba4718f566c418c52bc729126ffd16baee5baa718cf25dd5a69a"
|
193
|
+
|
194
|
+
[[package]]
|
195
|
+
name = "unicode-ident"
|
196
|
+
version = "1.0.18"
|
197
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
198
|
+
checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
|
199
|
+
|
200
|
+
[[package]]
|
201
|
+
name = "unindent"
|
202
|
+
version = "0.2.4"
|
203
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
204
|
+
checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3"
|
205
|
+
|
206
|
+
[[package]]
|
207
|
+
name = "version_check"
|
208
|
+
version = "0.9.5"
|
209
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
210
|
+
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
|
211
|
+
|
212
|
+
[[package]]
|
213
|
+
name = "wasi"
|
214
|
+
version = "0.11.0+wasi-snapshot-preview1"
|
215
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
216
|
+
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
|
217
|
+
|
218
|
+
[[package]]
|
219
|
+
name = "zerocopy"
|
220
|
+
version = "0.7.35"
|
221
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
222
|
+
checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
|
223
|
+
dependencies = [
|
224
|
+
"zerocopy-derive",
|
225
|
+
]
|
226
|
+
|
227
|
+
[[package]]
|
228
|
+
name = "zerocopy-derive"
|
229
|
+
version = "0.7.35"
|
230
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
231
|
+
checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
|
232
|
+
dependencies = [
|
233
|
+
"proc-macro2",
|
234
|
+
"quote",
|
235
|
+
"syn",
|
236
|
+
]
|
@@ -0,0 +1,23 @@
|
|
1
|
+
[package]
|
2
|
+
name = "ocr_stringdist"
|
3
|
+
version = "0.0.1"
|
4
|
+
edition = "2021"
|
5
|
+
description = "String distances considering OCR errors."
|
6
|
+
authors = ["Niklas von Moers <niklasvmoers@protonmail.com>"]
|
7
|
+
license = "MIT"
|
8
|
+
readme = "README.md"
|
9
|
+
|
10
|
+
|
11
|
+
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
12
|
+
[lib]
|
13
|
+
name = "ocr_stringdist"
|
14
|
+
crate-type = ["cdylib"]
|
15
|
+
|
16
|
+
[dependencies]
|
17
|
+
pyo3 = { version = "0.24.0", features = [] }
|
18
|
+
ahash = "^0.8"
|
19
|
+
once_cell = "1.21.3"
|
20
|
+
smallvec = "1.15.0"
|
21
|
+
|
22
|
+
[features]
|
23
|
+
python = []
|
@@ -0,0 +1,16 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: ocr_stringdist
|
3
|
+
Version: 0.0.1
|
4
|
+
Classifier: Programming Language :: Rust
|
5
|
+
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
6
|
+
Classifier: Operating System :: OS Independent
|
7
|
+
Summary: String distances considering OCR errors.
|
8
|
+
Author: Niklas von Moers <niklasvmoers@protonmail.com>
|
9
|
+
Author-email: Niklas von Moers <niklasvmoers@protonmail.com>
|
10
|
+
License: MIT
|
11
|
+
Requires-Python: >=3.9
|
12
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
13
|
+
Project-URL: repository, https://github.com/NiklasvonM/ocr-stringdist
|
14
|
+
|
15
|
+
# OCR-Stringdist
|
16
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
# OCR-Stringdist
|
@@ -0,0 +1,50 @@
|
|
1
|
+
from ocr_stringdist import (
|
2
|
+
ocr_weighted_levenshtein_distance,
|
3
|
+
custom_weighted_levenshtein_distance,
|
4
|
+
)
|
5
|
+
|
6
|
+
# Example with default OCR cost map
|
7
|
+
print("Using default OCR cost map:")
|
8
|
+
default_result = ocr_weighted_levenshtein_distance("12345G", "123456")
|
9
|
+
print(f"Distance between '12345G' and '123456': {default_result}")
|
10
|
+
|
11
|
+
# Example with custom cost map
|
12
|
+
custom_cost_map: dict[tuple[str, str], float] = {
|
13
|
+
("G", "6"): 0.1, # Make G/6 even more similar (default is 0.2)
|
14
|
+
("A", "B"): 0.3, # Make A/B somewhat similar
|
15
|
+
("X", "Y"): 0.5, # Make X/Y moderately similar
|
16
|
+
}
|
17
|
+
|
18
|
+
print("\nUsing custom cost map:")
|
19
|
+
custom_result = custom_weighted_levenshtein_distance(
|
20
|
+
"12345G", "123456", custom_cost_map
|
21
|
+
)
|
22
|
+
print(f"Distance between '12345G' and '123456' with custom map: {custom_result}")
|
23
|
+
|
24
|
+
# Example with custom default cost
|
25
|
+
print("\nUsing custom default cost:")
|
26
|
+
custom_default_result = custom_weighted_levenshtein_distance(
|
27
|
+
"ABCDE",
|
28
|
+
"XBCDE",
|
29
|
+
cost_map={("A", "X"): 0.5},
|
30
|
+
default_cost=0.8, # Lower default substitution cost (default is 1.0)
|
31
|
+
)
|
32
|
+
print(
|
33
|
+
f"Distance between 'ABCDE' and 'XBCDE' with custom default cost: {custom_default_result}"
|
34
|
+
)
|
35
|
+
|
36
|
+
# More complex example - comparing names with custom costs for similar looking characters
|
37
|
+
name_cost_map = {
|
38
|
+
("O", "0"): 0.1, # Letter O and number 0
|
39
|
+
("l", "1"): 0.1, # Lowercase L and number 1
|
40
|
+
("I", "1"): 0.1, # Uppercase I and number 1
|
41
|
+
("S", "5"): 0.2, # Letter S and number 5
|
42
|
+
("Z", "2"): 0.2, # Letter Z and number 2
|
43
|
+
("B", "8"): 0.2, # Letter B and number 8
|
44
|
+
}
|
45
|
+
|
46
|
+
print("\nComparing names with OCR-like errors:")
|
47
|
+
name1 = "ROBERT"
|
48
|
+
name2 = "R0BERT" # Using 0 instead of O
|
49
|
+
distance = custom_weighted_levenshtein_distance(name1, name2, name_cost_map)
|
50
|
+
print(f"Distance between '{name1}' and '{name2}': {distance}")
|
@@ -0,0 +1,22 @@
|
|
1
|
+
[build-system]
|
2
|
+
requires = ["maturin>=0.14,<2"]
|
3
|
+
build-backend = "maturin"
|
4
|
+
|
5
|
+
[project]
|
6
|
+
name = "ocr_stringdist"
|
7
|
+
dynamic = ["version"]
|
8
|
+
requires-python = ">=3.9"
|
9
|
+
classifiers = [
|
10
|
+
"Programming Language :: Rust",
|
11
|
+
"Programming Language :: Python :: Implementation :: PyPy",
|
12
|
+
"Operating System :: OS Independent",
|
13
|
+
]
|
14
|
+
|
15
|
+
[project.urls]
|
16
|
+
repository = "https://github.com/NiklasvonM/ocr-stringdist"
|
17
|
+
|
18
|
+
|
19
|
+
[tool.maturin]
|
20
|
+
features = ["pyo3/extension-module", "python"]
|
21
|
+
python-source = "python"
|
22
|
+
module-name = "ocr_stringdist._rust_stringdist"
|
@@ -0,0 +1 @@
|
|
1
|
+
from ._rust_stringdist import *
|
@@ -0,0 +1,8 @@
|
|
1
|
+
def levenshtein_distance(s1: str, s2: str) -> int: ...
|
2
|
+
def ocr_weighted_levenshtein_distance(s1: str, s2: str) -> float: ...
|
3
|
+
def custom_weighted_levenshtein_distance(
|
4
|
+
s1: str,
|
5
|
+
s2: str,
|
6
|
+
cost_map: dict[tuple[str, str], float],
|
7
|
+
default_cost: float | None = None
|
8
|
+
) -> float: ...
|
File without changes
|
@@ -0,0 +1,11 @@
|
|
1
|
+
mod weighted_levenshtein;
|
2
|
+
|
3
|
+
pub use weighted_levenshtein::{
|
4
|
+
custom_levenshtein_distance, custom_levenshtein_distance_with_cost_map,
|
5
|
+
vec_custom_levenshtein_distance, vec_custom_levenshtein_distance_with_cost_map, OcrCostMap,
|
6
|
+
};
|
7
|
+
|
8
|
+
#[cfg(feature = "python")]
|
9
|
+
mod rust_stringdist;
|
10
|
+
#[cfg(feature = "python")]
|
11
|
+
pub use rust_stringdist::_rust_stringdist;
|
@@ -0,0 +1,51 @@
|
|
1
|
+
use crate::custom_levenshtein_distance as _weighted_lev;
|
2
|
+
use crate::custom_levenshtein_distance_with_cost_map as _weighted_lev_with_map;
|
3
|
+
use crate::OcrCostMap;
|
4
|
+
use pyo3::prelude::*;
|
5
|
+
use pyo3::types::PyDict;
|
6
|
+
use std::collections::HashMap;
|
7
|
+
|
8
|
+
// Calculates the Levenshtein distance between two strings.
|
9
|
+
#[pyfunction]
|
10
|
+
fn ocr_weighted_levenshtein_distance(a: &str, b: &str) -> PyResult<f64> {
|
11
|
+
Ok(_weighted_lev(a, b))
|
12
|
+
}
|
13
|
+
|
14
|
+
// Calculates the weighted Levenshtein distance with a custom cost map from Python.
|
15
|
+
#[pyfunction]
|
16
|
+
#[pyo3(signature = (a, b, cost_map, default_cost = None))]
|
17
|
+
fn custom_weighted_levenshtein_distance(
|
18
|
+
a: &str,
|
19
|
+
b: &str,
|
20
|
+
cost_map: &Bound<'_, PyDict>,
|
21
|
+
default_cost: Option<f64>,
|
22
|
+
) -> PyResult<f64> {
|
23
|
+
let default_cost_value = default_cost.unwrap_or(1.0);
|
24
|
+
let mut char_costs: HashMap<(char, char), f64> = HashMap::new();
|
25
|
+
|
26
|
+
// Convert Python dictionary to Rust HashMap
|
27
|
+
for (key, value) in cost_map.iter() {
|
28
|
+
if let Ok(key_tuple) = key.extract::<(String, String)>() {
|
29
|
+
if let Ok(cost) = value.extract::<f64>() {
|
30
|
+
// Extract the first character from each string, if they exist
|
31
|
+
if let (Some(c1), Some(c2)) =
|
32
|
+
(key_tuple.0.chars().next(), key_tuple.1.chars().next())
|
33
|
+
{
|
34
|
+
char_costs.insert((c1, c2), cost);
|
35
|
+
}
|
36
|
+
}
|
37
|
+
}
|
38
|
+
}
|
39
|
+
|
40
|
+
// Create a custom cost map and calculate the distance
|
41
|
+
let custom_cost_map = OcrCostMap::new(char_costs, default_cost_value);
|
42
|
+
Ok(_weighted_lev_with_map(a, b, &custom_cost_map))
|
43
|
+
}
|
44
|
+
|
45
|
+
/// A Python module implemented in Rust.
|
46
|
+
#[pymodule]
|
47
|
+
pub fn _rust_stringdist(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
|
48
|
+
m.add_function(wrap_pyfunction!(ocr_weighted_levenshtein_distance, m)?)?;
|
49
|
+
m.add_function(wrap_pyfunction!(custom_weighted_levenshtein_distance, m)?)?;
|
50
|
+
Ok(())
|
51
|
+
}
|
@@ -0,0 +1,204 @@
|
|
1
|
+
use once_cell::sync::Lazy;
|
2
|
+
use smallvec::SmallVec;
|
3
|
+
use std::collections::HashMap;
|
4
|
+
|
5
|
+
#[derive(Clone, Debug)]
|
6
|
+
pub struct OcrCostMap {
|
7
|
+
/// Maps pairs of characters to their specific substitution cost.
|
8
|
+
/// Stores pairs symmetrically for efficient lookup.
|
9
|
+
costs: HashMap<(char, char), f64>,
|
10
|
+
/// Default cost for substitutions not found in the map.
|
11
|
+
default_substitution_cost: f64,
|
12
|
+
}
|
13
|
+
|
14
|
+
impl OcrCostMap {
|
15
|
+
/// Creates a new OcrCostMap with specified costs.
|
16
|
+
/// Ensures symmetry by adding both (a, b) and (b, a) if only one is provided.
|
17
|
+
pub fn new(
|
18
|
+
custom_costs_input: HashMap<(char, char), f64>,
|
19
|
+
default_substitution_cost: f64,
|
20
|
+
) -> Self {
|
21
|
+
let mut costs = HashMap::with_capacity(custom_costs_input.len() * 2); // Pre-allocate
|
22
|
+
for ((c1, c2), cost) in custom_costs_input {
|
23
|
+
// Ensure symmetry and avoid overwriting if both orders are present
|
24
|
+
costs.entry((c1, c2)).or_insert(cost);
|
25
|
+
costs.entry((c2, c1)).or_insert(cost);
|
26
|
+
}
|
27
|
+
|
28
|
+
OcrCostMap {
|
29
|
+
costs,
|
30
|
+
default_substitution_cost,
|
31
|
+
}
|
32
|
+
}
|
33
|
+
|
34
|
+
/// Gets the substitution cost between two characters.
|
35
|
+
/// Checks the custom map (symmetrically) first, then falls back to the
|
36
|
+
/// default substitution cost configured within this map instance.
|
37
|
+
pub fn get_substitution_cost(&self, c1: char, c2: char) -> f64 {
|
38
|
+
if c1 == c2 {
|
39
|
+
0.0 // No cost if characters are identical
|
40
|
+
} else {
|
41
|
+
// Lookup the pair (symmetry is handled by storage in `new`)
|
42
|
+
// Use the map's configured default_substitution_cost as the fallback.
|
43
|
+
self.costs
|
44
|
+
.get(&(c1, c2))
|
45
|
+
.copied() // Get the cost if the key exists
|
46
|
+
.unwrap_or(self.default_substitution_cost) // Fallback to configured default
|
47
|
+
}
|
48
|
+
}
|
49
|
+
}
|
50
|
+
|
51
|
+
impl Default for OcrCostMap {
|
52
|
+
fn default() -> Self {
|
53
|
+
DEFAULT_OCR_COST_MAP.clone()
|
54
|
+
}
|
55
|
+
}
|
56
|
+
|
57
|
+
// --- Default OCR Map Initialization (Immutable HashMap) ---
|
58
|
+
|
59
|
+
// Define the costs as a static array of tuples
|
60
|
+
const DEFAULT_OCR_PAIRS: &[((char, char), f64)] = &[
|
61
|
+
(('G', '6'), 0.2),
|
62
|
+
(('O', '0'), 0.2),
|
63
|
+
(('o', '0'), 0.2),
|
64
|
+
(('l', '1'), 0.2),
|
65
|
+
(('I', '1'), 0.2),
|
66
|
+
(('2', 'Z'), 0.2),
|
67
|
+
(('B', '8'), 0.2),
|
68
|
+
(('S', '5'), 0.3),
|
69
|
+
(('s', '5'), 0.3),
|
70
|
+
(('E', 'F'), 0.8),
|
71
|
+
];
|
72
|
+
|
73
|
+
// Use Lazy and collect from the static array for initialization
|
74
|
+
static DEFAULT_OCR_COST_MAP: Lazy<OcrCostMap> = Lazy::new(|| {
|
75
|
+
// Collect the static array into a HashMap directly
|
76
|
+
let ocr_costs: HashMap<(char, char), f64> = DEFAULT_OCR_PAIRS.iter().copied().collect();
|
77
|
+
OcrCostMap::new(ocr_costs, 1.0)
|
78
|
+
});
|
79
|
+
|
80
|
+
// Helper to create a range vector with f64 values
|
81
|
+
fn range_vec_f64(size: usize) -> SmallVec<[f64; 16]> {
|
82
|
+
let mut vec = SmallVec::with_capacity(size);
|
83
|
+
for i in 0..size {
|
84
|
+
vec.push(i as f64);
|
85
|
+
}
|
86
|
+
vec
|
87
|
+
}
|
88
|
+
|
89
|
+
/// Calculates Levenshtein distance between two vectors using custom costs.
|
90
|
+
pub fn vec_custom_levenshtein_distance(v1: &[char], v2: &[char]) -> f64 {
|
91
|
+
vec_custom_levenshtein_distance_with_cost_map(v1, v2, &OcrCostMap::default())
|
92
|
+
}
|
93
|
+
|
94
|
+
/// Calculates Levenshtein distance between two vectors using a specified cost map.
|
95
|
+
pub fn vec_custom_levenshtein_distance_with_cost_map(
|
96
|
+
v1: &[char],
|
97
|
+
v2: &[char],
|
98
|
+
cost_map: &OcrCostMap,
|
99
|
+
) -> f64 {
|
100
|
+
let rows = v1.len() + 1;
|
101
|
+
let cols = v2.len() + 1;
|
102
|
+
|
103
|
+
if rows == 1 {
|
104
|
+
return (cols - 1) as f64;
|
105
|
+
} else if cols == 1 {
|
106
|
+
return (rows - 1) as f64;
|
107
|
+
}
|
108
|
+
|
109
|
+
let mut cur: SmallVec<[f64; 16]> = range_vec_f64(cols);
|
110
|
+
|
111
|
+
for r in 1..rows {
|
112
|
+
let prev = cur.clone();
|
113
|
+
cur = SmallVec::from_elem(0.0, cols);
|
114
|
+
cur[0] = r as f64;
|
115
|
+
|
116
|
+
let item1 = v1[r - 1];
|
117
|
+
|
118
|
+
for c in 1..cols {
|
119
|
+
let item2 = v2[c - 1];
|
120
|
+
|
121
|
+
let deletion = prev[c] + 1.0;
|
122
|
+
let insertion = cur[c - 1] + 1.0;
|
123
|
+
|
124
|
+
// Use the provided cost map to get substitution cost
|
125
|
+
let substitution_cost = cost_map.get_substitution_cost(item1, item2);
|
126
|
+
let substitution = prev[c - 1] + substitution_cost;
|
127
|
+
|
128
|
+
cur[c] = deletion.min(insertion).min(substitution);
|
129
|
+
}
|
130
|
+
}
|
131
|
+
cur[cols - 1]
|
132
|
+
}
|
133
|
+
|
134
|
+
/// Calculates custom Levenshtein distance between two strings using OCR cost map.
|
135
|
+
pub fn custom_levenshtein_distance(s1: &str, s2: &str) -> f64 {
|
136
|
+
if s1 == s2 {
|
137
|
+
return 0.0;
|
138
|
+
}
|
139
|
+
|
140
|
+
let v1: Vec<char> = s1.chars().collect();
|
141
|
+
let v2: Vec<char> = s2.chars().collect();
|
142
|
+
|
143
|
+
vec_custom_levenshtein_distance(&v1, &v2)
|
144
|
+
}
|
145
|
+
|
146
|
+
/// Calculates custom Levenshtein distance between two strings using a provided cost map.
|
147
|
+
pub fn custom_levenshtein_distance_with_cost_map(s1: &str, s2: &str, cost_map: &OcrCostMap) -> f64 {
|
148
|
+
if s1 == s2 {
|
149
|
+
return 0.0;
|
150
|
+
}
|
151
|
+
|
152
|
+
let v1: Vec<char> = s1.chars().collect();
|
153
|
+
let v2: Vec<char> = s2.chars().collect();
|
154
|
+
|
155
|
+
vec_custom_levenshtein_distance_with_cost_map(&v1, &v2, cost_map)
|
156
|
+
}
|
157
|
+
|
158
|
+
#[cfg(test)]
|
159
|
+
mod test {
|
160
|
+
use super::*;
|
161
|
+
|
162
|
+
fn assert_approx_eq(a: f64, b: f64, epsilon: f64) {
|
163
|
+
assert!(
|
164
|
+
(a - b).abs() < epsilon,
|
165
|
+
"Assertion failed: {} != {} within epsilon {}",
|
166
|
+
a,
|
167
|
+
b,
|
168
|
+
epsilon
|
169
|
+
);
|
170
|
+
}
|
171
|
+
|
172
|
+
#[test]
|
173
|
+
fn test_custom_levenshtein_simple() {
|
174
|
+
assert_approx_eq(custom_levenshtein_distance("abc", "axc"), 1.0, 1e-9);
|
175
|
+
assert_approx_eq(custom_levenshtein_distance("abc", "ac"), 1.0, 1e-9);
|
176
|
+
assert_approx_eq(custom_levenshtein_distance("ac", "abc"), 1.0, 1e-9);
|
177
|
+
}
|
178
|
+
|
179
|
+
#[test]
|
180
|
+
fn test_custom_levenshtein_ocr_pairs() {
|
181
|
+
assert_approx_eq(custom_levenshtein_distance("ABCDEFG", "ABCDEF6"), 0.2, 1e-9);
|
182
|
+
|
183
|
+
assert_approx_eq(custom_levenshtein_distance("ABCDEF6", "ABCDEFG"), 0.2, 1e-9);
|
184
|
+
|
185
|
+
assert_approx_eq(
|
186
|
+
custom_levenshtein_distance("ABCDEFG", "ABCDEF6X"),
|
187
|
+
0.2 + 1.0,
|
188
|
+
1e-9,
|
189
|
+
);
|
190
|
+
}
|
191
|
+
|
192
|
+
#[test]
|
193
|
+
fn test_custom_levenshtein_with_custom_map() {
|
194
|
+
let mut custom_costs = HashMap::new();
|
195
|
+
custom_costs.insert(('a', 'b'), 0.1);
|
196
|
+
let cost_map = OcrCostMap::new(custom_costs, 1.0);
|
197
|
+
|
198
|
+
assert_approx_eq(
|
199
|
+
custom_levenshtein_distance_with_cost_map("abc", "bbc", &cost_map),
|
200
|
+
0.1,
|
201
|
+
1e-9,
|
202
|
+
);
|
203
|
+
}
|
204
|
+
}
|