ocr-stringdist 0.0.4__tar.gz → 0.0.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.6}/.github/workflows/CI.yml +0 -1
- ocr_stringdist-0.0.6/.github/workflows/docs.yml +70 -0
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.6}/Cargo.lock +45 -57
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.6}/Cargo.toml +2 -3
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.6}/Justfile +7 -1
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.6}/PKG-INFO +10 -6
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.6}/README.md +9 -5
- ocr_stringdist-0.0.6/docs/Makefile +20 -0
- ocr_stringdist-0.0.6/docs/make.bat +35 -0
- ocr_stringdist-0.0.6/docs/source/api/index.rst +22 -0
- ocr_stringdist-0.0.6/docs/source/conf.py +40 -0
- ocr_stringdist-0.0.6/docs/source/index.rst +10 -0
- ocr_stringdist-0.0.6/examples/batch_processing.py +70 -0
- ocr_stringdist-0.0.4/example.py → ocr_stringdist-0.0.6/examples/weighted_levenshtein.py +11 -12
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.6}/pyproject.toml +5 -0
- ocr_stringdist-0.0.6/python/ocr_stringdist/__init__.py +98 -0
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.6}/python/ocr_stringdist/default_ocr_distances.py +4 -0
- ocr_stringdist-0.0.6/src/rust_stringdist.rs +56 -0
- ocr_stringdist-0.0.6/src/weighted_levenshtein.rs +352 -0
- ocr_stringdist-0.0.6/tests/test_batch_functions.py +141 -0
- ocr_stringdist-0.0.6/tests/test_ocr_stringdist.py +112 -0
- ocr_stringdist-0.0.6/uv.lock +801 -0
- ocr_stringdist-0.0.4/python/ocr_stringdist/__init__.py +0 -41
- ocr_stringdist-0.0.4/src/rust_stringdist.rs +0 -44
- ocr_stringdist-0.0.4/src/weighted_levenshtein.rs +0 -140
- ocr_stringdist-0.0.4/tests/test_ocr_stringdist.py +0 -5
- ocr_stringdist-0.0.4/uv.lock +0 -290
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.6}/.gitignore +0 -0
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.6}/LICENSE +0 -0
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.6}/mypy.ini +0 -0
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.6}/python/ocr_stringdist/matching.py +0 -0
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.6}/python/ocr_stringdist/py.typed +0 -0
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.6}/ruff.toml +0 -0
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.6}/src/lib.rs +0 -0
- {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.6}/tests/test_matching.py +0 -0
@@ -0,0 +1,70 @@
|
|
1
|
+
name: Deploy Documentation to Pages
|
2
|
+
|
3
|
+
on:
|
4
|
+
push:
|
5
|
+
branches:
|
6
|
+
- main
|
7
|
+
workflow_dispatch: # Allows manual triggering from the Actions tab
|
8
|
+
|
9
|
+
# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
|
10
|
+
permissions:
|
11
|
+
contents: read
|
12
|
+
pages: write
|
13
|
+
id-token: write
|
14
|
+
|
15
|
+
# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
|
16
|
+
# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
|
17
|
+
concurrency:
|
18
|
+
group: "pages"
|
19
|
+
cancel-in-progress: false
|
20
|
+
|
21
|
+
jobs:
|
22
|
+
build:
|
23
|
+
runs-on: ubuntu-latest
|
24
|
+
steps:
|
25
|
+
- name: Checkout repository
|
26
|
+
uses: actions/checkout@v4
|
27
|
+
|
28
|
+
- name: Set up Python
|
29
|
+
uses: actions/setup-python@v5
|
30
|
+
with:
|
31
|
+
python-version: '3.12'
|
32
|
+
|
33
|
+
- name: Install uv
|
34
|
+
run: curl -LsSf https://astral.sh/uv/install.sh | sh
|
35
|
+
|
36
|
+
- name: Create virtual environment
|
37
|
+
run: uv venv
|
38
|
+
|
39
|
+
- name: Install dependencies
|
40
|
+
run: uv sync --group docs
|
41
|
+
|
42
|
+
- name: Build Sphinx documentation
|
43
|
+
run: |
|
44
|
+
uv run make -C docs html
|
45
|
+
# Add a .nojekyll file to the build output directory to prevent
|
46
|
+
# GitHub Pages from ignoring files that start with an underscore
|
47
|
+
# (like Sphinx's _static and _images directories).
|
48
|
+
touch docs/build/html/.nojekyll
|
49
|
+
|
50
|
+
- name: Setup Pages
|
51
|
+
uses: actions/configure-pages@v4
|
52
|
+
|
53
|
+
- name: Upload artifact
|
54
|
+
uses: actions/upload-pages-artifact@v3
|
55
|
+
with:
|
56
|
+
# Upload entire directory. GitHub Pages expects index.html at the root.
|
57
|
+
path: './docs/build/html'
|
58
|
+
|
59
|
+
deploy:
|
60
|
+
environment:
|
61
|
+
name: github-pages
|
62
|
+
url: ${{ steps.deployment.outputs.page_url }}
|
63
|
+
runs-on: ubuntu-latest
|
64
|
+
needs: build
|
65
|
+
steps:
|
66
|
+
- name: Deploy to GitHub Pages
|
67
|
+
id: deployment
|
68
|
+
uses: actions/deploy-pages@v4
|
69
|
+
# This action automatically downloads the artifact uploaded by
|
70
|
+
# upload-pages-artifact and deploys it to GitHub Pages.
|
@@ -2,19 +2,6 @@
|
|
2
2
|
# It is not intended for manual editing.
|
3
3
|
version = 3
|
4
4
|
|
5
|
-
[[package]]
|
6
|
-
name = "ahash"
|
7
|
-
version = "0.8.11"
|
8
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
9
|
-
checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
|
10
|
-
dependencies = [
|
11
|
-
"cfg-if",
|
12
|
-
"getrandom",
|
13
|
-
"once_cell",
|
14
|
-
"version_check",
|
15
|
-
"zerocopy",
|
16
|
-
]
|
17
|
-
|
18
5
|
[[package]]
|
19
6
|
name = "autocfg"
|
20
7
|
version = "1.4.0"
|
@@ -28,16 +15,36 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
28
15
|
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
29
16
|
|
30
17
|
[[package]]
|
31
|
-
name = "
|
32
|
-
version = "0.
|
18
|
+
name = "crossbeam-deque"
|
19
|
+
version = "0.8.6"
|
33
20
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
34
|
-
checksum = "
|
21
|
+
checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
|
35
22
|
dependencies = [
|
36
|
-
"
|
37
|
-
"
|
38
|
-
|
23
|
+
"crossbeam-epoch",
|
24
|
+
"crossbeam-utils",
|
25
|
+
]
|
26
|
+
|
27
|
+
[[package]]
|
28
|
+
name = "crossbeam-epoch"
|
29
|
+
version = "0.9.18"
|
30
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
31
|
+
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
|
32
|
+
dependencies = [
|
33
|
+
"crossbeam-utils",
|
39
34
|
]
|
40
35
|
|
36
|
+
[[package]]
|
37
|
+
name = "crossbeam-utils"
|
38
|
+
version = "0.8.21"
|
39
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
40
|
+
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
|
41
|
+
|
42
|
+
[[package]]
|
43
|
+
name = "either"
|
44
|
+
version = "1.15.0"
|
45
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
46
|
+
checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
|
47
|
+
|
41
48
|
[[package]]
|
42
49
|
name = "heck"
|
43
50
|
version = "0.5.0"
|
@@ -67,11 +74,10 @@ dependencies = [
|
|
67
74
|
|
68
75
|
[[package]]
|
69
76
|
name = "ocr_stringdist"
|
70
|
-
version = "0.0.
|
77
|
+
version = "0.0.6"
|
71
78
|
dependencies = [
|
72
|
-
"ahash",
|
73
79
|
"pyo3",
|
74
|
-
"
|
80
|
+
"rayon",
|
75
81
|
]
|
76
82
|
|
77
83
|
[[package]]
|
@@ -168,10 +174,24 @@ dependencies = [
|
|
168
174
|
]
|
169
175
|
|
170
176
|
[[package]]
|
171
|
-
name = "
|
172
|
-
version = "1.
|
177
|
+
name = "rayon"
|
178
|
+
version = "1.10.0"
|
179
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
180
|
+
checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
|
181
|
+
dependencies = [
|
182
|
+
"either",
|
183
|
+
"rayon-core",
|
184
|
+
]
|
185
|
+
|
186
|
+
[[package]]
|
187
|
+
name = "rayon-core"
|
188
|
+
version = "1.12.1"
|
173
189
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
174
|
-
checksum = "
|
190
|
+
checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
|
191
|
+
dependencies = [
|
192
|
+
"crossbeam-deque",
|
193
|
+
"crossbeam-utils",
|
194
|
+
]
|
175
195
|
|
176
196
|
[[package]]
|
177
197
|
name = "syn"
|
@@ -201,35 +221,3 @@ name = "unindent"
|
|
201
221
|
version = "0.2.4"
|
202
222
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
203
223
|
checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3"
|
204
|
-
|
205
|
-
[[package]]
|
206
|
-
name = "version_check"
|
207
|
-
version = "0.9.5"
|
208
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
209
|
-
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
|
210
|
-
|
211
|
-
[[package]]
|
212
|
-
name = "wasi"
|
213
|
-
version = "0.11.0+wasi-snapshot-preview1"
|
214
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
215
|
-
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
|
216
|
-
|
217
|
-
[[package]]
|
218
|
-
name = "zerocopy"
|
219
|
-
version = "0.7.35"
|
220
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
221
|
-
checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
|
222
|
-
dependencies = [
|
223
|
-
"zerocopy-derive",
|
224
|
-
]
|
225
|
-
|
226
|
-
[[package]]
|
227
|
-
name = "zerocopy-derive"
|
228
|
-
version = "0.7.35"
|
229
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
230
|
-
checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
|
231
|
-
dependencies = [
|
232
|
-
"proc-macro2",
|
233
|
-
"quote",
|
234
|
-
"syn",
|
235
|
-
]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[package]
|
2
2
|
name = "ocr_stringdist"
|
3
|
-
version = "0.0.
|
3
|
+
version = "0.0.6"
|
4
4
|
edition = "2021"
|
5
5
|
description = "String distances considering OCR errors."
|
6
6
|
authors = ["Niklas von Moers <niklasvmoers@protonmail.com>"]
|
@@ -15,8 +15,7 @@ crate-type = ["cdylib"]
|
|
15
15
|
|
16
16
|
[dependencies]
|
17
17
|
pyo3 = { version = "0.24.0", features = [] }
|
18
|
-
|
19
|
-
smallvec = "1.15.0"
|
18
|
+
rayon = "1.10.0"
|
20
19
|
|
21
20
|
[features]
|
22
21
|
python = []
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: ocr_stringdist
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.6
|
4
4
|
Classifier: Programming Language :: Rust
|
5
5
|
Classifier: Programming Language :: Python
|
6
6
|
Classifier: Operating System :: OS Independent
|
@@ -17,6 +17,8 @@ Project-URL: repository, https://github.com/NiklasvonM/ocr-stringdist
|
|
17
17
|
|
18
18
|
A Python library for string distance calculations that account for common OCR (optical character recognition) errors.
|
19
19
|
|
20
|
+
Documentation: https://niklasvonm.github.io/ocr-stringdist/
|
21
|
+
|
20
22
|
[](https://pypi.org/project/ocr-stringdist/)
|
21
23
|
[](LICENSE)
|
22
24
|
|
@@ -34,10 +36,12 @@ pip install ocr-stringdist
|
|
34
36
|
|
35
37
|
## Features
|
36
38
|
|
37
|
-
- **Weighted Levenshtein Distance**: An adaptation of the classic Levenshtein algorithm with custom substitution costs for character pairs that are commonly confused in OCR models.
|
39
|
+
- **Weighted Levenshtein Distance**: An adaptation of the classic Levenshtein algorithm with custom substitution costs for character pairs that are commonly confused in OCR models, including efficient batch processing.
|
40
|
+
- **Unicode Support**: Arbitrary unicode strings can be compared.
|
41
|
+
- **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
|
38
42
|
- **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
|
39
43
|
- **Customizable Cost Maps**: Create your own substitution cost maps for specific OCR systems or domains.
|
40
|
-
- **Best Match Finder**: Utility function find_best_candidate to efficiently find the best matching string from a collection of candidates using any specified distance function (including the library's OCR-aware ones).
|
44
|
+
- **Best Match Finder**: Utility function `find_best_candidate` to efficiently find the best matching string from a collection of candidates using any specified distance function (including the library's OCR-aware ones).
|
41
45
|
|
42
46
|
## Usage
|
43
47
|
|
@@ -51,12 +55,12 @@ distance = osd.weighted_levenshtein_distance("OCR5", "OCRS")
|
|
51
55
|
print(f"Distance between 'OCR5' and 'OCRS': {distance}") # Will be less than 1.0
|
52
56
|
|
53
57
|
# Custom cost map
|
54
|
-
custom_map = {("
|
58
|
+
custom_map = {("In", "h"): 0.5}
|
55
59
|
distance = osd.weighted_levenshtein_distance(
|
56
|
-
"
|
60
|
+
"hi", "Ini",
|
57
61
|
cost_map=custom_map,
|
58
62
|
symmetric=True,
|
59
|
-
|
63
|
+
max_token_characters=2,
|
60
64
|
)
|
61
65
|
print(f"Distance with custom map: {distance}")
|
62
66
|
```
|
@@ -2,6 +2,8 @@
|
|
2
2
|
|
3
3
|
A Python library for string distance calculations that account for common OCR (optical character recognition) errors.
|
4
4
|
|
5
|
+
Documentation: https://niklasvonm.github.io/ocr-stringdist/
|
6
|
+
|
5
7
|
[](https://pypi.org/project/ocr-stringdist/)
|
6
8
|
[](LICENSE)
|
7
9
|
|
@@ -19,10 +21,12 @@ pip install ocr-stringdist
|
|
19
21
|
|
20
22
|
## Features
|
21
23
|
|
22
|
-
- **Weighted Levenshtein Distance**: An adaptation of the classic Levenshtein algorithm with custom substitution costs for character pairs that are commonly confused in OCR models.
|
24
|
+
- **Weighted Levenshtein Distance**: An adaptation of the classic Levenshtein algorithm with custom substitution costs for character pairs that are commonly confused in OCR models, including efficient batch processing.
|
25
|
+
- **Unicode Support**: Arbitrary unicode strings can be compared.
|
26
|
+
- **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
|
23
27
|
- **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
|
24
28
|
- **Customizable Cost Maps**: Create your own substitution cost maps for specific OCR systems or domains.
|
25
|
-
- **Best Match Finder**: Utility function find_best_candidate to efficiently find the best matching string from a collection of candidates using any specified distance function (including the library's OCR-aware ones).
|
29
|
+
- **Best Match Finder**: Utility function `find_best_candidate` to efficiently find the best matching string from a collection of candidates using any specified distance function (including the library's OCR-aware ones).
|
26
30
|
|
27
31
|
## Usage
|
28
32
|
|
@@ -36,12 +40,12 @@ distance = osd.weighted_levenshtein_distance("OCR5", "OCRS")
|
|
36
40
|
print(f"Distance between 'OCR5' and 'OCRS': {distance}") # Will be less than 1.0
|
37
41
|
|
38
42
|
# Custom cost map
|
39
|
-
custom_map = {("
|
43
|
+
custom_map = {("In", "h"): 0.5}
|
40
44
|
distance = osd.weighted_levenshtein_distance(
|
41
|
-
"
|
45
|
+
"hi", "Ini",
|
42
46
|
cost_map=custom_map,
|
43
47
|
symmetric=True,
|
44
|
-
|
48
|
+
max_token_characters=2,
|
45
49
|
)
|
46
50
|
print(f"Distance with custom map: {distance}")
|
47
51
|
```
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# Minimal makefile for Sphinx documentation
|
2
|
+
#
|
3
|
+
|
4
|
+
# You can set these variables from the command line, and also
|
5
|
+
# from the environment for the first two.
|
6
|
+
SPHINXOPTS ?=
|
7
|
+
SPHINXBUILD ?= sphinx-build
|
8
|
+
SOURCEDIR = source
|
9
|
+
BUILDDIR = build
|
10
|
+
|
11
|
+
# Put it first so that "make" without argument is like "make help".
|
12
|
+
help:
|
13
|
+
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
14
|
+
|
15
|
+
.PHONY: help Makefile
|
16
|
+
|
17
|
+
# Catch-all target: route all unknown targets to Sphinx using the new
|
18
|
+
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
19
|
+
%: Makefile
|
20
|
+
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
@@ -0,0 +1,35 @@
|
|
1
|
+
@ECHO OFF
|
2
|
+
|
3
|
+
pushd %~dp0
|
4
|
+
|
5
|
+
REM Command file for Sphinx documentation
|
6
|
+
|
7
|
+
if "%SPHINXBUILD%" == "" (
|
8
|
+
set SPHINXBUILD=sphinx-build
|
9
|
+
)
|
10
|
+
set SOURCEDIR=source
|
11
|
+
set BUILDDIR=build
|
12
|
+
|
13
|
+
%SPHINXBUILD% >NUL 2>NUL
|
14
|
+
if errorlevel 9009 (
|
15
|
+
echo.
|
16
|
+
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
17
|
+
echo.installed, then set the SPHINXBUILD environment variable to point
|
18
|
+
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
19
|
+
echo.may add the Sphinx directory to PATH.
|
20
|
+
echo.
|
21
|
+
echo.If you don't have Sphinx installed, grab it from
|
22
|
+
echo.https://www.sphinx-doc.org/
|
23
|
+
exit /b 1
|
24
|
+
)
|
25
|
+
|
26
|
+
if "%1" == "" goto help
|
27
|
+
|
28
|
+
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
29
|
+
goto end
|
30
|
+
|
31
|
+
:help
|
32
|
+
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
33
|
+
|
34
|
+
:end
|
35
|
+
popd
|
@@ -0,0 +1,22 @@
|
|
1
|
+
.. _api_reference:
|
2
|
+
|
3
|
+
API Reference
|
4
|
+
=============
|
5
|
+
|
6
|
+
This page contains the auto-generated API reference documentation.
|
7
|
+
|
8
|
+
.. autofunction:: ocr_stringdist.weighted_levenshtein_distance
|
9
|
+
|
10
|
+
.. autofunction:: ocr_stringdist.batch_weighted_levenshtein_distance
|
11
|
+
|
12
|
+
.. automodule:: ocr_stringdist.matching
|
13
|
+
:members:
|
14
|
+
:undoc-members:
|
15
|
+
:show-inheritance:
|
16
|
+
|
17
|
+
.. autodata:: ocr_stringdist.default_ocr_distances.ocr_distance_map
|
18
|
+
:annotation:
|
19
|
+
.. literalinclude:: ../../../python/ocr_stringdist/default_ocr_distances.py
|
20
|
+
:language: python
|
21
|
+
:start-after: OCR_DISTANCE_MAP_START
|
22
|
+
:end-before: OCR_DISTANCE_MAP_END
|
@@ -0,0 +1,40 @@
|
|
1
|
+
# Configuration file for the Sphinx documentation builder.
|
2
|
+
#
|
3
|
+
# For the full list of built-in configuration values, see the documentation:
|
4
|
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
5
|
+
|
6
|
+
|
7
|
+
import os
|
8
|
+
import sys
|
9
|
+
|
10
|
+
# source code is in project_root/python/ocr_stringdist
|
11
|
+
sys.path.insert(0, os.path.abspath("../../python"))
|
12
|
+
|
13
|
+
|
14
|
+
# -- Project information -----------------------------------------------------
|
15
|
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
|
16
|
+
|
17
|
+
project = "OCR-StringDist"
|
18
|
+
copyright = "2025, Niklas von Moers"
|
19
|
+
author = "Niklas von Moers"
|
20
|
+
|
21
|
+
# -- General configuration ---------------------------------------------------
|
22
|
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
|
23
|
+
|
24
|
+
extensions: list[str] = [
|
25
|
+
"sphinx.ext.autodoc", # Core library to pull documentation from docstrings
|
26
|
+
"sphinx.ext.napoleon", # Support for Google and NumPy style docstrings
|
27
|
+
"sphinx.ext.intersphinx", # Link to other projects' documentation
|
28
|
+
"sphinx.ext.viewcode", # Add links to source code
|
29
|
+
"sphinx_mdinclude", # Include Markdown
|
30
|
+
]
|
31
|
+
|
32
|
+
templates_path = ["_templates"]
|
33
|
+
exclude_patterns: list[str] = []
|
34
|
+
|
35
|
+
|
36
|
+
# -- Options for HTML output -------------------------------------------------
|
37
|
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
|
38
|
+
|
39
|
+
html_theme = "sphinx_rtd_theme"
|
40
|
+
html_static_path: list[str] = ["_static"]
|
@@ -0,0 +1,70 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
Example demonstrating the usage of the batch processing functions from ocr_stringdist.
|
4
|
+
"""
|
5
|
+
|
6
|
+
import time
|
7
|
+
from typing import Any, Callable
|
8
|
+
|
9
|
+
import ocr_stringdist as osd
|
10
|
+
|
11
|
+
MAX_TOKEN_CHARACTERS = 1
|
12
|
+
|
13
|
+
|
14
|
+
def benchmark(func: Callable, *args: Any, **kwargs: Any) -> tuple[Any, float]: # type: ignore
|
15
|
+
"""Run a function and return the execution time in seconds."""
|
16
|
+
start = time.time()
|
17
|
+
result = func(*args, **kwargs)
|
18
|
+
end = time.time()
|
19
|
+
return result, end - start
|
20
|
+
|
21
|
+
|
22
|
+
def compare_methods() -> None:
|
23
|
+
"""
|
24
|
+
Compare the performance of different methods for calculating Levenshtein distances.
|
25
|
+
"""
|
26
|
+
# Example data
|
27
|
+
source = "recognition"
|
28
|
+
candidates = ["recognition", "recogmtion", "recognltlon", "recogrtition", "recognitton"] * 1000
|
29
|
+
|
30
|
+
print("\nSingle string against multiple candidates:")
|
31
|
+
print("-" * 50)
|
32
|
+
|
33
|
+
# Standard loop approach
|
34
|
+
_, time_loop = benchmark(
|
35
|
+
lambda: [
|
36
|
+
osd.weighted_levenshtein_distance(
|
37
|
+
source, cand, max_token_characters=MAX_TOKEN_CHARACTERS
|
38
|
+
)
|
39
|
+
for cand in candidates
|
40
|
+
]
|
41
|
+
)
|
42
|
+
print(
|
43
|
+
f"Loop of single calls: {time_loop:.6f} seconds "
|
44
|
+
f"({1000 * time_loop / len(candidates):.6f}ms each)"
|
45
|
+
)
|
46
|
+
|
47
|
+
# Batch approach
|
48
|
+
_, time_batch = benchmark(
|
49
|
+
osd.batch_weighted_levenshtein_distance,
|
50
|
+
source,
|
51
|
+
candidates,
|
52
|
+
max_token_characters=MAX_TOKEN_CHARACTERS,
|
53
|
+
)
|
54
|
+
print(
|
55
|
+
f"Batch function: {time_batch:.6f} seconds "
|
56
|
+
f"({1000 * time_batch / len(candidates):.6f}ms each)"
|
57
|
+
)
|
58
|
+
print(f"Speedup: {time_loop / time_batch:.2f}x")
|
59
|
+
|
60
|
+
|
61
|
+
def main() -> None:
|
62
|
+
"""Main function."""
|
63
|
+
print("Demonstrating batch processing functions from ocr_stringdist\n")
|
64
|
+
|
65
|
+
# Run the benchmarks
|
66
|
+
compare_methods()
|
67
|
+
|
68
|
+
|
69
|
+
if __name__ == "__main__":
|
70
|
+
main()
|
@@ -1,3 +1,4 @@
|
|
1
|
+
#!/usr/bin/env python3
|
1
2
|
from icecream import ic
|
2
3
|
from ocr_stringdist import find_best_candidate, weighted_levenshtein_distance
|
3
4
|
|
@@ -17,29 +18,27 @@ ic(
|
|
17
18
|
)
|
18
19
|
)
|
19
20
|
|
21
|
+
# Substitution of multiple characters at once is supported.
|
20
22
|
ic(
|
21
23
|
weighted_levenshtein_distance(
|
22
|
-
"
|
23
|
-
"
|
24
|
-
|
25
|
-
|
26
|
-
)
|
24
|
+
"이탈리",
|
25
|
+
"OI탈리", # Korean syllables may be confused with multiple Latin letters at once
|
26
|
+
{("이", "OI"): 0.5},
|
27
|
+
max_token_characters=2,
|
28
|
+
),
|
27
29
|
)
|
28
30
|
|
29
31
|
ic(
|
30
32
|
weighted_levenshtein_distance(
|
31
|
-
"
|
32
|
-
"
|
33
|
-
{
|
33
|
+
"ABCDE",
|
34
|
+
"XBCDE",
|
35
|
+
cost_map={},
|
36
|
+
default_cost=0.8, # Lower default substitution cost (default is 1.0)
|
34
37
|
)
|
35
38
|
)
|
36
39
|
|
37
|
-
|
38
40
|
ic(weighted_levenshtein_distance("A", "B", {("A", "B"): 0.0}, symmetric=False))
|
39
41
|
ic(weighted_levenshtein_distance("A", "B", {("B", "A"): 0.0}, symmetric=False))
|
40
|
-
ic(weighted_levenshtein_distance("B", "A", {("B", "A"): 0.0}, symmetric=False))
|
41
|
-
ic(weighted_levenshtein_distance("B", "A", {("A", "B"): 0.0}, symmetric=False))
|
42
|
-
|
43
42
|
|
44
43
|
ic(
|
45
44
|
find_best_candidate(
|