ocr-stringdist 0.0.4__tar.gz → 0.0.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.6}/.github/workflows/CI.yml +0 -1
  2. ocr_stringdist-0.0.6/.github/workflows/docs.yml +70 -0
  3. {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.6}/Cargo.lock +45 -57
  4. {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.6}/Cargo.toml +2 -3
  5. {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.6}/Justfile +7 -1
  6. {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.6}/PKG-INFO +10 -6
  7. {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.6}/README.md +9 -5
  8. ocr_stringdist-0.0.6/docs/Makefile +20 -0
  9. ocr_stringdist-0.0.6/docs/make.bat +35 -0
  10. ocr_stringdist-0.0.6/docs/source/api/index.rst +22 -0
  11. ocr_stringdist-0.0.6/docs/source/conf.py +40 -0
  12. ocr_stringdist-0.0.6/docs/source/index.rst +10 -0
  13. ocr_stringdist-0.0.6/examples/batch_processing.py +70 -0
  14. ocr_stringdist-0.0.4/example.py → ocr_stringdist-0.0.6/examples/weighted_levenshtein.py +11 -12
  15. {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.6}/pyproject.toml +5 -0
  16. ocr_stringdist-0.0.6/python/ocr_stringdist/__init__.py +98 -0
  17. {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.6}/python/ocr_stringdist/default_ocr_distances.py +4 -0
  18. ocr_stringdist-0.0.6/src/rust_stringdist.rs +56 -0
  19. ocr_stringdist-0.0.6/src/weighted_levenshtein.rs +352 -0
  20. ocr_stringdist-0.0.6/tests/test_batch_functions.py +141 -0
  21. ocr_stringdist-0.0.6/tests/test_ocr_stringdist.py +112 -0
  22. ocr_stringdist-0.0.6/uv.lock +801 -0
  23. ocr_stringdist-0.0.4/python/ocr_stringdist/__init__.py +0 -41
  24. ocr_stringdist-0.0.4/src/rust_stringdist.rs +0 -44
  25. ocr_stringdist-0.0.4/src/weighted_levenshtein.rs +0 -140
  26. ocr_stringdist-0.0.4/tests/test_ocr_stringdist.py +0 -5
  27. ocr_stringdist-0.0.4/uv.lock +0 -290
  28. {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.6}/.gitignore +0 -0
  29. {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.6}/LICENSE +0 -0
  30. {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.6}/mypy.ini +0 -0
  31. {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.6}/python/ocr_stringdist/matching.py +0 -0
  32. {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.6}/python/ocr_stringdist/py.typed +0 -0
  33. {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.6}/ruff.toml +0 -0
  34. {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.6}/src/lib.rs +0 -0
  35. {ocr_stringdist-0.0.4 → ocr_stringdist-0.0.6}/tests/test_matching.py +0 -0
@@ -1,4 +1,3 @@
1
- # This file was edited manually to add
2
1
  # The original was autogenerated by maturin v0.14.15
3
2
  on:
4
3
  push:
@@ -0,0 +1,70 @@
1
+ name: Deploy Documentation to Pages
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ workflow_dispatch: # Allows manual triggering from the Actions tab
8
+
9
+ # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
10
+ permissions:
11
+ contents: read
12
+ pages: write
13
+ id-token: write
14
+
15
+ # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
16
+ # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
17
+ concurrency:
18
+ group: "pages"
19
+ cancel-in-progress: false
20
+
21
+ jobs:
22
+ build:
23
+ runs-on: ubuntu-latest
24
+ steps:
25
+ - name: Checkout repository
26
+ uses: actions/checkout@v4
27
+
28
+ - name: Set up Python
29
+ uses: actions/setup-python@v5
30
+ with:
31
+ python-version: '3.12'
32
+
33
+ - name: Install uv
34
+ run: curl -LsSf https://astral.sh/uv/install.sh | sh
35
+
36
+ - name: Create virtual environment
37
+ run: uv venv
38
+
39
+ - name: Install dependencies
40
+ run: uv sync --group docs
41
+
42
+ - name: Build Sphinx documentation
43
+ run: |
44
+ uv run make -C docs html
45
+ # Add a .nojekyll file to the build output directory to prevent
46
+ # GitHub Pages from ignoring files that start with an underscore
47
+ # (like Sphinx's _static and _images directories).
48
+ touch docs/build/html/.nojekyll
49
+
50
+ - name: Setup Pages
51
+ uses: actions/configure-pages@v4
52
+
53
+ - name: Upload artifact
54
+ uses: actions/upload-pages-artifact@v3
55
+ with:
56
+ # Upload entire directory. GitHub Pages expects index.html at the root.
57
+ path: './docs/build/html'
58
+
59
+ deploy:
60
+ environment:
61
+ name: github-pages
62
+ url: ${{ steps.deployment.outputs.page_url }}
63
+ runs-on: ubuntu-latest
64
+ needs: build
65
+ steps:
66
+ - name: Deploy to GitHub Pages
67
+ id: deployment
68
+ uses: actions/deploy-pages@v4
69
+ # This action automatically downloads the artifact uploaded by
70
+ # upload-pages-artifact and deploys it to GitHub Pages.
@@ -2,19 +2,6 @@
2
2
  # It is not intended for manual editing.
3
3
  version = 3
4
4
 
5
- [[package]]
6
- name = "ahash"
7
- version = "0.8.11"
8
- source = "registry+https://github.com/rust-lang/crates.io-index"
9
- checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
10
- dependencies = [
11
- "cfg-if",
12
- "getrandom",
13
- "once_cell",
14
- "version_check",
15
- "zerocopy",
16
- ]
17
-
18
5
  [[package]]
19
6
  name = "autocfg"
20
7
  version = "1.4.0"
@@ -28,16 +15,36 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
28
15
  checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
29
16
 
30
17
  [[package]]
31
- name = "getrandom"
32
- version = "0.2.15"
18
+ name = "crossbeam-deque"
19
+ version = "0.8.6"
33
20
  source = "registry+https://github.com/rust-lang/crates.io-index"
34
- checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
21
+ checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
35
22
  dependencies = [
36
- "cfg-if",
37
- "libc",
38
- "wasi",
23
+ "crossbeam-epoch",
24
+ "crossbeam-utils",
25
+ ]
26
+
27
+ [[package]]
28
+ name = "crossbeam-epoch"
29
+ version = "0.9.18"
30
+ source = "registry+https://github.com/rust-lang/crates.io-index"
31
+ checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
32
+ dependencies = [
33
+ "crossbeam-utils",
39
34
  ]
40
35
 
36
+ [[package]]
37
+ name = "crossbeam-utils"
38
+ version = "0.8.21"
39
+ source = "registry+https://github.com/rust-lang/crates.io-index"
40
+ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
41
+
42
+ [[package]]
43
+ name = "either"
44
+ version = "1.15.0"
45
+ source = "registry+https://github.com/rust-lang/crates.io-index"
46
+ checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
47
+
41
48
  [[package]]
42
49
  name = "heck"
43
50
  version = "0.5.0"
@@ -67,11 +74,10 @@ dependencies = [
67
74
 
68
75
  [[package]]
69
76
  name = "ocr_stringdist"
70
- version = "0.0.4"
77
+ version = "0.0.6"
71
78
  dependencies = [
72
- "ahash",
73
79
  "pyo3",
74
- "smallvec",
80
+ "rayon",
75
81
  ]
76
82
 
77
83
  [[package]]
@@ -168,10 +174,24 @@ dependencies = [
168
174
  ]
169
175
 
170
176
  [[package]]
171
- name = "smallvec"
172
- version = "1.15.0"
177
+ name = "rayon"
178
+ version = "1.10.0"
179
+ source = "registry+https://github.com/rust-lang/crates.io-index"
180
+ checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
181
+ dependencies = [
182
+ "either",
183
+ "rayon-core",
184
+ ]
185
+
186
+ [[package]]
187
+ name = "rayon-core"
188
+ version = "1.12.1"
173
189
  source = "registry+https://github.com/rust-lang/crates.io-index"
174
- checksum = "8917285742e9f3e1683f0a9c4e6b57960b7314d0b08d30d1ecd426713ee2eee9"
190
+ checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
191
+ dependencies = [
192
+ "crossbeam-deque",
193
+ "crossbeam-utils",
194
+ ]
175
195
 
176
196
  [[package]]
177
197
  name = "syn"
@@ -201,35 +221,3 @@ name = "unindent"
201
221
  version = "0.2.4"
202
222
  source = "registry+https://github.com/rust-lang/crates.io-index"
203
223
  checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3"
204
-
205
- [[package]]
206
- name = "version_check"
207
- version = "0.9.5"
208
- source = "registry+https://github.com/rust-lang/crates.io-index"
209
- checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
210
-
211
- [[package]]
212
- name = "wasi"
213
- version = "0.11.0+wasi-snapshot-preview1"
214
- source = "registry+https://github.com/rust-lang/crates.io-index"
215
- checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
216
-
217
- [[package]]
218
- name = "zerocopy"
219
- version = "0.7.35"
220
- source = "registry+https://github.com/rust-lang/crates.io-index"
221
- checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
222
- dependencies = [
223
- "zerocopy-derive",
224
- ]
225
-
226
- [[package]]
227
- name = "zerocopy-derive"
228
- version = "0.7.35"
229
- source = "registry+https://github.com/rust-lang/crates.io-index"
230
- checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
231
- dependencies = [
232
- "proc-macro2",
233
- "quote",
234
- "syn",
235
- ]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "ocr_stringdist"
3
- version = "0.0.4"
3
+ version = "0.0.6"
4
4
  edition = "2021"
5
5
  description = "String distances considering OCR errors."
6
6
  authors = ["Niklas von Moers <niklasvmoers@protonmail.com>"]
@@ -15,8 +15,7 @@ crate-type = ["cdylib"]
15
15
 
16
16
  [dependencies]
17
17
  pyo3 = { version = "0.24.0", features = [] }
18
- ahash = "^0.8"
19
- smallvec = "1.15.0"
18
+ rayon = "1.10.0"
20
19
 
21
20
  [features]
22
21
  python = []
@@ -4,7 +4,7 @@ venv:
4
4
  uv sync
5
5
 
6
6
  pytest:
7
- maturin develop
7
+ uv run maturin develop
8
8
  uv run pytest
9
9
 
10
10
  test:
@@ -12,3 +12,9 @@ test:
12
12
 
13
13
  mypy:
14
14
  uv run mypy .
15
+
16
+ lint:
17
+ uv run ruff check . --fix
18
+
19
+ doc:
20
+ uv run make -C docs html
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ocr_stringdist
3
- Version: 0.0.4
3
+ Version: 0.0.6
4
4
  Classifier: Programming Language :: Rust
5
5
  Classifier: Programming Language :: Python
6
6
  Classifier: Operating System :: OS Independent
@@ -17,6 +17,8 @@ Project-URL: repository, https://github.com/NiklasvonM/ocr-stringdist
17
17
 
18
18
  A Python library for string distance calculations that account for common OCR (optical character recognition) errors.
19
19
 
20
+ Documentation: https://niklasvonm.github.io/ocr-stringdist/
21
+
20
22
  [![PyPI](https://img.shields.io/badge/PyPI-Package-blue)](https://pypi.org/project/ocr-stringdist/)
21
23
  [![License](https://img.shields.io/badge/License-MIT-green)](LICENSE)
22
24
 
@@ -34,10 +36,12 @@ pip install ocr-stringdist
34
36
 
35
37
  ## Features
36
38
 
37
- - **Weighted Levenshtein Distance**: An adaptation of the classic Levenshtein algorithm with custom substitution costs for character pairs that are commonly confused in OCR models.
39
+ - **Weighted Levenshtein Distance**: An adaptation of the classic Levenshtein algorithm with custom substitution costs for character pairs that are commonly confused in OCR models, including efficient batch processing.
40
+ - **Unicode Support**: Arbitrary unicode strings can be compared.
41
+ - **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
38
42
  - **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
39
43
  - **Customizable Cost Maps**: Create your own substitution cost maps for specific OCR systems or domains.
40
- - **Best Match Finder**: Utility function find_best_candidate to efficiently find the best matching string from a collection of candidates using any specified distance function (including the library's OCR-aware ones). Supports early stopping for performance optimization.
44
+ - **Best Match Finder**: Utility function `find_best_candidate` to efficiently find the best matching string from a collection of candidates using any specified distance function (including the library's OCR-aware ones).
41
45
 
42
46
  ## Usage
43
47
 
@@ -51,12 +55,12 @@ distance = osd.weighted_levenshtein_distance("OCR5", "OCRS")
51
55
  print(f"Distance between 'OCR5' and 'OCRS': {distance}") # Will be less than 1.0
52
56
 
53
57
  # Custom cost map
54
- custom_map = {("f", "t"): 0.2, ("m", "n"): 0.1}
58
+ custom_map = {("In", "h"): 0.5}
55
59
  distance = osd.weighted_levenshtein_distance(
56
- "first", "tirst",
60
+ "hi", "Ini",
57
61
  cost_map=custom_map,
58
62
  symmetric=True,
59
- default_cost=1.0
63
+ max_token_characters=2,
60
64
  )
61
65
  print(f"Distance with custom map: {distance}")
62
66
  ```
@@ -2,6 +2,8 @@
2
2
 
3
3
  A Python library for string distance calculations that account for common OCR (optical character recognition) errors.
4
4
 
5
+ Documentation: https://niklasvonm.github.io/ocr-stringdist/
6
+
5
7
  [![PyPI](https://img.shields.io/badge/PyPI-Package-blue)](https://pypi.org/project/ocr-stringdist/)
6
8
  [![License](https://img.shields.io/badge/License-MIT-green)](LICENSE)
7
9
 
@@ -19,10 +21,12 @@ pip install ocr-stringdist
19
21
 
20
22
  ## Features
21
23
 
22
- - **Weighted Levenshtein Distance**: An adaptation of the classic Levenshtein algorithm with custom substitution costs for character pairs that are commonly confused in OCR models.
24
+ - **Weighted Levenshtein Distance**: An adaptation of the classic Levenshtein algorithm with custom substitution costs for character pairs that are commonly confused in OCR models, including efficient batch processing.
25
+ - **Unicode Support**: Arbitrary unicode strings can be compared.
26
+ - **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
23
27
  - **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
24
28
  - **Customizable Cost Maps**: Create your own substitution cost maps for specific OCR systems or domains.
25
- - **Best Match Finder**: Utility function find_best_candidate to efficiently find the best matching string from a collection of candidates using any specified distance function (including the library's OCR-aware ones). Supports early stopping for performance optimization.
29
+ - **Best Match Finder**: Utility function `find_best_candidate` to efficiently find the best matching string from a collection of candidates using any specified distance function (including the library's OCR-aware ones).
26
30
 
27
31
  ## Usage
28
32
 
@@ -36,12 +40,12 @@ distance = osd.weighted_levenshtein_distance("OCR5", "OCRS")
36
40
  print(f"Distance between 'OCR5' and 'OCRS': {distance}") # Will be less than 1.0
37
41
 
38
42
  # Custom cost map
39
- custom_map = {("f", "t"): 0.2, ("m", "n"): 0.1}
43
+ custom_map = {("In", "h"): 0.5}
40
44
  distance = osd.weighted_levenshtein_distance(
41
- "first", "tirst",
45
+ "hi", "Ini",
42
46
  cost_map=custom_map,
43
47
  symmetric=True,
44
- default_cost=1.0
48
+ max_token_characters=2,
45
49
  )
46
50
  print(f"Distance with custom map: {distance}")
47
51
  ```
@@ -0,0 +1,20 @@
1
+ # Minimal makefile for Sphinx documentation
2
+ #
3
+
4
+ # You can set these variables from the command line, and also
5
+ # from the environment for the first two.
6
+ SPHINXOPTS ?=
7
+ SPHINXBUILD ?= sphinx-build
8
+ SOURCEDIR = source
9
+ BUILDDIR = build
10
+
11
+ # Put it first so that "make" without argument is like "make help".
12
+ help:
13
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14
+
15
+ .PHONY: help Makefile
16
+
17
+ # Catch-all target: route all unknown targets to Sphinx using the new
18
+ # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19
+ %: Makefile
20
+ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
@@ -0,0 +1,35 @@
1
+ @ECHO OFF
2
+
3
+ pushd %~dp0
4
+
5
+ REM Command file for Sphinx documentation
6
+
7
+ if "%SPHINXBUILD%" == "" (
8
+ set SPHINXBUILD=sphinx-build
9
+ )
10
+ set SOURCEDIR=source
11
+ set BUILDDIR=build
12
+
13
+ %SPHINXBUILD% >NUL 2>NUL
14
+ if errorlevel 9009 (
15
+ echo.
16
+ echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17
+ echo.installed, then set the SPHINXBUILD environment variable to point
18
+ echo.to the full path of the 'sphinx-build' executable. Alternatively you
19
+ echo.may add the Sphinx directory to PATH.
20
+ echo.
21
+ echo.If you don't have Sphinx installed, grab it from
22
+ echo.https://www.sphinx-doc.org/
23
+ exit /b 1
24
+ )
25
+
26
+ if "%1" == "" goto help
27
+
28
+ %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29
+ goto end
30
+
31
+ :help
32
+ %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33
+
34
+ :end
35
+ popd
@@ -0,0 +1,22 @@
1
+ .. _api_reference:
2
+
3
+ API Reference
4
+ =============
5
+
6
+ This page contains the auto-generated API reference documentation.
7
+
8
+ .. autofunction:: ocr_stringdist.weighted_levenshtein_distance
9
+
10
+ .. autofunction:: ocr_stringdist.batch_weighted_levenshtein_distance
11
+
12
+ .. automodule:: ocr_stringdist.matching
13
+ :members:
14
+ :undoc-members:
15
+ :show-inheritance:
16
+
17
+ .. autodata:: ocr_stringdist.default_ocr_distances.ocr_distance_map
18
+ :annotation:
19
+ .. literalinclude:: ../../../python/ocr_stringdist/default_ocr_distances.py
20
+ :language: python
21
+ :start-after: OCR_DISTANCE_MAP_START
22
+ :end-before: OCR_DISTANCE_MAP_END
@@ -0,0 +1,40 @@
1
+ # Configuration file for the Sphinx documentation builder.
2
+ #
3
+ # For the full list of built-in configuration values, see the documentation:
4
+ # https://www.sphinx-doc.org/en/master/usage/configuration.html
5
+
6
+
7
+ import os
8
+ import sys
9
+
10
+ # source code is in project_root/python/ocr_stringdist
11
+ sys.path.insert(0, os.path.abspath("../../python"))
12
+
13
+
14
+ # -- Project information -----------------------------------------------------
15
+ # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
16
+
17
+ project = "OCR-StringDist"
18
+ copyright = "2025, Niklas von Moers"
19
+ author = "Niklas von Moers"
20
+
21
+ # -- General configuration ---------------------------------------------------
22
+ # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
23
+
24
+ extensions: list[str] = [
25
+ "sphinx.ext.autodoc", # Core library to pull documentation from docstrings
26
+ "sphinx.ext.napoleon", # Support for Google and NumPy style docstrings
27
+ "sphinx.ext.intersphinx", # Link to other projects' documentation
28
+ "sphinx.ext.viewcode", # Add links to source code
29
+ "sphinx_mdinclude", # Include Markdown
30
+ ]
31
+
32
+ templates_path = ["_templates"]
33
+ exclude_patterns: list[str] = []
34
+
35
+
36
+ # -- Options for HTML output -------------------------------------------------
37
+ # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
38
+
39
+ html_theme = "sphinx_rtd_theme"
40
+ html_static_path: list[str] = ["_static"]
@@ -0,0 +1,10 @@
1
+ .. OCR-StringDist documentation master file, created by
2
+ sphinx-quickstart on Sun Apr 20 10:40:20 2025.
3
+
4
+ .. mdinclude:: ../../README.md
5
+
6
+ .. toctree::
7
+ :maxdepth: 2
8
+ :caption: Contents:
9
+
10
+ api/index
@@ -0,0 +1,70 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Example demonstrating the usage of the batch processing functions from ocr_stringdist.
4
+ """
5
+
6
+ import time
7
+ from typing import Any, Callable
8
+
9
+ import ocr_stringdist as osd
10
+
11
+ MAX_TOKEN_CHARACTERS = 1
12
+
13
+
14
+ def benchmark(func: Callable, *args: Any, **kwargs: Any) -> tuple[Any, float]: # type: ignore
15
+ """Run a function and return the execution time in seconds."""
16
+ start = time.time()
17
+ result = func(*args, **kwargs)
18
+ end = time.time()
19
+ return result, end - start
20
+
21
+
22
+ def compare_methods() -> None:
23
+ """
24
+ Compare the performance of different methods for calculating Levenshtein distances.
25
+ """
26
+ # Example data
27
+ source = "recognition"
28
+ candidates = ["recognition", "recogmtion", "recognltlon", "recogrtition", "recognitton"] * 1000
29
+
30
+ print("\nSingle string against multiple candidates:")
31
+ print("-" * 50)
32
+
33
+ # Standard loop approach
34
+ _, time_loop = benchmark(
35
+ lambda: [
36
+ osd.weighted_levenshtein_distance(
37
+ source, cand, max_token_characters=MAX_TOKEN_CHARACTERS
38
+ )
39
+ for cand in candidates
40
+ ]
41
+ )
42
+ print(
43
+ f"Loop of single calls: {time_loop:.6f} seconds "
44
+ f"({1000 * time_loop / len(candidates):.6f}ms each)"
45
+ )
46
+
47
+ # Batch approach
48
+ _, time_batch = benchmark(
49
+ osd.batch_weighted_levenshtein_distance,
50
+ source,
51
+ candidates,
52
+ max_token_characters=MAX_TOKEN_CHARACTERS,
53
+ )
54
+ print(
55
+ f"Batch function: {time_batch:.6f} seconds "
56
+ f"({1000 * time_batch / len(candidates):.6f}ms each)"
57
+ )
58
+ print(f"Speedup: {time_loop / time_batch:.2f}x")
59
+
60
+
61
+ def main() -> None:
62
+ """Main function."""
63
+ print("Demonstrating batch processing functions from ocr_stringdist\n")
64
+
65
+ # Run the benchmarks
66
+ compare_methods()
67
+
68
+
69
+ if __name__ == "__main__":
70
+ main()
@@ -1,3 +1,4 @@
1
+ #!/usr/bin/env python3
1
2
  from icecream import ic
2
3
  from ocr_stringdist import find_best_candidate, weighted_levenshtein_distance
3
4
 
@@ -17,29 +18,27 @@ ic(
17
18
  )
18
19
  )
19
20
 
21
+ # Substitution of multiple characters at once is supported.
20
22
  ic(
21
23
  weighted_levenshtein_distance(
22
- "ABCDE",
23
- "XBCDE",
24
- cost_map={},
25
- default_cost=0.8, # Lower default substitution cost (default is 1.0)
26
- )
24
+ "이탈리",
25
+ "OI탈리", # Korean syllables may be confused with multiple Latin letters at once
26
+ {("이", "OI"): 0.5},
27
+ max_token_characters=2,
28
+ ),
27
29
  )
28
30
 
29
31
  ic(
30
32
  weighted_levenshtein_distance(
31
- "RO8ERT",
32
- "R0BERT",
33
- {("O", "0"): 0.1, ("B", "8"): 0.2},
33
+ "ABCDE",
34
+ "XBCDE",
35
+ cost_map={},
36
+ default_cost=0.8, # Lower default substitution cost (default is 1.0)
34
37
  )
35
38
  )
36
39
 
37
-
38
40
  ic(weighted_levenshtein_distance("A", "B", {("A", "B"): 0.0}, symmetric=False))
39
41
  ic(weighted_levenshtein_distance("A", "B", {("B", "A"): 0.0}, symmetric=False))
40
- ic(weighted_levenshtein_distance("B", "A", {("B", "A"): 0.0}, symmetric=False))
41
- ic(weighted_levenshtein_distance("B", "A", {("A", "B"): 0.0}, symmetric=False))
42
-
43
42
 
44
43
  ic(
45
44
  find_best_candidate(
@@ -30,3 +30,8 @@ dev = [
30
30
  "ruff>=0.11.6",
31
31
  "wheel>=0.45.1",
32
32
  ]
33
+ docs = [
34
+ "sphinx>=7.4.7",
35
+ "sphinx-mdinclude>=0.6.2",
36
+ "sphinx-rtd-theme>=3.0.2",
37
+ ]