ocr-stringdist 0.2.2__tar.gz → 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/.github/workflows/CI.yml +44 -28
  2. ocr_stringdist-1.0.0/CHANGELOG.md +61 -0
  3. {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/Cargo.lock +1 -1
  4. {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/Cargo.toml +2 -2
  5. ocr_stringdist-1.0.0/Justfile +41 -0
  6. ocr_stringdist-1.0.0/PKG-INFO +94 -0
  7. ocr_stringdist-1.0.0/README.md +80 -0
  8. {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/docs/source/api/index.rst +8 -5
  9. ocr_stringdist-1.0.0/docs/source/cost_learning_model.rst +62 -0
  10. ocr_stringdist-1.0.0/docs/source/end_to_end_example.rst +127 -0
  11. {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/docs/source/examples.rst +24 -42
  12. {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/docs/source/index.rst +10 -8
  13. {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/examples/explain_distance.py +2 -3
  14. ocr_stringdist-1.0.0/examples/learn_costs.py +24 -0
  15. ocr_stringdist-1.0.0/examples/weighted_levenshtein.py +29 -0
  16. {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/pyproject.toml +2 -2
  17. ocr_stringdist-1.0.0/python/ocr_stringdist/__init__.py +11 -0
  18. ocr_stringdist-1.0.0/python/ocr_stringdist/edit_operation.py +16 -0
  19. ocr_stringdist-1.0.0/python/ocr_stringdist/learner.py +254 -0
  20. ocr_stringdist-1.0.0/python/ocr_stringdist/levenshtein.py +215 -0
  21. {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/python/ocr_stringdist/matching.py +6 -6
  22. ocr_stringdist-1.0.0/python/ocr_stringdist/protocols.py +9 -0
  23. {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0/python}/tests/test_batch_weighted_levenshtein.py +14 -19
  24. {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0/python}/tests/test_explain_weighted_levenshtein.py +28 -2
  25. ocr_stringdist-1.0.0/python/tests/test_learner.py +288 -0
  26. {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0/python}/tests/test_matching.py +6 -9
  27. ocr_stringdist-1.0.0/python/tests/test_protocols.py +6 -0
  28. {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0/python}/tests/test_weighted_levenshtein.py +48 -52
  29. {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/src/explanation.rs +4 -0
  30. ocr_stringdist-1.0.0/src/rust_stringdist.rs +538 -0
  31. {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/src/weighted_levenshtein.rs +20 -2
  32. ocr_stringdist-0.2.2/CHANGELOG.md +0 -35
  33. ocr_stringdist-0.2.2/Justfile +0 -21
  34. ocr_stringdist-0.2.2/PKG-INFO +0 -102
  35. ocr_stringdist-0.2.2/README.md +0 -89
  36. ocr_stringdist-0.2.2/examples/weighted_levenshtein.py +0 -48
  37. ocr_stringdist-0.2.2/python/ocr_stringdist/__init__.py +0 -17
  38. ocr_stringdist-0.2.2/python/ocr_stringdist/levenshtein.py +0 -242
  39. ocr_stringdist-0.2.2/src/rust_stringdist.rs +0 -235
  40. {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/.github/workflows/docs.yml +0 -0
  41. {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/.gitignore +0 -0
  42. {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/LICENSE +0 -0
  43. {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/docs/Makefile +0 -0
  44. {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/docs/make.bat +0 -0
  45. {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/docs/source/changelog.rst +0 -0
  46. {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/docs/source/conf.py +0 -0
  47. {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/docs/source/getting-started.rst +0 -0
  48. {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/examples/batch_processing.py +0 -0
  49. {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/mypy.ini +0 -0
  50. {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/python/ocr_stringdist/default_ocr_distances.py +0 -0
  51. {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/python/ocr_stringdist/py.typed +0 -0
  52. {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/ruff.toml +0 -0
  53. {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/src/cost_map.rs +0 -0
  54. {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/src/lib.rs +0 -0
  55. {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/src/types.rs +0 -0
  56. {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/uv.lock +0 -0
@@ -13,10 +13,12 @@ permissions:
13
13
 
14
14
  jobs:
15
15
  lint_and_test:
16
+ name: Lint & Test
16
17
  runs-on: ubuntu-latest
17
18
  strategy:
19
+ fail-fast: false
18
20
  matrix:
19
- python-version: ["3.9", "3.13", "pypy3.11"]
21
+ python-version: ["3.9", "3.13"]
20
22
  steps:
21
23
  - uses: actions/checkout@v4
22
24
  with:
@@ -24,17 +26,8 @@ jobs:
24
26
  - uses: actions/setup-python@v5
25
27
  with:
26
28
  python-version: ${{ matrix.python-version }}
27
- - name: Build wheels
28
- uses: PyO3/maturin-action@v1
29
- with:
30
- target: ${{ matrix.target }}
31
- args: --release --out dist -i ${{ matrix.python-version }}
32
- sccache: "true"
33
- - name: Install Just
34
- uses: extractions/setup-just@v2
35
29
  - name: Run Cargo Tests
36
- run: |
37
- cargo test
30
+ run: cargo test --features python -- --nocapture --test-threads=1
38
31
  - name: Run pytest
39
32
  run: |
40
33
  # just venv pytest
@@ -43,20 +36,21 @@ jobs:
43
36
  . .venv/bin/activate
44
37
  .venv/bin/pip install wheel pytest maturin
45
38
  maturin develop
46
- .venv/bin/pytest
39
+ .venv/bin/pytest python/tests
47
40
 
48
41
  linux:
42
+ name: Build Wheels (Linux)
49
43
  runs-on: ubuntu-latest
50
44
  needs: lint_and_test
51
45
  strategy:
52
46
  matrix:
53
47
  platform:
54
48
  - target: x64
55
- interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11
49
+ interpreter: 3.9 3.10 3.11 3.12 3.13
56
50
  - target: aarch64
57
- interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11
51
+ interpreter: 3.9 3.10 3.11 3.12 3.13
58
52
  - target: armv7
59
- interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11
53
+ interpreter: 3.9 3.10 3.11 3.12 3.13
60
54
  steps:
61
55
  - uses: actions/checkout@v4
62
56
  with:
@@ -71,9 +65,11 @@ jobs:
71
65
  - name: Upload wheels
72
66
  uses: actions/upload-artifact@v4
73
67
  with:
74
- name: wheels-linux-${{ strategy.job-index }}
68
+ name: wheels-linux-${{ matrix.platform.target }}
75
69
  path: dist
70
+
76
71
  musllinux:
72
+ name: Build Wheels (musllinux)
77
73
  runs-on: ubuntu-latest
78
74
  needs: lint_and_test
79
75
  strategy:
@@ -81,13 +77,13 @@ jobs:
81
77
  platform:
82
78
  - target: x86_64-unknown-linux-musl
83
79
  arch: x86_64
84
- interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11
80
+ interpreter: 3.9 3.10 3.11 3.12 3.13
85
81
  - target: i686-unknown-linux-musl
86
82
  arch: x86
87
- interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11
83
+ interpreter: 3.9 3.10 3.11 3.12 3.13
88
84
  - target: aarch64-unknown-linux-musl
89
85
  arch: aarch64
90
- interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11
86
+ interpreter: 3.9 3.10 3.11 3.12 3.13
91
87
  # all values: [x86_64, x86, aarch64, armhf, armv7, ppc64le, riscv64, s390x]
92
88
  # { target: "armv7-unknown-linux-musleabihf", image_tag: "armv7" },
93
89
  # { target: "powerpc64le-unknown-linux-musl", image_tag: "ppc64le" },
@@ -107,10 +103,11 @@ jobs:
107
103
  - name: Upload wheels
108
104
  uses: actions/upload-artifact@v4
109
105
  with:
110
- name: wheels-musl-${{ strategy.job-index }}
106
+ name: wheels-musl-${{ matrix.platform.arch }}
111
107
  path: dist
112
108
 
113
109
  windows:
110
+ name: Build Wheels (Windows)
114
111
  runs-on: windows-latest
115
112
  needs: lint_and_test
116
113
  strategy:
@@ -124,6 +121,16 @@ jobs:
124
121
  - uses: actions/setup-python@v5
125
122
  with:
126
123
  python-version: ${{ matrix.interpreter }}
124
+ architecture: ${{ matrix.target }}
125
+ - name: Ensure pythonXY.lib exists (for PyO3 on Windows)
126
+ shell: pwsh
127
+ run: |
128
+ $py = "${{ matrix.interpreter }}"
129
+ $libPath = "${{ env.pythonLocation }}\\libs\\python$($py.Replace('.', '')).lib"
130
+ if (!(Test-Path $libPath)) {
131
+ Write-Host "pythonXY.lib missing, generating..."
132
+ & "${{ env.pythonLocation }}\\python.exe" -c "import distutils.sysconfig, shutil, sys; libdir = distutils.sysconfig.get_config_var('LIBDIR'); libname = distutils.sysconfig.get_config_var('LDLIBRARY'); src = libdir + '\\\\' + libname; dst = sys.prefix + '\\\\libs\\\\' + libname; shutil.copyfile(src, dst)"
133
+ }
127
134
  - name: Build wheels
128
135
  uses: PyO3/maturin-action@v1
129
136
  with:
@@ -133,19 +140,20 @@ jobs:
133
140
  - name: Upload wheels
134
141
  uses: actions/upload-artifact@v4
135
142
  with:
143
+ name: wheels-win-${{ matrix.target }}-${{ matrix.interpreter }}
136
144
  path: dist
137
- name: wheels-win-${{ strategy.job-index }}
138
145
 
139
146
  macos:
147
+ name: Build Wheels (macOS)
140
148
  runs-on: macos-latest
141
149
  needs: lint_and_test
142
150
  strategy:
143
151
  matrix:
144
152
  platform:
145
153
  - target: x64
146
- interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11
154
+ interpreter: 3.9 3.10 3.11 3.12 3.13
147
155
  - target: aarch64
148
- interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11
156
+ interpreter: 3.9 3.10 3.11 3.12 3.13
149
157
  steps:
150
158
  - uses: actions/checkout@v4
151
159
  with:
@@ -159,10 +167,11 @@ jobs:
159
167
  - name: Upload wheels
160
168
  uses: actions/upload-artifact@v4
161
169
  with:
162
- name: wheels-mac-${{ strategy.job-index }}
170
+ name: wheels-mac-${{ matrix.platform.target }}
163
171
  path: dist
164
172
 
165
173
  sdist:
174
+ name: Build Source Distribution
166
175
  runs-on: ubuntu-latest
167
176
  needs: lint_and_test
168
177
  steps:
@@ -177,23 +186,30 @@ jobs:
177
186
  - name: Upload sdist
178
187
  uses: actions/upload-artifact@v4
179
188
  with:
180
- name: wheels-sdist-${{ strategy.job-index }}
189
+ name: wheels-sdist
181
190
  path: dist
182
191
 
183
192
  release:
184
- name: Release
193
+ name: Release to PyPI
185
194
  runs-on: ubuntu-latest
186
195
  if: "startsWith(github.ref, 'refs/tags/')"
187
196
  needs: [linux, windows, macos, sdist, musllinux]
188
197
  steps:
189
- - uses: actions/download-artifact@v4
198
+ - name: Download all wheels and sdist
199
+ uses: actions/download-artifact@v4
190
200
  with:
191
201
  pattern: wheels-*
192
202
  merge-multiple: true
203
+ - name: Move packages to dist directory
204
+ run: |
205
+ mkdir -p dist
206
+ mv *.whl *.tar.gz dist/
207
+ echo "Moved packages to dist/:"
208
+ ls -l dist
193
209
  - name: Publish to PyPI
194
210
  uses: PyO3/maturin-action@v1
195
211
  env:
196
212
  MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
197
213
  with:
198
214
  command: upload
199
- args: --skip-existing *
215
+ args: --skip-existing dist/*
@@ -0,0 +1,61 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [1.0.0] - 2025-09-20
9
+
10
+ ### Changed
11
+
12
+ - Rename "Learner" to "CostLearner".
13
+ - Rework and fix the cost learning algorithm.
14
+ - Remove `with_cost_function` from `CostLearner`.
15
+ - Remove the functional interface in favour of `WeightedLevenshtein` class.
16
+
17
+ ### Added
18
+
19
+ - Add `calculate_for_unseen` parameter to `CostLearner.fit()`.
20
+ - Add input validation in `WeightedLevenshtein.__init__`.
21
+ - Add `to_dict` and `from_dict` methods to `WeightedLevenshtein`.
22
+
23
+ ## [0.3.0] - 2025-09-14
24
+
25
+ ### Added
26
+
27
+ - Add the option to include the matched characters in the `explain` method via the `filter_matches` parameter.
28
+ - Add the option to learn the costs from a dataset of pairs (OCR result, ground truth) via the `WeightedLevenshtein.learn_from` method and the `Learner` class.
29
+
30
+ ### Changed
31
+
32
+ - Drop support for PyPy due to issues with PyO3.
33
+
34
+ ## [0.2.2] - 2025-09-01
35
+
36
+ ### Changed
37
+
38
+ - Improve documentation.
39
+
40
+ ## [0.2.1] - 2025-08-31
41
+
42
+ ### Fixed
43
+
44
+ - Documentation for PyPI
45
+
46
+ ## [0.2.0] - 2025-08-31
47
+
48
+ ### Added
49
+
50
+ - `WeightedLevenshtein` class for reusable configuration.
51
+ - Explanation of edit operations via `WeightedLevenshtein.explain` and `explain_weighted_levenshtein`.
52
+
53
+ ## [0.1.0] - 2025-04-26
54
+
55
+ ### Added
56
+
57
+ - Custom insertion and deletion costs for weighted Levenshtein distance.
58
+
59
+ ### Changed
60
+
61
+ - Breaking changes to Levenshtein distance functions signatures.
@@ -74,7 +74,7 @@ dependencies = [
74
74
 
75
75
  [[package]]
76
76
  name = "ocr_stringdist"
77
- version = "0.2.2"
77
+ version = "1.0.0"
78
78
  dependencies = [
79
79
  "pyo3",
80
80
  "rayon",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "ocr_stringdist"
3
- version = "0.2.2"
3
+ version = "1.0.0"
4
4
  edition = "2021"
5
5
  description = "String distances considering OCR errors."
6
6
  authors = ["Niklas von Moers <niklasvmoers@protonmail.com>"]
@@ -14,7 +14,7 @@ name = "ocr_stringdist"
14
14
  crate-type = ["cdylib"]
15
15
 
16
16
  [dependencies]
17
- pyo3 = { version = "0.24.0", features = [] }
17
+ pyo3 = { version = "0.24.0", features = ["auto-initialize"] }
18
18
  rayon = "1.10.0"
19
19
 
20
20
  [features]
@@ -0,0 +1,41 @@
1
+ venv:
2
+ rm -rf .venv
3
+ uv venv
4
+ uv sync --all-groups
5
+
6
+ pytest:
7
+ uv run maturin develop
8
+ uv run pytest --cov=python/ocr_stringdist python/tests
9
+
10
+ test:
11
+ cargo llvm-cov --features python
12
+ #cargo test --features python
13
+
14
+ mypy:
15
+ uv run mypy .
16
+
17
+ lint:
18
+ uv run ruff check . --fix
19
+
20
+ doc:
21
+ uv run make -C docs html
22
+
23
+ # Usage: just release v1.0.0
24
+ # Make sure to update the version in Cargo.toml first.
25
+ release version:
26
+ # Fail if the current branch is not 'main'
27
+ @if [ "$(git symbolic-ref --short HEAD)" != "main" ]; then \
28
+ echo "Error: Must be on 'main' branch to release."; \
29
+ exit 1; \
30
+ fi
31
+
32
+ # Fail if the working directory is not clean
33
+ @if ! git diff --quiet --exit-code; then \
34
+ echo "Error: Working directory is not clean. Commit or stash changes before releasing."; \
35
+ exit 1; \
36
+ fi
37
+
38
+ git tag -a {{version}} -m "Release version {{version}}"
39
+ git push origin {{version}}
40
+
41
+ @echo "Successfully tagged and pushed version {{version}}"
@@ -0,0 +1,94 @@
1
+ Metadata-Version: 2.4
2
+ Name: ocr-stringdist
3
+ Version: 1.0.0
4
+ Classifier: Programming Language :: Rust
5
+ Classifier: Programming Language :: Python :: Implementation :: CPython
6
+ Classifier: License :: OSI Approved :: MIT License
7
+ Classifier: Operating System :: OS Independent
8
+ License-File: LICENSE
9
+ Requires-Python: >=3.9
10
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
11
+ Project-URL: repository, https://github.com/NiklasvonM/ocr-stringdist
12
+ Project-URL: documentation, https://niklasvonm.github.io/ocr-stringdist/
13
+
14
+ # OCR-StringDist
15
+
16
+ A Python library to learn, model, explain and correct OCR errors using a fast string distance engine.
17
+
18
+ Documentation: https://niklasvonm.github.io/ocr-stringdist/
19
+
20
+ [![PyPI badge](https://badge.fury.io/py/ocr-stringdist.svg)](https://badge.fury.io/py/ocr-stringdist)
21
+ [![License](https://img.shields.io/badge/License-MIT-green)](LICENSE)
22
+
23
+ ## Overview
24
+
25
+ Standard string distances (like Levenshtein) treat all character substitutions equally. This is suboptimal for text read from images via OCR, where errors like `O` vs `0` are far more common than, say, `O` vs `X`.
26
+
27
+ OCR-StringDist provides a learnable **weighted Levenshtein distance**, implementing part of the **Noisy Channel model**.
28
+
29
+ **Example:** Matching against the correct word `CODE`:
30
+
31
+ * **Standard Levenshtein:**
32
+ * $d(\text{"CODE"}, \text{"C0DE"}) = 1$ (O → 0)
33
+ * $d(\text{"CODE"}, \text{"CXDE"}) = 1$ (O → X)
34
+ * Result: Both appear equally likely/distant.
35
+
36
+ * **OCR-StringDist (Channel Model):**
37
+ * $d(\text{"CODE"}, \text{"C0DE"}) \approx 0.1$ (common error, low cost)
38
+ * $d(\text{"CODE"}, \text{"CXDE"}) = 1.0$ (unlikely error, high cost)
39
+ * Result: Correctly identifies `C0DE` as a much closer match.
40
+
41
+ This makes it ideal for matching potentially incorrect OCR output against known values (e.g., product codes). By combining this *channel model* with a *source model* (e.g., product code frequencies), you can build a complete and robust OCR correction system.
42
+
43
+ ## Installation
44
+
45
+ ```bash
46
+ pip install ocr-stringdist
47
+ ```
48
+
49
+ ## Features
50
+
51
+ - **Learnable Costs**: Automatically learn substitution, insertion, and deletion costs from a dataset of (OCR string, ground truth string) pairs.
52
+ - **Weighted Levenshtein Distance**: Models OCR error patterns by assigning custom costs to specific edit operations.
53
+ - **High Performance**: Core logic in Rust and a batch_distance function for efficiently comparing one string against thousands of candidates.
54
+ - **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
55
+ - **Explainable Edit Path**: Returns the optimal sequence of edit operations (substitutions, insertions, and deletions) used to transform one string into another.
56
+ - **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
57
+ - **Full Unicode Support**: Works with arbitrary Unicode strings.
58
+
59
+ ## Core Workflow
60
+
61
+ The typical workflow involves
62
+ - learning costs from your data and then
63
+ - using the resulting model to find the best match from a list of candidates.
64
+
65
+ ```python
66
+ from ocr_stringdist import WeightedLevenshtein
67
+
68
+ # 1. LEARN costs from your own data
69
+ training_data = [
70
+ ("128", "123"),
71
+ ("567", "567"),
72
+ ]
73
+ wl = WeightedLevenshtein.learn_from(training_data)
74
+
75
+ # The engine has now learned that '8' -> '3' is a low-cost substitution
76
+ print(f"Learned cost for ('8', '3'): {wl.substitution_costs[('8', '3')]:.2f}")
77
+
78
+
79
+ # 2. MATCH new OCR output against a list of candidates
80
+ ocr_output = "Product Code 128"
81
+ candidates = [
82
+ "Product Code 123",
83
+ "Product Code 523", # '5' -> '1' is an unlikely error
84
+ ]
85
+
86
+ distances = wl.batch_distance(ocr_output, candidates)
87
+
88
+ # Find the best match
89
+ min_distance = min(distances)
90
+ best_match = candidates[distances.index(min_distance)]
91
+
92
+ print(f"Best match for '{ocr_output}': '{best_match}' (Cost: {min_distance:.2f})")
93
+ ```
94
+
@@ -0,0 +1,80 @@
1
+ # OCR-StringDist
2
+
3
+ A Python library to learn, model, explain and correct OCR errors using a fast string distance engine.
4
+
5
+ Documentation: https://niklasvonm.github.io/ocr-stringdist/
6
+
7
+ [![PyPI badge](https://badge.fury.io/py/ocr-stringdist.svg)](https://badge.fury.io/py/ocr-stringdist)
8
+ [![License](https://img.shields.io/badge/License-MIT-green)](LICENSE)
9
+
10
+ ## Overview
11
+
12
+ Standard string distances (like Levenshtein) treat all character substitutions equally. This is suboptimal for text read from images via OCR, where errors like `O` vs `0` are far more common than, say, `O` vs `X`.
13
+
14
+ OCR-StringDist provides a learnable **weighted Levenshtein distance**, implementing part of the **Noisy Channel model**.
15
+
16
+ **Example:** Matching against the correct word `CODE`:
17
+
18
+ * **Standard Levenshtein:**
19
+ * $d(\text{"CODE"}, \text{"C0DE"}) = 1$ (O → 0)
20
+ * $d(\text{"CODE"}, \text{"CXDE"}) = 1$ (O → X)
21
+ * Result: Both appear equally likely/distant.
22
+
23
+ * **OCR-StringDist (Channel Model):**
24
+ * $d(\text{"CODE"}, \text{"C0DE"}) \approx 0.1$ (common error, low cost)
25
+ * $d(\text{"CODE"}, \text{"CXDE"}) = 1.0$ (unlikely error, high cost)
26
+ * Result: Correctly identifies `C0DE` as a much closer match.
27
+
28
+ This makes it ideal for matching potentially incorrect OCR output against known values (e.g., product codes). By combining this *channel model* with a *source model* (e.g., product code frequencies), you can build a complete and robust OCR correction system.
29
+
30
+ ## Installation
31
+
32
+ ```bash
33
+ pip install ocr-stringdist
34
+ ```
35
+
36
+ ## Features
37
+
38
+ - **Learnable Costs**: Automatically learn substitution, insertion, and deletion costs from a dataset of (OCR string, ground truth string) pairs.
39
+ - **Weighted Levenshtein Distance**: Models OCR error patterns by assigning custom costs to specific edit operations.
40
+ - **High Performance**: Core logic in Rust and a batch_distance function for efficiently comparing one string against thousands of candidates.
41
+ - **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
42
+ - **Explainable Edit Path**: Returns the optimal sequence of edit operations (substitutions, insertions, and deletions) used to transform one string into another.
43
+ - **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
44
+ - **Full Unicode Support**: Works with arbitrary Unicode strings.
45
+
46
+ ## Core Workflow
47
+
48
+ The typical workflow involves
49
+ - learning costs from your data and then
50
+ - using the resulting model to find the best match from a list of candidates.
51
+
52
+ ```python
53
+ from ocr_stringdist import WeightedLevenshtein
54
+
55
+ # 1. LEARN costs from your own data
56
+ training_data = [
57
+ ("128", "123"),
58
+ ("567", "567"),
59
+ ]
60
+ wl = WeightedLevenshtein.learn_from(training_data)
61
+
62
+ # The engine has now learned that '8' -> '3' is a low-cost substitution
63
+ print(f"Learned cost for ('8', '3'): {wl.substitution_costs[('8', '3')]:.2f}")
64
+
65
+
66
+ # 2. MATCH new OCR output against a list of candidates
67
+ ocr_output = "Product Code 128"
68
+ candidates = [
69
+ "Product Code 123",
70
+ "Product Code 523", # '5' -> '1' is an unlikely error
71
+ ]
72
+
73
+ distances = wl.batch_distance(ocr_output, candidates)
74
+
75
+ # Find the best match
76
+ min_distance = min(distances)
77
+ best_match = candidates[distances.index(min_distance)]
78
+
79
+ print(f"Best match for '{ocr_output}': '{best_match}' (Cost: {min_distance:.2f})")
80
+ ```
@@ -1,14 +1,17 @@
1
1
  .. _api_reference:
2
2
 
3
- API Reference
4
- =============
3
+ ===============
4
+ API Reference
5
+ ===============
5
6
 
6
7
  .. autoclass:: ocr_stringdist.WeightedLevenshtein
7
8
  :members:
8
9
 
9
- .. autofunction:: ocr_stringdist.weighted_levenshtein_distance
10
- .. autofunction:: ocr_stringdist.batch_weighted_levenshtein_distance
11
- .. autofunction:: ocr_stringdist.explain_weighted_levenshtein
10
+ .. autoclass:: ocr_stringdist.learner.CostLearner
11
+ :members:
12
+
13
+ .. autoclass:: ocr_stringdist.edit_operation.EditOperation
14
+ :members:
12
15
 
13
16
  .. automodule:: ocr_stringdist.matching
14
17
  :members:
@@ -0,0 +1,62 @@
1
+ =====================
2
+ Cost Learning Model
3
+ =====================
4
+
5
+ The ``CostLearner`` class calculates edit costs using a probabilistic model. The cost of an edit operation is defined by its **surprisal**: a measure of how unlikely that event is based on the training data. This value, derived from the negative log-likelihood :math:`-\log(P(e))`, quantifies the information contained in observing an event :math:`e`.
6
+
7
+ A common, high-probability error will have low surprisal and thus a low cost. A rare, low-probability error will have high surprisal and a high cost.
8
+
9
+ -------------------
10
+ Probabilistic Model
11
+ -------------------
12
+
13
+ The model estimates the probability of edit operations and transforms them into normalized, comparable costs. The smoothing parameter :math:`k` (set via ``with_smoothing()``) allows for a continuous transition between a Maximum Likelihood Estimation and a smoothed Bayesian model.
14
+
15
+ General Notation
16
+ ~~~~~~~~~~~~~~~~
17
+
18
+ - :math:`c(e)`: The observed count of a specific event :math:`e`. For example, :math:`c(s \to t)` is the count of source character :math:`s` being substituted by target character :math:`t`.
19
+ - :math:`C(x)`: The total count of a specific context character :math:`x`. For example, :math:`C(s)` is the total number of times the source character :math:`s` appeared in the OCR outputs.
20
+ - :math:`V`: The total number of unique characters in the vocabulary.
21
+
22
+ Probability of an Edit Operation
23
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
24
+
25
+ The model treats all edit operations within the same probabilistic framework. An insertion is modeled as a substitution from a ground-truth character to an "empty" character, and a deletion is a substitution from an OCR character to an empty character.
26
+
27
+ This means that for any given character (either from the source or the target), there are :math:`V+1` possible outcomes: a transformation into any of the :math:`V` vocabulary characters or a transformation into an empty character.
28
+
29
+ The smoothed conditional probability for any edit event :math:`e` given a context character :math:`x` (where :math:`x` is a source character for substitutions/deletions or a target character for insertions) is:
30
+
31
+ .. math:: P(e|x) = \frac{c(e) + k}{C(x) + k \cdot (V+1)}
32
+
33
+
34
+ Bayesian Interpretation
35
+ ~~~~~~~~~~~~~~~~~~~~~~~
36
+
37
+ When :math:`k > 0`, the parameter acts as the concentration parameter of a **symmetric Dirichlet prior distribution**. This represents a prior belief that every possible error is equally likely and has a "pseudo-count" of `k`.
38
+
39
+ Normalization
40
+ ~~~~~~~~~~~~~
41
+
42
+ The costs are normalized by a ceiling :math:`Z` that depends on the size of the unified outcome space. It is the a priori surprisal of any single event, assuming a uniform probability distribution over all :math:`V+1` possible outcomes.
43
+
44
+ .. math:: Z = -\log(\frac{1}{V+1}) = \log(V+1)
45
+
46
+ This normalization contextualizes the cost relative to the complexity of the character set.
47
+
48
+ Final Cost
49
+ ~~~~~~~~~~
50
+
51
+ The final cost :math:`w(e)` is the base surprisal scaled by the normalization ceiling:
52
+
53
+ .. math:: w(e) = \frac{-\log(P(e|x))}{Z}
54
+
55
+ This cost is a relative measure. Costs can be greater than 1.0, which indicates the observed event was less probable than the uniform a priori assumption.
56
+
57
+ Asymptotic Properties
58
+ ~~~~~~~~~~~~~~~~~~~~~
59
+
60
+ As the amount of training data grows, the learned cost for an operation with a stable frequency ("share") converges to a fixed value - independent of :math:`k`:
61
+
62
+ .. math:: w(e) \approx \frac{-\log(\text{share})}{\log(V+1)}