ocr-stringdist 0.2.2__tar.gz → 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/.github/workflows/CI.yml +44 -28
- ocr_stringdist-1.0.0/CHANGELOG.md +61 -0
- {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/Cargo.lock +1 -1
- {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/Cargo.toml +2 -2
- ocr_stringdist-1.0.0/Justfile +41 -0
- ocr_stringdist-1.0.0/PKG-INFO +94 -0
- ocr_stringdist-1.0.0/README.md +80 -0
- {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/docs/source/api/index.rst +8 -5
- ocr_stringdist-1.0.0/docs/source/cost_learning_model.rst +62 -0
- ocr_stringdist-1.0.0/docs/source/end_to_end_example.rst +127 -0
- {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/docs/source/examples.rst +24 -42
- {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/docs/source/index.rst +10 -8
- {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/examples/explain_distance.py +2 -3
- ocr_stringdist-1.0.0/examples/learn_costs.py +24 -0
- ocr_stringdist-1.0.0/examples/weighted_levenshtein.py +29 -0
- {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/pyproject.toml +2 -2
- ocr_stringdist-1.0.0/python/ocr_stringdist/__init__.py +11 -0
- ocr_stringdist-1.0.0/python/ocr_stringdist/edit_operation.py +16 -0
- ocr_stringdist-1.0.0/python/ocr_stringdist/learner.py +254 -0
- ocr_stringdist-1.0.0/python/ocr_stringdist/levenshtein.py +215 -0
- {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/python/ocr_stringdist/matching.py +6 -6
- ocr_stringdist-1.0.0/python/ocr_stringdist/protocols.py +9 -0
- {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0/python}/tests/test_batch_weighted_levenshtein.py +14 -19
- {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0/python}/tests/test_explain_weighted_levenshtein.py +28 -2
- ocr_stringdist-1.0.0/python/tests/test_learner.py +288 -0
- {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0/python}/tests/test_matching.py +6 -9
- ocr_stringdist-1.0.0/python/tests/test_protocols.py +6 -0
- {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0/python}/tests/test_weighted_levenshtein.py +48 -52
- {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/src/explanation.rs +4 -0
- ocr_stringdist-1.0.0/src/rust_stringdist.rs +538 -0
- {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/src/weighted_levenshtein.rs +20 -2
- ocr_stringdist-0.2.2/CHANGELOG.md +0 -35
- ocr_stringdist-0.2.2/Justfile +0 -21
- ocr_stringdist-0.2.2/PKG-INFO +0 -102
- ocr_stringdist-0.2.2/README.md +0 -89
- ocr_stringdist-0.2.2/examples/weighted_levenshtein.py +0 -48
- ocr_stringdist-0.2.2/python/ocr_stringdist/__init__.py +0 -17
- ocr_stringdist-0.2.2/python/ocr_stringdist/levenshtein.py +0 -242
- ocr_stringdist-0.2.2/src/rust_stringdist.rs +0 -235
- {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/.github/workflows/docs.yml +0 -0
- {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/.gitignore +0 -0
- {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/LICENSE +0 -0
- {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/docs/Makefile +0 -0
- {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/docs/make.bat +0 -0
- {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/docs/source/changelog.rst +0 -0
- {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/docs/source/conf.py +0 -0
- {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/docs/source/getting-started.rst +0 -0
- {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/examples/batch_processing.py +0 -0
- {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/mypy.ini +0 -0
- {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/python/ocr_stringdist/default_ocr_distances.py +0 -0
- {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/python/ocr_stringdist/py.typed +0 -0
- {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/ruff.toml +0 -0
- {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/src/cost_map.rs +0 -0
- {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/src/lib.rs +0 -0
- {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/src/types.rs +0 -0
- {ocr_stringdist-0.2.2 → ocr_stringdist-1.0.0}/uv.lock +0 -0
|
@@ -13,10 +13,12 @@ permissions:
|
|
|
13
13
|
|
|
14
14
|
jobs:
|
|
15
15
|
lint_and_test:
|
|
16
|
+
name: Lint & Test
|
|
16
17
|
runs-on: ubuntu-latest
|
|
17
18
|
strategy:
|
|
19
|
+
fail-fast: false
|
|
18
20
|
matrix:
|
|
19
|
-
python-version: ["3.9", "3.13"
|
|
21
|
+
python-version: ["3.9", "3.13"]
|
|
20
22
|
steps:
|
|
21
23
|
- uses: actions/checkout@v4
|
|
22
24
|
with:
|
|
@@ -24,17 +26,8 @@ jobs:
|
|
|
24
26
|
- uses: actions/setup-python@v5
|
|
25
27
|
with:
|
|
26
28
|
python-version: ${{ matrix.python-version }}
|
|
27
|
-
- name: Build wheels
|
|
28
|
-
uses: PyO3/maturin-action@v1
|
|
29
|
-
with:
|
|
30
|
-
target: ${{ matrix.target }}
|
|
31
|
-
args: --release --out dist -i ${{ matrix.python-version }}
|
|
32
|
-
sccache: "true"
|
|
33
|
-
- name: Install Just
|
|
34
|
-
uses: extractions/setup-just@v2
|
|
35
29
|
- name: Run Cargo Tests
|
|
36
|
-
run:
|
|
37
|
-
cargo test
|
|
30
|
+
run: cargo test --features python -- --nocapture --test-threads=1
|
|
38
31
|
- name: Run pytest
|
|
39
32
|
run: |
|
|
40
33
|
# just venv pytest
|
|
@@ -43,20 +36,21 @@ jobs:
|
|
|
43
36
|
. .venv/bin/activate
|
|
44
37
|
.venv/bin/pip install wheel pytest maturin
|
|
45
38
|
maturin develop
|
|
46
|
-
.venv/bin/pytest
|
|
39
|
+
.venv/bin/pytest python/tests
|
|
47
40
|
|
|
48
41
|
linux:
|
|
42
|
+
name: Build Wheels (Linux)
|
|
49
43
|
runs-on: ubuntu-latest
|
|
50
44
|
needs: lint_and_test
|
|
51
45
|
strategy:
|
|
52
46
|
matrix:
|
|
53
47
|
platform:
|
|
54
48
|
- target: x64
|
|
55
|
-
interpreter: 3.9 3.10 3.11 3.12 3.13
|
|
49
|
+
interpreter: 3.9 3.10 3.11 3.12 3.13
|
|
56
50
|
- target: aarch64
|
|
57
|
-
interpreter: 3.9 3.10 3.11 3.12 3.13
|
|
51
|
+
interpreter: 3.9 3.10 3.11 3.12 3.13
|
|
58
52
|
- target: armv7
|
|
59
|
-
interpreter: 3.9 3.10 3.11 3.12 3.13
|
|
53
|
+
interpreter: 3.9 3.10 3.11 3.12 3.13
|
|
60
54
|
steps:
|
|
61
55
|
- uses: actions/checkout@v4
|
|
62
56
|
with:
|
|
@@ -71,9 +65,11 @@ jobs:
|
|
|
71
65
|
- name: Upload wheels
|
|
72
66
|
uses: actions/upload-artifact@v4
|
|
73
67
|
with:
|
|
74
|
-
name: wheels-linux-${{
|
|
68
|
+
name: wheels-linux-${{ matrix.platform.target }}
|
|
75
69
|
path: dist
|
|
70
|
+
|
|
76
71
|
musllinux:
|
|
72
|
+
name: Build Wheels (musllinux)
|
|
77
73
|
runs-on: ubuntu-latest
|
|
78
74
|
needs: lint_and_test
|
|
79
75
|
strategy:
|
|
@@ -81,13 +77,13 @@ jobs:
|
|
|
81
77
|
platform:
|
|
82
78
|
- target: x86_64-unknown-linux-musl
|
|
83
79
|
arch: x86_64
|
|
84
|
-
interpreter: 3.9 3.10 3.11 3.12 3.13
|
|
80
|
+
interpreter: 3.9 3.10 3.11 3.12 3.13
|
|
85
81
|
- target: i686-unknown-linux-musl
|
|
86
82
|
arch: x86
|
|
87
|
-
interpreter: 3.9 3.10 3.11 3.12 3.13
|
|
83
|
+
interpreter: 3.9 3.10 3.11 3.12 3.13
|
|
88
84
|
- target: aarch64-unknown-linux-musl
|
|
89
85
|
arch: aarch64
|
|
90
|
-
interpreter: 3.9 3.10 3.11 3.12 3.13
|
|
86
|
+
interpreter: 3.9 3.10 3.11 3.12 3.13
|
|
91
87
|
# all values: [x86_64, x86, aarch64, armhf, armv7, ppc64le, riscv64, s390x]
|
|
92
88
|
# { target: "armv7-unknown-linux-musleabihf", image_tag: "armv7" },
|
|
93
89
|
# { target: "powerpc64le-unknown-linux-musl", image_tag: "ppc64le" },
|
|
@@ -107,10 +103,11 @@ jobs:
|
|
|
107
103
|
- name: Upload wheels
|
|
108
104
|
uses: actions/upload-artifact@v4
|
|
109
105
|
with:
|
|
110
|
-
name: wheels-musl-${{
|
|
106
|
+
name: wheels-musl-${{ matrix.platform.arch }}
|
|
111
107
|
path: dist
|
|
112
108
|
|
|
113
109
|
windows:
|
|
110
|
+
name: Build Wheels (Windows)
|
|
114
111
|
runs-on: windows-latest
|
|
115
112
|
needs: lint_and_test
|
|
116
113
|
strategy:
|
|
@@ -124,6 +121,16 @@ jobs:
|
|
|
124
121
|
- uses: actions/setup-python@v5
|
|
125
122
|
with:
|
|
126
123
|
python-version: ${{ matrix.interpreter }}
|
|
124
|
+
architecture: ${{ matrix.target }}
|
|
125
|
+
- name: Ensure pythonXY.lib exists (for PyO3 on Windows)
|
|
126
|
+
shell: pwsh
|
|
127
|
+
run: |
|
|
128
|
+
$py = "${{ matrix.interpreter }}"
|
|
129
|
+
$libPath = "${{ env.pythonLocation }}\\libs\\python$($py.Replace('.', '')).lib"
|
|
130
|
+
if (!(Test-Path $libPath)) {
|
|
131
|
+
Write-Host "pythonXY.lib missing, generating..."
|
|
132
|
+
& "${{ env.pythonLocation }}\\python.exe" -c "import distutils.sysconfig, shutil, sys; libdir = distutils.sysconfig.get_config_var('LIBDIR'); libname = distutils.sysconfig.get_config_var('LDLIBRARY'); src = libdir + '\\\\' + libname; dst = sys.prefix + '\\\\libs\\\\' + libname; shutil.copyfile(src, dst)"
|
|
133
|
+
}
|
|
127
134
|
- name: Build wheels
|
|
128
135
|
uses: PyO3/maturin-action@v1
|
|
129
136
|
with:
|
|
@@ -133,19 +140,20 @@ jobs:
|
|
|
133
140
|
- name: Upload wheels
|
|
134
141
|
uses: actions/upload-artifact@v4
|
|
135
142
|
with:
|
|
143
|
+
name: wheels-win-${{ matrix.target }}-${{ matrix.interpreter }}
|
|
136
144
|
path: dist
|
|
137
|
-
name: wheels-win-${{ strategy.job-index }}
|
|
138
145
|
|
|
139
146
|
macos:
|
|
147
|
+
name: Build Wheels (macOS)
|
|
140
148
|
runs-on: macos-latest
|
|
141
149
|
needs: lint_and_test
|
|
142
150
|
strategy:
|
|
143
151
|
matrix:
|
|
144
152
|
platform:
|
|
145
153
|
- target: x64
|
|
146
|
-
interpreter: 3.9 3.10 3.11 3.12 3.13
|
|
154
|
+
interpreter: 3.9 3.10 3.11 3.12 3.13
|
|
147
155
|
- target: aarch64
|
|
148
|
-
interpreter: 3.9 3.10 3.11 3.12 3.13
|
|
156
|
+
interpreter: 3.9 3.10 3.11 3.12 3.13
|
|
149
157
|
steps:
|
|
150
158
|
- uses: actions/checkout@v4
|
|
151
159
|
with:
|
|
@@ -159,10 +167,11 @@ jobs:
|
|
|
159
167
|
- name: Upload wheels
|
|
160
168
|
uses: actions/upload-artifact@v4
|
|
161
169
|
with:
|
|
162
|
-
name: wheels-mac-${{
|
|
170
|
+
name: wheels-mac-${{ matrix.platform.target }}
|
|
163
171
|
path: dist
|
|
164
172
|
|
|
165
173
|
sdist:
|
|
174
|
+
name: Build Source Distribution
|
|
166
175
|
runs-on: ubuntu-latest
|
|
167
176
|
needs: lint_and_test
|
|
168
177
|
steps:
|
|
@@ -177,23 +186,30 @@ jobs:
|
|
|
177
186
|
- name: Upload sdist
|
|
178
187
|
uses: actions/upload-artifact@v4
|
|
179
188
|
with:
|
|
180
|
-
name: wheels-sdist
|
|
189
|
+
name: wheels-sdist
|
|
181
190
|
path: dist
|
|
182
191
|
|
|
183
192
|
release:
|
|
184
|
-
name: Release
|
|
193
|
+
name: Release to PyPI
|
|
185
194
|
runs-on: ubuntu-latest
|
|
186
195
|
if: "startsWith(github.ref, 'refs/tags/')"
|
|
187
196
|
needs: [linux, windows, macos, sdist, musllinux]
|
|
188
197
|
steps:
|
|
189
|
-
-
|
|
198
|
+
- name: Download all wheels and sdist
|
|
199
|
+
uses: actions/download-artifact@v4
|
|
190
200
|
with:
|
|
191
201
|
pattern: wheels-*
|
|
192
202
|
merge-multiple: true
|
|
203
|
+
- name: Move packages to dist directory
|
|
204
|
+
run: |
|
|
205
|
+
mkdir -p dist
|
|
206
|
+
mv *.whl *.tar.gz dist/
|
|
207
|
+
echo "Moved packages to dist/:"
|
|
208
|
+
ls -l dist
|
|
193
209
|
- name: Publish to PyPI
|
|
194
210
|
uses: PyO3/maturin-action@v1
|
|
195
211
|
env:
|
|
196
212
|
MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
|
|
197
213
|
with:
|
|
198
214
|
command: upload
|
|
199
|
-
args: --skip-existing
|
|
215
|
+
args: --skip-existing dist/*
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [1.0.0] - 2025-09-20
|
|
9
|
+
|
|
10
|
+
### Changed
|
|
11
|
+
|
|
12
|
+
- Rename "Learner" to "CostLearner".
|
|
13
|
+
- Rework and fix the cost learning algorithm.
|
|
14
|
+
- Remove `with_cost_function` from `CostLearner`.
|
|
15
|
+
- Remove the functional interface in favour of `WeightedLevenshtein` class.
|
|
16
|
+
|
|
17
|
+
### Added
|
|
18
|
+
|
|
19
|
+
- Add `calculate_for_unseen` parameter to `CostLearner.fit()`.
|
|
20
|
+
- Add input validation in `WeightedLevenshtein.__init__`.
|
|
21
|
+
- Add `to_dict` and `from_dict` methods to `WeightedLevenshtein`.
|
|
22
|
+
|
|
23
|
+
## [0.3.0] - 2025-09-14
|
|
24
|
+
|
|
25
|
+
### Added
|
|
26
|
+
|
|
27
|
+
- Add the option to include the matched characters in the `explain` method via the `filter_matches` parameter.
|
|
28
|
+
- Add the option to learn the costs from a dataset of pairs (OCR result, ground truth) via the `WeightedLevenshtein.learn_from` method and the `Learner` class.
|
|
29
|
+
|
|
30
|
+
### Changed
|
|
31
|
+
|
|
32
|
+
- Drop support for PyPy due to issues with PyO3.
|
|
33
|
+
|
|
34
|
+
## [0.2.2] - 2025-09-01
|
|
35
|
+
|
|
36
|
+
### Changed
|
|
37
|
+
|
|
38
|
+
- Improve documentation.
|
|
39
|
+
|
|
40
|
+
## [0.2.1] - 2025-08-31
|
|
41
|
+
|
|
42
|
+
### Fixed
|
|
43
|
+
|
|
44
|
+
- Documentation for PyPI
|
|
45
|
+
|
|
46
|
+
## [0.2.0] - 2025-08-31
|
|
47
|
+
|
|
48
|
+
### Added
|
|
49
|
+
|
|
50
|
+
- `WeightedLevenshtein` class for reusable configuration.
|
|
51
|
+
- Explanation of edit operations via `WeightedLevenshtein.explain` and `explain_weighted_levenshtein`.
|
|
52
|
+
|
|
53
|
+
## [0.1.0] - 2025-04-26
|
|
54
|
+
|
|
55
|
+
### Added
|
|
56
|
+
|
|
57
|
+
- Custom insertion and deletion costs for weighted Levenshtein distance.
|
|
58
|
+
|
|
59
|
+
### Changed
|
|
60
|
+
|
|
61
|
+
- Breaking changes to Levenshtein distance functions signatures.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "ocr_stringdist"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "1.0.0"
|
|
4
4
|
edition = "2021"
|
|
5
5
|
description = "String distances considering OCR errors."
|
|
6
6
|
authors = ["Niklas von Moers <niklasvmoers@protonmail.com>"]
|
|
@@ -14,7 +14,7 @@ name = "ocr_stringdist"
|
|
|
14
14
|
crate-type = ["cdylib"]
|
|
15
15
|
|
|
16
16
|
[dependencies]
|
|
17
|
-
pyo3 = { version = "0.24.0", features = [] }
|
|
17
|
+
pyo3 = { version = "0.24.0", features = ["auto-initialize"] }
|
|
18
18
|
rayon = "1.10.0"
|
|
19
19
|
|
|
20
20
|
[features]
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
venv:
|
|
2
|
+
rm -rf .venv
|
|
3
|
+
uv venv
|
|
4
|
+
uv sync --all-groups
|
|
5
|
+
|
|
6
|
+
pytest:
|
|
7
|
+
uv run maturin develop
|
|
8
|
+
uv run pytest --cov=python/ocr_stringdist python/tests
|
|
9
|
+
|
|
10
|
+
test:
|
|
11
|
+
cargo llvm-cov --features python
|
|
12
|
+
#cargo test --features python
|
|
13
|
+
|
|
14
|
+
mypy:
|
|
15
|
+
uv run mypy .
|
|
16
|
+
|
|
17
|
+
lint:
|
|
18
|
+
uv run ruff check . --fix
|
|
19
|
+
|
|
20
|
+
doc:
|
|
21
|
+
uv run make -C docs html
|
|
22
|
+
|
|
23
|
+
# Usage: just release v1.0.0
|
|
24
|
+
# Make sure to update the version in Cargo.toml first.
|
|
25
|
+
release version:
|
|
26
|
+
# Fail if the current branch is not 'main'
|
|
27
|
+
@if [ "$(git symbolic-ref --short HEAD)" != "main" ]; then \
|
|
28
|
+
echo "Error: Must be on 'main' branch to release."; \
|
|
29
|
+
exit 1; \
|
|
30
|
+
fi
|
|
31
|
+
|
|
32
|
+
# Fail if the working directory is not clean
|
|
33
|
+
@if ! git diff --quiet --exit-code; then \
|
|
34
|
+
echo "Error: Working directory is not clean. Commit or stash changes before releasing."; \
|
|
35
|
+
exit 1; \
|
|
36
|
+
fi
|
|
37
|
+
|
|
38
|
+
git tag -a {{version}} -m "Release version {{version}}"
|
|
39
|
+
git push origin {{version}}
|
|
40
|
+
|
|
41
|
+
@echo "Successfully tagged and pushed version {{version}}"
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ocr-stringdist
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Classifier: Programming Language :: Rust
|
|
5
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
6
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
7
|
+
Classifier: Operating System :: OS Independent
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Python: >=3.9
|
|
10
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
11
|
+
Project-URL: repository, https://github.com/NiklasvonM/ocr-stringdist
|
|
12
|
+
Project-URL: documentation, https://niklasvonm.github.io/ocr-stringdist/
|
|
13
|
+
|
|
14
|
+
# OCR-StringDist
|
|
15
|
+
|
|
16
|
+
A Python library to learn, model, explain and correct OCR errors using a fast string distance engine.
|
|
17
|
+
|
|
18
|
+
Documentation: https://niklasvonm.github.io/ocr-stringdist/
|
|
19
|
+
|
|
20
|
+
[](https://badge.fury.io/py/ocr-stringdist)
|
|
21
|
+
[](LICENSE)
|
|
22
|
+
|
|
23
|
+
## Overview
|
|
24
|
+
|
|
25
|
+
Standard string distances (like Levenshtein) treat all character substitutions equally. This is suboptimal for text read from images via OCR, where errors like `O` vs `0` are far more common than, say, `O` vs `X`.
|
|
26
|
+
|
|
27
|
+
OCR-StringDist provides a learnable **weighted Levenshtein distance**, implementing part of the **Noisy Channel model**.
|
|
28
|
+
|
|
29
|
+
**Example:** Matching against the correct word `CODE`:
|
|
30
|
+
|
|
31
|
+
* **Standard Levenshtein:**
|
|
32
|
+
* $d(\text{"CODE"}, \text{"C0DE"}) = 1$ (O → 0)
|
|
33
|
+
* $d(\text{"CODE"}, \text{"CXDE"}) = 1$ (O → X)
|
|
34
|
+
* Result: Both appear equally likely/distant.
|
|
35
|
+
|
|
36
|
+
* **OCR-StringDist (Channel Model):**
|
|
37
|
+
* $d(\text{"CODE"}, \text{"C0DE"}) \approx 0.1$ (common error, low cost)
|
|
38
|
+
* $d(\text{"CODE"}, \text{"CXDE"}) = 1.0$ (unlikely error, high cost)
|
|
39
|
+
* Result: Correctly identifies `C0DE` as a much closer match.
|
|
40
|
+
|
|
41
|
+
This makes it ideal for matching potentially incorrect OCR output against known values (e.g., product codes). By combining this *channel model* with a *source model* (e.g., product code frequencies), you can build a complete and robust OCR correction system.
|
|
42
|
+
|
|
43
|
+
## Installation
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install ocr-stringdist
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Features
|
|
50
|
+
|
|
51
|
+
- **Learnable Costs**: Automatically learn substitution, insertion, and deletion costs from a dataset of (OCR string, ground truth string) pairs.
|
|
52
|
+
- **Weighted Levenshtein Distance**: Models OCR error patterns by assigning custom costs to specific edit operations.
|
|
53
|
+
- **High Performance**: Core logic in Rust and a batch_distance function for efficiently comparing one string against thousands of candidates.
|
|
54
|
+
- **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
|
|
55
|
+
- **Explainable Edit Path**: Returns the optimal sequence of edit operations (substitutions, insertions, and deletions) used to transform one string into another.
|
|
56
|
+
- **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
|
|
57
|
+
- **Full Unicode Support**: Works with arbitrary Unicode strings.
|
|
58
|
+
|
|
59
|
+
## Core Workflow
|
|
60
|
+
|
|
61
|
+
The typical workflow involves
|
|
62
|
+
- learning costs from your data and then
|
|
63
|
+
- using the resulting model to find the best match from a list of candidates.
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
from ocr_stringdist import WeightedLevenshtein
|
|
67
|
+
|
|
68
|
+
# 1. LEARN costs from your own data
|
|
69
|
+
training_data = [
|
|
70
|
+
("128", "123"),
|
|
71
|
+
("567", "567"),
|
|
72
|
+
]
|
|
73
|
+
wl = WeightedLevenshtein.learn_from(training_data)
|
|
74
|
+
|
|
75
|
+
# The engine has now learned that '8' -> '3' is a low-cost substitution
|
|
76
|
+
print(f"Learned cost for ('8', '3'): {wl.substitution_costs[('8', '3')]:.2f}")
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
# 2. MATCH new OCR output against a list of candidates
|
|
80
|
+
ocr_output = "Product Code 128"
|
|
81
|
+
candidates = [
|
|
82
|
+
"Product Code 123",
|
|
83
|
+
"Product Code 523", # '5' -> '1' is an unlikely error
|
|
84
|
+
]
|
|
85
|
+
|
|
86
|
+
distances = wl.batch_distance(ocr_output, candidates)
|
|
87
|
+
|
|
88
|
+
# Find the best match
|
|
89
|
+
min_distance = min(distances)
|
|
90
|
+
best_match = candidates[distances.index(min_distance)]
|
|
91
|
+
|
|
92
|
+
print(f"Best match for '{ocr_output}': '{best_match}' (Cost: {min_distance:.2f})")
|
|
93
|
+
```
|
|
94
|
+
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# OCR-StringDist
|
|
2
|
+
|
|
3
|
+
A Python library to learn, model, explain and correct OCR errors using a fast string distance engine.
|
|
4
|
+
|
|
5
|
+
Documentation: https://niklasvonm.github.io/ocr-stringdist/
|
|
6
|
+
|
|
7
|
+
[](https://badge.fury.io/py/ocr-stringdist)
|
|
8
|
+
[](LICENSE)
|
|
9
|
+
|
|
10
|
+
## Overview
|
|
11
|
+
|
|
12
|
+
Standard string distances (like Levenshtein) treat all character substitutions equally. This is suboptimal for text read from images via OCR, where errors like `O` vs `0` are far more common than, say, `O` vs `X`.
|
|
13
|
+
|
|
14
|
+
OCR-StringDist provides a learnable **weighted Levenshtein distance**, implementing part of the **Noisy Channel model**.
|
|
15
|
+
|
|
16
|
+
**Example:** Matching against the correct word `CODE`:
|
|
17
|
+
|
|
18
|
+
* **Standard Levenshtein:**
|
|
19
|
+
* $d(\text{"CODE"}, \text{"C0DE"}) = 1$ (O → 0)
|
|
20
|
+
* $d(\text{"CODE"}, \text{"CXDE"}) = 1$ (O → X)
|
|
21
|
+
* Result: Both appear equally likely/distant.
|
|
22
|
+
|
|
23
|
+
* **OCR-StringDist (Channel Model):**
|
|
24
|
+
* $d(\text{"CODE"}, \text{"C0DE"}) \approx 0.1$ (common error, low cost)
|
|
25
|
+
* $d(\text{"CODE"}, \text{"CXDE"}) = 1.0$ (unlikely error, high cost)
|
|
26
|
+
* Result: Correctly identifies `C0DE` as a much closer match.
|
|
27
|
+
|
|
28
|
+
This makes it ideal for matching potentially incorrect OCR output against known values (e.g., product codes). By combining this *channel model* with a *source model* (e.g., product code frequencies), you can build a complete and robust OCR correction system.
|
|
29
|
+
|
|
30
|
+
## Installation
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
pip install ocr-stringdist
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Features
|
|
37
|
+
|
|
38
|
+
- **Learnable Costs**: Automatically learn substitution, insertion, and deletion costs from a dataset of (OCR string, ground truth string) pairs.
|
|
39
|
+
- **Weighted Levenshtein Distance**: Models OCR error patterns by assigning custom costs to specific edit operations.
|
|
40
|
+
- **High Performance**: Core logic in Rust and a batch_distance function for efficiently comparing one string against thousands of candidates.
|
|
41
|
+
- **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
|
|
42
|
+
- **Explainable Edit Path**: Returns the optimal sequence of edit operations (substitutions, insertions, and deletions) used to transform one string into another.
|
|
43
|
+
- **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
|
|
44
|
+
- **Full Unicode Support**: Works with arbitrary Unicode strings.
|
|
45
|
+
|
|
46
|
+
## Core Workflow
|
|
47
|
+
|
|
48
|
+
The typical workflow involves
|
|
49
|
+
- learning costs from your data and then
|
|
50
|
+
- using the resulting model to find the best match from a list of candidates.
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
from ocr_stringdist import WeightedLevenshtein
|
|
54
|
+
|
|
55
|
+
# 1. LEARN costs from your own data
|
|
56
|
+
training_data = [
|
|
57
|
+
("128", "123"),
|
|
58
|
+
("567", "567"),
|
|
59
|
+
]
|
|
60
|
+
wl = WeightedLevenshtein.learn_from(training_data)
|
|
61
|
+
|
|
62
|
+
# The engine has now learned that '8' -> '3' is a low-cost substitution
|
|
63
|
+
print(f"Learned cost for ('8', '3'): {wl.substitution_costs[('8', '3')]:.2f}")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# 2. MATCH new OCR output against a list of candidates
|
|
67
|
+
ocr_output = "Product Code 128"
|
|
68
|
+
candidates = [
|
|
69
|
+
"Product Code 123",
|
|
70
|
+
"Product Code 523", # '5' -> '1' is an unlikely error
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
distances = wl.batch_distance(ocr_output, candidates)
|
|
74
|
+
|
|
75
|
+
# Find the best match
|
|
76
|
+
min_distance = min(distances)
|
|
77
|
+
best_match = candidates[distances.index(min_distance)]
|
|
78
|
+
|
|
79
|
+
print(f"Best match for '{ocr_output}': '{best_match}' (Cost: {min_distance:.2f})")
|
|
80
|
+
```
|
|
@@ -1,14 +1,17 @@
|
|
|
1
1
|
.. _api_reference:
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
3
|
+
===============
|
|
4
|
+
API Reference
|
|
5
|
+
===============
|
|
5
6
|
|
|
6
7
|
.. autoclass:: ocr_stringdist.WeightedLevenshtein
|
|
7
8
|
:members:
|
|
8
9
|
|
|
9
|
-
..
|
|
10
|
-
|
|
11
|
-
|
|
10
|
+
.. autoclass:: ocr_stringdist.learner.CostLearner
|
|
11
|
+
:members:
|
|
12
|
+
|
|
13
|
+
.. autoclass:: ocr_stringdist.edit_operation.EditOperation
|
|
14
|
+
:members:
|
|
12
15
|
|
|
13
16
|
.. automodule:: ocr_stringdist.matching
|
|
14
17
|
:members:
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
=====================
|
|
2
|
+
Cost Learning Model
|
|
3
|
+
=====================
|
|
4
|
+
|
|
5
|
+
The ``CostLearner`` class calculates edit costs using a probabilistic model. The cost of an edit operation is defined by its **surprisal**: a measure of how unlikely that event is based on the training data. This value, derived from the negative log-likelihood :math:`-\log(P(e))`, quantifies the information contained in observing an event :math:`e`.
|
|
6
|
+
|
|
7
|
+
A common, high-probability error will have low surprisal and thus a low cost. A rare, low-probability error will have high surprisal and a high cost.
|
|
8
|
+
|
|
9
|
+
-------------------
|
|
10
|
+
Probabilistic Model
|
|
11
|
+
-------------------
|
|
12
|
+
|
|
13
|
+
The model estimates the probability of edit operations and transforms them into normalized, comparable costs. The smoothing parameter :math:`k` (set via ``with_smoothing()``) allows for a continuous transition between a Maximum Likelihood Estimation and a smoothed Bayesian model.
|
|
14
|
+
|
|
15
|
+
General Notation
|
|
16
|
+
~~~~~~~~~~~~~~~~
|
|
17
|
+
|
|
18
|
+
- :math:`c(e)`: The observed count of a specific event :math:`e`. For example, :math:`c(s \to t)` is the count of source character :math:`s` being substituted by target character :math:`t`.
|
|
19
|
+
- :math:`C(x)`: The total count of a specific context character :math:`x`. For example, :math:`C(s)` is the total number of times the source character :math:`s` appeared in the OCR outputs.
|
|
20
|
+
- :math:`V`: The total number of unique characters in the vocabulary.
|
|
21
|
+
|
|
22
|
+
Probability of an Edit Operation
|
|
23
|
+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
24
|
+
|
|
25
|
+
The model treats all edit operations within the same probabilistic framework. An insertion is modeled as a substitution from a ground-truth character to an "empty" character, and a deletion is a substitution from an OCR character to an empty character.
|
|
26
|
+
|
|
27
|
+
This means that for any given character (either from the source or the target), there are :math:`V+1` possible outcomes: a transformation into any of the :math:`V` vocabulary characters or a transformation into an empty character.
|
|
28
|
+
|
|
29
|
+
The smoothed conditional probability for any edit event :math:`e` given a context character :math:`x` (where :math:`x` is a source character for substitutions/deletions or a target character for insertions) is:
|
|
30
|
+
|
|
31
|
+
.. math:: P(e|x) = \frac{c(e) + k}{C(x) + k \cdot (V+1)}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
Bayesian Interpretation
|
|
35
|
+
~~~~~~~~~~~~~~~~~~~~~~~
|
|
36
|
+
|
|
37
|
+
When :math:`k > 0`, the parameter acts as the concentration parameter of a **symmetric Dirichlet prior distribution**. This represents a prior belief that every possible error is equally likely and has a "pseudo-count" of `k`.
|
|
38
|
+
|
|
39
|
+
Normalization
|
|
40
|
+
~~~~~~~~~~~~~
|
|
41
|
+
|
|
42
|
+
The costs are normalized by a ceiling :math:`Z` that depends on the size of the unified outcome space. It is the a priori surprisal of any single event, assuming a uniform probability distribution over all :math:`V+1` possible outcomes.
|
|
43
|
+
|
|
44
|
+
.. math:: Z = -\log(\frac{1}{V+1}) = \log(V+1)
|
|
45
|
+
|
|
46
|
+
This normalization contextualizes the cost relative to the complexity of the character set.
|
|
47
|
+
|
|
48
|
+
Final Cost
|
|
49
|
+
~~~~~~~~~~
|
|
50
|
+
|
|
51
|
+
The final cost :math:`w(e)` is the base surprisal scaled by the normalization ceiling:
|
|
52
|
+
|
|
53
|
+
.. math:: w(e) = \frac{-\log(P(e|x))}{Z}
|
|
54
|
+
|
|
55
|
+
This cost is a relative measure. Costs can be greater than 1.0, which indicates the observed event was less probable than the uniform a priori assumption.
|
|
56
|
+
|
|
57
|
+
Asymptotic Properties
|
|
58
|
+
~~~~~~~~~~~~~~~~~~~~~
|
|
59
|
+
|
|
60
|
+
As the amount of training data grows, the learned cost for an operation with a stable frequency ("share") converges to a fixed value - independent of :math:`k`:
|
|
61
|
+
|
|
62
|
+
.. math:: w(e) \approx \frac{-\log(\text{share})}{\log(V+1)}
|