ocr-stringdist 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,200 @@
1
+ # This file was edited manually to add
2
+ # The original was autogenerated by maturin v0.14.15
3
+ on:
4
+ push:
5
+ branches:
6
+ - "main"
7
+ tags:
8
+ - "*"
9
+ pull_request:
10
+ workflow_dispatch:
11
+
12
+ permissions:
13
+ contents: read
14
+
15
+ jobs:
16
+ lint_and_test:
17
+ runs-on: ubuntu-latest
18
+ strategy:
19
+ matrix:
20
+ python-version: ["3.9", "3.13", "pypy3.11"]
21
+ steps:
22
+ - uses: actions/checkout@v4
23
+ with:
24
+ submodules: recursive
25
+ - uses: actions/setup-python@v5
26
+ with:
27
+ python-version: ${{ matrix.python-version }}
28
+ - name: Build wheels
29
+ uses: PyO3/maturin-action@v1
30
+ with:
31
+ target: ${{ matrix.target }}
32
+ args: --release --out dist -i ${{ matrix.python-version }}
33
+ sccache: "true"
34
+ - name: Install Just
35
+ uses: extractions/setup-just@v2
36
+ - name: Run Cargo Tests
37
+ run: |
38
+ cargo test
39
+ - name: Run pytest
40
+ run: |
41
+ # just venv pytest
42
+ rm -rf .venv
43
+ python3 -m venv .venv
44
+ . .venv/bin/activate
45
+ .venv/bin/pip install wheel pytest maturin
46
+ maturin develop
47
+ .venv/bin/pytest
48
+
49
+ linux:
50
+ runs-on: ubuntu-latest
51
+ needs: lint_and_test
52
+ strategy:
53
+ matrix:
54
+ platform:
55
+ - target: x64
56
+ interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11
57
+ - target: aarch64
58
+ interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11
59
+ - target: armv7
60
+ interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11
61
+ steps:
62
+ - uses: actions/checkout@v4
63
+ with:
64
+ submodules: recursive
65
+ - name: Build wheels
66
+ uses: PyO3/maturin-action@v1
67
+ with:
68
+ target: ${{ matrix.platform.target }}
69
+ args: --release --out dist -i ${{ matrix.platform.interpreter }}
70
+ sccache: "true"
71
+ manylinux: auto
72
+ - name: Upload wheels
73
+ uses: actions/upload-artifact@v4
74
+ with:
75
+ name: wheels-linux-${{ strategy.job-index }}
76
+ path: dist
77
+ musllinux:
78
+ runs-on: ubuntu-latest
79
+ needs: lint_and_test
80
+ strategy:
81
+ matrix:
82
+ platform:
83
+ - target: x86_64-unknown-linux-musl
84
+ arch: x86_64
85
+ interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11
86
+ - target: i686-unknown-linux-musl
87
+ arch: x86
88
+ interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11
89
+ - target: aarch64-unknown-linux-musl
90
+ arch: aarch64
91
+ interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11
92
+ # all values: [x86_64, x86, aarch64, armhf, armv7, ppc64le, riscv64, s390x]
93
+ # { target: "armv7-unknown-linux-musleabihf", image_tag: "armv7" },
94
+ # { target: "powerpc64le-unknown-linux-musl", image_tag: "ppc64le" },
95
+ steps:
96
+ - uses: actions/checkout@v4
97
+ with:
98
+ submodules: recursive
99
+ - name: Setup QEMU
100
+ uses: docker/setup-qemu-action@v3
101
+ - name: Build wheels
102
+ uses: PyO3/maturin-action@v1
103
+ with:
104
+ target: ${{ matrix.platform.target }}
105
+ args: --release --out dist -i ${{ matrix.platform.interpreter }}
106
+ sccache: "true"
107
+ manylinux: musllinux_1_1
108
+ - name: Upload wheels
109
+ uses: actions/upload-artifact@v4
110
+ with:
111
+ name: wheels-musl-${{ strategy.job-index }}
112
+ path: dist
113
+
114
+ windows:
115
+ runs-on: windows-latest
116
+ needs: lint_and_test
117
+ strategy:
118
+ matrix:
119
+ target: [x64, x86]
120
+ interpreter: ["3.9", "3.10", "3.11", "3.12", "3.13"]
121
+ steps:
122
+ - uses: actions/checkout@v4
123
+ with:
124
+ submodules: recursive
125
+ - uses: actions/setup-python@v5
126
+ with:
127
+ python-version: ${{ matrix.interpreter }}
128
+ - name: Build wheels
129
+ uses: PyO3/maturin-action@v1
130
+ with:
131
+ target: ${{ matrix.target }}
132
+ args: --release --out dist -i ${{ matrix.interpreter }}
133
+ sccache: "true"
134
+ - name: Upload wheels
135
+ uses: actions/upload-artifact@v4
136
+ with:
137
+ path: dist
138
+ name: wheels-win-${{ strategy.job-index }}
139
+
140
+ macos:
141
+ runs-on: macos-latest
142
+ needs: lint_and_test
143
+ strategy:
144
+ matrix:
145
+ platform:
146
+ - target: x64
147
+ interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11
148
+ - target: aarch64
149
+ interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11
150
+ steps:
151
+ - uses: actions/checkout@v4
152
+ with:
153
+ submodules: recursive
154
+ - name: Build wheels
155
+ uses: PyO3/maturin-action@v1
156
+ with:
157
+ target: ${{ matrix.platform.target }}
158
+ args: --release --out dist -i ${{ matrix.platform.interpreter }}
159
+ sccache: "true"
160
+ - name: Upload wheels
161
+ uses: actions/upload-artifact@v4
162
+ with:
163
+ name: wheels-mac-${{ strategy.job-index }}
164
+ path: dist
165
+
166
+ sdist:
167
+ runs-on: ubuntu-latest
168
+ needs: lint_and_test
169
+ steps:
170
+ - uses: actions/checkout@v4
171
+ with:
172
+ submodules: recursive
173
+ - name: Build sdist
174
+ uses: PyO3/maturin-action@v1
175
+ with:
176
+ command: sdist
177
+ args: --out dist
178
+ - name: Upload sdist
179
+ uses: actions/upload-artifact@v4
180
+ with:
181
+ name: wheels-sdist-${{ strategy.job-index }}
182
+ path: dist
183
+
184
+ release:
185
+ name: Release
186
+ runs-on: ubuntu-latest
187
+ if: "startsWith(github.ref, 'refs/tags/')"
188
+ needs: [linux, windows, macos, sdist, musllinux]
189
+ steps:
190
+ - uses: actions/download-artifact@v4
191
+ with:
192
+ pattern: wheels-*
193
+ merge-multiple: true
194
+ - name: Publish to PyPI
195
+ uses: PyO3/maturin-action@v1
196
+ env:
197
+ MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
198
+ with:
199
+ command: upload
200
+ args: --skip-existing *
@@ -0,0 +1,180 @@
1
+ build/
2
+ dist/
3
+ *.so
4
+ *.swp
5
+ *.pyc
6
+ *.DS_Store
7
+ *~
8
+ .tox/
9
+ .coverage
10
+ htmlcov/
11
+ .ropeproject/
12
+ _build/
13
+ .ipynb_checkpoints/
14
+ .cache
15
+ wheelhouse/
16
+ site/
17
+ target/
18
+ Cargo.lock
19
+ .venv
20
+
21
+ # Byte-compiled / optimized / DLL files
22
+ __pycache__/
23
+ *.py[cod]
24
+ *$py.class
25
+
26
+ # C extensions
27
+ *.so
28
+
29
+ # Distribution / packaging
30
+ .Python
31
+ build/
32
+ develop-eggs/
33
+ dist/
34
+ downloads/
35
+ eggs/
36
+ .eggs/
37
+ lib/
38
+ lib64/
39
+ parts/
40
+ sdist/
41
+ var/
42
+ wheels/
43
+ share/python-wheels/
44
+ *.egg-info/
45
+ .installed.cfg
46
+ *.egg
47
+ MANIFEST
48
+
49
+ # PyInstaller
50
+ # Usually these files are written by a python script from a template
51
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
52
+ *.manifest
53
+ *.spec
54
+
55
+ # Installer logs
56
+ pip-log.txt
57
+ pip-delete-this-directory.txt
58
+
59
+ # Unit test / coverage reports
60
+ htmlcov/
61
+ .tox/
62
+ .nox/
63
+ .coverage
64
+ .coverage.*
65
+ .cache
66
+ nosetests.xml
67
+ coverage.xml
68
+ *.cover
69
+ *.py,cover
70
+ .hypothesis/
71
+ .pytest_cache/
72
+ cover/
73
+
74
+ # Translations
75
+ *.mo
76
+ *.pot
77
+
78
+ # Django stuff:
79
+ *.log
80
+ local_settings.py
81
+ db.sqlite3
82
+ db.sqlite3-journal
83
+
84
+ # Flask stuff:
85
+ instance/
86
+ .webassets-cache
87
+
88
+ # Scrapy stuff:
89
+ .scrapy
90
+
91
+ # Sphinx documentation
92
+ docs/_build/
93
+
94
+ # PyBuilder
95
+ .pybuilder/
96
+ target/
97
+
98
+ # Jupyter Notebook
99
+ .ipynb_checkpoints
100
+
101
+ # IPython
102
+ profile_default/
103
+ ipython_config.py
104
+
105
+ # pyenv
106
+ # For a library or package, you might want to ignore these files since the code is
107
+ # intended to run in multiple environments; otherwise, check them in:
108
+ # .python-version
109
+
110
+ # pipenv
111
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
112
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
113
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
114
+ # install all needed dependencies.
115
+ #Pipfile.lock
116
+
117
+ # poetry
118
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
119
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
120
+ # commonly ignored for libraries.
121
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
122
+ #poetry.lock
123
+
124
+ # pdm
125
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
126
+ #pdm.lock
127
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
128
+ # in version control.
129
+ # https://pdm.fming.dev/#use-with-ide
130
+ .pdm.toml
131
+
132
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
133
+ __pypackages__/
134
+
135
+ # Celery stuff
136
+ celerybeat-schedule
137
+ celerybeat.pid
138
+
139
+ # SageMath parsed files
140
+ *.sage.py
141
+
142
+ # Environments
143
+ .env
144
+ .venv
145
+ env/
146
+ venv/
147
+ ENV/
148
+ env.bak/
149
+ venv.bak/
150
+
151
+ # Spyder project settings
152
+ .spyderproject
153
+ .spyproject
154
+
155
+ # Rope project settings
156
+ .ropeproject
157
+
158
+ # mkdocs documentation
159
+ /site
160
+
161
+ # mypy
162
+ .mypy_cache/
163
+ .dmypy.json
164
+ dmypy.json
165
+
166
+ # Pyre type checker
167
+ .pyre/
168
+
169
+ # pytype static type analyzer
170
+ .pytype/
171
+
172
+ # Cython debug symbols
173
+ cython_debug/
174
+
175
+ # PyCharm
176
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
177
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
178
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
179
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
180
+ #.idea/
@@ -0,0 +1,236 @@
1
+ # This file is automatically @generated by Cargo.
2
+ # It is not intended for manual editing.
3
+ version = 4
4
+
5
+ [[package]]
6
+ name = "ahash"
7
+ version = "0.8.11"
8
+ source = "registry+https://github.com/rust-lang/crates.io-index"
9
+ checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
10
+ dependencies = [
11
+ "cfg-if",
12
+ "getrandom",
13
+ "once_cell",
14
+ "version_check",
15
+ "zerocopy",
16
+ ]
17
+
18
+ [[package]]
19
+ name = "autocfg"
20
+ version = "1.4.0"
21
+ source = "registry+https://github.com/rust-lang/crates.io-index"
22
+ checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
23
+
24
+ [[package]]
25
+ name = "cfg-if"
26
+ version = "1.0.0"
27
+ source = "registry+https://github.com/rust-lang/crates.io-index"
28
+ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
29
+
30
+ [[package]]
31
+ name = "getrandom"
32
+ version = "0.2.15"
33
+ source = "registry+https://github.com/rust-lang/crates.io-index"
34
+ checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
35
+ dependencies = [
36
+ "cfg-if",
37
+ "libc",
38
+ "wasi",
39
+ ]
40
+
41
+ [[package]]
42
+ name = "heck"
43
+ version = "0.5.0"
44
+ source = "registry+https://github.com/rust-lang/crates.io-index"
45
+ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
46
+
47
+ [[package]]
48
+ name = "indoc"
49
+ version = "2.0.6"
50
+ source = "registry+https://github.com/rust-lang/crates.io-index"
51
+ checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd"
52
+
53
+ [[package]]
54
+ name = "libc"
55
+ version = "0.2.172"
56
+ source = "registry+https://github.com/rust-lang/crates.io-index"
57
+ checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa"
58
+
59
+ [[package]]
60
+ name = "memoffset"
61
+ version = "0.9.1"
62
+ source = "registry+https://github.com/rust-lang/crates.io-index"
63
+ checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a"
64
+ dependencies = [
65
+ "autocfg",
66
+ ]
67
+
68
+ [[package]]
69
+ name = "ocr_stringdist"
70
+ version = "0.0.1"
71
+ dependencies = [
72
+ "ahash",
73
+ "once_cell",
74
+ "pyo3",
75
+ "smallvec",
76
+ ]
77
+
78
+ [[package]]
79
+ name = "once_cell"
80
+ version = "1.21.3"
81
+ source = "registry+https://github.com/rust-lang/crates.io-index"
82
+ checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
83
+
84
+ [[package]]
85
+ name = "portable-atomic"
86
+ version = "1.11.0"
87
+ source = "registry+https://github.com/rust-lang/crates.io-index"
88
+ checksum = "350e9b48cbc6b0e028b0473b114454c6316e57336ee184ceab6e53f72c178b3e"
89
+
90
+ [[package]]
91
+ name = "proc-macro2"
92
+ version = "1.0.95"
93
+ source = "registry+https://github.com/rust-lang/crates.io-index"
94
+ checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778"
95
+ dependencies = [
96
+ "unicode-ident",
97
+ ]
98
+
99
+ [[package]]
100
+ name = "pyo3"
101
+ version = "0.24.1"
102
+ source = "registry+https://github.com/rust-lang/crates.io-index"
103
+ checksum = "17da310086b068fbdcefbba30aeb3721d5bb9af8db4987d6735b2183ca567229"
104
+ dependencies = [
105
+ "cfg-if",
106
+ "indoc",
107
+ "libc",
108
+ "memoffset",
109
+ "once_cell",
110
+ "portable-atomic",
111
+ "pyo3-build-config",
112
+ "pyo3-ffi",
113
+ "pyo3-macros",
114
+ "unindent",
115
+ ]
116
+
117
+ [[package]]
118
+ name = "pyo3-build-config"
119
+ version = "0.24.1"
120
+ source = "registry+https://github.com/rust-lang/crates.io-index"
121
+ checksum = "e27165889bd793000a098bb966adc4300c312497ea25cf7a690a9f0ac5aa5fc1"
122
+ dependencies = [
123
+ "once_cell",
124
+ "target-lexicon",
125
+ ]
126
+
127
+ [[package]]
128
+ name = "pyo3-ffi"
129
+ version = "0.24.1"
130
+ source = "registry+https://github.com/rust-lang/crates.io-index"
131
+ checksum = "05280526e1dbf6b420062f3ef228b78c0c54ba94e157f5cb724a609d0f2faabc"
132
+ dependencies = [
133
+ "libc",
134
+ "pyo3-build-config",
135
+ ]
136
+
137
+ [[package]]
138
+ name = "pyo3-macros"
139
+ version = "0.24.1"
140
+ source = "registry+https://github.com/rust-lang/crates.io-index"
141
+ checksum = "5c3ce5686aa4d3f63359a5100c62a127c9f15e8398e5fdeb5deef1fed5cd5f44"
142
+ dependencies = [
143
+ "proc-macro2",
144
+ "pyo3-macros-backend",
145
+ "quote",
146
+ "syn",
147
+ ]
148
+
149
+ [[package]]
150
+ name = "pyo3-macros-backend"
151
+ version = "0.24.1"
152
+ source = "registry+https://github.com/rust-lang/crates.io-index"
153
+ checksum = "f4cf6faa0cbfb0ed08e89beb8103ae9724eb4750e3a78084ba4017cbe94f3855"
154
+ dependencies = [
155
+ "heck",
156
+ "proc-macro2",
157
+ "pyo3-build-config",
158
+ "quote",
159
+ "syn",
160
+ ]
161
+
162
+ [[package]]
163
+ name = "quote"
164
+ version = "1.0.40"
165
+ source = "registry+https://github.com/rust-lang/crates.io-index"
166
+ checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
167
+ dependencies = [
168
+ "proc-macro2",
169
+ ]
170
+
171
+ [[package]]
172
+ name = "smallvec"
173
+ version = "1.15.0"
174
+ source = "registry+https://github.com/rust-lang/crates.io-index"
175
+ checksum = "8917285742e9f3e1683f0a9c4e6b57960b7314d0b08d30d1ecd426713ee2eee9"
176
+
177
+ [[package]]
178
+ name = "syn"
179
+ version = "2.0.100"
180
+ source = "registry+https://github.com/rust-lang/crates.io-index"
181
+ checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0"
182
+ dependencies = [
183
+ "proc-macro2",
184
+ "quote",
185
+ "unicode-ident",
186
+ ]
187
+
188
+ [[package]]
189
+ name = "target-lexicon"
190
+ version = "0.13.2"
191
+ source = "registry+https://github.com/rust-lang/crates.io-index"
192
+ checksum = "e502f78cdbb8ba4718f566c418c52bc729126ffd16baee5baa718cf25dd5a69a"
193
+
194
+ [[package]]
195
+ name = "unicode-ident"
196
+ version = "1.0.18"
197
+ source = "registry+https://github.com/rust-lang/crates.io-index"
198
+ checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
199
+
200
+ [[package]]
201
+ name = "unindent"
202
+ version = "0.2.4"
203
+ source = "registry+https://github.com/rust-lang/crates.io-index"
204
+ checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3"
205
+
206
+ [[package]]
207
+ name = "version_check"
208
+ version = "0.9.5"
209
+ source = "registry+https://github.com/rust-lang/crates.io-index"
210
+ checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
211
+
212
+ [[package]]
213
+ name = "wasi"
214
+ version = "0.11.0+wasi-snapshot-preview1"
215
+ source = "registry+https://github.com/rust-lang/crates.io-index"
216
+ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
217
+
218
+ [[package]]
219
+ name = "zerocopy"
220
+ version = "0.7.35"
221
+ source = "registry+https://github.com/rust-lang/crates.io-index"
222
+ checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
223
+ dependencies = [
224
+ "zerocopy-derive",
225
+ ]
226
+
227
+ [[package]]
228
+ name = "zerocopy-derive"
229
+ version = "0.7.35"
230
+ source = "registry+https://github.com/rust-lang/crates.io-index"
231
+ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
232
+ dependencies = [
233
+ "proc-macro2",
234
+ "quote",
235
+ "syn",
236
+ ]
@@ -0,0 +1,23 @@
1
+ [package]
2
+ name = "ocr_stringdist"
3
+ version = "0.0.1"
4
+ edition = "2021"
5
+ description = "String distances considering OCR errors."
6
+ authors = ["Niklas von Moers <niklasvmoers@protonmail.com>"]
7
+ license = "MIT"
8
+ readme = "README.md"
9
+
10
+
11
+ # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
12
+ [lib]
13
+ name = "ocr_stringdist"
14
+ crate-type = ["cdylib"]
15
+
16
+ [dependencies]
17
+ pyo3 = { version = "0.24.0", features = [] }
18
+ ahash = "^0.8"
19
+ once_cell = "1.21.3"
20
+ smallvec = "1.15.0"
21
+
22
+ [features]
23
+ python = []
@@ -0,0 +1,12 @@
1
+ pytest:
2
+ maturin develop
3
+ .venv/bin/pytest
4
+
5
+ test: pytest
6
+ cargo test
7
+
8
+ venv:
9
+ rm -rf .venv
10
+ python3 -m venv .venv
11
+ . .venv/bin/activate
12
+ .venv/bin/pip install wheel pytest maturin
@@ -0,0 +1,16 @@
1
+ Metadata-Version: 2.4
2
+ Name: ocr_stringdist
3
+ Version: 0.0.1
4
+ Classifier: Programming Language :: Rust
5
+ Classifier: Programming Language :: Python :: Implementation :: PyPy
6
+ Classifier: Operating System :: OS Independent
7
+ Summary: String distances considering OCR errors.
8
+ Author: Niklas von Moers <niklasvmoers@protonmail.com>
9
+ Author-email: Niklas von Moers <niklasvmoers@protonmail.com>
10
+ License: MIT
11
+ Requires-Python: >=3.9
12
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
13
+ Project-URL: repository, https://github.com/NiklasvonM/ocr-stringdist
14
+
15
+ # OCR-Stringdist
16
+
@@ -0,0 +1 @@
1
+ # OCR-Stringdist
@@ -0,0 +1,50 @@
1
+ from ocr_stringdist import (
2
+ ocr_weighted_levenshtein_distance,
3
+ custom_weighted_levenshtein_distance,
4
+ )
5
+
6
+ # Example with default OCR cost map
7
+ print("Using default OCR cost map:")
8
+ default_result = ocr_weighted_levenshtein_distance("12345G", "123456")
9
+ print(f"Distance between '12345G' and '123456': {default_result}")
10
+
11
+ # Example with custom cost map
12
+ custom_cost_map: dict[tuple[str, str], float] = {
13
+ ("G", "6"): 0.1, # Make G/6 even more similar (default is 0.2)
14
+ ("A", "B"): 0.3, # Make A/B somewhat similar
15
+ ("X", "Y"): 0.5, # Make X/Y moderately similar
16
+ }
17
+
18
+ print("\nUsing custom cost map:")
19
+ custom_result = custom_weighted_levenshtein_distance(
20
+ "12345G", "123456", custom_cost_map
21
+ )
22
+ print(f"Distance between '12345G' and '123456' with custom map: {custom_result}")
23
+
24
+ # Example with custom default cost
25
+ print("\nUsing custom default cost:")
26
+ custom_default_result = custom_weighted_levenshtein_distance(
27
+ "ABCDE",
28
+ "XBCDE",
29
+ cost_map={("A", "X"): 0.5},
30
+ default_cost=0.8, # Lower default substitution cost (default is 1.0)
31
+ )
32
+ print(
33
+ f"Distance between 'ABCDE' and 'XBCDE' with custom default cost: {custom_default_result}"
34
+ )
35
+
36
+ # More complex example - comparing names with custom costs for similar looking characters
37
+ name_cost_map = {
38
+ ("O", "0"): 0.1, # Letter O and number 0
39
+ ("l", "1"): 0.1, # Lowercase L and number 1
40
+ ("I", "1"): 0.1, # Uppercase I and number 1
41
+ ("S", "5"): 0.2, # Letter S and number 5
42
+ ("Z", "2"): 0.2, # Letter Z and number 2
43
+ ("B", "8"): 0.2, # Letter B and number 8
44
+ }
45
+
46
+ print("\nComparing names with OCR-like errors:")
47
+ name1 = "ROBERT"
48
+ name2 = "R0BERT" # Using 0 instead of O
49
+ distance = custom_weighted_levenshtein_distance(name1, name2, name_cost_map)
50
+ print(f"Distance between '{name1}' and '{name2}': {distance}")
@@ -0,0 +1,22 @@
1
+ [build-system]
2
+ requires = ["maturin>=0.14,<2"]
3
+ build-backend = "maturin"
4
+
5
+ [project]
6
+ name = "ocr_stringdist"
7
+ dynamic = ["version"]
8
+ requires-python = ">=3.9"
9
+ classifiers = [
10
+ "Programming Language :: Rust",
11
+ "Programming Language :: Python :: Implementation :: PyPy",
12
+ "Operating System :: OS Independent",
13
+ ]
14
+
15
+ [project.urls]
16
+ repository = "https://github.com/NiklasvonM/ocr-stringdist"
17
+
18
+
19
+ [tool.maturin]
20
+ features = ["pyo3/extension-module", "python"]
21
+ python-source = "python"
22
+ module-name = "ocr_stringdist._rust_stringdist"
@@ -0,0 +1 @@
1
+ from ._rust_stringdist import *
@@ -0,0 +1,8 @@
1
+ def levenshtein_distance(s1: str, s2: str) -> int: ...
2
+ def ocr_weighted_levenshtein_distance(s1: str, s2: str) -> float: ...
3
+ def custom_weighted_levenshtein_distance(
4
+ s1: str,
5
+ s2: str,
6
+ cost_map: dict[tuple[str, str], float],
7
+ default_cost: float | None = None
8
+ ) -> float: ...
File without changes
@@ -0,0 +1,11 @@
1
+ mod weighted_levenshtein;
2
+
3
+ pub use weighted_levenshtein::{
4
+ custom_levenshtein_distance, custom_levenshtein_distance_with_cost_map,
5
+ vec_custom_levenshtein_distance, vec_custom_levenshtein_distance_with_cost_map, OcrCostMap,
6
+ };
7
+
8
+ #[cfg(feature = "python")]
9
+ mod rust_stringdist;
10
+ #[cfg(feature = "python")]
11
+ pub use rust_stringdist::_rust_stringdist;
@@ -0,0 +1,51 @@
1
+ use crate::custom_levenshtein_distance as _weighted_lev;
2
+ use crate::custom_levenshtein_distance_with_cost_map as _weighted_lev_with_map;
3
+ use crate::OcrCostMap;
4
+ use pyo3::prelude::*;
5
+ use pyo3::types::PyDict;
6
+ use std::collections::HashMap;
7
+
8
+ // Calculates the Levenshtein distance between two strings.
9
+ #[pyfunction]
10
+ fn ocr_weighted_levenshtein_distance(a: &str, b: &str) -> PyResult<f64> {
11
+ Ok(_weighted_lev(a, b))
12
+ }
13
+
14
+ // Calculates the weighted Levenshtein distance with a custom cost map from Python.
15
+ #[pyfunction]
16
+ #[pyo3(signature = (a, b, cost_map, default_cost = None))]
17
+ fn custom_weighted_levenshtein_distance(
18
+ a: &str,
19
+ b: &str,
20
+ cost_map: &Bound<'_, PyDict>,
21
+ default_cost: Option<f64>,
22
+ ) -> PyResult<f64> {
23
+ let default_cost_value = default_cost.unwrap_or(1.0);
24
+ let mut char_costs: HashMap<(char, char), f64> = HashMap::new();
25
+
26
+ // Convert Python dictionary to Rust HashMap
27
+ for (key, value) in cost_map.iter() {
28
+ if let Ok(key_tuple) = key.extract::<(String, String)>() {
29
+ if let Ok(cost) = value.extract::<f64>() {
30
+ // Extract the first character from each string, if they exist
31
+ if let (Some(c1), Some(c2)) =
32
+ (key_tuple.0.chars().next(), key_tuple.1.chars().next())
33
+ {
34
+ char_costs.insert((c1, c2), cost);
35
+ }
36
+ }
37
+ }
38
+ }
39
+
40
+ // Create a custom cost map and calculate the distance
41
+ let custom_cost_map = OcrCostMap::new(char_costs, default_cost_value);
42
+ Ok(_weighted_lev_with_map(a, b, &custom_cost_map))
43
+ }
44
+
45
+ /// A Python module implemented in Rust.
46
+ #[pymodule]
47
+ pub fn _rust_stringdist(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
48
+ m.add_function(wrap_pyfunction!(ocr_weighted_levenshtein_distance, m)?)?;
49
+ m.add_function(wrap_pyfunction!(custom_weighted_levenshtein_distance, m)?)?;
50
+ Ok(())
51
+ }
@@ -0,0 +1,204 @@
1
+ use once_cell::sync::Lazy;
2
+ use smallvec::SmallVec;
3
+ use std::collections::HashMap;
4
+
5
+ #[derive(Clone, Debug)]
6
+ pub struct OcrCostMap {
7
+ /// Maps pairs of characters to their specific substitution cost.
8
+ /// Stores pairs symmetrically for efficient lookup.
9
+ costs: HashMap<(char, char), f64>,
10
+ /// Default cost for substitutions not found in the map.
11
+ default_substitution_cost: f64,
12
+ }
13
+
14
+ impl OcrCostMap {
15
+ /// Creates a new OcrCostMap with specified costs.
16
+ /// Ensures symmetry by adding both (a, b) and (b, a) if only one is provided.
17
+ pub fn new(
18
+ custom_costs_input: HashMap<(char, char), f64>,
19
+ default_substitution_cost: f64,
20
+ ) -> Self {
21
+ let mut costs = HashMap::with_capacity(custom_costs_input.len() * 2); // Pre-allocate
22
+ for ((c1, c2), cost) in custom_costs_input {
23
+ // Ensure symmetry and avoid overwriting if both orders are present
24
+ costs.entry((c1, c2)).or_insert(cost);
25
+ costs.entry((c2, c1)).or_insert(cost);
26
+ }
27
+
28
+ OcrCostMap {
29
+ costs,
30
+ default_substitution_cost,
31
+ }
32
+ }
33
+
34
+ /// Gets the substitution cost between two characters.
35
+ /// Checks the custom map (symmetrically) first, then falls back to the
36
+ /// default substitution cost configured within this map instance.
37
+ pub fn get_substitution_cost(&self, c1: char, c2: char) -> f64 {
38
+ if c1 == c2 {
39
+ 0.0 // No cost if characters are identical
40
+ } else {
41
+ // Lookup the pair (symmetry is handled by storage in `new`)
42
+ // Use the map's configured default_substitution_cost as the fallback.
43
+ self.costs
44
+ .get(&(c1, c2))
45
+ .copied() // Get the cost if the key exists
46
+ .unwrap_or(self.default_substitution_cost) // Fallback to configured default
47
+ }
48
+ }
49
+ }
50
+
51
+ impl Default for OcrCostMap {
52
+ fn default() -> Self {
53
+ DEFAULT_OCR_COST_MAP.clone()
54
+ }
55
+ }
56
+
57
+ // --- Default OCR Map Initialization (Immutable HashMap) ---
58
+
59
+ // Define the costs as a static array of tuples
60
+ const DEFAULT_OCR_PAIRS: &[((char, char), f64)] = &[
61
+ (('G', '6'), 0.2),
62
+ (('O', '0'), 0.2),
63
+ (('o', '0'), 0.2),
64
+ (('l', '1'), 0.2),
65
+ (('I', '1'), 0.2),
66
+ (('2', 'Z'), 0.2),
67
+ (('B', '8'), 0.2),
68
+ (('S', '5'), 0.3),
69
+ (('s', '5'), 0.3),
70
+ (('E', 'F'), 0.8),
71
+ ];
72
+
73
+ // Use Lazy and collect from the static array for initialization
74
+ static DEFAULT_OCR_COST_MAP: Lazy<OcrCostMap> = Lazy::new(|| {
75
+ // Collect the static array into a HashMap directly
76
+ let ocr_costs: HashMap<(char, char), f64> = DEFAULT_OCR_PAIRS.iter().copied().collect();
77
+ OcrCostMap::new(ocr_costs, 1.0)
78
+ });
79
+
80
+ // Helper to create a range vector with f64 values
81
+ fn range_vec_f64(size: usize) -> SmallVec<[f64; 16]> {
82
+ let mut vec = SmallVec::with_capacity(size);
83
+ for i in 0..size {
84
+ vec.push(i as f64);
85
+ }
86
+ vec
87
+ }
88
+
89
+ /// Calculates Levenshtein distance between two vectors using custom costs.
90
+ pub fn vec_custom_levenshtein_distance(v1: &[char], v2: &[char]) -> f64 {
91
+ vec_custom_levenshtein_distance_with_cost_map(v1, v2, &OcrCostMap::default())
92
+ }
93
+
94
+ /// Calculates Levenshtein distance between two vectors using a specified cost map.
95
+ pub fn vec_custom_levenshtein_distance_with_cost_map(
96
+ v1: &[char],
97
+ v2: &[char],
98
+ cost_map: &OcrCostMap,
99
+ ) -> f64 {
100
+ let rows = v1.len() + 1;
101
+ let cols = v2.len() + 1;
102
+
103
+ if rows == 1 {
104
+ return (cols - 1) as f64;
105
+ } else if cols == 1 {
106
+ return (rows - 1) as f64;
107
+ }
108
+
109
+ let mut cur: SmallVec<[f64; 16]> = range_vec_f64(cols);
110
+
111
+ for r in 1..rows {
112
+ let prev = cur.clone();
113
+ cur = SmallVec::from_elem(0.0, cols);
114
+ cur[0] = r as f64;
115
+
116
+ let item1 = v1[r - 1];
117
+
118
+ for c in 1..cols {
119
+ let item2 = v2[c - 1];
120
+
121
+ let deletion = prev[c] + 1.0;
122
+ let insertion = cur[c - 1] + 1.0;
123
+
124
+ // Use the provided cost map to get substitution cost
125
+ let substitution_cost = cost_map.get_substitution_cost(item1, item2);
126
+ let substitution = prev[c - 1] + substitution_cost;
127
+
128
+ cur[c] = deletion.min(insertion).min(substitution);
129
+ }
130
+ }
131
+ cur[cols - 1]
132
+ }
133
+
134
+ /// Calculates custom Levenshtein distance between two strings using OCR cost map.
135
+ pub fn custom_levenshtein_distance(s1: &str, s2: &str) -> f64 {
136
+ if s1 == s2 {
137
+ return 0.0;
138
+ }
139
+
140
+ let v1: Vec<char> = s1.chars().collect();
141
+ let v2: Vec<char> = s2.chars().collect();
142
+
143
+ vec_custom_levenshtein_distance(&v1, &v2)
144
+ }
145
+
146
+ /// Calculates custom Levenshtein distance between two strings using a provided cost map.
147
+ pub fn custom_levenshtein_distance_with_cost_map(s1: &str, s2: &str, cost_map: &OcrCostMap) -> f64 {
148
+ if s1 == s2 {
149
+ return 0.0;
150
+ }
151
+
152
+ let v1: Vec<char> = s1.chars().collect();
153
+ let v2: Vec<char> = s2.chars().collect();
154
+
155
+ vec_custom_levenshtein_distance_with_cost_map(&v1, &v2, cost_map)
156
+ }
157
+
158
+ #[cfg(test)]
159
+ mod test {
160
+ use super::*;
161
+
162
+ fn assert_approx_eq(a: f64, b: f64, epsilon: f64) {
163
+ assert!(
164
+ (a - b).abs() < epsilon,
165
+ "Assertion failed: {} != {} within epsilon {}",
166
+ a,
167
+ b,
168
+ epsilon
169
+ );
170
+ }
171
+
172
+ #[test]
173
+ fn test_custom_levenshtein_simple() {
174
+ assert_approx_eq(custom_levenshtein_distance("abc", "axc"), 1.0, 1e-9);
175
+ assert_approx_eq(custom_levenshtein_distance("abc", "ac"), 1.0, 1e-9);
176
+ assert_approx_eq(custom_levenshtein_distance("ac", "abc"), 1.0, 1e-9);
177
+ }
178
+
179
+ #[test]
180
+ fn test_custom_levenshtein_ocr_pairs() {
181
+ assert_approx_eq(custom_levenshtein_distance("ABCDEFG", "ABCDEF6"), 0.2, 1e-9);
182
+
183
+ assert_approx_eq(custom_levenshtein_distance("ABCDEF6", "ABCDEFG"), 0.2, 1e-9);
184
+
185
+ assert_approx_eq(
186
+ custom_levenshtein_distance("ABCDEFG", "ABCDEF6X"),
187
+ 0.2 + 1.0,
188
+ 1e-9,
189
+ );
190
+ }
191
+
192
+ #[test]
193
+ fn test_custom_levenshtein_with_custom_map() {
194
+ let mut custom_costs = HashMap::new();
195
+ custom_costs.insert(('a', 'b'), 0.1);
196
+ let cost_map = OcrCostMap::new(custom_costs, 1.0);
197
+
198
+ assert_approx_eq(
199
+ custom_levenshtein_distance_with_cost_map("abc", "bbc", &cost_map),
200
+ 0.1,
201
+ 1e-9,
202
+ );
203
+ }
204
+ }
@@ -0,0 +1,5 @@
1
+ from ocr_stringdist import ocr_weighted_levenshtein_distance
2
+
3
+
4
+ def test_ocr_weighted_levenshtein_distance() -> None:
5
+ assert ocr_weighted_levenshtein_distance("a", "b") == 1.0