ocr-stringdist 0.0.3__tar.gz → 0.0.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ocr_stringdist-0.0.3 → ocr_stringdist-0.0.5}/.github/workflows/CI.yml +0 -1
- ocr_stringdist-0.0.5/.github/workflows/docs.yml +70 -0
- {ocr_stringdist-0.0.3 → ocr_stringdist-0.0.5}/.gitignore +0 -1
- {ocr_stringdist-0.0.3 → ocr_stringdist-0.0.5}/Cargo.lock +2 -66
- {ocr_stringdist-0.0.3 → ocr_stringdist-0.0.5}/Cargo.toml +1 -3
- ocr_stringdist-0.0.5/Justfile +14 -0
- {ocr_stringdist-0.0.3 → ocr_stringdist-0.0.5}/PKG-INFO +28 -5
- {ocr_stringdist-0.0.3 → ocr_stringdist-0.0.5}/README.md +26 -3
- ocr_stringdist-0.0.5/docs/Makefile +20 -0
- ocr_stringdist-0.0.5/docs/make.bat +35 -0
- ocr_stringdist-0.0.5/docs/source/api/index.rst +18 -0
- ocr_stringdist-0.0.5/docs/source/conf.py +40 -0
- ocr_stringdist-0.0.5/docs/source/index.rst +10 -0
- {ocr_stringdist-0.0.3 → ocr_stringdist-0.0.5}/example.py +17 -11
- ocr_stringdist-0.0.5/mypy.ini +137 -0
- {ocr_stringdist-0.0.3 → ocr_stringdist-0.0.5}/pyproject.toml +16 -1
- {ocr_stringdist-0.0.3 → ocr_stringdist-0.0.5}/python/ocr_stringdist/__init__.py +4 -3
- ocr_stringdist-0.0.5/python/ocr_stringdist/matching.py +83 -0
- ocr_stringdist-0.0.5/ruff.toml +88 -0
- {ocr_stringdist-0.0.3 → ocr_stringdist-0.0.5}/src/rust_stringdist.rs +3 -8
- ocr_stringdist-0.0.5/src/weighted_levenshtein.rs +322 -0
- ocr_stringdist-0.0.5/tests/test_matching.py +39 -0
- ocr_stringdist-0.0.5/tests/test_ocr_stringdist.py +106 -0
- ocr_stringdist-0.0.5/uv.lock +801 -0
- ocr_stringdist-0.0.3/Justfile +0 -12
- ocr_stringdist-0.0.3/src/weighted_levenshtein.rs +0 -140
- ocr_stringdist-0.0.3/tests/test_ocr_stringdist.py +0 -5
- {ocr_stringdist-0.0.3 → ocr_stringdist-0.0.5}/LICENSE +0 -0
- {ocr_stringdist-0.0.3 → ocr_stringdist-0.0.5}/python/ocr_stringdist/default_ocr_distances.py +0 -0
- {ocr_stringdist-0.0.3 → ocr_stringdist-0.0.5}/python/ocr_stringdist/py.typed +0 -0
- {ocr_stringdist-0.0.3 → ocr_stringdist-0.0.5}/src/lib.rs +0 -0
| @@ -0,0 +1,70 @@ | |
| 1 | 
            +
            name: Deploy Documentation to Pages
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            on:
         | 
| 4 | 
            +
              push:
         | 
| 5 | 
            +
                branches:
         | 
| 6 | 
            +
                  - main
         | 
| 7 | 
            +
              workflow_dispatch: # Allows manual triggering from the Actions tab
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
         | 
| 10 | 
            +
            permissions:
         | 
| 11 | 
            +
              contents: read
         | 
| 12 | 
            +
              pages: write
         | 
| 13 | 
            +
              id-token: write
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
         | 
| 16 | 
            +
            # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
         | 
| 17 | 
            +
            concurrency:
         | 
| 18 | 
            +
              group: "pages"
         | 
| 19 | 
            +
              cancel-in-progress: false
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            jobs:
         | 
| 22 | 
            +
              build:
         | 
| 23 | 
            +
                runs-on: ubuntu-latest
         | 
| 24 | 
            +
                steps:
         | 
| 25 | 
            +
                  - name: Checkout repository
         | 
| 26 | 
            +
                    uses: actions/checkout@v4
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                  - name: Set up Python
         | 
| 29 | 
            +
                    uses: actions/setup-python@v5
         | 
| 30 | 
            +
                    with:
         | 
| 31 | 
            +
                      python-version: '3.12'
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                  - name: Install uv
         | 
| 34 | 
            +
                    run: curl -LsSf https://astral.sh/uv/install.sh | sh
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                  - name: Create virtual environment
         | 
| 37 | 
            +
                    run: uv venv
         | 
| 38 | 
            +
             | 
| 39 | 
            +
                  - name: Install dependencies
         | 
| 40 | 
            +
                    run: uv sync --group docs
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                  - name: Build Sphinx documentation
         | 
| 43 | 
            +
                    run: |
         | 
| 44 | 
            +
                      uv run make -C docs html
         | 
| 45 | 
            +
                      # Add a .nojekyll file to the build output directory to prevent
         | 
| 46 | 
            +
                      # GitHub Pages from ignoring files that start with an underscore
         | 
| 47 | 
            +
                      # (like Sphinx's _static and _images directories).
         | 
| 48 | 
            +
                      touch docs/build/html/.nojekyll
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                  - name: Setup Pages
         | 
| 51 | 
            +
                    uses: actions/configure-pages@v4
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                  - name: Upload artifact
         | 
| 54 | 
            +
                    uses: actions/upload-pages-artifact@v3
         | 
| 55 | 
            +
                    with:
         | 
| 56 | 
            +
                      # Upload entire directory. GitHub Pages expects index.html at the root.
         | 
| 57 | 
            +
                      path: './docs/build/html'
         | 
| 58 | 
            +
             | 
| 59 | 
            +
              deploy:
         | 
| 60 | 
            +
                environment:
         | 
| 61 | 
            +
                  name: github-pages
         | 
| 62 | 
            +
                  url: ${{ steps.deployment.outputs.page_url }}
         | 
| 63 | 
            +
                runs-on: ubuntu-latest
         | 
| 64 | 
            +
                needs: build
         | 
| 65 | 
            +
                steps:
         | 
| 66 | 
            +
                  - name: Deploy to GitHub Pages
         | 
| 67 | 
            +
                    id: deployment
         | 
| 68 | 
            +
                    uses: actions/deploy-pages@v4
         | 
| 69 | 
            +
                    # This action automatically downloads the artifact uploaded by
         | 
| 70 | 
            +
                    # upload-pages-artifact and deploys it to GitHub Pages.
         | 
| @@ -1,19 +1,6 @@ | |
| 1 1 | 
             
            # This file is automatically @generated by Cargo.
         | 
| 2 2 | 
             
            # It is not intended for manual editing.
         | 
| 3 | 
            -
            version =  | 
| 4 | 
            -
             | 
| 5 | 
            -
            [[package]]
         | 
| 6 | 
            -
            name = "ahash"
         | 
| 7 | 
            -
            version = "0.8.11"
         | 
| 8 | 
            -
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 9 | 
            -
            checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
         | 
| 10 | 
            -
            dependencies = [
         | 
| 11 | 
            -
             "cfg-if",
         | 
| 12 | 
            -
             "getrandom",
         | 
| 13 | 
            -
             "once_cell",
         | 
| 14 | 
            -
             "version_check",
         | 
| 15 | 
            -
             "zerocopy",
         | 
| 16 | 
            -
            ]
         | 
| 3 | 
            +
            version = 3
         | 
| 17 4 |  | 
| 18 5 | 
             
            [[package]]
         | 
| 19 6 | 
             
            name = "autocfg"
         | 
| @@ -27,17 +14,6 @@ version = "1.0.0" | |
| 27 14 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 28 15 | 
             
            checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
         | 
| 29 16 |  | 
| 30 | 
            -
            [[package]]
         | 
| 31 | 
            -
            name = "getrandom"
         | 
| 32 | 
            -
            version = "0.2.15"
         | 
| 33 | 
            -
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 34 | 
            -
            checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
         | 
| 35 | 
            -
            dependencies = [
         | 
| 36 | 
            -
             "cfg-if",
         | 
| 37 | 
            -
             "libc",
         | 
| 38 | 
            -
             "wasi",
         | 
| 39 | 
            -
            ]
         | 
| 40 | 
            -
             | 
| 41 17 | 
             
            [[package]]
         | 
| 42 18 | 
             
            name = "heck"
         | 
| 43 19 | 
             
            version = "0.5.0"
         | 
| @@ -67,11 +43,9 @@ dependencies = [ | |
| 67 43 |  | 
| 68 44 | 
             
            [[package]]
         | 
| 69 45 | 
             
            name = "ocr_stringdist"
         | 
| 70 | 
            -
            version = "0.0. | 
| 46 | 
            +
            version = "0.0.5"
         | 
| 71 47 | 
             
            dependencies = [
         | 
| 72 | 
            -
             "ahash",
         | 
| 73 48 | 
             
             "pyo3",
         | 
| 74 | 
            -
             "smallvec",
         | 
| 75 49 | 
             
            ]
         | 
| 76 50 |  | 
| 77 51 | 
             
            [[package]]
         | 
| @@ -167,12 +141,6 @@ dependencies = [ | |
| 167 141 | 
             
             "proc-macro2",
         | 
| 168 142 | 
             
            ]
         | 
| 169 143 |  | 
| 170 | 
            -
            [[package]]
         | 
| 171 | 
            -
            name = "smallvec"
         | 
| 172 | 
            -
            version = "1.15.0"
         | 
| 173 | 
            -
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 174 | 
            -
            checksum = "8917285742e9f3e1683f0a9c4e6b57960b7314d0b08d30d1ecd426713ee2eee9"
         | 
| 175 | 
            -
             | 
| 176 144 | 
             
            [[package]]
         | 
| 177 145 | 
             
            name = "syn"
         | 
| 178 146 | 
             
            version = "2.0.100"
         | 
| @@ -201,35 +169,3 @@ name = "unindent" | |
| 201 169 | 
             
            version = "0.2.4"
         | 
| 202 170 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 203 171 | 
             
            checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3"
         | 
| 204 | 
            -
             | 
| 205 | 
            -
            [[package]]
         | 
| 206 | 
            -
            name = "version_check"
         | 
| 207 | 
            -
            version = "0.9.5"
         | 
| 208 | 
            -
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 209 | 
            -
            checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
         | 
| 210 | 
            -
             | 
| 211 | 
            -
            [[package]]
         | 
| 212 | 
            -
            name = "wasi"
         | 
| 213 | 
            -
            version = "0.11.0+wasi-snapshot-preview1"
         | 
| 214 | 
            -
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 215 | 
            -
            checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
         | 
| 216 | 
            -
             | 
| 217 | 
            -
            [[package]]
         | 
| 218 | 
            -
            name = "zerocopy"
         | 
| 219 | 
            -
            version = "0.7.35"
         | 
| 220 | 
            -
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 221 | 
            -
            checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
         | 
| 222 | 
            -
            dependencies = [
         | 
| 223 | 
            -
             "zerocopy-derive",
         | 
| 224 | 
            -
            ]
         | 
| 225 | 
            -
             | 
| 226 | 
            -
            [[package]]
         | 
| 227 | 
            -
            name = "zerocopy-derive"
         | 
| 228 | 
            -
            version = "0.7.35"
         | 
| 229 | 
            -
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 230 | 
            -
            checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
         | 
| 231 | 
            -
            dependencies = [
         | 
| 232 | 
            -
             "proc-macro2",
         | 
| 233 | 
            -
             "quote",
         | 
| 234 | 
            -
             "syn",
         | 
| 235 | 
            -
            ]
         | 
| @@ -1,6 +1,6 @@ | |
| 1 1 | 
             
            [package]
         | 
| 2 2 | 
             
            name = "ocr_stringdist"
         | 
| 3 | 
            -
            version = "0.0. | 
| 3 | 
            +
            version = "0.0.5"
         | 
| 4 4 | 
             
            edition = "2021"
         | 
| 5 5 | 
             
            description = "String distances considering OCR errors."
         | 
| 6 6 | 
             
            authors = ["Niklas von Moers <niklasvmoers@protonmail.com>"]
         | 
| @@ -15,8 +15,6 @@ crate-type = ["cdylib"] | |
| 15 15 |  | 
| 16 16 | 
             
            [dependencies]
         | 
| 17 17 | 
             
            pyo3 = { version = "0.24.0", features = [] }
         | 
| 18 | 
            -
            ahash = "^0.8"
         | 
| 19 | 
            -
            smallvec = "1.15.0"
         | 
| 20 18 |  | 
| 21 19 | 
             
            [features]
         | 
| 22 20 | 
             
            python = []
         | 
| @@ -1,8 +1,8 @@ | |
| 1 1 | 
             
            Metadata-Version: 2.4
         | 
| 2 2 | 
             
            Name: ocr_stringdist
         | 
| 3 | 
            -
            Version: 0.0. | 
| 3 | 
            +
            Version: 0.0.5
         | 
| 4 4 | 
             
            Classifier: Programming Language :: Rust
         | 
| 5 | 
            -
            Classifier: Programming Language :: Python | 
| 5 | 
            +
            Classifier: Programming Language :: Python
         | 
| 6 6 | 
             
            Classifier: Operating System :: OS Independent
         | 
| 7 7 | 
             
            License-File: LICENSE
         | 
| 8 8 | 
             
            Summary: String distances considering OCR errors.
         | 
| @@ -17,6 +17,8 @@ Project-URL: repository, https://github.com/NiklasvonM/ocr-stringdist | |
| 17 17 |  | 
| 18 18 | 
             
            A Python library for string distance calculations that account for common OCR (optical character recognition) errors.
         | 
| 19 19 |  | 
| 20 | 
            +
            Documentation: https://niklasvonm.github.io/ocr-stringdist/
         | 
| 21 | 
            +
             | 
| 20 22 | 
             
            [](https://pypi.org/project/ocr-stringdist/)
         | 
| 21 23 | 
             
            [](LICENSE)
         | 
| 22 24 |  | 
| @@ -35,11 +37,16 @@ pip install ocr-stringdist | |
| 35 37 | 
             
            ## Features
         | 
| 36 38 |  | 
| 37 39 | 
             
            - **Weighted Levenshtein Distance**: An adaptation of the classic Levenshtein algorithm with custom substitution costs for character pairs that are commonly confused in OCR models.
         | 
| 40 | 
            +
            - **Unicode Support**: Arbitrary unicode strings can be compared.
         | 
| 41 | 
            +
            - **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
         | 
| 38 42 | 
             
            - **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
         | 
| 39 43 | 
             
            - **Customizable Cost Maps**: Create your own substitution cost maps for specific OCR systems or domains.
         | 
| 44 | 
            +
            - **Best Match Finder**: Utility function `find_best_candidate` to efficiently find the best matching string from a collection of candidates using any specified distance function (including the library's OCR-aware ones).
         | 
| 40 45 |  | 
| 41 46 | 
             
            ## Usage
         | 
| 42 47 |  | 
| 48 | 
            +
            ### Weighted Levenshtein Distance
         | 
| 49 | 
            +
             | 
| 43 50 | 
             
            ```python
         | 
| 44 51 | 
             
            import ocr_stringdist as osd
         | 
| 45 52 |  | 
| @@ -48,16 +55,32 @@ distance = osd.weighted_levenshtein_distance("OCR5", "OCRS") | |
| 48 55 | 
             
            print(f"Distance between 'OCR5' and 'OCRS': {distance}")  # Will be less than 1.0
         | 
| 49 56 |  | 
| 50 57 | 
             
            # Custom cost map
         | 
| 51 | 
            -
            custom_map = {(" | 
| 58 | 
            +
            custom_map = {("In", "h"): 0.5}
         | 
| 52 59 | 
             
            distance = osd.weighted_levenshtein_distance(
         | 
| 53 | 
            -
                " | 
| 60 | 
            +
                "hi", "Ini",
         | 
| 54 61 | 
             
                cost_map=custom_map,
         | 
| 55 62 | 
             
                symmetric=True,
         | 
| 56 | 
            -
                default_cost=1.0
         | 
| 63 | 
            +
                default_cost=1.0,
         | 
| 57 64 | 
             
            )
         | 
| 58 65 | 
             
            print(f"Distance with custom map: {distance}")
         | 
| 59 66 | 
             
            ```
         | 
| 60 67 |  | 
| 68 | 
            +
            ### Finding the Best Candidate
         | 
| 69 | 
            +
             | 
| 70 | 
            +
            ```python
         | 
| 71 | 
            +
            import ocr_stringdist as osd
         | 
| 72 | 
            +
             | 
| 73 | 
            +
            s = "apple"
         | 
| 74 | 
            +
            candidates = ["apply", "apples", "orange", "appIe"]  # 'appIe' has an OCR-like error
         | 
| 75 | 
            +
             | 
| 76 | 
            +
            def ocr_aware_distance(s1: str, s2: str) -> float:
         | 
| 77 | 
            +
                return osd.weighted_levenshtein_distance(s1, s2, cost_map={("l", "I"): 0.1})
         | 
| 78 | 
            +
             | 
| 79 | 
            +
            best_candidate, best_dist = osd.find_best_candidate(s, candidates, ocr_aware_distance)
         | 
| 80 | 
            +
            print(f"Best candidate for '{s}' is '{best_candidate}' with distance {best_dist}")
         | 
| 81 | 
            +
            # Output: Best candidate for 'apple' is 'appIe' with distance 0.1
         | 
| 82 | 
            +
            ```
         | 
| 83 | 
            +
             | 
| 61 84 | 
             
            ## Acknowledgements
         | 
| 62 85 |  | 
| 63 86 | 
             
            This project is inspired by [jellyfish](https://github.com/jamesturk/jellyfish), providing the base implementations of the algorithms used here.
         | 
| @@ -2,6 +2,8 @@ | |
| 2 2 |  | 
| 3 3 | 
             
            A Python library for string distance calculations that account for common OCR (optical character recognition) errors.
         | 
| 4 4 |  | 
| 5 | 
            +
            Documentation: https://niklasvonm.github.io/ocr-stringdist/
         | 
| 6 | 
            +
             | 
| 5 7 | 
             
            [](https://pypi.org/project/ocr-stringdist/)
         | 
| 6 8 | 
             
            [](LICENSE)
         | 
| 7 9 |  | 
| @@ -20,11 +22,16 @@ pip install ocr-stringdist | |
| 20 22 | 
             
            ## Features
         | 
| 21 23 |  | 
| 22 24 | 
             
            - **Weighted Levenshtein Distance**: An adaptation of the classic Levenshtein algorithm with custom substitution costs for character pairs that are commonly confused in OCR models.
         | 
| 25 | 
            +
            - **Unicode Support**: Arbitrary unicode strings can be compared.
         | 
| 26 | 
            +
            - **Substitution of Multiple Characters**: Not just character pairs, but string pairs may be substituted, for example the Korean syllable "이" for the two letters "OI".
         | 
| 23 27 | 
             
            - **Pre-defined OCR Distance Map**: A built-in distance map for common OCR confusions (e.g., "0" vs "O", "1" vs "l", "5" vs "S").
         | 
| 24 28 | 
             
            - **Customizable Cost Maps**: Create your own substitution cost maps for specific OCR systems or domains.
         | 
| 29 | 
            +
            - **Best Match Finder**: Utility function `find_best_candidate` to efficiently find the best matching string from a collection of candidates using any specified distance function (including the library's OCR-aware ones).
         | 
| 25 30 |  | 
| 26 31 | 
             
            ## Usage
         | 
| 27 32 |  | 
| 33 | 
            +
            ### Weighted Levenshtein Distance
         | 
| 34 | 
            +
             | 
| 28 35 | 
             
            ```python
         | 
| 29 36 | 
             
            import ocr_stringdist as osd
         | 
| 30 37 |  | 
| @@ -33,16 +40,32 @@ distance = osd.weighted_levenshtein_distance("OCR5", "OCRS") | |
| 33 40 | 
             
            print(f"Distance between 'OCR5' and 'OCRS': {distance}")  # Will be less than 1.0
         | 
| 34 41 |  | 
| 35 42 | 
             
            # Custom cost map
         | 
| 36 | 
            -
            custom_map = {(" | 
| 43 | 
            +
            custom_map = {("In", "h"): 0.5}
         | 
| 37 44 | 
             
            distance = osd.weighted_levenshtein_distance(
         | 
| 38 | 
            -
                " | 
| 45 | 
            +
                "hi", "Ini",
         | 
| 39 46 | 
             
                cost_map=custom_map,
         | 
| 40 47 | 
             
                symmetric=True,
         | 
| 41 | 
            -
                default_cost=1.0
         | 
| 48 | 
            +
                default_cost=1.0,
         | 
| 42 49 | 
             
            )
         | 
| 43 50 | 
             
            print(f"Distance with custom map: {distance}")
         | 
| 44 51 | 
             
            ```
         | 
| 45 52 |  | 
| 53 | 
            +
            ### Finding the Best Candidate
         | 
| 54 | 
            +
             | 
| 55 | 
            +
            ```python
         | 
| 56 | 
            +
            import ocr_stringdist as osd
         | 
| 57 | 
            +
             | 
| 58 | 
            +
            s = "apple"
         | 
| 59 | 
            +
            candidates = ["apply", "apples", "orange", "appIe"]  # 'appIe' has an OCR-like error
         | 
| 60 | 
            +
             | 
| 61 | 
            +
            def ocr_aware_distance(s1: str, s2: str) -> float:
         | 
| 62 | 
            +
                return osd.weighted_levenshtein_distance(s1, s2, cost_map={("l", "I"): 0.1})
         | 
| 63 | 
            +
             | 
| 64 | 
            +
            best_candidate, best_dist = osd.find_best_candidate(s, candidates, ocr_aware_distance)
         | 
| 65 | 
            +
            print(f"Best candidate for '{s}' is '{best_candidate}' with distance {best_dist}")
         | 
| 66 | 
            +
            # Output: Best candidate for 'apple' is 'appIe' with distance 0.1
         | 
| 67 | 
            +
            ```
         | 
| 68 | 
            +
             | 
| 46 69 | 
             
            ## Acknowledgements
         | 
| 47 70 |  | 
| 48 71 | 
             
            This project is inspired by [jellyfish](https://github.com/jamesturk/jellyfish), providing the base implementations of the algorithms used here.
         | 
| @@ -0,0 +1,20 @@ | |
| 1 | 
            +
            # Minimal makefile for Sphinx documentation
         | 
| 2 | 
            +
            #
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            # You can set these variables from the command line, and also
         | 
| 5 | 
            +
            # from the environment for the first two.
         | 
| 6 | 
            +
            SPHINXOPTS    ?=
         | 
| 7 | 
            +
            SPHINXBUILD   ?= sphinx-build
         | 
| 8 | 
            +
            SOURCEDIR     = source
         | 
| 9 | 
            +
            BUILDDIR      = build
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            # Put it first so that "make" without argument is like "make help".
         | 
| 12 | 
            +
            help:
         | 
| 13 | 
            +
            	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            .PHONY: help Makefile
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            # Catch-all target: route all unknown targets to Sphinx using the new
         | 
| 18 | 
            +
            # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
         | 
| 19 | 
            +
            %: Makefile
         | 
| 20 | 
            +
            	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
         | 
| @@ -0,0 +1,35 @@ | |
| 1 | 
            +
            @ECHO OFF
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            pushd %~dp0
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            REM Command file for Sphinx documentation
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            if "%SPHINXBUILD%" == "" (
         | 
| 8 | 
            +
            	set SPHINXBUILD=sphinx-build
         | 
| 9 | 
            +
            )
         | 
| 10 | 
            +
            set SOURCEDIR=source
         | 
| 11 | 
            +
            set BUILDDIR=build
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            %SPHINXBUILD% >NUL 2>NUL
         | 
| 14 | 
            +
            if errorlevel 9009 (
         | 
| 15 | 
            +
            	echo.
         | 
| 16 | 
            +
            	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
         | 
| 17 | 
            +
            	echo.installed, then set the SPHINXBUILD environment variable to point
         | 
| 18 | 
            +
            	echo.to the full path of the 'sphinx-build' executable. Alternatively you
         | 
| 19 | 
            +
            	echo.may add the Sphinx directory to PATH.
         | 
| 20 | 
            +
            	echo.
         | 
| 21 | 
            +
            	echo.If you don't have Sphinx installed, grab it from
         | 
| 22 | 
            +
            	echo.https://www.sphinx-doc.org/
         | 
| 23 | 
            +
            	exit /b 1
         | 
| 24 | 
            +
            )
         | 
| 25 | 
            +
             | 
| 26 | 
            +
            if "%1" == "" goto help
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
         | 
| 29 | 
            +
            goto end
         | 
| 30 | 
            +
             | 
| 31 | 
            +
            :help
         | 
| 32 | 
            +
            %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
         | 
| 33 | 
            +
             | 
| 34 | 
            +
            :end
         | 
| 35 | 
            +
            popd
         | 
| @@ -0,0 +1,18 @@ | |
| 1 | 
            +
            .. _api_reference:
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            API Reference
         | 
| 4 | 
            +
            =============
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            This page contains the auto-generated API reference documentation.
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            .. autofunction:: ocr_stringdist.__init__.weighted_levenshtein_distance
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            .. automodule:: ocr_stringdist.matching
         | 
| 11 | 
            +
               :members:
         | 
| 12 | 
            +
               :undoc-members:
         | 
| 13 | 
            +
               :show-inheritance:
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            .. automodule:: ocr_stringdist.default_ocr_distances
         | 
| 16 | 
            +
               :members:
         | 
| 17 | 
            +
               :undoc-members:
         | 
| 18 | 
            +
               :show-inheritance:
         | 
| @@ -0,0 +1,40 @@ | |
| 1 | 
            +
            # Configuration file for the Sphinx documentation builder.
         | 
| 2 | 
            +
            #
         | 
| 3 | 
            +
            # For the full list of built-in configuration values, see the documentation:
         | 
| 4 | 
            +
            # https://www.sphinx-doc.org/en/master/usage/configuration.html
         | 
| 5 | 
            +
             | 
| 6 | 
            +
             | 
| 7 | 
            +
            import os
         | 
| 8 | 
            +
            import sys
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            # source code is in project_root/python/ocr_stringdist
         | 
| 11 | 
            +
            sys.path.insert(0, os.path.abspath("../../python"))
         | 
| 12 | 
            +
             | 
| 13 | 
            +
             | 
| 14 | 
            +
            # -- Project information -----------------------------------------------------
         | 
| 15 | 
            +
            # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            project = "OCR-StringDist"
         | 
| 18 | 
            +
            copyright = "2025, Niklas von Moers"
         | 
| 19 | 
            +
            author = "Niklas von Moers"
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            # -- General configuration ---------------------------------------------------
         | 
| 22 | 
            +
            # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
         | 
| 23 | 
            +
             | 
| 24 | 
            +
            extensions: list[str] = [
         | 
| 25 | 
            +
                "sphinx.ext.autodoc",  # Core library to pull documentation from docstrings
         | 
| 26 | 
            +
                "sphinx.ext.napoleon",  # Support for Google and NumPy style docstrings
         | 
| 27 | 
            +
                "sphinx.ext.intersphinx",  # Link to other projects' documentation
         | 
| 28 | 
            +
                "sphinx.ext.viewcode",  # Add links to source code
         | 
| 29 | 
            +
                "sphinx_mdinclude",  # Include Markdown
         | 
| 30 | 
            +
            ]
         | 
| 31 | 
            +
             | 
| 32 | 
            +
            templates_path = ["_templates"]
         | 
| 33 | 
            +
            exclude_patterns: list[str] = []
         | 
| 34 | 
            +
             | 
| 35 | 
            +
             | 
| 36 | 
            +
            # -- Options for HTML output -------------------------------------------------
         | 
| 37 | 
            +
            # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
         | 
| 38 | 
            +
             | 
| 39 | 
            +
            html_theme = "sphinx_rtd_theme"
         | 
| 40 | 
            +
            html_static_path: list[str] = ["_static"]
         | 
| @@ -1,5 +1,5 @@ | |
| 1 | 
            -
            from ocr_stringdist import weighted_levenshtein_distance
         | 
| 2 1 | 
             
            from icecream import ic
         | 
| 2 | 
            +
            from ocr_stringdist import find_best_candidate, weighted_levenshtein_distance
         | 
| 3 3 |  | 
| 4 4 | 
             
            ic(
         | 
| 5 5 | 
             
                weighted_levenshtein_distance(
         | 
| @@ -17,6 +17,15 @@ ic( | |
| 17 17 | 
             
                )
         | 
| 18 18 | 
             
            )
         | 
| 19 19 |  | 
| 20 | 
            +
            # Substitution of multiple characters at once is supported.
         | 
| 21 | 
            +
            ic(
         | 
| 22 | 
            +
                weighted_levenshtein_distance(
         | 
| 23 | 
            +
                    "이탈리",
         | 
| 24 | 
            +
                    "OI탈리",  # Korean syllables may be confused with multiple Latin letters at once
         | 
| 25 | 
            +
                    {("이", "OI"): 0.5},
         | 
| 26 | 
            +
                ),
         | 
| 27 | 
            +
            )
         | 
| 28 | 
            +
             | 
| 20 29 | 
             
            ic(
         | 
| 21 30 | 
             
                weighted_levenshtein_distance(
         | 
| 22 31 | 
             
                    "ABCDE",
         | 
| @@ -26,16 +35,13 @@ ic( | |
| 26 35 | 
             
                )
         | 
| 27 36 | 
             
            )
         | 
| 28 37 |  | 
| 38 | 
            +
            ic(weighted_levenshtein_distance("A", "B", {("A", "B"): 0.0}, symmetric=False))
         | 
| 39 | 
            +
            ic(weighted_levenshtein_distance("A", "B", {("B", "A"): 0.0}, symmetric=False))
         | 
| 40 | 
            +
             | 
| 29 41 | 
             
            ic(
         | 
| 30 | 
            -
                 | 
| 31 | 
            -
                    " | 
| 32 | 
            -
                    " | 
| 33 | 
            -
                     | 
| 42 | 
            +
                find_best_candidate(
         | 
| 43 | 
            +
                    "apple",
         | 
| 44 | 
            +
                    ["apply", "apples", "orange", "appIe"],
         | 
| 45 | 
            +
                    lambda s1, s2: weighted_levenshtein_distance(s1, s2, {("l", "I"): 0.1}),
         | 
| 34 46 | 
             
                )
         | 
| 35 47 | 
             
            )
         | 
| 36 | 
            -
             | 
| 37 | 
            -
             | 
| 38 | 
            -
            ic(weighted_levenshtein_distance("A", "B", {("A", "B"): 0.0}, symmetric=False))
         | 
| 39 | 
            -
            ic(weighted_levenshtein_distance("A", "B", {("B", "A"): 0.0}, symmetric=False))
         | 
| 40 | 
            -
            ic(weighted_levenshtein_distance("B", "A", {("B", "A"): 0.0}, symmetric=False))
         | 
| 41 | 
            -
            ic(weighted_levenshtein_distance("B", "A", {("A", "B"): 0.0}, symmetric=False))
         | 
| @@ -0,0 +1,137 @@ | |
| 1 | 
            +
            ; Based on https://gist.github.com/CodeByAidan/adb2b9e188256def1fe35b932cba7eb8
         | 
| 2 | 
            +
            [mypy]
         | 
| 3 | 
            +
            check_untyped_defs = True
         | 
| 4 | 
            +
            disallow_any_generics = True
         | 
| 5 | 
            +
            disallow_any_unimported = True
         | 
| 6 | 
            +
            disallow_subclassing_any = True
         | 
| 7 | 
            +
            disallow_untyped_calls = True
         | 
| 8 | 
            +
            disallow_untyped_decorators = True
         | 
| 9 | 
            +
            disallow_untyped_defs = True
         | 
| 10 | 
            +
            ignore_missing_imports = True
         | 
| 11 | 
            +
            no_implicit_optional = True
         | 
| 12 | 
            +
            pretty = True
         | 
| 13 | 
            +
            show_column_numbers = True
         | 
| 14 | 
            +
            show_error_codes = True
         | 
| 15 | 
            +
            show_error_context = True
         | 
| 16 | 
            +
            strict_equality = True
         | 
| 17 | 
            +
            warn_return_any = True
         | 
| 18 | 
            +
            warn_unused_ignores = True
         | 
| 19 | 
            +
             | 
| 20 | 
            +
            ; All of this below is just defaults:
         | 
| 21 | 
            +
            ; -----------------------------------
         | 
| 22 | 
            +
            ; (if any flags are commented out with a = and nothing after it,
         | 
| 23 | 
            +
            ;  it means there is no default value/custom)
         | 
| 24 | 
            +
            ;  ex. ; mypy_path =
         | 
| 25 | 
            +
            ; -----------------------------------
         | 
| 26 | 
            +
            ; (if any flags are commented out with a = and a value after it,
         | 
| 27 | 
            +
            ;  it means that is the default value but it was changed out for
         | 
| 28 | 
            +
            ;  my personal preference in my config above)
         | 
| 29 | 
            +
            ;  ex. ; ignore_missing_imports = False
         | 
| 30 | 
            +
             | 
| 31 | 
            +
            ; == Import discovery ==
         | 
| 32 | 
            +
            ; mypy_path =
         | 
| 33 | 
            +
            ; files =
         | 
| 34 | 
            +
            ; modules =
         | 
| 35 | 
            +
            ; packages =
         | 
| 36 | 
            +
            ; exclude =
         | 
| 37 | 
            +
            namespace_packages = True
         | 
| 38 | 
            +
            explicit_package_bases = False
         | 
| 39 | 
            +
            ; ignore_missing_imports = False
         | 
| 40 | 
            +
            follow_imports = normal
         | 
| 41 | 
            +
            follow_imports_for_stubs = False
         | 
| 42 | 
            +
            ; python_executable =
         | 
| 43 | 
            +
            no_site_packages = False
         | 
| 44 | 
            +
            no_silence_site_packages = False
         | 
| 45 | 
            +
             | 
| 46 | 
            +
            ; == Platform configuration ==
         | 
| 47 | 
            +
            ; python_version =
         | 
| 48 | 
            +
            ; platform =
         | 
| 49 | 
            +
            ; always_true =
         | 
| 50 | 
            +
            ; always_false =
         | 
| 51 | 
            +
             | 
| 52 | 
            +
            ; == Disallow dynamic typing ==
         | 
| 53 | 
            +
            ; disallow_any_unimported = False
         | 
| 54 | 
            +
            disallow_any_expr = False
         | 
| 55 | 
            +
            disallow_any_decorated = False
         | 
| 56 | 
            +
            disallow_any_explicit = False
         | 
| 57 | 
            +
            ; disallow_any_generics = False
         | 
| 58 | 
            +
            ; disallow_subclassing_any = False
         | 
| 59 | 
            +
             | 
| 60 | 
            +
            ; == Untyped definitions and calls ==
         | 
| 61 | 
            +
            ; disallow_untyped_calls = False
         | 
| 62 | 
            +
            ; untyped_calls_exclude =
         | 
| 63 | 
            +
            ; disallow_untyped_defs = False
         | 
| 64 | 
            +
            disallow_incomplete_defs = False
         | 
| 65 | 
            +
            ; check_untyped_defs = False
         | 
| 66 | 
            +
            ; disallow_untyped_decorators = False
         | 
| 67 | 
            +
             | 
| 68 | 
            +
            ; == None and Optional handling ==
         | 
| 69 | 
            +
            implicit_optional = False
         | 
| 70 | 
            +
            strict_optional = True
         | 
| 71 | 
            +
             | 
| 72 | 
            +
            ; == Configuring warnings ==
         | 
| 73 | 
            +
            warn_redundant_casts = False
         | 
| 74 | 
            +
            ; warn_unused_ignores = False
         | 
| 75 | 
            +
            warn_no_return = True
         | 
| 76 | 
            +
            ; warn_return_any = False
         | 
| 77 | 
            +
            warn_unreachable = False
         | 
| 78 | 
            +
             | 
| 79 | 
            +
            ; == Suppressing errors ==
         | 
| 80 | 
            +
            ignore_errors = False
         | 
| 81 | 
            +
             | 
| 82 | 
            +
            ; == Miscellaneous strictness flags ==
         | 
| 83 | 
            +
            allow_untyped_globals = False
         | 
| 84 | 
            +
            allow_redefinition = False
         | 
| 85 | 
            +
            local_partial_types = False
         | 
| 86 | 
            +
            ; disable_error_code =
         | 
| 87 | 
            +
            ; enable_error_code =
         | 
| 88 | 
            +
            implicit_reexport = True
         | 
| 89 | 
            +
            strict_concatenate = False
         | 
| 90 | 
            +
            ; strict_equality = False
         | 
| 91 | 
            +
            strict = False
         | 
| 92 | 
            +
             | 
| 93 | 
            +
            ; == Configuring error messages ==
         | 
| 94 | 
            +
            ; show_error_context = False
         | 
| 95 | 
            +
            ; show_column_numbers = False
         | 
| 96 | 
            +
            hide_error_codes = False
         | 
| 97 | 
            +
            ; pretty = False
         | 
| 98 | 
            +
            color_output = True
         | 
| 99 | 
            +
            error_summary = True
         | 
| 100 | 
            +
            show_absolute_path = False
         | 
| 101 | 
            +
            force_uppercase_builtins = False
         | 
| 102 | 
            +
            force_union_syntax = False
         | 
| 103 | 
            +
             | 
| 104 | 
            +
            ; == Incremental mode ==
         | 
| 105 | 
            +
            incremental = True
         | 
| 106 | 
            +
            cache_dir = .mypy_cache
         | 
| 107 | 
            +
            sqlite_cache = False
         | 
| 108 | 
            +
            cache_fine_grained = False
         | 
| 109 | 
            +
            skip_version_check = False
         | 
| 110 | 
            +
            skip_cache_mtime_checks = False
         | 
| 111 | 
            +
             | 
| 112 | 
            +
            ; == Advanced options ==
         | 
| 113 | 
            +
            ; plugins =
         | 
| 114 | 
            +
            pdb = False
         | 
| 115 | 
            +
            show_traceback = False
         | 
| 116 | 
            +
            raise_exceptions = False
         | 
| 117 | 
            +
            ; custom_typing_module =
         | 
| 118 | 
            +
            ; custom_typeshed_dir =
         | 
| 119 | 
            +
            warn_incomplete_stub = False
         | 
| 120 | 
            +
             | 
| 121 | 
            +
            ; == Report generation ==
         | 
| 122 | 
            +
            ; any_exprs_report =
         | 
| 123 | 
            +
            ; cobertura_xml_report = ; pip install mypy[reports]
         | 
| 124 | 
            +
            ; html_report = ; pip install mypy[reports]
         | 
| 125 | 
            +
            ; xslt_html_report = ; pip install mypy[reports]
         | 
| 126 | 
            +
            ; linecount_report =
         | 
| 127 | 
            +
            ; linecoverage_report =
         | 
| 128 | 
            +
            ; lineprecision_report =
         | 
| 129 | 
            +
            ; txt_report = ; pip install mypy[reports]
         | 
| 130 | 
            +
            ; xslt_txt_report = ; pip install mypy[reports]
         | 
| 131 | 
            +
            ; xml_report = ; pip install mypy[reports]
         | 
| 132 | 
            +
             | 
| 133 | 
            +
            ; == Miscellaneous ==
         | 
| 134 | 
            +
            ; junit_xml =
         | 
| 135 | 
            +
            scripts_are_modules = False
         | 
| 136 | 
            +
            warn_unused_configs = False
         | 
| 137 | 
            +
            verbosity = 0
         |