ocr-stringdist 1.0.0__tar.gz → 1.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/CHANGELOG.md +11 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/Cargo.lock +1 -1
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/Cargo.toml +1 -1
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/PKG-INFO +1 -1
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/docs/source/examples.rst +2 -1
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/docs/source/index.rst +1 -1
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/python/ocr_stringdist/__init__.py +2 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/python/ocr_stringdist/edit_operation.py +5 -2
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/python/ocr_stringdist/levenshtein.py +2 -1
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/python/tests/test_weighted_levenshtein.py +10 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/.github/workflows/CI.yml +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/.github/workflows/docs.yml +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/.gitignore +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/Justfile +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/LICENSE +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/README.md +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/docs/Makefile +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/docs/make.bat +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/docs/source/api/index.rst +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/docs/source/changelog.rst +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/docs/source/conf.py +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/docs/source/cost_learning_model.rst +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/docs/source/end_to_end_example.rst +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/docs/source/getting-started.rst +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/examples/batch_processing.py +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/examples/explain_distance.py +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/examples/learn_costs.py +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/examples/weighted_levenshtein.py +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/mypy.ini +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/pyproject.toml +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/python/ocr_stringdist/default_ocr_distances.py +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/python/ocr_stringdist/learner.py +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/python/ocr_stringdist/matching.py +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/python/ocr_stringdist/protocols.py +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/python/ocr_stringdist/py.typed +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/python/tests/test_batch_weighted_levenshtein.py +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/python/tests/test_explain_weighted_levenshtein.py +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/python/tests/test_learner.py +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/python/tests/test_matching.py +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/python/tests/test_protocols.py +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/ruff.toml +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/src/cost_map.rs +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/src/explanation.rs +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/src/lib.rs +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/src/rust_stringdist.rs +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/src/types.rs +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/src/weighted_levenshtein.rs +0 -0
- {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/uv.lock +0 -0
|
@@ -5,6 +5,17 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [1.0.1] - 2025-09-21
|
|
9
|
+
|
|
10
|
+
### Fixed
|
|
11
|
+
|
|
12
|
+
- Fix critical bug in `WeightedLevenshtein.from_dict` when using insertion costs.
|
|
13
|
+
|
|
14
|
+
### Added
|
|
15
|
+
|
|
16
|
+
- `from ocr_stringdist import EditOperation`
|
|
17
|
+
- `EditOperation.as_dict()`
|
|
18
|
+
|
|
8
19
|
## [1.0.0] - 2025-09-20
|
|
9
20
|
|
|
10
21
|
### Changed
|
|
@@ -77,4 +77,5 @@ The custom costs can be learned from a dataset of pairs of (OCR output, ground t
|
|
|
77
77
|
distance = learned_wl.distance("Hay", "Hey")
|
|
78
78
|
print(f"Distance with learned costs: {distance}") # < 1.0
|
|
79
79
|
|
|
80
|
-
Note that this only supports learning from character-level edits
|
|
80
|
+
Note that this by default only supports learning from character-level edits.
|
|
81
|
+
If multi-character tokens are to be considered, an `initial_model` that's already configured to know specific multi-character edits needs to be provided.
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
A Python library to learn, model, explain and correct OCR errors using a fast string distance engine.
|
|
6
6
|
|
|
7
|
-
:Repository: https://
|
|
7
|
+
:Repository: https://github.com/NiklasvonM/ocr-stringdist
|
|
8
8
|
:Current version: |release|
|
|
9
9
|
|
|
10
10
|
.. image:: https://img.shields.io/badge/PyPI-Package-blue
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
from .default_ocr_distances import ocr_distance_map
|
|
2
|
+
from .edit_operation import EditOperation
|
|
2
3
|
from .learner import CostLearner
|
|
3
4
|
from .levenshtein import WeightedLevenshtein
|
|
4
5
|
from .matching import find_best_candidate
|
|
5
6
|
|
|
6
7
|
__all__ = [
|
|
7
8
|
"ocr_distance_map",
|
|
9
|
+
"EditOperation",
|
|
8
10
|
"CostLearner",
|
|
9
11
|
"WeightedLevenshtein",
|
|
10
12
|
"find_best_candidate",
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
from typing import Literal, Optional
|
|
1
|
+
from dataclasses import asdict, dataclass
|
|
2
|
+
from typing import Any, Literal, Optional
|
|
3
3
|
|
|
4
4
|
OperationType = Literal["substitute", "insert", "delete", "match"]
|
|
5
5
|
|
|
@@ -14,3 +14,6 @@ class EditOperation:
|
|
|
14
14
|
source_token: Optional[str]
|
|
15
15
|
target_token: Optional[str]
|
|
16
16
|
cost: float
|
|
17
|
+
|
|
18
|
+
def to_dict(self) -> dict[str, Any]:
|
|
19
|
+
return asdict(self)
|
|
@@ -191,6 +191,7 @@ class WeightedLevenshtein:
|
|
|
191
191
|
For the counterpart, see :meth:`WeightedLevenshtein.to_dict`.
|
|
192
192
|
|
|
193
193
|
:param data: A dictionary with (not necessarily all of) the following keys:
|
|
194
|
+
|
|
194
195
|
- "substitution_costs": {"from": str, "to": str, "cost": float}
|
|
195
196
|
- "substitution_costs": dict[str, float]
|
|
196
197
|
- "deletion_costs": dict[str, float]
|
|
@@ -206,7 +207,7 @@ class WeightedLevenshtein:
|
|
|
206
207
|
|
|
207
208
|
return cls(
|
|
208
209
|
substitution_costs=sub_costs,
|
|
209
|
-
insertion_costs=data.get("
|
|
210
|
+
insertion_costs=data.get("insertion_costs"),
|
|
210
211
|
deletion_costs=data.get("deletion_costs"),
|
|
211
212
|
symmetric_substitution=data.get("symmetric_substitution", True),
|
|
212
213
|
default_substitution_cost=data.get("default_substitution_cost", 1.0),
|
|
@@ -549,3 +549,13 @@ def test_costs_above_default_cost() -> None:
|
|
|
549
549
|
)
|
|
550
550
|
actual_cost = wl.distance("a", "b")
|
|
551
551
|
assert actual_cost == configured_cost
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
def test_serialization() -> None:
|
|
555
|
+
wl_orig = WeightedLevenshtein(
|
|
556
|
+
substitution_costs={("a", "b"): 0.5},
|
|
557
|
+
insertion_costs={"a": 0.5},
|
|
558
|
+
deletion_costs={"b": 0.5},
|
|
559
|
+
)
|
|
560
|
+
wl_loaded = WeightedLevenshtein.from_dict(wl_orig.to_dict())
|
|
561
|
+
assert wl_loaded == wl_orig
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/python/ocr_stringdist/default_ocr_distances.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/python/tests/test_batch_weighted_levenshtein.py
RENAMED
|
File without changes
|
{ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/python/tests/test_explain_weighted_levenshtein.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|