ocr-stringdist 1.0.0__tar.gz → 1.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/CHANGELOG.md +11 -0
  2. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/Cargo.lock +1 -1
  3. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/Cargo.toml +1 -1
  4. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/PKG-INFO +1 -1
  5. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/docs/source/examples.rst +2 -1
  6. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/docs/source/index.rst +1 -1
  7. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/python/ocr_stringdist/__init__.py +2 -0
  8. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/python/ocr_stringdist/edit_operation.py +5 -2
  9. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/python/ocr_stringdist/levenshtein.py +2 -1
  10. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/python/tests/test_weighted_levenshtein.py +10 -0
  11. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/.github/workflows/CI.yml +0 -0
  12. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/.github/workflows/docs.yml +0 -0
  13. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/.gitignore +0 -0
  14. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/Justfile +0 -0
  15. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/LICENSE +0 -0
  16. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/README.md +0 -0
  17. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/docs/Makefile +0 -0
  18. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/docs/make.bat +0 -0
  19. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/docs/source/api/index.rst +0 -0
  20. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/docs/source/changelog.rst +0 -0
  21. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/docs/source/conf.py +0 -0
  22. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/docs/source/cost_learning_model.rst +0 -0
  23. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/docs/source/end_to_end_example.rst +0 -0
  24. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/docs/source/getting-started.rst +0 -0
  25. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/examples/batch_processing.py +0 -0
  26. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/examples/explain_distance.py +0 -0
  27. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/examples/learn_costs.py +0 -0
  28. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/examples/weighted_levenshtein.py +0 -0
  29. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/mypy.ini +0 -0
  30. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/pyproject.toml +0 -0
  31. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/python/ocr_stringdist/default_ocr_distances.py +0 -0
  32. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/python/ocr_stringdist/learner.py +0 -0
  33. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/python/ocr_stringdist/matching.py +0 -0
  34. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/python/ocr_stringdist/protocols.py +0 -0
  35. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/python/ocr_stringdist/py.typed +0 -0
  36. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/python/tests/test_batch_weighted_levenshtein.py +0 -0
  37. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/python/tests/test_explain_weighted_levenshtein.py +0 -0
  38. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/python/tests/test_learner.py +0 -0
  39. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/python/tests/test_matching.py +0 -0
  40. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/python/tests/test_protocols.py +0 -0
  41. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/ruff.toml +0 -0
  42. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/src/cost_map.rs +0 -0
  43. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/src/explanation.rs +0 -0
  44. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/src/lib.rs +0 -0
  45. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/src/rust_stringdist.rs +0 -0
  46. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/src/types.rs +0 -0
  47. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/src/weighted_levenshtein.rs +0 -0
  48. {ocr_stringdist-1.0.0 → ocr_stringdist-1.0.1}/uv.lock +0 -0
@@ -5,6 +5,17 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [1.0.1] - 2025-09-21
9
+
10
+ ### Fixed
11
+
12
+ - Fix critical bug in `WeightedLevenshtein.from_dict` when using insertion costs.
13
+
14
+ ### Added
15
+
16
+ - `from ocr_stringdist import EditOperation`
17
+ - `EditOperation.as_dict()`
18
+
8
19
  ## [1.0.0] - 2025-09-20
9
20
 
10
21
  ### Changed
@@ -74,7 +74,7 @@ dependencies = [
74
74
 
75
75
  [[package]]
76
76
  name = "ocr_stringdist"
77
- version = "1.0.0"
77
+ version = "1.0.1"
78
78
  dependencies = [
79
79
  "pyo3",
80
80
  "rayon",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "ocr_stringdist"
3
- version = "1.0.0"
3
+ version = "1.0.1"
4
4
  edition = "2021"
5
5
  description = "String distances considering OCR errors."
6
6
  authors = ["Niklas von Moers <niklasvmoers@protonmail.com>"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ocr-stringdist
3
- Version: 1.0.0
3
+ Version: 1.0.1
4
4
  Classifier: Programming Language :: Rust
5
5
  Classifier: Programming Language :: Python :: Implementation :: CPython
6
6
  Classifier: License :: OSI Approved :: MIT License
@@ -77,4 +77,5 @@ The custom costs can be learned from a dataset of pairs of (OCR output, ground t
77
77
  distance = learned_wl.distance("Hay", "Hey")
78
78
  print(f"Distance with learned costs: {distance}") # < 1.0
79
79
 
80
- Note that this only supports learning from character-level edits, not multi-character tokens.
80
+ Note that this by default only supports learning from character-level edits.
81
+ If multi-character tokens are to be considered, an `initial_model` that's already configured to know specific multi-character edits needs to be provided.
@@ -4,7 +4,7 @@
4
4
 
5
5
  A Python library to learn, model, explain and correct OCR errors using a fast string distance engine.
6
6
 
7
- :Repository: https://niklasvonm.github.io/ocr-stringdist/
7
+ :Repository: https://github.com/NiklasvonM/ocr-stringdist
8
8
  :Current version: |release|
9
9
 
10
10
  .. image:: https://img.shields.io/badge/PyPI-Package-blue
@@ -1,10 +1,12 @@
1
1
  from .default_ocr_distances import ocr_distance_map
2
+ from .edit_operation import EditOperation
2
3
  from .learner import CostLearner
3
4
  from .levenshtein import WeightedLevenshtein
4
5
  from .matching import find_best_candidate
5
6
 
6
7
  __all__ = [
7
8
  "ocr_distance_map",
9
+ "EditOperation",
8
10
  "CostLearner",
9
11
  "WeightedLevenshtein",
10
12
  "find_best_candidate",
@@ -1,5 +1,5 @@
1
- from dataclasses import dataclass
2
- from typing import Literal, Optional
1
+ from dataclasses import asdict, dataclass
2
+ from typing import Any, Literal, Optional
3
3
 
4
4
  OperationType = Literal["substitute", "insert", "delete", "match"]
5
5
 
@@ -14,3 +14,6 @@ class EditOperation:
14
14
  source_token: Optional[str]
15
15
  target_token: Optional[str]
16
16
  cost: float
17
+
18
+ def to_dict(self) -> dict[str, Any]:
19
+ return asdict(self)
@@ -191,6 +191,7 @@ class WeightedLevenshtein:
191
191
  For the counterpart, see :meth:`WeightedLevenshtein.to_dict`.
192
192
 
193
193
  :param data: A dictionary with (not necessarily all of) the following keys:
194
+
194
195
  - "substitution_costs": {"from": str, "to": str, "cost": float}
195
196
  - "substitution_costs": dict[str, float]
196
197
  - "deletion_costs": dict[str, float]
@@ -206,7 +207,7 @@ class WeightedLevenshtein:
206
207
 
207
208
  return cls(
208
209
  substitution_costs=sub_costs,
209
- insertion_costs=data.get("substitution_costs"),
210
+ insertion_costs=data.get("insertion_costs"),
210
211
  deletion_costs=data.get("deletion_costs"),
211
212
  symmetric_substitution=data.get("symmetric_substitution", True),
212
213
  default_substitution_cost=data.get("default_substitution_cost", 1.0),
@@ -549,3 +549,13 @@ def test_costs_above_default_cost() -> None:
549
549
  )
550
550
  actual_cost = wl.distance("a", "b")
551
551
  assert actual_cost == configured_cost
552
+
553
+
554
+ def test_serialization() -> None:
555
+ wl_orig = WeightedLevenshtein(
556
+ substitution_costs={("a", "b"): 0.5},
557
+ insertion_costs={"a": 0.5},
558
+ deletion_costs={"b": 0.5},
559
+ )
560
+ wl_loaded = WeightedLevenshtein.from_dict(wl_orig.to_dict())
561
+ assert wl_loaded == wl_orig
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes