onnxtr 0.6.3__tar.gz → 0.7.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. {onnxtr-0.6.3 → onnxtr-0.7.1}/PKG-INFO +5 -3
  2. {onnxtr-0.6.3 → onnxtr-0.7.1}/README.md +4 -2
  3. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/predictor/predictor.py +1 -1
  4. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/recognition/models/__init__.py +1 -0
  5. onnxtr-0.7.1/onnxtr/models/recognition/models/viptr.py +179 -0
  6. onnxtr-0.7.1/onnxtr/models/recognition/predictor/_utils.py +145 -0
  7. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/recognition/predictor/base.py +3 -3
  8. onnxtr-0.7.1/onnxtr/models/recognition/utils.py +93 -0
  9. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/recognition/zoo.py +10 -1
  10. onnxtr-0.7.1/onnxtr/utils/vocabs.py +1140 -0
  11. onnxtr-0.7.1/onnxtr/version.py +1 -0
  12. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr.egg-info/PKG-INFO +5 -3
  13. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr.egg-info/SOURCES.txt +1 -0
  14. {onnxtr-0.6.3 → onnxtr-0.7.1}/setup.py +1 -1
  15. onnxtr-0.6.3/onnxtr/models/recognition/predictor/_utils.py +0 -83
  16. onnxtr-0.6.3/onnxtr/models/recognition/utils.py +0 -84
  17. onnxtr-0.6.3/onnxtr/utils/vocabs.py +0 -155
  18. onnxtr-0.6.3/onnxtr/version.py +0 -1
  19. {onnxtr-0.6.3 → onnxtr-0.7.1}/LICENSE +0 -0
  20. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/__init__.py +0 -0
  21. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/contrib/__init__.py +0 -0
  22. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/contrib/artefacts.py +0 -0
  23. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/contrib/base.py +0 -0
  24. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/file_utils.py +0 -0
  25. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/io/__init__.py +0 -0
  26. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/io/elements.py +0 -0
  27. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/io/html.py +0 -0
  28. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/io/image.py +0 -0
  29. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/io/pdf.py +0 -0
  30. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/io/reader.py +0 -0
  31. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/__init__.py +0 -0
  32. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/_utils.py +0 -0
  33. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/builder.py +0 -0
  34. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/classification/__init__.py +0 -0
  35. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/classification/models/__init__.py +0 -0
  36. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/classification/models/mobilenet.py +0 -0
  37. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/classification/predictor/__init__.py +0 -0
  38. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/classification/predictor/base.py +0 -0
  39. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/classification/zoo.py +0 -0
  40. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/detection/__init__.py +0 -0
  41. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/detection/_utils/__init__.py +0 -0
  42. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/detection/_utils/base.py +0 -0
  43. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/detection/core.py +0 -0
  44. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/detection/models/__init__.py +0 -0
  45. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/detection/models/differentiable_binarization.py +0 -0
  46. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/detection/models/fast.py +0 -0
  47. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/detection/models/linknet.py +0 -0
  48. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/detection/postprocessor/__init__.py +0 -0
  49. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/detection/postprocessor/base.py +0 -0
  50. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/detection/predictor/__init__.py +0 -0
  51. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/detection/predictor/base.py +0 -0
  52. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/detection/zoo.py +0 -0
  53. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/engine.py +0 -0
  54. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/factory/__init__.py +0 -0
  55. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/factory/hub.py +0 -0
  56. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/predictor/__init__.py +0 -0
  57. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/predictor/base.py +0 -0
  58. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/preprocessor/__init__.py +0 -0
  59. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/preprocessor/base.py +0 -0
  60. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/recognition/__init__.py +0 -0
  61. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/recognition/core.py +0 -0
  62. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/recognition/models/crnn.py +0 -0
  63. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/recognition/models/master.py +0 -0
  64. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/recognition/models/parseq.py +0 -0
  65. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/recognition/models/sar.py +0 -0
  66. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/recognition/models/vitstr.py +0 -0
  67. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/recognition/predictor/__init__.py +0 -0
  68. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/models/zoo.py +0 -0
  69. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/py.typed +0 -0
  70. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/transforms/__init__.py +0 -0
  71. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/transforms/base.py +0 -0
  72. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/utils/__init__.py +0 -0
  73. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/utils/common_types.py +0 -0
  74. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/utils/data.py +0 -0
  75. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/utils/fonts.py +0 -0
  76. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/utils/geometry.py +0 -0
  77. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/utils/multithreading.py +0 -0
  78. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/utils/reconstitution.py +0 -0
  79. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/utils/repr.py +0 -0
  80. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr/utils/visualization.py +0 -0
  81. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr.egg-info/dependency_links.txt +0 -0
  82. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr.egg-info/requires.txt +0 -0
  83. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr.egg-info/top_level.txt +0 -0
  84. {onnxtr-0.6.3 → onnxtr-0.7.1}/onnxtr.egg-info/zip-safe +0 -0
  85. {onnxtr-0.6.3 → onnxtr-0.7.1}/pyproject.toml +0 -0
  86. {onnxtr-0.6.3 → onnxtr-0.7.1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: onnxtr
3
- Version: 0.6.3
3
+ Version: 0.7.1
4
4
  Summary: Onnx Text Recognition (OnnxTR): docTR Onnx-Wrapper for high-performance OCR on documents.
5
5
  Author-email: Felix Dittrich <felixdittrich92@gmail.com>
6
6
  Maintainer: Felix Dittrich
@@ -292,8 +292,8 @@ Dynamic: license-file
292
292
  [![codecov](https://codecov.io/gh/felixdittrich92/OnnxTR/graph/badge.svg?token=WVFRCQBOLI)](https://codecov.io/gh/felixdittrich92/OnnxTR)
293
293
  [![Codacy Badge](https://app.codacy.com/project/badge/Grade/4fff4d764bb14fb8b4f4afeb9587231b)](https://app.codacy.com/gh/felixdittrich92/OnnxTR/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade)
294
294
  [![CodeFactor](https://www.codefactor.io/repository/github/felixdittrich92/onnxtr/badge)](https://www.codefactor.io/repository/github/felixdittrich92/onnxtr)
295
- [![Socket Badge](https://socket.dev/api/badge/pypi/package/onnxtr/0.6.2?artifact_id=tar-gz)](https://socket.dev/pypi/package/onnxtr/overview/0.6.2/tar-gz)
296
- [![Pypi](https://img.shields.io/badge/pypi-v0.6.3-blue.svg)](https://pypi.org/project/OnnxTR/)
295
+ [![Socket Badge](https://socket.dev/api/badge/pypi/package/onnxtr/0.7.1?artifact_id=tar-gz)](https://socket.dev/pypi/package/onnxtr/overview/0.7.1/tar-gz)
296
+ [![Pypi](https://img.shields.io/badge/pypi-v0.7.1-blue.svg)](https://pypi.org/project/OnnxTR/)
297
297
  [![Docker Images](https://img.shields.io/badge/Docker-4287f5?style=flat&logo=docker&logoColor=white)](https://github.com/felixdittrich92/OnnxTR/pkgs/container/onnxtr)
298
298
  [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/Felix92/OnnxTR-OCR)
299
299
  ![PyPI - Downloads](https://img.shields.io/pypi/dm/onnxtr)
@@ -575,6 +575,7 @@ Credits where it's due: this repository provides ONNX models for the following a
575
575
  - MASTER: [MASTER: Multi-Aspect Non-local Network for Scene Text Recognition](https://arxiv.org/pdf/1910.02562.pdf).
576
576
  - ViTSTR: [Vision Transformer for Fast and Efficient Scene Text Recognition](https://arxiv.org/pdf/2105.08582.pdf).
577
577
  - PARSeq: [Scene Text Recognition with Permuted Autoregressive Sequence Models](https://arxiv.org/pdf/2207.06966).
578
+ - VIPTR: [A Vision Permutable Extractor for Fast and Efficient Scene Text Recognition](https://arxiv.org/abs/2401.10110).
578
579
 
579
580
  ```python
580
581
  predictor = ocr_predictor()
@@ -602,6 +603,7 @@ predictor.list_archs()
602
603
  'vitstr_small',
603
604
  'vitstr_base',
604
605
  'parseq'
606
+ 'viptr_tiny', # No 8-bit support
605
607
  ]
606
608
  }
607
609
  ```
@@ -7,8 +7,8 @@
7
7
  [![codecov](https://codecov.io/gh/felixdittrich92/OnnxTR/graph/badge.svg?token=WVFRCQBOLI)](https://codecov.io/gh/felixdittrich92/OnnxTR)
8
8
  [![Codacy Badge](https://app.codacy.com/project/badge/Grade/4fff4d764bb14fb8b4f4afeb9587231b)](https://app.codacy.com/gh/felixdittrich92/OnnxTR/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade)
9
9
  [![CodeFactor](https://www.codefactor.io/repository/github/felixdittrich92/onnxtr/badge)](https://www.codefactor.io/repository/github/felixdittrich92/onnxtr)
10
- [![Socket Badge](https://socket.dev/api/badge/pypi/package/onnxtr/0.6.2?artifact_id=tar-gz)](https://socket.dev/pypi/package/onnxtr/overview/0.6.2/tar-gz)
11
- [![Pypi](https://img.shields.io/badge/pypi-v0.6.3-blue.svg)](https://pypi.org/project/OnnxTR/)
10
+ [![Socket Badge](https://socket.dev/api/badge/pypi/package/onnxtr/0.7.1?artifact_id=tar-gz)](https://socket.dev/pypi/package/onnxtr/overview/0.7.1/tar-gz)
11
+ [![Pypi](https://img.shields.io/badge/pypi-v0.7.1-blue.svg)](https://pypi.org/project/OnnxTR/)
12
12
  [![Docker Images](https://img.shields.io/badge/Docker-4287f5?style=flat&logo=docker&logoColor=white)](https://github.com/felixdittrich92/OnnxTR/pkgs/container/onnxtr)
13
13
  [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/Felix92/OnnxTR-OCR)
14
14
  ![PyPI - Downloads](https://img.shields.io/pypi/dm/onnxtr)
@@ -290,6 +290,7 @@ Credits where it's due: this repository provides ONNX models for the following a
290
290
  - MASTER: [MASTER: Multi-Aspect Non-local Network for Scene Text Recognition](https://arxiv.org/pdf/1910.02562.pdf).
291
291
  - ViTSTR: [Vision Transformer for Fast and Efficient Scene Text Recognition](https://arxiv.org/pdf/2105.08582.pdf).
292
292
  - PARSeq: [Scene Text Recognition with Permuted Autoregressive Sequence Models](https://arxiv.org/pdf/2207.06966).
293
+ - VIPTR: [A Vision Permutable Extractor for Fast and Efficient Scene Text Recognition](https://arxiv.org/abs/2401.10110).
293
294
 
294
295
  ```python
295
296
  predictor = ocr_predictor()
@@ -317,6 +318,7 @@ predictor.list_archs()
317
318
  'vitstr_small',
318
319
  'vitstr_base',
319
320
  'parseq'
321
+ 'viptr_tiny', # No 8-bit support
320
322
  ]
321
323
  }
322
324
  ```
@@ -115,7 +115,7 @@ class OCRPredictor(NestedObject, _OCRPredictor):
115
115
  # Crop images
116
116
  crops, loc_preds = self._prepare_crops(
117
117
  pages,
118
- loc_preds, # type: ignore[arg-type]
118
+ loc_preds,
119
119
  channels_last=True,
120
120
  assume_straight_pages=self.assume_straight_pages,
121
121
  assume_horizontal=self._page_orientation_disabled,
@@ -3,3 +3,4 @@ from .sar import *
3
3
  from .master import *
4
4
  from .vitstr import *
5
5
  from .parseq import *
6
+ from .viptr import *
@@ -0,0 +1,179 @@
1
+ # Copyright (C) 2021-2025, Mindee | Felix Dittrich.
2
+
3
+ # This program is licensed under the Apache License 2.0.
4
+ # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
+
6
+ import logging
7
+ from copy import deepcopy
8
+ from itertools import groupby
9
+ from typing import Any
10
+
11
+ import numpy as np
12
+ from scipy.special import softmax
13
+
14
+ from onnxtr.utils import VOCABS
15
+
16
+ from ...engine import Engine, EngineConfig
17
+ from ..core import RecognitionPostProcessor
18
+
19
+ __all__ = ["VIPTR", "viptr_tiny"]
20
+
21
+ default_cfgs: dict[str, dict[str, Any]] = {
22
+ "viptr_tiny": {
23
+ "mean": (0.694, 0.695, 0.693),
24
+ "std": (0.299, 0.296, 0.301),
25
+ "input_shape": (3, 32, 128),
26
+ "vocab": VOCABS["french"],
27
+ "url": "https://github.com/felixdittrich92/OnnxTR/releases/download/v0.6.3/viptr_tiny-499b8015.onnx",
28
+ "url_8_bit": "https://github.com/felixdittrich92/OnnxTR/releases/download/v0.6.3/viptr_tiny-499b8015.onnx",
29
+ },
30
+ }
31
+
32
+
33
+ class VIPTRPostProcessor(RecognitionPostProcessor):
34
+ """Postprocess raw prediction of the model (logits) to a list of words using CTC decoding
35
+
36
+ Args:
37
+ vocab: string containing the ordered sequence of supported characters
38
+ """
39
+
40
+ def __init__(self, vocab):
41
+ self.vocab = vocab
42
+
43
+ def decode_sequence(self, sequence, vocab):
44
+ return "".join([vocab[int(char)] for char in sequence])
45
+
46
+ def ctc_best_path(
47
+ self,
48
+ logits,
49
+ vocab,
50
+ blank=0,
51
+ ):
52
+ """Implements best path decoding as shown by Graves (Dissertation, p63), highly inspired from
53
+ <https://github.com/githubharald/CTCDecoder>`_.
54
+
55
+ Args:
56
+ logits: model output, shape: N x T x C
57
+ vocab: vocabulary to use
58
+ blank: index of blank label
59
+
60
+ Returns:
61
+ A list of tuples: (word, confidence)
62
+ """
63
+ # Gather the most confident characters, and assign the smallest conf among those to the sequence prob
64
+ probs = softmax(logits, axis=-1).max(axis=-1).min(axis=1)
65
+
66
+ # collapse best path (using itertools.groupby), map to chars, join char list to string
67
+ words = [
68
+ self.decode_sequence([k for k, _ in groupby(seq.tolist()) if k != blank], vocab)
69
+ for seq in np.argmax(logits, axis=-1)
70
+ ]
71
+
72
+ return list(zip(words, probs.astype(float).tolist()))
73
+
74
+ def __call__(self, logits):
75
+ """Performs decoding of raw output with CTC and decoding of CTC predictions
76
+ with label_to_idx mapping dictionnary
77
+
78
+ Args:
79
+ logits: raw output of the model, shape (N, C + 1, seq_len)
80
+
81
+ Returns:
82
+ A tuple of 2 lists: a list of str (words) and a list of float (probs)
83
+
84
+ """
85
+ # Decode CTC
86
+ return self.ctc_best_path(logits=logits, vocab=self.vocab, blank=len(self.vocab))
87
+
88
+
89
+ class VIPTR(Engine):
90
+ """VIPTR Onnx loader
91
+
92
+ Args:
93
+ model_path: path or url to onnx model file
94
+ vocab: vocabulary used for encoding
95
+ engine_cfg: configuration for the inference engine
96
+ cfg: configuration dictionary
97
+ **kwargs: additional arguments to be passed to `Engine`
98
+ """
99
+
100
+ _children_names: list[str] = ["postprocessor"]
101
+
102
+ def __init__(
103
+ self,
104
+ model_path: str,
105
+ vocab: str,
106
+ engine_cfg: EngineConfig | None = None,
107
+ cfg: dict[str, Any] | None = None,
108
+ **kwargs: Any,
109
+ ) -> None:
110
+ super().__init__(url=model_path, engine_cfg=engine_cfg, **kwargs)
111
+
112
+ self.vocab = vocab
113
+ self.cfg = cfg
114
+
115
+ self.postprocessor = VIPTRPostProcessor(self.vocab)
116
+
117
+ def __call__(
118
+ self,
119
+ x: np.ndarray,
120
+ return_model_output: bool = False,
121
+ ) -> dict[str, Any]:
122
+ logits = self.run(x)
123
+
124
+ out: dict[str, Any] = {}
125
+ if return_model_output:
126
+ out["out_map"] = logits
127
+
128
+ # Post-process
129
+ out["preds"] = self.postprocessor(logits)
130
+
131
+ return out
132
+
133
+
134
+ def _viptr(
135
+ arch: str,
136
+ model_path: str,
137
+ load_in_8_bit: bool = False,
138
+ engine_cfg: EngineConfig | None = None,
139
+ **kwargs: Any,
140
+ ) -> VIPTR:
141
+ if load_in_8_bit:
142
+ logging.warning("VIPTR models do not support 8-bit quantization yet. Loading full precision model...")
143
+ kwargs["vocab"] = kwargs.get("vocab", default_cfgs[arch]["vocab"])
144
+
145
+ _cfg = deepcopy(default_cfgs[arch])
146
+ _cfg["vocab"] = kwargs["vocab"]
147
+ _cfg["input_shape"] = kwargs.get("input_shape", default_cfgs[arch]["input_shape"])
148
+ # Patch the url
149
+ model_path = default_cfgs[arch]["url_8_bit"] if load_in_8_bit and "http" in model_path else model_path
150
+
151
+ # Build the model
152
+ return VIPTR(model_path, cfg=_cfg, engine_cfg=engine_cfg, **kwargs)
153
+
154
+
155
+ def viptr_tiny(
156
+ model_path: str = default_cfgs["viptr_tiny"]["url"],
157
+ load_in_8_bit: bool = False,
158
+ engine_cfg: EngineConfig | None = None,
159
+ **kwargs: Any,
160
+ ) -> VIPTR:
161
+ """VIPTR as described in `"A Vision Permutable Extractor for Fast and Efficient
162
+ Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_.
163
+
164
+ >>> import numpy as np
165
+ >>> from onnxtr.models import viptr_tiny
166
+ >>> model = viptr_tiny()
167
+ >>> input_tensor = np.random.rand(1, 3, 32, 128)
168
+ >>> out = model(input_tensor)
169
+
170
+ Args:
171
+ model_path: path to onnx model file, defaults to url in default_cfgs
172
+ load_in_8_bit: whether to load the the 8-bit quantized model, defaults to False
173
+ engine_cfg: configuration for the inference engine
174
+ **kwargs: keyword arguments of the VIPTR architecture
175
+
176
+ Returns:
177
+ text recognition architecture
178
+ """
179
+ return _viptr("viptr_tiny", model_path, load_in_8_bit, engine_cfg, **kwargs)
@@ -0,0 +1,145 @@
1
+ # Copyright (C) 2021-2025, Mindee | Felix Dittrich.
2
+
3
+ # This program is licensed under the Apache License 2.0.
4
+ # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
+
6
+
7
+ import math
8
+
9
+ import numpy as np
10
+
11
+ from ..utils import merge_multi_strings
12
+
13
+ __all__ = ["split_crops", "remap_preds"]
14
+
15
+
16
+ def split_crops(
17
+ crops: list[np.ndarray],
18
+ max_ratio: float,
19
+ target_ratio: int,
20
+ split_overlap_ratio: float,
21
+ channels_last: bool = True,
22
+ ) -> tuple[list[np.ndarray], list[int | tuple[int, int, float]], bool]:
23
+ """
24
+ Split crops horizontally if they exceed a given aspect ratio.
25
+
26
+ Args:
27
+ crops: List of image crops (H, W, C) if channels_last else (C, H, W).
28
+ max_ratio: Aspect ratio threshold above which crops are split.
29
+ target_ratio: Target aspect ratio after splitting (e.g., 4 for 128x32).
30
+ split_overlap_ratio: Desired overlap between splits (as a fraction of split width).
31
+ channels_last: Whether the crops are in channels-last format.
32
+
33
+ Returns:
34
+ A tuple containing:
35
+ - The new list of crops (possibly with splits),
36
+ - A mapping indicating how to reassemble predictions,
37
+ - A boolean indicating whether remapping is required.
38
+ """
39
+ if split_overlap_ratio <= 0.0 or split_overlap_ratio >= 1.0:
40
+ raise ValueError(f"Valid range for split_overlap_ratio is (0.0, 1.0), but is: {split_overlap_ratio}")
41
+
42
+ remap_required = False
43
+ new_crops: list[np.ndarray] = []
44
+ crop_map: list[int | tuple[int, int, float]] = []
45
+
46
+ for crop in crops:
47
+ h, w = crop.shape[:2] if channels_last else crop.shape[-2:]
48
+ aspect_ratio = w / h
49
+
50
+ if aspect_ratio > max_ratio:
51
+ split_width = max(1, math.ceil(h * target_ratio))
52
+ overlap_width = max(0, math.floor(split_width * split_overlap_ratio))
53
+
54
+ splits, last_overlap = _split_horizontally(crop, split_width, overlap_width, channels_last)
55
+
56
+ # Remove any empty splits
57
+ splits = [s for s in splits if all(dim > 0 for dim in s.shape)]
58
+ if splits:
59
+ crop_map.append((len(new_crops), len(new_crops) + len(splits), last_overlap))
60
+ new_crops.extend(splits)
61
+ remap_required = True
62
+ else:
63
+ # Fallback: treat it as a single crop
64
+ crop_map.append(len(new_crops))
65
+ new_crops.append(crop)
66
+ else:
67
+ crop_map.append(len(new_crops))
68
+ new_crops.append(crop)
69
+
70
+ return new_crops, crop_map, remap_required
71
+
72
+
73
+ def _split_horizontally(
74
+ image: np.ndarray, split_width: int, overlap_width: int, channels_last: bool
75
+ ) -> tuple[list[np.ndarray], float]:
76
+ """
77
+ Horizontally split a single image with overlapping regions.
78
+
79
+ Args:
80
+ image: The image to split (H, W, C) if channels_last else (C, H, W).
81
+ split_width: Width of each split.
82
+ overlap_width: Width of the overlapping region.
83
+ channels_last: Whether the image is in channels-last format.
84
+
85
+ Returns:
86
+ - A list of horizontal image slices.
87
+ - The actual overlap ratio of the last split.
88
+ """
89
+ image_width = image.shape[1] if channels_last else image.shape[-1]
90
+ if image_width <= split_width:
91
+ return [image], 0.0
92
+
93
+ # Compute start columns for each split
94
+ step = split_width - overlap_width
95
+ starts = list(range(0, image_width - split_width + 1, step))
96
+
97
+ # Ensure the last patch reaches the end of the image
98
+ if starts[-1] + split_width < image_width:
99
+ starts.append(image_width - split_width)
100
+
101
+ splits = []
102
+ for start_col in starts:
103
+ end_col = start_col + split_width
104
+ if channels_last:
105
+ split = image[:, start_col:end_col, :]
106
+ else:
107
+ split = image[:, :, start_col:end_col]
108
+ splits.append(split)
109
+
110
+ # Calculate the last overlap ratio, if only one split no overlap
111
+ last_overlap = 0
112
+ if len(starts) > 1:
113
+ last_overlap = (starts[-2] + split_width) - starts[-1]
114
+ last_overlap_ratio = last_overlap / split_width if split_width else 0.0
115
+
116
+ return splits, last_overlap_ratio
117
+
118
+
119
+ def remap_preds(
120
+ preds: list[tuple[str, float]],
121
+ crop_map: list[int | tuple[int, int, float]],
122
+ overlap_ratio: float,
123
+ ) -> list[tuple[str, float]]:
124
+ """
125
+ Reconstruct predictions from possibly split crops.
126
+
127
+ Args:
128
+ preds: List of (text, confidence) tuples from each crop.
129
+ crop_map: Map returned by `split_crops`.
130
+ overlap_ratio: Overlap ratio used during splitting.
131
+
132
+ Returns:
133
+ List of merged (text, confidence) tuples corresponding to original crops.
134
+ """
135
+ remapped = []
136
+ for item in crop_map:
137
+ if isinstance(item, int):
138
+ remapped.append(preds[item])
139
+ else:
140
+ start_idx, end_idx, last_overlap = item
141
+ text_parts, confidences = zip(*preds[start_idx:end_idx])
142
+ merged_text = merge_multi_strings(list(text_parts), overlap_ratio, last_overlap)
143
+ merged_conf = sum(confidences) / len(confidences) # average confidence
144
+ remapped.append((merged_text, merged_conf))
145
+ return remapped
@@ -36,7 +36,7 @@ class RecognitionPredictor(NestedObject):
36
36
  self.model = model
37
37
  self.split_wide_crops = split_wide_crops
38
38
  self.critical_ar = 8 # Critical aspect ratio
39
- self.dil_factor = 1.4 # Dilation factor to overlap the crops
39
+ self.overlap_ratio = 0.5 # Ratio of overlap between neighboring crops
40
40
  self.target_ar = 6 # Target aspect ratio
41
41
 
42
42
  def __call__(
@@ -57,7 +57,7 @@ class RecognitionPredictor(NestedObject):
57
57
  crops, # type: ignore[arg-type]
58
58
  self.critical_ar,
59
59
  self.target_ar,
60
- self.dil_factor,
60
+ self.overlap_ratio,
61
61
  True,
62
62
  )
63
63
  if remapped:
@@ -74,6 +74,6 @@ class RecognitionPredictor(NestedObject):
74
74
 
75
75
  # Remap crops
76
76
  if self.split_wide_crops and remapped:
77
- out = remap_preds(out, crop_map, self.dil_factor)
77
+ out = remap_preds(out, crop_map, self.overlap_ratio)
78
78
 
79
79
  return out
@@ -0,0 +1,93 @@
1
+ # Copyright (C) 2021-2025, Mindee | Felix Dittrich.
2
+
3
+ # This program is licensed under the Apache License 2.0.
4
+ # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
+
6
+
7
+ from rapidfuzz.distance import Hamming
8
+
9
+ __all__ = ["merge_strings", "merge_multi_strings"]
10
+
11
+
12
+ def merge_strings(a: str, b: str, overlap_ratio: float) -> str:
13
+ """Merges 2 character sequences in the best way to maximize the alignment of their overlapping characters.
14
+
15
+ Args:
16
+ a: first char seq, suffix should be similar to b's prefix.
17
+ b: second char seq, prefix should be similar to a's suffix.
18
+ overlap_ratio: estimated ratio of overlapping characters.
19
+
20
+ Returns:
21
+ A merged character sequence.
22
+
23
+ Example::
24
+ >>> from doctr.models.recognition.utils import merge_strings
25
+ >>> merge_strings('abcd', 'cdefgh', 0.5)
26
+ 'abcdefgh'
27
+ >>> merge_strings('abcdi', 'cdefgh', 0.5)
28
+ 'abcdefgh'
29
+ """
30
+ seq_len = min(len(a), len(b))
31
+ if seq_len <= 1: # One sequence is empty or will be after cropping in next step, return both to keep data
32
+ return a + b
33
+
34
+ a_crop, b_crop = a[:-1], b[1:] # Remove last letter of "a" and first of "b", because they might be cut off
35
+ max_overlap = min(len(a_crop), len(b_crop))
36
+
37
+ # Compute Hamming distances for all possible overlaps
38
+ scores = [Hamming.distance(a_crop[-i:], b_crop[:i], processor=None) for i in range(1, max_overlap + 1)]
39
+
40
+ # Find zero-score matches
41
+ zero_matches = [i for i, score in enumerate(scores) if score == 0]
42
+
43
+ expected_overlap = round(len(b) * overlap_ratio) - 3 # adjust for cropping and index
44
+
45
+ # Case 1: One perfect match - exactly one zero score - just merge there
46
+ if len(zero_matches) == 1:
47
+ i = zero_matches[0]
48
+ return a_crop + b_crop[i + 1 :]
49
+
50
+ # Case 2: Multiple perfect matches - likely due to repeated characters.
51
+ # Use the estimated overlap length to choose the match closest to the expected alignment.
52
+ elif len(zero_matches) > 1:
53
+ best_i = min(zero_matches, key=lambda x: abs(x - expected_overlap))
54
+ return a_crop + b_crop[best_i + 1 :]
55
+
56
+ # Case 3: Absence of zero scores indicates that the same character in the image was recognized differently OR that
57
+ # the overlap was too small and we just need to merge the crops fully
58
+ if expected_overlap < -1:
59
+ return a + b
60
+ elif expected_overlap < 0:
61
+ return a_crop + b_crop
62
+
63
+ # Find best overlap by minimizing Hamming distance + distance from expected overlap size
64
+ combined_scores = [score + abs(i - expected_overlap) for i, score in enumerate(scores)]
65
+ best_i = combined_scores.index(min(combined_scores))
66
+ return a_crop + b_crop[best_i + 1 :]
67
+
68
+
69
+ def merge_multi_strings(seq_list: list[str], overlap_ratio: float, last_overlap_ratio: float) -> str:
70
+ """
71
+ Merges consecutive string sequences with overlapping characters.
72
+
73
+ Args:
74
+ seq_list: list of sequences to merge. Sequences need to be ordered from left to right.
75
+ overlap_ratio: Estimated ratio of overlapping letters between neighboring strings.
76
+ last_overlap_ratio: Estimated ratio of overlapping letters for the last element in seq_list.
77
+
78
+ Returns:
79
+ A merged character sequence
80
+
81
+ Example::
82
+ >>> from doctr.models.recognition.utils import merge_multi_strings
83
+ >>> merge_multi_strings(['abc', 'bcdef', 'difghi', 'aijkl'], 0.5, 0.1)
84
+ 'abcdefghijkl'
85
+ """
86
+ if not seq_list:
87
+ return ""
88
+ result = seq_list[0]
89
+ for i in range(1, len(seq_list)):
90
+ text_b = seq_list[i]
91
+ ratio = last_overlap_ratio if i == len(seq_list) - 1 else overlap_ratio
92
+ result = merge_strings(result, text_b, ratio)
93
+ return result
@@ -22,6 +22,7 @@ ARCHS: list[str] = [
22
22
  "vitstr_small",
23
23
  "vitstr_base",
24
24
  "parseq",
25
+ "viptr_tiny",
25
26
  ]
26
27
 
27
28
 
@@ -35,7 +36,15 @@ def _predictor(
35
36
  _model = recognition.__dict__[arch](load_in_8_bit=load_in_8_bit, engine_cfg=engine_cfg)
36
37
  else:
37
38
  if not isinstance(
38
- arch, (recognition.CRNN, recognition.SAR, recognition.MASTER, recognition.ViTSTR, recognition.PARSeq)
39
+ arch,
40
+ (
41
+ recognition.CRNN,
42
+ recognition.SAR,
43
+ recognition.MASTER,
44
+ recognition.ViTSTR,
45
+ recognition.PARSeq,
46
+ recognition.VIPTR,
47
+ ),
39
48
  ):
40
49
  raise ValueError(f"unknown architecture: {type(arch)}")
41
50
  _model = arch