PyPI - onnxtr - Versions diffs - 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

onnxtr 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

onnxtr/io/elements.py +17 -4
onnxtr/io/pdf.py +6 -3
onnxtr/models/__init__.py +1 -0
onnxtr/models/_utils.py +57 -20
onnxtr/models/builder.py +24 -9
onnxtr/models/classification/models/mobilenet.py +12 -5
onnxtr/models/classification/zoo.py +18 -6
onnxtr/models/detection/_utils/__init__.py +1 -0
onnxtr/models/detection/_utils/base.py +66 -0
onnxtr/models/detection/models/differentiable_binarization.py +27 -12
onnxtr/models/detection/models/fast.py +30 -9
onnxtr/models/detection/models/linknet.py +24 -9
onnxtr/models/detection/postprocessor/base.py +4 -3
onnxtr/models/detection/predictor/base.py +15 -1
onnxtr/models/detection/zoo.py +12 -3
onnxtr/models/engine.py +73 -7
onnxtr/models/predictor/base.py +65 -42
onnxtr/models/predictor/predictor.py +22 -15
onnxtr/models/recognition/models/crnn.py +24 -9
onnxtr/models/recognition/models/master.py +14 -5
onnxtr/models/recognition/models/parseq.py +14 -5
onnxtr/models/recognition/models/sar.py +12 -5
onnxtr/models/recognition/models/vitstr.py +18 -7
onnxtr/models/recognition/zoo.py +9 -6
onnxtr/models/zoo.py +16 -0
onnxtr/py.typed +0 -0
onnxtr/utils/geometry.py +33 -12
onnxtr/version.py +1 -1
{onnxtr-0.2.0.dist-info → onnxtr-0.3.0.dist-info}/METADATA +60 -21
{onnxtr-0.2.0.dist-info → onnxtr-0.3.0.dist-info}/RECORD +34 -31
{onnxtr-0.2.0.dist-info → onnxtr-0.3.0.dist-info}/WHEEL +1 -1
{onnxtr-0.2.0.dist-info → onnxtr-0.3.0.dist-info}/top_level.txt +0 -1
{onnxtr-0.2.0.dist-info → onnxtr-0.3.0.dist-info}/LICENSE +0 -0
{onnxtr-0.2.0.dist-info → onnxtr-0.3.0.dist-info}/zip-safe +0 -0

onnxtr/models/predictor/base.py CHANGED Viewed

@@ -8,10 +8,11 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
 import numpy as np
 from onnxtr.models.builder import DocumentBuilder
-from onnxtr.utils.geometry import extract_crops, extract_rcrops
+from onnxtr.models.engine import EngineConfig
+from onnxtr.utils.geometry import extract_crops, extract_rcrops, rotate_image
-from .._utils import rectify_crops, rectify_loc_preds
-from ..classification import crop_orientation_predictor
+from .._utils import estimate_orientation, rectify_crops, rectify_loc_preds
+from ..classification import crop_orientation_predictor, page_orientation_predictor
 from ..classification.predictor import OrientationPredictor
 from ..detection.zoo import ARCHS as DETECTION_ARCHS
 from ..recognition.zoo import ARCHS as RECOGNITION_ARCHS
@@ -31,11 +32,15 @@ class _OCRPredictor:
             accordingly. Doing so will improve performances for documents with page-uniform rotations.
         preserve_aspect_ratio: if True, resize preserving the aspect ratio (with padding)
         symmetric_pad: if True and preserve_aspect_ratio is True, pas the image symmetrically.
+        detect_orientation: if True, the estimated general page orientation will be added to the predictions for each
+            page. Doing so will slightly deteriorate the overall latency.
         load_in_8_bit: whether to load the the 8-bit quantized model, defaults to False
+        clf_engine_cfg: configuration of the orientation classification engine
         **kwargs: keyword args of `DocumentBuilder`
     """
     crop_orientation_predictor: Optional[OrientationPredictor]
+    page_orientation_predictor: Optional[OrientationPredictor]
     def __init__(
         self,
@@ -43,19 +48,75 @@ class _OCRPredictor:
         straighten_pages: bool = False,
         preserve_aspect_ratio: bool = True,
         symmetric_pad: bool = True,
+        detect_orientation: bool = False,
         load_in_8_bit: bool = False,
+        clf_engine_cfg: EngineConfig = EngineConfig(),
         **kwargs: Any,
     ) -> None:
         self.assume_straight_pages = assume_straight_pages
         self.straighten_pages = straighten_pages
         self.crop_orientation_predictor = (
-            None if assume_straight_pages else crop_orientation_predictor(load_in_8_bit=load_in_8_bit)
+            None
+            if assume_straight_pages
+            else crop_orientation_predictor(load_in_8_bit=load_in_8_bit, engine_cfg=clf_engine_cfg)
+        )
+        self.page_orientation_predictor = (
+            page_orientation_predictor(load_in_8_bit=load_in_8_bit, engine_cfg=clf_engine_cfg)
+            if detect_orientation or straighten_pages or not assume_straight_pages
+            else None
         )
         self.doc_builder = DocumentBuilder(**kwargs)
         self.preserve_aspect_ratio = preserve_aspect_ratio
         self.symmetric_pad = symmetric_pad
         self.hooks: List[Callable] = []
+    def _general_page_orientations(
+        self,
+        pages: List[np.ndarray],
+    ) -> List[Tuple[int, float]]:
+        _, classes, probs = zip(self.page_orientation_predictor(pages))  # type: ignore[misc]
+        # Flatten to list of tuples with (value, confidence)
+        page_orientations = [
+            (orientation, prob)
+            for page_classes, page_probs in zip(classes, probs)
+            for orientation, prob in zip(page_classes, page_probs)
+        ]
+        return page_orientations
+    def _get_orientations(
+        self, pages: List[np.ndarray], seg_maps: List[np.ndarray]
+    ) -> Tuple[List[Tuple[int, float]], List[int]]:
+        general_pages_orientations = self._general_page_orientations(pages)
+        origin_page_orientations = [
+            estimate_orientation(seq_map, general_orientation)
+            for seq_map, general_orientation in zip(seg_maps, general_pages_orientations)
+        ]
+        return general_pages_orientations, origin_page_orientations
+    def _straighten_pages(
+        self,
+        pages: List[np.ndarray],
+        seg_maps: List[np.ndarray],
+        general_pages_orientations: Optional[List[Tuple[int, float]]] = None,
+        origin_pages_orientations: Optional[List[int]] = None,
+    ) -> List[np.ndarray]:
+        general_pages_orientations = (
+            general_pages_orientations if general_pages_orientations else self._general_page_orientations(pages)
+        )
+        origin_pages_orientations = (
+            origin_pages_orientations
+            if origin_pages_orientations
+            else [
+                estimate_orientation(seq_map, general_orientation)
+                for seq_map, general_orientation in zip(seg_maps, general_pages_orientations)
+            ]
+        )
+        return [
+            # We exapnd if the page is wider than tall and the angle is 90 or -90
+            rotate_image(page, angle, expand=page.shape[1] > page.shape[0] and abs(angle) == 90)
+            for page, angle in zip(pages, origin_pages_orientations)
+        ]
     @staticmethod
     def _generate_crops(
         pages: List[np.ndarray],
@@ -110,44 +171,6 @@ class _OCRPredictor:
         ]
         return rect_crops, rect_loc_preds, crop_orientations  # type: ignore[return-value]
-    def _remove_padding(
-        self,
-        pages: List[np.ndarray],
-        loc_preds: List[np.ndarray],
-    ) -> List[np.ndarray]:
-        if self.preserve_aspect_ratio:
-            # Rectify loc_preds to remove padding
-            rectified_preds = []
-            for page, loc_pred in zip(pages, loc_preds):
-                h, w = page.shape[0], page.shape[1]
-                if h > w:
-                    # y unchanged, dilate x coord
-                    if self.symmetric_pad:
-                        if self.assume_straight_pages:
-                            loc_pred[:, [0, 2]] = np.clip((loc_pred[:, [0, 2]] - 0.5) * h / w + 0.5, 0, 1)
-                        else:
-                            loc_pred[:, :, 0] = np.clip((loc_pred[:, :, 0] - 0.5) * h / w + 0.5, 0, 1)
-                    else:
-                        if self.assume_straight_pages:
-                            loc_pred[:, [0, 2]] *= h / w
-                        else:
-                            loc_pred[:, :, 0] *= h / w
-                elif w > h:
-                    # x unchanged, dilate y coord
-                    if self.symmetric_pad:
-                        if self.assume_straight_pages:
-                            loc_pred[:, [1, 3]] = np.clip((loc_pred[:, [1, 3]] - 0.5) * w / h + 0.5, 0, 1)
-                        else:
-                            loc_pred[:, :, 1] = np.clip((loc_pred[:, :, 1] - 0.5) * w / h + 0.5, 0, 1)
-                    else:
-                        if self.assume_straight_pages:
-                            loc_pred[:, [1, 3]] *= w / h
-                        else:
-                            loc_pred[:, :, 1] *= w / h
-                rectified_preds.append(loc_pred)
-            return rectified_preds
-        return loc_preds
     @staticmethod
     def _process_predictions(
         loc_preds: List[np.ndarray],

onnxtr/models/predictor/predictor.py CHANGED Viewed

@@ -8,10 +8,11 @@ from typing import Any, List
 import numpy as np
 from onnxtr.io.elements import Document
-from onnxtr.models._utils import estimate_orientation, get_language
+from onnxtr.models._utils import get_language
 from onnxtr.models.detection.predictor import DetectionPredictor
+from onnxtr.models.engine import EngineConfig
 from onnxtr.models.recognition.predictor import RecognitionPredictor
-from onnxtr.utils.geometry import rotate_image
+from onnxtr.utils.geometry import detach_scores
 from onnxtr.utils.repr import NestedObject
 from .base import _OCRPredictor
@@ -35,6 +36,7 @@ class OCRPredictor(NestedObject, _OCRPredictor):
             page. Doing so will slightly deteriorate the overall latency.
         detect_language: if True, the language prediction will be added to the predictions for each
             page. Doing so will slightly deteriorate the overall latency.
+        clf_engine_cfg: configuration of the orientation classification engine
         **kwargs: keyword args of `DocumentBuilder`
     """
@@ -50,12 +52,20 @@ class OCRPredictor(NestedObject, _OCRPredictor):
         symmetric_pad: bool = True,
         detect_orientation: bool = False,
         detect_language: bool = False,
+        clf_engine_cfg: EngineConfig = EngineConfig(),
         **kwargs: Any,
     ) -> None:
         self.det_predictor = det_predictor
         self.reco_predictor = reco_predictor
         _OCRPredictor.__init__(
-            self, assume_straight_pages, straighten_pages, preserve_aspect_ratio, symmetric_pad, **kwargs
+            self,
+            assume_straight_pages,
+            straighten_pages,
+            preserve_aspect_ratio,
+            symmetric_pad,
+            detect_orientation,
+            clf_engine_cfg=clf_engine_cfg,
+            **kwargs,
         )
         self.detect_orientation = detect_orientation
         self.detect_language = detect_language
@@ -80,26 +90,22 @@ class OCRPredictor(NestedObject, _OCRPredictor):
             for out_map in out_maps
         ]
         if self.detect_orientation:
-            origin_page_orientations = [estimate_orientation(seq_map) for seq_map in seg_maps]
+            general_pages_orientations, origin_pages_orientations = self._get_orientations(pages, seg_maps)
             orientations = [
-                {"value": orientation_page, "confidence": None} for orientation_page in origin_page_orientations
+                {"value": orientation_page, "confidence": None} for orientation_page in origin_pages_orientations
             ]
         else:
             orientations = None
+            general_pages_orientations = None
+            origin_pages_orientations = None
         if self.straighten_pages:
-            origin_page_orientations = (
-                origin_page_orientations
-                if self.detect_orientation
-                else [estimate_orientation(seq_map) for seq_map in seg_maps]
-            )
-            pages = [rotate_image(page, -angle, expand=False) for page, angle in zip(pages, origin_page_orientations)]
+            pages = self._straighten_pages(pages, seg_maps, general_pages_orientations, origin_pages_orientations)
             # forward again to get predictions on straight pages
             loc_preds = self.det_predictor(pages, **kwargs)  # type: ignore[assignment]
-        loc_preds = [loc_pred[0] for loc_pred in loc_preds]
-        # Rectify crops if aspect ratio
-        loc_preds = self._remove_padding(pages, loc_preds)
+        # Detach objectness scores from loc_preds
+        loc_preds, objectness_scores = detach_scores(loc_preds)  # type: ignore[arg-type]
         # Apply hooks to loc_preds if any
         for hook in self.hooks:
@@ -136,6 +142,7 @@ class OCRPredictor(NestedObject, _OCRPredictor):
         out = self.doc_builder(
             pages,
             boxes,
+            objectness_scores,
             text_preds,
             origin_page_shapes,  # type: ignore[arg-type]
             crop_orientations,

onnxtr/models/recognition/models/crnn.py CHANGED Viewed

@@ -12,7 +12,7 @@ from scipy.special import softmax
 from onnxtr.utils import VOCABS
-from ...engine import Engine
+from ...engine import Engine, EngineConfig
 from ..core import RecognitionPostProcessor
 __all__ = ["CRNN", "crnn_vgg16_bn", "crnn_mobilenet_v3_small", "crnn_mobilenet_v3_large"]
@@ -113,6 +113,7 @@ class CRNN(Engine):
     ----
         model_path: path or url to onnx model file
         vocab: vocabulary used for encoding
+        engine_cfg: configuration for the inference engine
         cfg: configuration dictionary
         **kwargs: additional arguments to be passed to `Engine`
     """
@@ -123,10 +124,11 @@ class CRNN(Engine):
         self,
         model_path: str,
         vocab: str,
+        engine_cfg: EngineConfig = EngineConfig(),
         cfg: Optional[Dict[str, Any]] = None,
         **kwargs: Any,
     ) -> None:
-        super().__init__(url=model_path, **kwargs)
+        super().__init__(url=model_path, engine_cfg=engine_cfg, **kwargs)
         self.vocab = vocab
         self.cfg = cfg
         self.postprocessor = CRNNPostProcessor(self.vocab)
@@ -152,6 +154,7 @@ def _crnn(
     arch: str,
     model_path: str,
     load_in_8_bit: bool = False,
+    engine_cfg: EngineConfig = EngineConfig(),
     **kwargs: Any,
 ) -> CRNN:
     kwargs["vocab"] = kwargs.get("vocab", default_cfgs[arch]["vocab"])
@@ -163,11 +166,14 @@ def _crnn(
     model_path = default_cfgs[arch]["url_8_bit"] if load_in_8_bit and "http" in model_path else model_path
     # Build the model
-    return CRNN(model_path, cfg=_cfg, **kwargs)
+    return CRNN(model_path, cfg=_cfg, engine_cfg=engine_cfg, **kwargs)
 def crnn_vgg16_bn(
-    model_path: str = default_cfgs["crnn_vgg16_bn"]["url"], load_in_8_bit: bool = False, **kwargs: Any
+    model_path: str = default_cfgs["crnn_vgg16_bn"]["url"],
+    load_in_8_bit: bool = False,
+    engine_cfg: EngineConfig = EngineConfig(),
+    **kwargs: Any,
 ) -> CRNN:
     """CRNN with a VGG-16 backbone as described in `"An End-to-End Trainable Neural Network for Image-based
     Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_.
@@ -182,17 +188,21 @@ def crnn_vgg16_bn(
     ----
         model_path: path to onnx model file, defaults to url in default_cfgs
         load_in_8_bit: whether to load the the 8-bit quantized model, defaults to False
+        engine_cfg: configuration for the inference engine
         **kwargs: keyword arguments of the CRNN architecture
     Returns:
     -------
         text recognition architecture
     """
-    return _crnn("crnn_vgg16_bn", model_path, load_in_8_bit, **kwargs)
+    return _crnn("crnn_vgg16_bn", model_path, load_in_8_bit, engine_cfg, **kwargs)
 def crnn_mobilenet_v3_small(
-    model_path: str = default_cfgs["crnn_mobilenet_v3_small"]["url"], load_in_8_bit: bool = False, **kwargs: Any
+    model_path: str = default_cfgs["crnn_mobilenet_v3_small"]["url"],
+    load_in_8_bit: bool = False,
+    engine_cfg: EngineConfig = EngineConfig(),
+    **kwargs: Any,
 ) -> CRNN:
     """CRNN with a MobileNet V3 Small backbone as described in `"An End-to-End Trainable Neural Network for Image-based
     Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_.
@@ -207,17 +217,21 @@ def crnn_mobilenet_v3_small(
     ----
         model_path: path to onnx model file, defaults to url in default_cfgs
         load_in_8_bit: whether to load the the 8-bit quantized model, defaults to False
+        engine_cfg: configuration for the inference engine
         **kwargs: keyword arguments of the CRNN architecture
     Returns:
     -------
         text recognition architecture
     """
-    return _crnn("crnn_mobilenet_v3_small", model_path, load_in_8_bit, **kwargs)
+    return _crnn("crnn_mobilenet_v3_small", model_path, load_in_8_bit, engine_cfg, **kwargs)
 def crnn_mobilenet_v3_large(
-    model_path: str = default_cfgs["crnn_mobilenet_v3_large"]["url"], load_in_8_bit: bool = False, **kwargs: Any
+    model_path: str = default_cfgs["crnn_mobilenet_v3_large"]["url"],
+    load_in_8_bit: bool = False,
+    engine_cfg: EngineConfig = EngineConfig(),
+    **kwargs: Any,
 ) -> CRNN:
     """CRNN with a MobileNet V3 Large backbone as described in `"An End-to-End Trainable Neural Network for Image-based
     Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_.
@@ -232,10 +246,11 @@ def crnn_mobilenet_v3_large(
     ----
         model_path: path to onnx model file, defaults to url in default_cfgs
         load_in_8_bit: whether to load the the 8-bit quantized model, defaults to False
+        engine_cfg: configuration for the inference engine
         **kwargs: keyword arguments of the CRNN architecture
     Returns:
     -------
         text recognition architecture
     """
-    return _crnn("crnn_mobilenet_v3_large", model_path, load_in_8_bit, **kwargs)
+    return _crnn("crnn_mobilenet_v3_large", model_path, load_in_8_bit, engine_cfg, **kwargs)

onnxtr/models/recognition/models/master.py CHANGED Viewed

@@ -11,7 +11,7 @@ from scipy.special import softmax
 from onnxtr.utils import VOCABS
-from ...engine import Engine
+from ...engine import Engine, EngineConfig
 from ..core import RecognitionPostProcessor
 __all__ = ["MASTER", "master"]
@@ -36,6 +36,7 @@ class MASTER(Engine):
     ----
         model_path: path or url to onnx model file
         vocab: vocabulary, (without EOS, SOS, PAD)
+        engine_cfg: configuration for the inference engine
         cfg: dictionary containing information about the model
         **kwargs: additional arguments to be passed to `Engine`
     """
@@ -44,10 +45,11 @@ class MASTER(Engine):
         self,
         model_path: str,
         vocab: str,
+        engine_cfg: EngineConfig = EngineConfig(),
         cfg: Optional[Dict[str, Any]] = None,
         **kwargs: Any,
     ) -> None:
-        super().__init__(url=model_path, **kwargs)
+        super().__init__(url=model_path, engine_cfg=engine_cfg, **kwargs)
         self.vocab = vocab
         self.cfg = cfg
@@ -114,6 +116,7 @@ def _master(
     arch: str,
     model_path: str,
     load_in_8_bit: bool = False,
+    engine_cfg: EngineConfig = EngineConfig(),
     **kwargs: Any,
 ) -> MASTER:
     # Patch the config
@@ -125,10 +128,15 @@ def _master(
     # Patch the url
     model_path = default_cfgs[arch]["url_8_bit"] if load_in_8_bit and "http" in model_path else model_path
-    return MASTER(model_path, cfg=_cfg, **kwargs)
+    return MASTER(model_path, cfg=_cfg, engine_cfg=engine_cfg, **kwargs)
-def master(model_path: str = default_cfgs["master"]["url"], load_in_8_bit: bool = False, **kwargs: Any) -> MASTER:
+def master(
+    model_path: str = default_cfgs["master"]["url"],
+    load_in_8_bit: bool = False,
+    engine_cfg: EngineConfig = EngineConfig(),
+    **kwargs: Any,
+) -> MASTER:
     """MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_.
     >>> import numpy as np
@@ -141,10 +149,11 @@ def master(model_path: str = default_cfgs["master"]["url"], load_in_8_bit: bool
     ----
         model_path: path to onnx model file, defaults to url in default_cfgs
         load_in_8_bit: whether to load the the 8-bit quantized model, defaults to False
+        engine_cfg: configuration for the inference engine
         **kwargs: keywoard arguments passed to the MASTER architecture
     Returns:
     -------
         text recognition architecture
     """
-    return _master("master", model_path, load_in_8_bit, **kwargs)
+    return _master("master", model_path, load_in_8_bit, engine_cfg, **kwargs)

onnxtr/models/recognition/models/parseq.py CHANGED Viewed

@@ -11,7 +11,7 @@ from scipy.special import softmax
 from onnxtr.utils import VOCABS
-from ...engine import Engine
+from ...engine import Engine, EngineConfig
 from ..core import RecognitionPostProcessor
 __all__ = ["PARSeq", "parseq"]
@@ -35,6 +35,7 @@ class PARSeq(Engine):
     ----
         model_path: path to onnx model file
         vocab: vocabulary used for encoding
+        engine_cfg: configuration for the inference engine
         cfg: dictionary containing information about the model
         **kwargs: additional arguments to be passed to `Engine`
     """
@@ -43,10 +44,11 @@ class PARSeq(Engine):
         self,
         model_path: str,
         vocab: str,
+        engine_cfg: EngineConfig = EngineConfig(),
         cfg: Optional[Dict[str, Any]] = None,
         **kwargs: Any,
     ) -> None:
-        super().__init__(url=model_path, **kwargs)
+        super().__init__(url=model_path, engine_cfg=engine_cfg, **kwargs)
         self.vocab = vocab
         self.cfg = cfg
         self.postprocessor = PARSeqPostProcessor(vocab=self.vocab)
@@ -102,6 +104,7 @@ def _parseq(
     arch: str,
     model_path: str,
     load_in_8_bit: bool = False,
+    engine_cfg: EngineConfig = EngineConfig(),
     **kwargs: Any,
 ) -> PARSeq:
     # Patch the config
@@ -114,10 +117,15 @@ def _parseq(
     model_path = default_cfgs[arch]["url_8_bit"] if load_in_8_bit and "http" in model_path else model_path
     # Build the model
-    return PARSeq(model_path, cfg=_cfg, **kwargs)
+    return PARSeq(model_path, cfg=_cfg, engine_cfg=engine_cfg, **kwargs)
-def parseq(model_path: str = default_cfgs["parseq"]["url"], load_in_8_bit: bool = False, **kwargs: Any) -> PARSeq:
+def parseq(
+    model_path: str = default_cfgs["parseq"]["url"],
+    load_in_8_bit: bool = False,
+    engine_cfg: EngineConfig = EngineConfig(),
+    **kwargs: Any,
+) -> PARSeq:
     """PARSeq architecture from
     `"Scene Text Recognition with Permuted Autoregressive Sequence Models" <https://arxiv.org/pdf/2207.06966>`_.
@@ -131,10 +139,11 @@ def parseq(model_path: str = default_cfgs["parseq"]["url"], load_in_8_bit: bool
     ----
         model_path: path to onnx model file, defaults to url in default_cfgs
         load_in_8_bit: whether to load the the 8-bit quantized model, defaults to False
+        engine_cfg: configuration for the inference engine
         **kwargs: keyword arguments of the PARSeq architecture
     Returns:
     -------
         text recognition architecture
     """
-    return _parseq("parseq", model_path, load_in_8_bit, **kwargs)
+    return _parseq("parseq", model_path, load_in_8_bit, engine_cfg, **kwargs)

onnxtr/models/recognition/models/sar.py CHANGED Viewed

@@ -11,7 +11,7 @@ from scipy.special import softmax
 from onnxtr.utils import VOCABS
-from ...engine import Engine
+from ...engine import Engine, EngineConfig
 from ..core import RecognitionPostProcessor
 __all__ = ["SAR", "sar_resnet31"]
@@ -35,6 +35,7 @@ class SAR(Engine):
     ----
         model_path: path to onnx model file
         vocab: vocabulary used for encoding
+        engine_cfg: configuration for the inference engine
         cfg: dictionary containing information about the model
         **kwargs: additional arguments to be passed to `Engine`
     """
@@ -43,10 +44,11 @@ class SAR(Engine):
         self,
         model_path: str,
         vocab: str,
+        engine_cfg: EngineConfig = EngineConfig(),
         cfg: Optional[Dict[str, Any]] = None,
         **kwargs: Any,
     ) -> None:
-        super().__init__(url=model_path, **kwargs)
+        super().__init__(url=model_path, engine_cfg=engine_cfg, **kwargs)
         self.vocab = vocab
         self.cfg = cfg
         self.postprocessor = SARPostProcessor(self.vocab)
@@ -101,6 +103,7 @@ def _sar(
     arch: str,
     model_path: str,
     load_in_8_bit: bool = False,
+    engine_cfg: EngineConfig = EngineConfig(),
     **kwargs: Any,
 ) -> SAR:
     # Patch the config
@@ -113,11 +116,14 @@ def _sar(
     model_path = default_cfgs[arch]["url_8_bit"] if load_in_8_bit and "http" in model_path else model_path
     # Build the model
-    return SAR(model_path, cfg=_cfg, **kwargs)
+    return SAR(model_path, cfg=_cfg, engine_cfg=engine_cfg, **kwargs)
 def sar_resnet31(
-    model_path: str = default_cfgs["sar_resnet31"]["url"], load_in_8_bit: bool = False, **kwargs: Any
+    model_path: str = default_cfgs["sar_resnet31"]["url"],
+    load_in_8_bit: bool = False,
+    engine_cfg: EngineConfig = EngineConfig(),
+    **kwargs: Any,
 ) -> SAR:
     """SAR with a resnet-31 feature extractor as described in `"Show, Attend and Read:A Simple and Strong
     Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_.
@@ -132,10 +138,11 @@ def sar_resnet31(
     ----
         model_path: path to onnx model file, defaults to url in default_cfgs
         load_in_8_bit: whether to load the the 8-bit quantized model, defaults to False
+        engine_cfg: configuration for the inference engine
         **kwargs: keyword arguments of the SAR architecture
     Returns:
     -------
         text recognition architecture
     """
-    return _sar("sar_resnet31", model_path, load_in_8_bit, **kwargs)
+    return _sar("sar_resnet31", model_path, load_in_8_bit, engine_cfg, **kwargs)

onnxtr/models/recognition/models/vitstr.py CHANGED Viewed

@@ -11,7 +11,7 @@ from scipy.special import softmax
 from onnxtr.utils import VOCABS
-from ...engine import Engine
+from ...engine import Engine, EngineConfig
 from ..core import RecognitionPostProcessor
 __all__ = ["ViTSTR", "vitstr_small", "vitstr_base"]
@@ -43,6 +43,7 @@ class ViTSTR(Engine):
     ----
         model_path: path to onnx model file
         vocab: vocabulary used for encoding
+        engine_cfg: configuration for the inference engine
         cfg: dictionary containing information about the model
         **kwargs: additional arguments to be passed to `Engine`
     """
@@ -51,10 +52,11 @@ class ViTSTR(Engine):
         self,
         model_path: str,
         vocab: str,
+        engine_cfg: EngineConfig = EngineConfig(),
         cfg: Optional[Dict[str, Any]] = None,
         **kwargs: Any,
     ) -> None:
-        super().__init__(url=model_path, **kwargs)
+        super().__init__(url=model_path, engine_cfg=engine_cfg, **kwargs)
         self.vocab = vocab
         self.cfg = cfg
@@ -112,6 +114,7 @@ def _vitstr(
     arch: str,
     model_path: str,
     load_in_8_bit: bool = False,
+    engine_cfg: EngineConfig = EngineConfig(),
     **kwargs: Any,
 ) -> ViTSTR:
     # Patch the config
@@ -124,11 +127,14 @@ def _vitstr(
     model_path = default_cfgs[arch]["url_8_bit"] if load_in_8_bit and "http" in model_path else model_path
     # Build the model
-    return ViTSTR(model_path, cfg=_cfg, **kwargs)
+    return ViTSTR(model_path, cfg=_cfg, engine_cfg=engine_cfg, **kwargs)
 def vitstr_small(
-    model_path: str = default_cfgs["vitstr_small"]["url"], load_in_8_bit: bool = False, **kwargs: Any
+    model_path: str = default_cfgs["vitstr_small"]["url"],
+    load_in_8_bit: bool = False,
+    engine_cfg: EngineConfig = EngineConfig(),
+    **kwargs: Any,
 ) -> ViTSTR:
     """ViTSTR-Small as described in `"Vision Transformer for Fast and Efficient Scene Text Recognition"
     <https://arxiv.org/pdf/2105.08582.pdf>`_.
@@ -143,17 +149,21 @@ def vitstr_small(
     ----
         model_path: path to onnx model file, defaults to url in default_cfgs
         load_in_8_bit: whether to load the the 8-bit quantized model, defaults to False
+        engine_cfg: configuration for the inference engine
         **kwargs: keyword arguments of the ViTSTR architecture
     Returns:
     -------
         text recognition architecture
     """
-    return _vitstr("vitstr_small", model_path, load_in_8_bit, **kwargs)
+    return _vitstr("vitstr_small", model_path, load_in_8_bit, engine_cfg, **kwargs)
 def vitstr_base(
-    model_path: str = default_cfgs["vitstr_base"]["url"], load_in_8_bit: bool = False, **kwargs: Any
+    model_path: str = default_cfgs["vitstr_base"]["url"],
+    load_in_8_bit: bool = False,
+    engine_cfg: EngineConfig = EngineConfig(),
+    **kwargs: Any,
 ) -> ViTSTR:
     """ViTSTR-Base as described in `"Vision Transformer for Fast and Efficient Scene Text Recognition"
     <https://arxiv.org/pdf/2105.08582.pdf>`_.
@@ -168,10 +178,11 @@ def vitstr_base(
     ----
         model_path: path to onnx model file, defaults to url in default_cfgs
         load_in_8_bit: whether to load the the 8-bit quantized model, defaults to False
+        engine_cfg: configuration for the inference engine
         **kwargs: keyword arguments of the ViTSTR architecture
     Returns:
     -------
         text recognition architecture
     """
-    return _vitstr("vitstr_base", model_path, load_in_8_bit, **kwargs)
+    return _vitstr("vitstr_base", model_path, load_in_8_bit, engine_cfg, **kwargs)

onnxtr 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

onnxtr 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl