PyPI - python-doctr - Versions diffs - 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

python-doctr 0.8.1py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (107) hide show

doctr/__init__.py +1 -1
doctr/contrib/__init__.py +0 -0
doctr/contrib/artefacts.py +131 -0
doctr/contrib/base.py +105 -0
doctr/datasets/cord.py +10 -1
doctr/datasets/datasets/pytorch.py +2 -2
doctr/datasets/funsd.py +11 -1
doctr/datasets/generator/base.py +6 -5
doctr/datasets/ic03.py +11 -1
doctr/datasets/ic13.py +10 -1
doctr/datasets/iiit5k.py +26 -16
doctr/datasets/imgur5k.py +11 -2
doctr/datasets/loader.py +1 -6
doctr/datasets/sroie.py +11 -1
doctr/datasets/svhn.py +11 -1
doctr/datasets/svt.py +11 -1
doctr/datasets/synthtext.py +11 -1
doctr/datasets/utils.py +9 -3
doctr/datasets/vocabs.py +15 -4
doctr/datasets/wildreceipt.py +12 -1
doctr/file_utils.py +45 -12
doctr/io/elements.py +52 -10
doctr/io/html.py +2 -2
doctr/io/image/pytorch.py +6 -8
doctr/io/image/tensorflow.py +1 -1
doctr/io/pdf.py +5 -2
doctr/io/reader.py +6 -0
doctr/models/__init__.py +0 -1
doctr/models/_utils.py +57 -20
doctr/models/builder.py +73 -15
doctr/models/classification/magc_resnet/tensorflow.py +13 -6
doctr/models/classification/mobilenet/pytorch.py +47 -9
doctr/models/classification/mobilenet/tensorflow.py +51 -14
doctr/models/classification/predictor/pytorch.py +28 -17
doctr/models/classification/predictor/tensorflow.py +26 -16
doctr/models/classification/resnet/tensorflow.py +21 -8
doctr/models/classification/textnet/pytorch.py +3 -3
doctr/models/classification/textnet/tensorflow.py +11 -5
doctr/models/classification/vgg/tensorflow.py +9 -3
doctr/models/classification/vit/tensorflow.py +10 -4
doctr/models/classification/zoo.py +55 -19
doctr/models/detection/_utils/__init__.py +1 -0
doctr/models/detection/_utils/base.py +66 -0
doctr/models/detection/differentiable_binarization/base.py +4 -3
doctr/models/detection/differentiable_binarization/pytorch.py +2 -2
doctr/models/detection/differentiable_binarization/tensorflow.py +34 -12
doctr/models/detection/fast/base.py +6 -5
doctr/models/detection/fast/pytorch.py +4 -4
doctr/models/detection/fast/tensorflow.py +15 -12
doctr/models/detection/linknet/base.py +4 -3
doctr/models/detection/linknet/tensorflow.py +23 -11
doctr/models/detection/predictor/pytorch.py +15 -1
doctr/models/detection/predictor/tensorflow.py +17 -3
doctr/models/detection/zoo.py +7 -2
doctr/models/factory/hub.py +8 -18
doctr/models/kie_predictor/base.py +13 -3
doctr/models/kie_predictor/pytorch.py +45 -20
doctr/models/kie_predictor/tensorflow.py +44 -17
doctr/models/modules/layers/pytorch.py +2 -3
doctr/models/modules/layers/tensorflow.py +6 -8
doctr/models/modules/transformer/pytorch.py +2 -2
doctr/models/modules/transformer/tensorflow.py +0 -2
doctr/models/modules/vision_transformer/pytorch.py +1 -1
doctr/models/modules/vision_transformer/tensorflow.py +1 -1
doctr/models/predictor/base.py +97 -58
doctr/models/predictor/pytorch.py +35 -20
doctr/models/predictor/tensorflow.py +35 -18
doctr/models/preprocessor/pytorch.py +4 -4
doctr/models/preprocessor/tensorflow.py +3 -2
doctr/models/recognition/crnn/tensorflow.py +8 -6
doctr/models/recognition/master/pytorch.py +2 -2
doctr/models/recognition/master/tensorflow.py +9 -4
doctr/models/recognition/parseq/pytorch.py +4 -3
doctr/models/recognition/parseq/tensorflow.py +14 -11
doctr/models/recognition/sar/pytorch.py +7 -6
doctr/models/recognition/sar/tensorflow.py +10 -12
doctr/models/recognition/vitstr/pytorch.py +1 -1
doctr/models/recognition/vitstr/tensorflow.py +9 -4
doctr/models/recognition/zoo.py +1 -1
doctr/models/utils/pytorch.py +1 -1
doctr/models/utils/tensorflow.py +15 -15
doctr/models/zoo.py +2 -2
doctr/py.typed +0 -0
doctr/transforms/functional/base.py +1 -1
doctr/transforms/functional/pytorch.py +5 -5
doctr/transforms/modules/base.py +37 -15
doctr/transforms/modules/pytorch.py +73 -14
doctr/transforms/modules/tensorflow.py +78 -19
doctr/utils/fonts.py +7 -5
doctr/utils/geometry.py +141 -31
doctr/utils/metrics.py +34 -175
doctr/utils/reconstitution.py +212 -0
doctr/utils/visualization.py +5 -118
doctr/version.py +1 -1
{python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/METADATA +85 -81
python_doctr-0.10.0.dist-info/RECORD +173 -0
{python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/WHEEL +1 -1
doctr/models/artefacts/__init__.py +0 -2
doctr/models/artefacts/barcode.py +0 -74
doctr/models/artefacts/face.py +0 -63
doctr/models/obj_detection/__init__.py +0 -1
doctr/models/obj_detection/faster_rcnn/__init__.py +0 -4
doctr/models/obj_detection/faster_rcnn/pytorch.py +0 -81
python_doctr-0.8.1.dist-info/RECORD +0 -173
{python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/LICENSE +0 -0
{python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/top_level.txt +0 -0
{python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/zip-safe +0 -0

doctr/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
-from . import io, models, datasets, transforms, utils
+from . import io, models, datasets, contrib, transforms, utils
 from .file_utils import is_tf_available, is_torch_available
 from .version import __version__  # noqa: F401

doctr/contrib/__init__.py ADDED Viewed

File without changes

doctr/contrib/artefacts.py ADDED Viewed

@@ -0,0 +1,131 @@
+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+from typing import Any, Dict, List, Optional, Tuple
+import cv2
+import numpy as np
+from doctr.file_utils import requires_package
+from .base import _BasePredictor
+__all__ = ["ArtefactDetector"]
+default_cfgs: Dict[str, Dict[str, Any]] = {
+    "yolov8_artefact": {
+        "input_shape": (3, 1024, 1024),
+        "labels": ["bar_code", "qr_code", "logo", "photo"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.8.1/yolo_artefact-f9d66f14.onnx&src=0",
+    },
+}
+class ArtefactDetector(_BasePredictor):
+    """
+    A class to detect artefacts in images
+    >>> from doctr.io import DocumentFile
+    >>> from doctr.contrib.artefacts import ArtefactDetector
+    >>> doc = DocumentFile.from_images(["path/to/image.jpg"])
+    >>> detector = ArtefactDetector()
+    >>> results = detector(doc)
+    Args:
+    ----
+        arch: the architecture to use
+        batch_size: the batch size to use
+        model_path: the path to the model to use
+        labels: the labels to use
+        input_shape: the input shape to use
+        mask_labels: the mask labels to use
+        conf_threshold: the confidence threshold to use
+        iou_threshold: the intersection over union threshold to use
+        **kwargs: additional arguments to be passed to `download_from_url`
+    """
+    def __init__(
+        self,
+        arch: str = "yolov8_artefact",
+        batch_size: int = 2,
+        model_path: Optional[str] = None,
+        labels: Optional[List[str]] = None,
+        input_shape: Optional[Tuple[int, int, int]] = None,
+        conf_threshold: float = 0.5,
+        iou_threshold: float = 0.5,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(batch_size=batch_size, url=default_cfgs[arch]["url"], model_path=model_path, **kwargs)
+        self.labels = labels or default_cfgs[arch]["labels"]
+        self.input_shape = input_shape or default_cfgs[arch]["input_shape"]
+        self.conf_threshold = conf_threshold
+        self.iou_threshold = iou_threshold
+    def preprocess(self, img: np.ndarray) -> np.ndarray:
+        return np.transpose(cv2.resize(img, (self.input_shape[2], self.input_shape[1])), (2, 0, 1)) / np.array(255.0)
+    def postprocess(self, output: List[np.ndarray], input_images: List[List[np.ndarray]]) -> List[List[Dict[str, Any]]]:
+        results = []
+        for batch in zip(output, input_images):
+            for out, img in zip(batch[0], batch[1]):
+                org_height, org_width = img.shape[:2]
+                width_scale, height_scale = org_width / self.input_shape[2], org_height / self.input_shape[1]
+                for res in out:
+                    sample_results = []
+                    for row in np.transpose(np.squeeze(res)):
+                        classes_scores = row[4:]
+                        max_score = np.amax(classes_scores)
+                        if max_score >= self.conf_threshold:
+                            class_id = np.argmax(classes_scores)
+                            x, y, w, h = row[0], row[1], row[2], row[3]
+                            # to rescaled xmin, ymin, xmax, ymax
+                            xmin = int((x - w / 2) * width_scale)
+                            ymin = int((y - h / 2) * height_scale)
+                            xmax = int((x + w / 2) * width_scale)
+                            ymax = int((y + h / 2) * height_scale)
+                            sample_results.append({
+                                "label": self.labels[class_id],
+                                "confidence": float(max_score),
+                                "box": [xmin, ymin, xmax, ymax],
+                            })
+                    # Filter out overlapping boxes
+                    boxes = [res["box"] for res in sample_results]
+                    scores = [res["confidence"] for res in sample_results]
+                    keep_indices = cv2.dnn.NMSBoxes(boxes, scores, self.conf_threshold, self.iou_threshold)  # type: ignore[arg-type]
+                    sample_results = [sample_results[i] for i in keep_indices]
+                    results.append(sample_results)
+        self._results = results
+        return results
+    def show(self, **kwargs: Any) -> None:
+        """
+        Display the results
+        Args:
+        ----
+            **kwargs: additional keyword arguments to be passed to `plt.show`
+        """
+        requires_package("matplotlib", "`.show()` requires matplotlib installed")
+        import matplotlib.pyplot as plt
+        from matplotlib.patches import Rectangle
+        # visualize the results with matplotlib
+        if self._results and self._inputs:
+            for img, res in zip(self._inputs, self._results):
+                plt.figure(figsize=(10, 10))
+                plt.imshow(img)
+                for obj in res:
+                    xmin, ymin, xmax, ymax = obj["box"]
+                    label = obj["label"]
+                    plt.text(xmin, ymin, f"{label} {obj['confidence']:.2f}", color="red")
+                    plt.gca().add_patch(
+                        Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, fill=False, edgecolor="red", linewidth=2)
+                    )
+                plt.show(**kwargs)

doctr/contrib/base.py ADDED Viewed

@@ -0,0 +1,105 @@
+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+from typing import Any, List, Optional
+import numpy as np
+from doctr.file_utils import requires_package
+from doctr.utils.data import download_from_url
+class _BasePredictor:
+    """
+    Base class for all predictors
+    Args:
+    ----
+        batch_size: the batch size to use
+        url: the url to use to download a model if needed
+        model_path: the path to the model to use
+        **kwargs: additional arguments to be passed to `download_from_url`
+    """
+    def __init__(self, batch_size: int, url: Optional[str] = None, model_path: Optional[str] = None, **kwargs) -> None:
+        self.batch_size = batch_size
+        self.session = self._init_model(url, model_path, **kwargs)
+        self._inputs: List[np.ndarray] = []
+        self._results: List[Any] = []
+    def _init_model(self, url: Optional[str] = None, model_path: Optional[str] = None, **kwargs: Any) -> Any:
+        """
+        Download the model from the given url if needed
+        Args:
+        ----
+            url: the url to use
+            model_path: the path to the model to use
+            **kwargs: additional arguments to be passed to `download_from_url`
+        Returns:
+        -------
+            Any: the ONNX loaded model
+        """
+        requires_package("onnxruntime", "`.contrib` module requires `onnxruntime` to be installed.")
+        import onnxruntime as ort
+        if not url and not model_path:
+            raise ValueError("You must provide either a url or a model_path")
+        onnx_model_path = model_path if model_path else str(download_from_url(url, cache_subdir="models", **kwargs))  # type: ignore[arg-type]
+        return ort.InferenceSession(onnx_model_path, providers=["CUDAExecutionProvider", "CPUExecutionProvider"])
+    def preprocess(self, img: np.ndarray) -> np.ndarray:
+        """
+        Preprocess the input image
+        Args:
+        ----
+            img: the input image to preprocess
+        Returns:
+        -------
+            np.ndarray: the preprocessed image
+        """
+        raise NotImplementedError
+    def postprocess(self, output: List[np.ndarray], input_images: List[List[np.ndarray]]) -> Any:
+        """
+        Postprocess the model output
+        Args:
+        ----
+            output: the model output to postprocess
+            input_images: the input images used to generate the output
+        Returns:
+        -------
+            Any: the postprocessed output
+        """
+        raise NotImplementedError
+    def __call__(self, inputs: List[np.ndarray]) -> Any:
+        """
+        Call the model on the given inputs
+        Args:
+        ----
+            inputs: the inputs to use
+        Returns:
+        -------
+            Any: the postprocessed output
+        """
+        self._inputs = inputs
+        model_inputs = self.session.get_inputs()
+        batched_inputs = [inputs[i : i + self.batch_size] for i in range(0, len(inputs), self.batch_size)]
+        processed_batches = [
+            np.array([self.preprocess(img) for img in batch], dtype=np.float32) for batch in batched_inputs
+        ]
+        outputs = [self.session.run(None, {model_inputs[0].name: batch}) for batch in processed_batches]
+        return self.postprocess(outputs, batched_inputs)

doctr/datasets/cord.py CHANGED Viewed

@@ -33,6 +33,7 @@ class CORD(VisionDataset):
         train: whether the subset should be the training one
         use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
         recognition_task: whether the dataset should be used for recognition task
+        detection_task: whether the dataset should be used for detection task
         **kwargs: keyword arguments from `VisionDataset`.
     """
@@ -53,6 +54,7 @@ class CORD(VisionDataset):
         train: bool = True,
         use_polygons: bool = False,
         recognition_task: bool = False,
+        detection_task: bool = False,
         **kwargs: Any,
     ) -> None:
         url, sha256, name = self.TRAIN if train else self.TEST
@@ -64,10 +66,15 @@ class CORD(VisionDataset):
             pre_transforms=convert_target_to_relative if not recognition_task else None,
             **kwargs,
         )
+        if recognition_task and detection_task:
+            raise ValueError(
+                "`recognition_task` and `detection_task` cannot be set to True simultaneously. "
+                + "To get the whole dataset with boxes and labels leave both parameters to False."
+            )
         # List images
         tmp_root = os.path.join(self.root, "image")
-        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
+        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
         self.train = train
         np_dtype = np.float32
         for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking CORD", total=len(os.listdir(tmp_root))):
@@ -109,6 +116,8 @@ class CORD(VisionDataset):
                 )
                 for crop, label in zip(crops, list(text_targets)):
                     self.data.append((crop, label))
+            elif detection_task:
+                self.data.append((img_path, np.asarray(box_targets, dtype=int).clip(min=0)))
             else:
                 self.data.append((
                     img_path,

doctr/datasets/datasets/pytorch.py CHANGED Viewed

@@ -50,9 +50,9 @@ class AbstractDataset(_AbstractDataset):
     @staticmethod
     def collate_fn(samples: List[Tuple[torch.Tensor, Any]]) -> Tuple[torch.Tensor, List[Any]]:
         images, targets = zip(*samples)
-        images = torch.stack(images, dim=0)
+        images = torch.stack(images, dim=0)  # type: ignore[assignment]
-        return images, list(targets)
+        return images, list(targets)  # type: ignore[return-value]
 class VisionDataset(AbstractDataset, _VisionDataset):  # noqa: D101

doctr/datasets/funsd.py CHANGED Viewed

@@ -33,6 +33,7 @@ class FUNSD(VisionDataset):
         train: whether the subset should be the training one
         use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
         recognition_task: whether the dataset should be used for recognition task
+        detection_task: whether the dataset should be used for detection task
         **kwargs: keyword arguments from `VisionDataset`.
     """
@@ -45,6 +46,7 @@ class FUNSD(VisionDataset):
         train: bool = True,
         use_polygons: bool = False,
         recognition_task: bool = False,
+        detection_task: bool = False,
         **kwargs: Any,
     ) -> None:
         super().__init__(
@@ -55,6 +57,12 @@ class FUNSD(VisionDataset):
             pre_transforms=convert_target_to_relative if not recognition_task else None,
             **kwargs,
         )
+        if recognition_task and detection_task:
+            raise ValueError(
+                "`recognition_task` and `detection_task` cannot be set to True simultaneously. "
+                + "To get the whole dataset with boxes and labels leave both parameters to False."
+            )
         self.train = train
         np_dtype = np.float32
@@ -63,7 +71,7 @@ class FUNSD(VisionDataset):
         # # List images
         tmp_root = os.path.join(self.root, subfolder, "images")
-        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
+        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
         for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking FUNSD", total=len(os.listdir(tmp_root))):
             # File existence check
             if not os.path.exists(os.path.join(tmp_root, img_path)):
@@ -100,6 +108,8 @@ class FUNSD(VisionDataset):
                     # filter labels with unknown characters
                     if not any(char in label for char in ["☑", "☐", "\uf703", "\uf702"]):
                         self.data.append((crop, label))
+            elif detection_task:
+                self.data.append((img_path, np.asarray(box_targets, dtype=np_dtype)))
             else:
                 self.data.append((
                     img_path,

doctr/datasets/generator/base.py CHANGED Viewed

@@ -20,7 +20,7 @@ def synthesize_text_img(
     font_family: Optional[str] = None,
     background_color: Optional[Tuple[int, int, int]] = None,
     text_color: Optional[Tuple[int, int, int]] = None,
-) -> Image:
+) -> Image.Image:
     """Generate a synthetic text image
     Args:
@@ -81,7 +81,7 @@ class _CharacterGenerator(AbstractDataset):
         self._data: List[Image.Image] = []
         if cache_samples:
             self._data = [
-                (synthesize_text_img(char, font_family=font), idx)
+                (synthesize_text_img(char, font_family=font), idx)  # type: ignore[misc]
                 for idx, char in enumerate(self.vocab)
                 for font in self.font_family
             ]
@@ -93,7 +93,7 @@ class _CharacterGenerator(AbstractDataset):
         # Samples are already cached
         if len(self._data) > 0:
             idx = index % len(self._data)
-            pil_img, target = self._data[idx]
+            pil_img, target = self._data[idx]  # type: ignore[misc]
         else:
             target = index % len(self.vocab)
             pil_img = synthesize_text_img(self.vocab[target], font_family=random.choice(self.font_family))
@@ -132,7 +132,8 @@ class _WordGenerator(AbstractDataset):
         if cache_samples:
             _words = [self._generate_string(*self.wordlen_range) for _ in range(num_samples)]
             self._data = [
-                (synthesize_text_img(text, font_family=random.choice(self.font_family)), text) for text in _words
+                (synthesize_text_img(text, font_family=random.choice(self.font_family)), text)  # type: ignore[misc]
+                for text in _words
             ]
     def _generate_string(self, min_chars: int, max_chars: int) -> str:
@@ -145,7 +146,7 @@ class _WordGenerator(AbstractDataset):
     def _read_sample(self, index: int) -> Tuple[Any, str]:
         # Samples are already cached
         if len(self._data) > 0:
-            pil_img, target = self._data[index]
+            pil_img, target = self._data[index]  # type: ignore[misc]
         else:
             target = self._generate_string(*self.wordlen_range)
             pil_img = synthesize_text_img(target, font_family=random.choice(self.font_family))

doctr/datasets/ic03.py CHANGED Viewed

@@ -32,6 +32,7 @@ class IC03(VisionDataset):
         train: whether the subset should be the training one
         use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
         recognition_task: whether the dataset should be used for recognition task
+        detection_task: whether the dataset should be used for detection task
         **kwargs: keyword arguments from `VisionDataset`.
     """
@@ -51,6 +52,7 @@ class IC03(VisionDataset):
         train: bool = True,
         use_polygons: bool = False,
         recognition_task: bool = False,
+        detection_task: bool = False,
         **kwargs: Any,
     ) -> None:
         url, sha256, file_name = self.TRAIN if train else self.TEST
@@ -62,8 +64,14 @@ class IC03(VisionDataset):
             pre_transforms=convert_target_to_relative if not recognition_task else None,
             **kwargs,
         )
+        if recognition_task and detection_task:
+            raise ValueError(
+                "`recognition_task` and `detection_task` cannot be set to True simultaneously. "
+                + "To get the whole dataset with boxes and labels leave both parameters to False."
+            )
         self.train = train
-        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
+        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
         np_dtype = np.float32
         # Load xml data
@@ -117,6 +125,8 @@ class IC03(VisionDataset):
                     for crop, label in zip(crops, labels):
                         if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
                             self.data.append((crop, label))
+                elif detection_task:
+                    self.data.append((name.text, boxes))
                 else:
                     self.data.append((name.text, dict(boxes=boxes, labels=labels)))

doctr/datasets/ic13.py CHANGED Viewed

@@ -38,6 +38,7 @@ class IC13(AbstractDataset):
         label_folder: folder with all annotation files for the images
         use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
         recognition_task: whether the dataset should be used for recognition task
+        detection_task: whether the dataset should be used for detection task
         **kwargs: keyword arguments from `AbstractDataset`.
     """
@@ -47,11 +48,17 @@ class IC13(AbstractDataset):
         label_folder: str,
         use_polygons: bool = False,
         recognition_task: bool = False,
+        detection_task: bool = False,
         **kwargs: Any,
     ) -> None:
         super().__init__(
             img_folder, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs
         )
+        if recognition_task and detection_task:
+            raise ValueError(
+                "`recognition_task` and `detection_task` cannot be set to True simultaneously. "
+                + "To get the whole dataset with boxes and labels leave both parameters to False."
+            )
         # File existence check
         if not os.path.exists(label_folder) or not os.path.exists(img_folder):
@@ -59,7 +66,7 @@ class IC13(AbstractDataset):
                 f"unable to locate {label_folder if not os.path.exists(label_folder) else img_folder}"
             )
-        self.data: List[Tuple[Union[Path, np.ndarray], Union[str, Dict[str, Any]]]] = []
+        self.data: List[Tuple[Union[Path, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
         np_dtype = np.float32
         img_names = os.listdir(img_folder)
@@ -95,5 +102,7 @@ class IC13(AbstractDataset):
                 crops = crop_bboxes_from_image(img_path=img_path, geoms=box_targets)
                 for crop, label in zip(crops, labels):
                     self.data.append((crop, label))
+            elif detection_task:
+                self.data.append((img_path, box_targets))
             else:
                 self.data.append((img_path, dict(boxes=box_targets, labels=labels)))

doctr/datasets/iiit5k.py CHANGED Viewed

@@ -34,6 +34,7 @@ class IIIT5K(VisionDataset):
         train: whether the subset should be the training one
         use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
         recognition_task: whether the dataset should be used for recognition task
+        detection_task: whether the dataset should be used for detection task
         **kwargs: keyword arguments from `VisionDataset`.
     """
@@ -45,6 +46,7 @@ class IIIT5K(VisionDataset):
         train: bool = True,
         use_polygons: bool = False,
         recognition_task: bool = False,
+        detection_task: bool = False,
         **kwargs: Any,
     ) -> None:
         super().__init__(
@@ -55,6 +57,12 @@ class IIIT5K(VisionDataset):
             pre_transforms=convert_target_to_relative if not recognition_task else None,
             **kwargs,
         )
+        if recognition_task and detection_task:
+            raise ValueError(
+                "`recognition_task` and `detection_task` cannot be set to True simultaneously. "
+                + "To get the whole dataset with boxes and labels leave both parameters to False."
+            )
         self.train = train
         # Load mat data
@@ -62,7 +70,7 @@ class IIIT5K(VisionDataset):
         mat_file = "trainCharBound" if self.train else "testCharBound"
         mat_data = sio.loadmat(os.path.join(tmp_root, f"{mat_file}.mat"))[mat_file][0]
-        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
+        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
         np_dtype = np.float32
         for img_path, label, box_targets in tqdm(iterable=mat_data, desc="Unpacking IIIT5K", total=len(mat_data)):
@@ -73,24 +81,26 @@ class IIIT5K(VisionDataset):
             if not os.path.exists(os.path.join(tmp_root, _raw_path)):
                 raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, _raw_path)}")
+            if use_polygons:
+                # (x, y) coordinates of top left, top right, bottom right, bottom left corners
+                box_targets = [
+                    [
+                        [box[0], box[1]],
+                        [box[0] + box[2], box[1]],
+                        [box[0] + box[2], box[1] + box[3]],
+                        [box[0], box[1] + box[3]],
+                    ]
+                    for box in box_targets
+                ]
+            else:
+                # xmin, ymin, xmax, ymax
+                box_targets = [[box[0], box[1], box[0] + box[2], box[1] + box[3]] for box in box_targets]
             if recognition_task:
                 self.data.append((_raw_path, _raw_label))
+            elif detection_task:
+                self.data.append((_raw_path, np.asarray(box_targets, dtype=np_dtype)))
             else:
-                if use_polygons:
-                    # (x, y) coordinates of top left, top right, bottom right, bottom left corners
-                    box_targets = [
-                        [
-                            [box[0], box[1]],
-                            [box[0] + box[2], box[1]],
-                            [box[0] + box[2], box[1] + box[3]],
-                            [box[0], box[1] + box[3]],
-                        ]
-                        for box in box_targets
-                    ]
-                else:
-                    # xmin, ymin, xmax, ymax
-                    box_targets = [[box[0], box[1], box[0] + box[2], box[1] + box[3]] for box in box_targets]
                 # label are casted to list where each char corresponds to the character's bounding box
                 self.data.append((
                     _raw_path,

doctr/datasets/imgur5k.py CHANGED Viewed

@@ -46,6 +46,7 @@ class IMGUR5K(AbstractDataset):
         train: whether the subset should be the training one
         use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
         recognition_task: whether the dataset should be used for recognition task
+        detection_task: whether the dataset should be used for detection task
         **kwargs: keyword arguments from `AbstractDataset`.
     """
@@ -56,17 +57,23 @@ class IMGUR5K(AbstractDataset):
         train: bool = True,
         use_polygons: bool = False,
         recognition_task: bool = False,
+        detection_task: bool = False,
         **kwargs: Any,
     ) -> None:
         super().__init__(
             img_folder, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs
         )
+        if recognition_task and detection_task:
+            raise ValueError(
+                "`recognition_task` and `detection_task` cannot be set to True simultaneously. "
+                + "To get the whole dataset with boxes and labels leave both parameters to False."
+            )
         # File existence check
         if not os.path.exists(label_path) or not os.path.exists(img_folder):
             raise FileNotFoundError(f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}")
-        self.data: List[Tuple[Union[str, Path, np.ndarray], Union[str, Dict[str, Any]]]] = []
+        self.data: List[Tuple[Union[str, Path, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
         self.train = train
         np_dtype = np.float32
@@ -112,7 +119,7 @@ class IMGUR5K(AbstractDataset):
                 if ann["word"] != "."
             ]
             # (x, y) coordinates of top left, top right, bottom right, bottom left corners
-            box_targets = [cv2.boxPoints(((box[0], box[1]), (box[2], box[3]), box[4])) for box in _boxes]  # type: ignore[arg-type]
+            box_targets = [cv2.boxPoints(((box[0], box[1]), (box[2], box[3]), box[4])) for box in _boxes]
             if not use_polygons:
                 # xmin, ymin, xmax, ymax
@@ -132,6 +139,8 @@ class IMGUR5K(AbstractDataset):
                                 tmp_img = Image.fromarray(crop)
                                 tmp_img.save(os.path.join(reco_folder_path, f"{reco_images_counter}.png"))
                                 reco_images_counter += 1
+                elif detection_task:
+                    self.data.append((img_path, np.asarray(box_targets, dtype=np_dtype)))
                 else:
                     self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=labels)))

doctr/datasets/loader.py CHANGED Viewed

@@ -9,8 +9,6 @@ from typing import Callable, Optional
 import numpy as np
 import tensorflow as tf
-from doctr.utils.multithreading import multithread_exec
 __all__ = ["DataLoader"]
@@ -47,7 +45,6 @@ class DataLoader:
         shuffle: whether the samples should be shuffled before passing it to the iterator
         batch_size: number of elements in each batch
         drop_last: if `True`, drops the last batch if it isn't full
-        num_workers: number of workers to use for data loading
         collate_fn: function to merge samples into a batch
     """
@@ -57,7 +54,6 @@ class DataLoader:
         shuffle: bool = True,
         batch_size: int = 1,
         drop_last: bool = False,
-        num_workers: Optional[int] = None,
         collate_fn: Optional[Callable] = None,
     ) -> None:
         self.dataset = dataset
@@ -69,7 +65,6 @@ class DataLoader:
             self.collate_fn = self.dataset.collate_fn if hasattr(self.dataset, "collate_fn") else default_collate
         else:
             self.collate_fn = collate_fn
-        self.num_workers = num_workers
         self.reset()
     def __len__(self) -> int:
@@ -92,7 +87,7 @@ class DataLoader:
             idx = self._num_yielded * self.batch_size
             indices = self.indices[idx : min(len(self.dataset), idx + self.batch_size)]
-            samples = list(multithread_exec(self.dataset.__getitem__, indices, threads=self.num_workers))
+            samples = list(map(self.dataset.__getitem__, indices))
             batch_data = self.collate_fn(samples)

python-doctr 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl

python-doctr 0.8.1py3-none-any.whl → 0.10.0py3-none-any.whl