PyPI - python-doctr - Versions diffs - 0.7.0__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

python-doctr 0.7.0py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (137) hide show

doctr/datasets/__init__.py +2 -0
doctr/datasets/cord.py +6 -4
doctr/datasets/datasets/base.py +3 -2
doctr/datasets/datasets/pytorch.py +4 -2
doctr/datasets/datasets/tensorflow.py +4 -2
doctr/datasets/detection.py +6 -3
doctr/datasets/doc_artefacts.py +2 -1
doctr/datasets/funsd.py +7 -8
doctr/datasets/generator/base.py +3 -2
doctr/datasets/generator/pytorch.py +3 -1
doctr/datasets/generator/tensorflow.py +3 -1
doctr/datasets/ic03.py +3 -2
doctr/datasets/ic13.py +2 -1
doctr/datasets/iiit5k.py +6 -4
doctr/datasets/iiithws.py +2 -1
doctr/datasets/imgur5k.py +3 -2
doctr/datasets/loader.py +4 -2
doctr/datasets/mjsynth.py +2 -1
doctr/datasets/ocr.py +2 -1
doctr/datasets/orientation.py +40 -0
doctr/datasets/recognition.py +3 -2
doctr/datasets/sroie.py +2 -1
doctr/datasets/svhn.py +2 -1
doctr/datasets/svt.py +3 -2
doctr/datasets/synthtext.py +2 -1
doctr/datasets/utils.py +27 -11
doctr/datasets/vocabs.py +26 -1
doctr/datasets/wildreceipt.py +111 -0
doctr/file_utils.py +3 -1
doctr/io/elements.py +52 -35
doctr/io/html.py +5 -3
doctr/io/image/base.py +5 -4
doctr/io/image/pytorch.py +12 -7
doctr/io/image/tensorflow.py +11 -6
doctr/io/pdf.py +5 -4
doctr/io/reader.py +13 -5
doctr/models/_utils.py +30 -53
doctr/models/artefacts/barcode.py +4 -3
doctr/models/artefacts/face.py +4 -2
doctr/models/builder.py +58 -43
doctr/models/classification/__init__.py +1 -0
doctr/models/classification/magc_resnet/pytorch.py +5 -2
doctr/models/classification/magc_resnet/tensorflow.py +5 -2
doctr/models/classification/mobilenet/pytorch.py +16 -4
doctr/models/classification/mobilenet/tensorflow.py +29 -20
doctr/models/classification/predictor/pytorch.py +3 -2
doctr/models/classification/predictor/tensorflow.py +2 -1
doctr/models/classification/resnet/pytorch.py +23 -13
doctr/models/classification/resnet/tensorflow.py +33 -26
doctr/models/classification/textnet/__init__.py +6 -0
doctr/models/classification/textnet/pytorch.py +275 -0
doctr/models/classification/textnet/tensorflow.py +267 -0
doctr/models/classification/vgg/pytorch.py +4 -2
doctr/models/classification/vgg/tensorflow.py +5 -2
doctr/models/classification/vit/pytorch.py +9 -3
doctr/models/classification/vit/tensorflow.py +9 -3
doctr/models/classification/zoo.py +7 -2
doctr/models/core.py +1 -1
doctr/models/detection/__init__.py +1 -0
doctr/models/detection/_utils/pytorch.py +7 -1
doctr/models/detection/_utils/tensorflow.py +7 -3
doctr/models/detection/core.py +9 -3
doctr/models/detection/differentiable_binarization/base.py +37 -25
doctr/models/detection/differentiable_binarization/pytorch.py +80 -104
doctr/models/detection/differentiable_binarization/tensorflow.py +74 -55
doctr/models/detection/fast/__init__.py +6 -0
doctr/models/detection/fast/base.py +256 -0
doctr/models/detection/fast/pytorch.py +442 -0
doctr/models/detection/fast/tensorflow.py +428 -0
doctr/models/detection/linknet/base.py +12 -5
doctr/models/detection/linknet/pytorch.py +28 -15
doctr/models/detection/linknet/tensorflow.py +68 -88
doctr/models/detection/predictor/pytorch.py +16 -6
doctr/models/detection/predictor/tensorflow.py +13 -5
doctr/models/detection/zoo.py +19 -16
doctr/models/factory/hub.py +20 -10
doctr/models/kie_predictor/base.py +2 -1
doctr/models/kie_predictor/pytorch.py +28 -36
doctr/models/kie_predictor/tensorflow.py +27 -27
doctr/models/modules/__init__.py +1 -0
doctr/models/modules/layers/__init__.py +6 -0
doctr/models/modules/layers/pytorch.py +166 -0
doctr/models/modules/layers/tensorflow.py +175 -0
doctr/models/modules/transformer/pytorch.py +24 -22
doctr/models/modules/transformer/tensorflow.py +6 -4
doctr/models/modules/vision_transformer/pytorch.py +2 -4
doctr/models/modules/vision_transformer/tensorflow.py +2 -4
doctr/models/obj_detection/faster_rcnn/pytorch.py +4 -2
doctr/models/predictor/base.py +14 -3
doctr/models/predictor/pytorch.py +26 -29
doctr/models/predictor/tensorflow.py +25 -22
doctr/models/preprocessor/pytorch.py +14 -9
doctr/models/preprocessor/tensorflow.py +10 -5
doctr/models/recognition/core.py +4 -1
doctr/models/recognition/crnn/pytorch.py +23 -16
doctr/models/recognition/crnn/tensorflow.py +25 -17
doctr/models/recognition/master/base.py +4 -1
doctr/models/recognition/master/pytorch.py +20 -9
doctr/models/recognition/master/tensorflow.py +20 -8
doctr/models/recognition/parseq/base.py +4 -1
doctr/models/recognition/parseq/pytorch.py +28 -22
doctr/models/recognition/parseq/tensorflow.py +22 -11
doctr/models/recognition/predictor/_utils.py +3 -2
doctr/models/recognition/predictor/pytorch.py +3 -2
doctr/models/recognition/predictor/tensorflow.py +2 -1
doctr/models/recognition/sar/pytorch.py +14 -7
doctr/models/recognition/sar/tensorflow.py +23 -14
doctr/models/recognition/utils.py +5 -1
doctr/models/recognition/vitstr/base.py +4 -1
doctr/models/recognition/vitstr/pytorch.py +22 -13
doctr/models/recognition/vitstr/tensorflow.py +21 -10
doctr/models/recognition/zoo.py +4 -2
doctr/models/utils/pytorch.py +24 -6
doctr/models/utils/tensorflow.py +22 -3
doctr/models/zoo.py +21 -3
doctr/transforms/functional/base.py +8 -3
doctr/transforms/functional/pytorch.py +23 -6
doctr/transforms/functional/tensorflow.py +25 -5
doctr/transforms/modules/base.py +12 -5
doctr/transforms/modules/pytorch.py +10 -12
doctr/transforms/modules/tensorflow.py +17 -9
doctr/utils/common_types.py +1 -1
doctr/utils/data.py +4 -2
doctr/utils/fonts.py +3 -2
doctr/utils/geometry.py +95 -26
doctr/utils/metrics.py +36 -22
doctr/utils/multithreading.py +5 -3
doctr/utils/repr.py +3 -1
doctr/utils/visualization.py +31 -8
doctr/version.py +1 -1
{python_doctr-0.7.0.dist-info → python_doctr-0.8.1.dist-info}/METADATA +67 -31
python_doctr-0.8.1.dist-info/RECORD +173 -0
{python_doctr-0.7.0.dist-info → python_doctr-0.8.1.dist-info}/WHEEL +1 -1
python_doctr-0.7.0.dist-info/RECORD +0 -161
{python_doctr-0.7.0.dist-info → python_doctr-0.8.1.dist-info}/LICENSE +0 -0
{python_doctr-0.7.0.dist-info → python_doctr-0.8.1.dist-info}/top_level.txt +0 -0
{python_doctr-0.7.0.dist-info → python_doctr-0.8.1.dist-info}/zip-safe +0 -0

doctr/models/detection/differentiable_binarization/base.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2023, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
@@ -22,6 +22,7 @@ class DBPostProcessor(DetectionPostProcessor):
     <https://github.com/xuannianz/DifferentiableBinarization>`_.
     Args:
+    ----
         unclip ratio: ratio used to unshrink polygons
         min_size_box: minimal length (pix) to keep a box
         max_candidates: maximum boxes to consider in a single page
@@ -37,7 +38,7 @@ class DBPostProcessor(DetectionPostProcessor):
         assume_straight_pages: bool = True,
     ) -> None:
         super().__init__(box_thresh, bin_thresh, assume_straight_pages)
-        self.unclip_ratio = 1.5 if assume_straight_pages else 2.2
+        self.unclip_ratio = 1.5
     def polygon_to_box(
         self,
@@ -46,9 +47,11 @@ class DBPostProcessor(DetectionPostProcessor):
         """Expand a polygon (points) by a factor unclip_ratio, and returns a polygon
         Args:
+        ----
             points: The first parameter.
         Returns:
+        -------
             a box in absolute coordinates (xmin, ymin, xmax, ymax) or (4, 2) array (quadrangle)
         """
         if not self.assume_straight_pages:
@@ -80,7 +83,7 @@ class DBPostProcessor(DetectionPostProcessor):
         if len(expanded_points) < 1:
             return None  # type: ignore[return-value]
         return (
-            cv2.boundingRect(expanded_points)
+            cv2.boundingRect(expanded_points)  # type: ignore[return-value]
             if self.assume_straight_pages
             else np.roll(cv2.boxPoints(cv2.minAreaRect(expanded_points)), -1, axis=0)
         )
@@ -90,20 +93,22 @@ class DBPostProcessor(DetectionPostProcessor):
         pred: np.ndarray,
         bitmap: np.ndarray,
     ) -> np.ndarray:
-        """Compute boxes from a bitmap/pred_map
+        """Compute boxes from a bitmap/pred_map: find connected components then filter boxes
         Args:
+        ----
             pred: Pred map from differentiable binarization output
             bitmap: Bitmap map computed from pred (binarized)
             angle_tol: Comparison tolerance of the angle with the median angle across the page
             ratio_tol: Under this limit aspect ratio, we cannot resolve the direction of the crop
         Returns:
+        -------
             np tensor boxes for the bitmap, each box is a 5-element list
                 containing x, y, w, h, score for the box
         """
         height, width = bitmap.shape[:2]
-        min_size_box = 1 + int(height / 512)
+        min_size_box = 2
         boxes: List[Union[np.ndarray, List[float]]] = []
         # get contours from connected components on the bitmap
         contours, _ = cv2.findContours(bitmap.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
@@ -158,6 +163,7 @@ class _DBNet:
     <https://arxiv.org/pdf/1911.08947.pdf>`_.
     Args:
+    ----
         feature extractor: the backbone serving as feature extractor
         fpn_channels: number of channels each extracted feature maps is mapped to
     """
@@ -174,17 +180,20 @@ class _DBNet:
         ys: np.ndarray,
         a: np.ndarray,
         b: np.ndarray,
-        eps: float = 1e-7,
+        eps: float = 1e-6,
     ) -> float:
         """Compute the distance for each point of the map (xs, ys) to the (a, b) segment
         Args:
+        ----
             xs : map of x coordinates (height, width)
             ys : map of y coordinates (height, width)
             a: first point defining the [ab] segment
             b: second point defining the [ab] segment
+            eps: epsilon to avoid division by zero
         Returns:
+        -------
             The computed distance
         """
@@ -192,9 +201,10 @@ class _DBNet:
         square_dist_2 = np.square(xs - b[0]) + np.square(ys - b[1])
         square_dist = np.square(a[0] - b[0]) + np.square(a[1] - b[1])
         cosin = (square_dist - square_dist_1 - square_dist_2) / (2 * np.sqrt(square_dist_1 * square_dist_2) + eps)
+        cosin = np.clip(cosin, -1.0, 1.0)
         square_sin = 1 - np.square(cosin)
         square_sin = np.nan_to_num(square_sin)
-        result = np.sqrt(square_dist_1 * square_dist_2 * square_sin / square_dist)
+        result = np.sqrt(square_dist_1 * square_dist_2 * square_sin / square_dist + eps)
         result[cosin < 0] = np.sqrt(np.fmin(square_dist_1, square_dist_2))[cosin < 0]
         return result
@@ -207,6 +217,7 @@ class _DBNet:
         """Draw a polygon treshold map on a canvas, as described in the DB paper
         Args:
+        ----
             polygon : array of coord., to draw the boundary of the polygon
             canvas : threshold map to fill with polygons
             mask : mask for training on threshold polygons
@@ -223,7 +234,7 @@ class _DBNet:
         padded_polygon: np.ndarray = np.array(padding.Execute(distance)[0])
         # Fill the mask with 1 on the new padded polygon
-        cv2.fillPoly(mask, [padded_polygon.astype(np.int32)], 1.0)
+        cv2.fillPoly(mask, [padded_polygon.astype(np.int32)], 1.0)  # type: ignore[call-overload]
         # Get min/max to recover polygon after distance computation
         xmin = padded_polygon[:, 0].min()
@@ -255,7 +266,10 @@ class _DBNet:
         # Fill the canvas with the distances computed inside the valid padded polygon
         canvas[ymin_valid : ymax_valid + 1, xmin_valid : xmax_valid + 1] = np.fmax(
-            1 - distance_map[ymin_valid - ymin : ymax_valid - ymin + 1, xmin_valid - xmin : xmax_valid - xmin + 1],
+            1
+            - distance_map[
+                ymin_valid - ymin : ymax_valid - ymax + height, xmin_valid - xmin : xmax_valid - xmax + width
+            ],
             canvas[ymin_valid : ymax_valid + 1, xmin_valid : xmax_valid + 1],
         )
@@ -264,7 +278,7 @@ class _DBNet:
     def build_target(
         self,
         target: List[Dict[str, np.ndarray]],
-        output_shape: Tuple[int, int, int, int],
+        output_shape: Tuple[int, int, int],
         channels_last: bool = True,
     ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
         if any(t.dtype != np.float32 for tgt in target for t in tgt.values()):
@@ -274,23 +288,24 @@ class _DBNet:
         input_dtype = next(iter(target[0].values())).dtype if len(target) > 0 else np.float32
+        h: int
+        w: int
         if channels_last:
-            h, w = output_shape[1:-1]
-            target_shape = (output_shape[0], output_shape[-1], h, w)  # (Batch_size, num_classes, h, w)
+            h, w, num_classes = output_shape
         else:
-            h, w = output_shape[-2:]
-            target_shape = output_shape  # (Batch_size, num_classes, h, w)
+            num_classes, h, w = output_shape
+        target_shape = (len(target), num_classes, h, w)
         seg_target: np.ndarray = np.zeros(target_shape, dtype=np.uint8)
         seg_mask: np.ndarray = np.ones(target_shape, dtype=bool)
         thresh_target: np.ndarray = np.zeros(target_shape, dtype=np.float32)
-        thresh_mask: np.ndarray = np.ones(target_shape, dtype=np.uint8)
+        thresh_mask: np.ndarray = np.zeros(target_shape, dtype=np.uint8)
         for idx, tgt in enumerate(target):
             for class_idx, _tgt in enumerate(tgt.values()):
                 # Draw each polygon on gt
                 if _tgt.shape[0] == 0:
                     # Empty image, full masked
-                    # seg_mask[idx, :, :, class_idx] = False
                     seg_mask[idx, class_idx] = False
                 # Absolute bounding boxes
@@ -316,10 +331,9 @@ class _DBNet:
                     )
                     boxes_size = np.minimum(abs_boxes[:, 2] - abs_boxes[:, 0], abs_boxes[:, 3] - abs_boxes[:, 1])
-                for box, box_size, poly in zip(abs_boxes, boxes_size, polys):
+                for poly, box, box_size in zip(polys, abs_boxes, boxes_size):
                     # Mask boxes that are too small
                     if box_size < self.min_size_box:
-                        # seg_mask[idx, box[1] : box[3] + 1, box[0] : box[2] + 1, class_idx] = False
                         seg_mask[idx, class_idx, box[1] : box[3] + 1, box[0] : box[2] + 1] = False
                         continue
@@ -329,19 +343,17 @@ class _DBNet:
                     subject = [tuple(coor) for coor in poly]
                     padding = pyclipper.PyclipperOffset()
                     padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
-                    shrinked = padding.Execute(-distance)
+                    shrunken = padding.Execute(-distance)
                     # Draw polygon on gt if it is valid
-                    if len(shrinked) == 0:
-                        # seg_mask[idx, box[1] : box[3] + 1, box[0] : box[2] + 1, class_idx] = False
+                    if len(shrunken) == 0:
                         seg_mask[idx, class_idx, box[1] : box[3] + 1, box[0] : box[2] + 1] = False
                         continue
-                    shrinked = np.array(shrinked[0]).reshape(-1, 2)
-                    if shrinked.shape[0] <= 2 or not Polygon(shrinked).is_valid:
-                        # seg_mask[idx, box[1] : box[3] + 1, box[0] : box[2] + 1, class_idx] = False
+                    shrunken = np.array(shrunken[0]).reshape(-1, 2)
+                    if shrunken.shape[0] <= 2 or not Polygon(shrunken).is_valid:
                         seg_mask[idx, class_idx, box[1] : box[3] + 1, box[0] : box[2] + 1] = False
                         continue
-                    cv2.fillPoly(seg_target[idx, class_idx], [shrinked.astype(np.int32)], 1)
+                    cv2.fillPoly(seg_target[idx, class_idx], [shrunken.astype(np.int32)], 1.0)  # type: ignore[call-overload]
                     # Draw on both thresh map and thresh mask
                     poly, thresh_target[idx, class_idx], thresh_mask[idx, class_idx] = self.draw_thresh_map(

doctr/models/detection/differentiable_binarization/pytorch.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2023, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
@@ -16,10 +16,10 @@ from torchvision.ops.deform_conv import DeformConv2d
 from doctr.file_utils import CLASS_NAME
 from ...classification import mobilenet_v3_large
-from ...utils import load_pretrained_params
+from ...utils import _bf16_to_float32, load_pretrained_params
 from .base import DBPostProcessor, _DBNet
-__all__ = ["DBNet", "db_resnet50", "db_resnet34", "db_mobilenet_v3_large", "db_resnet50_rotation"]
+__all__ = ["DBNet", "db_resnet50", "db_resnet34", "db_mobilenet_v3_large"]
 default_cfgs: Dict[str, Dict[str, Any]] = {
@@ -27,25 +27,19 @@ default_cfgs: Dict[str, Dict[str, Any]] = {
         "input_shape": (3, 1024, 1024),
         "mean": (0.798, 0.785, 0.772),
         "std": (0.264, 0.2749, 0.287),
-        "url": "https://doctr-static.mindee.com/models?id=v0.3.1/db_resnet50-ac60cadc.pt&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.7.0/db_resnet50-79bd7d70.pt&src=0",
     },
     "db_resnet34": {
         "input_shape": (3, 1024, 1024),
         "mean": (0.798, 0.785, 0.772),
         "std": (0.264, 0.2749, 0.287),
-        "url": None,
+        "url": "https://doctr-static.mindee.com/models?id=v0.7.0/db_resnet34-cb6aed9e.pt&src=0",
     },
     "db_mobilenet_v3_large": {
         "input_shape": (3, 1024, 1024),
         "mean": (0.798, 0.785, 0.772),
         "std": (0.264, 0.2749, 0.287),
-        "url": "https://doctr-static.mindee.com/models?id=v0.3.1/db_mobilenet_v3_large-fd62154b.pt&src=0",
-    },
-    "db_resnet50_rotation": {
-        "input_shape": (3, 1024, 1024),
-        "mean": (0.798, 0.785, 0.772),
-        "std": (0.264, 0.2749, 0.287),
-        "url": "https://doctr-static.mindee.com/models?id=v0.4.1/db_resnet50-1138863a.pt&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.7.0/db_mobilenet_v3_large-81e9b152.pt&src=0",
     },
 }
@@ -63,28 +57,24 @@ class FeaturePyramidNetwork(nn.Module):
         conv_layer = DeformConv2d if deform_conv else nn.Conv2d
-        self.in_branches = nn.ModuleList(
-            [
-                nn.Sequential(
-                    conv_layer(chans, out_channels, 1, bias=False),
-                    nn.BatchNorm2d(out_channels),
-                    nn.ReLU(inplace=True),
-                )
-                for idx, chans in enumerate(in_channels)
-            ]
-        )
+        self.in_branches = nn.ModuleList([
+            nn.Sequential(
+                conv_layer(chans, out_channels, 1, bias=False),
+                nn.BatchNorm2d(out_channels),
+                nn.ReLU(inplace=True),
+            )
+            for idx, chans in enumerate(in_channels)
+        ])
         self.upsample = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True)
-        self.out_branches = nn.ModuleList(
-            [
-                nn.Sequential(
-                    conv_layer(out_channels, out_chans, 3, padding=1, bias=False),
-                    nn.BatchNorm2d(out_chans),
-                    nn.ReLU(inplace=True),
-                    nn.Upsample(scale_factor=2**idx, mode="bilinear", align_corners=True),
-                )
-                for idx, chans in enumerate(in_channels)
-            ]
-        )
+        self.out_branches = nn.ModuleList([
+            nn.Sequential(
+                conv_layer(out_channels, out_chans, 3, padding=1, bias=False),
+                nn.BatchNorm2d(out_chans),
+                nn.ReLU(inplace=True),
+                nn.Upsample(scale_factor=2**idx, mode="bilinear", align_corners=True),
+            )
+            for idx, chans in enumerate(in_channels)
+        ])
     def forward(self, x: List[torch.Tensor]) -> torch.Tensor:
         if len(x) != len(self.out_branches):
@@ -106,9 +96,12 @@ class DBNet(_DBNet, nn.Module):
     <https://arxiv.org/pdf/1911.08947.pdf>`_.
     Args:
+    ----
         feature extractor: the backbone serving as feature extractor
         head_chans: the number of channels in the head
         deform_conv: whether to use deformable convolution
+        bin_thresh: threshold for binarization
+        box_thresh: minimal objectness score to consider a box
         assume_straight_pages: if True, fit straight bounding boxes only
         exportable: onnx exportable returns only logits
         cfg: the configuration dict of the model
@@ -121,6 +114,7 @@ class DBNet(_DBNet, nn.Module):
         head_chans: int = 256,
         deform_conv: bool = False,
         bin_thresh: float = 0.3,
+        box_thresh: float = 0.1,
         assume_straight_pages: bool = True,
         exportable: bool = False,
         cfg: Optional[Dict[str, Any]] = None,
@@ -169,7 +163,9 @@ class DBNet(_DBNet, nn.Module):
             nn.ConvTranspose2d(head_chans // 4, num_classes, 2, stride=2),
         )
-        self.postprocessor = DBPostProcessor(assume_straight_pages=assume_straight_pages, bin_thresh=bin_thresh)
+        self.postprocessor = DBPostProcessor(
+            assume_straight_pages=assume_straight_pages, bin_thresh=bin_thresh, box_thresh=box_thresh
+        )
         for n, m in self.named_modules():
             # Don't override the initialization of the backbone
@@ -203,7 +199,7 @@ class DBNet(_DBNet, nn.Module):
             return out
         if return_model_output or target is None or return_preds:
-            prob_map = torch.sigmoid(logits)
+            prob_map = _bf16_to_float32(torch.sigmoid(logits))
         if return_model_output:
             out["out_map"] = prob_map
@@ -222,64 +218,72 @@ class DBNet(_DBNet, nn.Module):
         return out
-    def compute_loss(self, out_map: torch.Tensor, thresh_map: torch.Tensor, target: List[np.ndarray]) -> torch.Tensor:
+    def compute_loss(
+        self,
+        out_map: torch.Tensor,
+        thresh_map: torch.Tensor,
+        target: List[np.ndarray],
+        gamma: float = 2.0,
+        alpha: float = 0.5,
+        eps: float = 1e-8,
+    ) -> torch.Tensor:
         """Compute a batch of gts, masks, thresh_gts, thresh_masks from a list of boxes
         and a list of masks for each image. From there it computes the loss with the model output
         Args:
+        ----
             out_map: output feature map of the model of shape (N, C, H, W)
             thresh_map: threshold map of shape (N, C, H, W)
             target: list of dictionary where each dict has a `boxes` and a `flags` entry
+            gamma: modulating factor in the focal loss formula
+            alpha: balancing factor in the focal loss formula
+            eps: epsilon factor in dice loss
         Returns:
+        -------
             A loss tensor
         """
+        if gamma < 0:
+            raise ValueError("Value of gamma should be greater than or equal to zero.")
         prob_map = torch.sigmoid(out_map)
         thresh_map = torch.sigmoid(thresh_map)
-        targets = self.build_target(target, prob_map.shape, False)  # type: ignore[arg-type]
+        targets = self.build_target(target, out_map.shape[1:], False)  # type: ignore[arg-type]
         seg_target, seg_mask = torch.from_numpy(targets[0]), torch.from_numpy(targets[1])
         seg_target, seg_mask = seg_target.to(out_map.device), seg_mask.to(out_map.device)
         thresh_target, thresh_mask = torch.from_numpy(targets[2]), torch.from_numpy(targets[3])
         thresh_target, thresh_mask = thresh_target.to(out_map.device), thresh_mask.to(out_map.device)
-        # Compute balanced BCE loss for proba_map
-        bce_scale = 5.0
-        balanced_bce_loss = torch.zeros(1, device=out_map.device)
-        dice_loss = torch.zeros(1, device=out_map.device)
-        l1_loss = torch.zeros(1, device=out_map.device)
         if torch.any(seg_mask):
-            bce_loss = F.binary_cross_entropy_with_logits(
-                out_map,
-                seg_target,
-                reduction="none",
-            )[seg_mask]
-            neg_target = 1 - seg_target[seg_mask]
-            positive_count = seg_target[seg_mask].sum()
-            negative_count = torch.minimum(neg_target.sum(), 3.0 * positive_count)
-            negative_loss = bce_loss * neg_target
-            negative_loss = negative_loss.sort().values[-int(negative_count.item()) :]
-            sum_losses = torch.sum(bce_loss * seg_target[seg_mask]) + torch.sum(negative_loss)
-            balanced_bce_loss = sum_losses / (positive_count + negative_count + 1e-6)
-            # Compute dice loss for approxbin_map
-            bin_map = 1 / (1 + torch.exp(-50.0 * (prob_map[seg_mask] - thresh_map[seg_mask])))
-            bce_min = bce_loss.min()
-            weights = (bce_loss - bce_min) / (bce_loss.max() - bce_min) + 1.0
-            inter = torch.sum(bin_map * seg_target[seg_mask] * weights)
-            union = torch.sum(bin_map) + torch.sum(seg_target[seg_mask]) + 1e-8
-            dice_loss = 1 - 2.0 * inter / union
+            # Focal loss
+            focal_scale = 10.0
+            bce_loss = F.binary_cross_entropy_with_logits(out_map, seg_target, reduction="none")
+            p_t = prob_map * seg_target + (1 - prob_map) * (1 - seg_target)
+            alpha_t = alpha * seg_target + (1 - alpha) * (1 - seg_target)
+            # Unreduced version
+            focal_loss = alpha_t * (1 - p_t) ** gamma * bce_loss
+            # Class reduced
+            focal_loss = (seg_mask * focal_loss).sum((0, 1, 2, 3)) / seg_mask.sum((0, 1, 2, 3))
+            # Compute dice loss for each class or for approx binary_map
+            if len(self.class_names) > 1:
+                dice_map = torch.softmax(out_map, dim=1)
+            else:
+                # compute binary map instead
+                dice_map = 1 / (1 + torch.exp(-50.0 * (prob_map - thresh_map)))  # type: ignore[assignment]
+            # Class reduced
+            inter = (seg_mask * dice_map * seg_target).sum((0, 2, 3))
+            cardinality = (seg_mask * (dice_map + seg_target)).sum((0, 2, 3))
+            dice_loss = (1 - 2 * inter / (cardinality + eps)).mean()
         # Compute l1 loss for thresh_map
-        l1_scale = 10.0
         if torch.any(thresh_mask):
-            l1_loss = torch.mean(torch.abs(thresh_map[thresh_mask] - thresh_target[thresh_mask]))
+            l1_loss = (torch.abs(thresh_map - thresh_target) * thresh_mask).sum() / (thresh_mask.sum() + eps)
-        return l1_scale * l1_loss + bce_scale * balanced_bce_loss + dice_loss
+        return l1_loss + focal_scale * focal_loss + dice_loss
 def _dbnet(
@@ -337,12 +341,14 @@ def db_resnet34(pretrained: bool = False, **kwargs: Any) -> DBNet:
     >>> out = model(input_tensor)
     Args:
+    ----
         pretrained (bool): If True, returns a model pre-trained on our text detection dataset
+        **kwargs: keyword arguments of the DBNet architecture
     Returns:
+    -------
         text detection architecture
     """
     return _dbnet(
         "db_resnet34",
         pretrained,
@@ -370,12 +376,14 @@ def db_resnet50(pretrained: bool = False, **kwargs: Any) -> DBNet:
     >>> out = model(input_tensor)
     Args:
+    ----
         pretrained (bool): If True, returns a model pre-trained on our text detection dataset
+        **kwargs: keyword arguments of the DBNet architecture
     Returns:
+    -------
         text detection architecture
     """
     return _dbnet(
         "db_resnet50",
         pretrained,
@@ -403,12 +411,14 @@ def db_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) -> DBNet:
     >>> out = model(input_tensor)
     Args:
+    ----
         pretrained (bool): If True, returns a model pre-trained on our text detection dataset
+        **kwargs: keyword arguments of the DBNet architecture
     Returns:
+    -------
         text detection architecture
     """
     return _dbnet(
         "db_mobilenet_v3_large",
         pretrained,
@@ -423,37 +433,3 @@ def db_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) -> DBNet:
         ],
         **kwargs,
     )
-def db_resnet50_rotation(pretrained: bool = False, **kwargs: Any) -> DBNet:
-    """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization"
-    <https://arxiv.org/pdf/1911.08947.pdf>`_, using a ResNet-50 backbone.
-    This model is trained with rotated documents
-    >>> import torch
-    >>> from doctr.models import db_resnet50_rotation
-    >>> model = db_resnet50_rotation(pretrained=True)
-    >>> input_tensor = torch.rand((1, 3, 1024, 1024), dtype=torch.float32)
-    >>> out = model(input_tensor)
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on our text detection dataset
-    Returns:
-        text detection architecture
-    """
-    return _dbnet(
-        "db_resnet50_rotation",
-        pretrained,
-        resnet50,
-        ["layer1", "layer2", "layer3", "layer4"],
-        None,
-        ignore_keys=[
-            "prob_head.6.weight",
-            "prob_head.6.bias",
-            "thresh_head.6.weight",
-            "thresh_head.6.bias",
-        ],
-        **kwargs,
-    )

python-doctr 0.7.0__py3-none-any.whl → 0.8.1__py3-none-any.whl

python-doctr 0.7.0py3-none-any.whl → 0.8.1py3-none-any.whl