PyPI - yomitoku - Versions diffs - 0.4.0.post1.dev0__py3-none-any.whl - Mend

yomitoku 0.4.0.post1.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

yomitoku/__init__.py +20 -0
yomitoku/base.py +136 -0
yomitoku/cli/__init__.py +0 -0
yomitoku/cli/main.py +230 -0
yomitoku/configs/__init__.py +13 -0
yomitoku/configs/cfg_layout_parser_rtdtrv2.py +89 -0
yomitoku/configs/cfg_table_structure_recognizer_rtdtrv2.py +80 -0
yomitoku/configs/cfg_text_detector_dbnet.py +49 -0
yomitoku/configs/cfg_text_recognizer_parseq.py +51 -0
yomitoku/constants.py +32 -0
yomitoku/data/__init__.py +3 -0
yomitoku/data/dataset.py +40 -0
yomitoku/data/functions.py +279 -0
yomitoku/document_analyzer.py +315 -0
yomitoku/export/__init__.py +6 -0
yomitoku/export/export_csv.py +71 -0
yomitoku/export/export_html.py +188 -0
yomitoku/export/export_json.py +34 -0
yomitoku/export/export_markdown.py +145 -0
yomitoku/layout_analyzer.py +66 -0
yomitoku/layout_parser.py +189 -0
yomitoku/models/__init__.py +9 -0
yomitoku/models/dbnet_plus.py +272 -0
yomitoku/models/layers/__init__.py +0 -0
yomitoku/models/layers/activate.py +38 -0
yomitoku/models/layers/dbnet_feature_attention.py +160 -0
yomitoku/models/layers/parseq_transformer.py +218 -0
yomitoku/models/layers/rtdetr_backbone.py +333 -0
yomitoku/models/layers/rtdetr_hybrid_encoder.py +433 -0
yomitoku/models/layers/rtdetrv2_decoder.py +811 -0
yomitoku/models/parseq.py +243 -0
yomitoku/models/rtdetr.py +22 -0
yomitoku/ocr.py +87 -0
yomitoku/postprocessor/__init__.py +9 -0
yomitoku/postprocessor/dbnet_postporcessor.py +137 -0
yomitoku/postprocessor/parseq_tokenizer.py +128 -0
yomitoku/postprocessor/rtdetr_postprocessor.py +107 -0
yomitoku/reading_order.py +214 -0
yomitoku/resource/MPLUS1p-Medium.ttf +0 -0
yomitoku/resource/charset.txt +1 -0
yomitoku/table_structure_recognizer.py +244 -0
yomitoku/text_detector.py +103 -0
yomitoku/text_recognizer.py +128 -0
yomitoku/utils/__init__.py +0 -0
yomitoku/utils/graph.py +20 -0
yomitoku/utils/logger.py +15 -0
yomitoku/utils/misc.py +102 -0
yomitoku/utils/visualizer.py +179 -0
yomitoku-0.4.0.post1.dev0.dist-info/METADATA +127 -0
yomitoku-0.4.0.post1.dev0.dist-info/RECORD +52 -0
yomitoku-0.4.0.post1.dev0.dist-info/WHEEL +4 -0
yomitoku-0.4.0.post1.dev0.dist-info/entry_points.txt +2 -0

yomitoku/models/parseq.py ADDED Viewed

@@ -0,0 +1,243 @@
+# Scene Text Recognition Model Hub
+# Copyright 2022 Darwin Bautista
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+from typing import Optional, Sequence
+import torch
+import torch.nn as nn
+from huggingface_hub import PyTorchModelHubMixin
+from timm.models.helpers import named_apply
+from torch import Tensor
+from ..postprocessor import ParseqTokenizer as Tokenizer
+from .layers.parseq_transformer import Decoder, Encoder, TokenEmbedding
+def init_weights(
+    module: nn.Module, name: str = "", exclude: Sequence[str] = ()
+):
+    """Initialize the weights using the typical initialization schemes used in SOTA models."""
+    if any(map(name.startswith, exclude)):
+        return
+    if isinstance(module, nn.Linear):
+        nn.init.trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Embedding):
+        nn.init.trunc_normal_(module.weight, std=0.02)
+        if module.padding_idx is not None:
+            module.weight.data[module.padding_idx].zero_()
+    elif isinstance(module, nn.Conv2d):
+        nn.init.kaiming_normal_(
+            module.weight, mode="fan_out", nonlinearity="relu"
+        )
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d, nn.GroupNorm)):
+        nn.init.ones_(module.weight)
+        nn.init.zeros_(module.bias)
+class PARSeq(nn.Module, PyTorchModelHubMixin):
+    def __init__(
+        self,
+        cfg,
+    ) -> None:
+        super().__init__()
+        self.cfg = cfg
+        self.max_label_length = self.cfg.max_label_length
+        self.decode_ar = self.cfg.decode_ar
+        self.refine_iters = self.cfg.refine_iters
+        embed_dim = self.cfg.decoder.embed_dim
+        self.encoder = Encoder(
+            self.cfg.data.img_size,
+            **self.cfg.encoder,
+        )
+        self.decoder = Decoder(
+            norm=nn.LayerNorm(self.cfg.decoder.embed_dim),
+            cfg=self.cfg.decoder,
+        )
+        # We don't predict <bos> nor <pad>
+        self.head = nn.Linear(embed_dim, self.cfg.num_tokens - 2)
+        self.text_embed = TokenEmbedding(self.cfg.num_tokens, embed_dim)
+        # +1 for <eos>
+        self.pos_queries = nn.Parameter(
+            torch.Tensor(1, self.max_label_length + 1, embed_dim)
+        )
+        self.dropout = nn.Dropout()
+        # Encoder has its own init.
+        named_apply(partial(init_weights, exclude=["encoder"]), self)
+        nn.init.trunc_normal_(self.pos_queries, std=0.02)
+    @property
+    def _device(self) -> torch.device:
+        return next(self.head.parameters(recurse=False)).device
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        param_names = {"text_embed.embedding.weight", "pos_queries"}
+        enc_param_names = {
+            "encoder." + n for n in self.encoder.no_weight_decay()
+        }
+        return param_names.union(enc_param_names)
+    def encode(self, img: torch.Tensor):
+        return self.encoder(img)
+    def decode(
+        self,
+        tgt: torch.Tensor,
+        memory: torch.Tensor,
+        tgt_mask: Optional[Tensor] = None,
+        tgt_padding_mask: Optional[Tensor] = None,
+        tgt_query: Optional[Tensor] = None,
+        tgt_query_mask: Optional[Tensor] = None,
+    ):
+        N, L = tgt.shape
+        # <bos> stands for the null context. We only supply position information for characters after <bos>.
+        null_ctx = self.text_embed(tgt[:, :1])
+        tgt_emb = self.pos_queries[:, : L - 1] + self.text_embed(tgt[:, 1:])
+        tgt_emb = self.dropout(torch.cat([null_ctx, tgt_emb], dim=1))
+        if tgt_query is None:
+            tgt_query = self.pos_queries[:, :L].expand(N, -1, -1)
+        tgt_query = self.dropout(tgt_query)
+        return self.decoder(
+            tgt_query,
+            tgt_emb,
+            memory,
+            tgt_query_mask,
+            tgt_mask,
+            tgt_padding_mask,
+        )
+    def forward(
+        self,
+        tokenizer: Tokenizer,
+        images: Tensor,
+        max_length: Optional[int] = None,
+    ) -> Tensor:
+        testing = max_length is None
+        max_length = (
+            self.max_label_length
+            if max_length is None
+            else min(max_length, self.max_label_length)
+        )
+        bs = images.shape[0]
+        # +1 for <eos> at end of sequence.
+        num_steps = max_length + 1
+        memory = self.encode(images)
+        # Query positions up to `num_steps`
+        pos_queries = self.pos_queries[:, :num_steps].expand(bs, -1, -1)
+        # Special case for the forward permutation. Faster than using `generate_attn_masks()`
+        tgt_mask = query_mask = torch.triu(
+            torch.ones(
+                (num_steps, num_steps), dtype=torch.bool, device=self._device
+            ),
+            1,
+        )
+        if self.decode_ar:
+            tgt_in = torch.full(
+                (bs, num_steps),
+                tokenizer.pad_id,
+                dtype=torch.long,
+                device=self._device,
+            )
+            tgt_in[:, 0] = tokenizer.bos_id
+            logits = []
+            for i in range(num_steps):
+                j = i + 1  # next token index
+                # Efficient decoding:
+                # Input the context up to the ith token. We use only one query (at poad masking effect of the canonical (forward) AR context.
+                # Past tokens have no access to future tokens, hence are fixed once computed.sition = i) at a time.
+                # This works because of the lookahe
+                tgt_out = self.decode(
+                    tgt_in[:, :j],
+                    memory,
+                    tgt_mask[:j, :j],
+                    tgt_query=pos_queries[:, i:j],
+                    tgt_query_mask=query_mask[i:j, :j],
+                )
+                # the next token probability is in the output's ith token position
+                p_i = self.head(tgt_out)
+                logits.append(p_i)
+                if j < num_steps:
+                    # greedy decode. add the next token index to the target input
+                    tgt_in[:, j] = p_i.squeeze().argmax(-1)
+                    # Efficient batch decoding: If all output words have at least one EOS token, end decoding.
+                    if (
+                        testing
+                        and (tgt_in == tokenizer.eos_id).any(dim=-1).all()
+                    ):
+                        break
+            logits = torch.cat(logits, dim=1)
+        else:
+            # No prior context, so input is just <bos>. We query all positions.
+            tgt_in = torch.full(
+                (bs, 1),
+                tokenizer.bos_id,
+                dtype=torch.long,
+                device=self._device,
+            )
+            tgt_out = self.decode(tgt_in, memory, tgt_query=pos_queries)
+            logits = self.head(tgt_out)
+        if self.refine_iters:
+            # For iterative refinement, we always use a 'cloze' mask.
+            # We can derive it from the AR forward mask by unmasking the token context to the right.
+            query_mask[
+                torch.triu(
+                    torch.ones(
+                        num_steps,
+                        num_steps,
+                        dtype=torch.bool,
+                        device=self._device,
+                    ),
+                    2,
+                )
+            ] = 0
+            bos = torch.full(
+                (bs, 1),
+                tokenizer.bos_id,
+                dtype=torch.long,
+                device=self._device,
+            )
+            for i in range(self.refine_iters):
+                # Prior context is the previous output.
+                tgt_in = torch.cat([bos, logits[:, :-1].argmax(-1)], dim=1)
+                # Mask tokens beyond the first EOS token.
+                tgt_padding_mask = (tgt_in == tokenizer.eos_id).int().cumsum(
+                    -1
+                ) > 0
+                tgt_out = self.decode(
+                    tgt_in,
+                    memory,
+                    tgt_mask,
+                    tgt_padding_mask,
+                    pos_queries,
+                    query_mask[:, : tgt_in.shape[1]],
+                )
+                logits = self.head(tgt_out)
+        return logits

yomitoku/models/rtdetr.py ADDED Viewed

@@ -0,0 +1,22 @@
+import torch.nn as nn
+from huggingface_hub import PyTorchModelHubMixin
+from .layers.rtdetr_backbone import PResNet
+from .layers.rtdetr_hybrid_encoder import HybridEncoder
+from .layers.rtdetrv2_decoder import RTDETRTransformerv2
+class RTDETRv2(nn.Module, PyTorchModelHubMixin):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.backbone = PResNet(**cfg.PResNet)
+        self.encoder = HybridEncoder(**cfg.HybridEncoder)
+        self.decoder = RTDETRTransformerv2(**cfg.RTDETRTransformerv2)
+    def forward(self, x, targets=None):
+        x = self.backbone(x)
+        x = self.encoder(x)
+        x = self.decoder(x, targets)
+        return x

yomitoku/ocr.py ADDED Viewed

@@ -0,0 +1,87 @@
+from typing import List
+from pydantic import conlist
+from yomitoku.text_detector import TextDetector
+from yomitoku.text_recognizer import TextRecognizer
+from .base import BaseSchema
+class WordPrediction(BaseSchema):
+    points: conlist(
+        conlist(int, min_length=2, max_length=2),
+        min_length=4,
+        max_length=4,
+    )
+    content: str
+    direction: str
+    det_score: float
+    rec_score: float
+class OCRSchema(BaseSchema):
+    words: List[WordPrediction]
+class OCR:
+    def __init__(self, configs=None, device="cuda", visualize=False):
+        text_detector_kwargs = {
+            "device": device,
+            "visualize": visualize,
+        }
+        text_recognizer_kwargs = {
+            "device": device,
+            "visualize": visualize,
+        }
+        if isinstance(configs, dict):
+            assert (
+                "text_detector" in configs or "text_recognizer" in configs
+            ), "Invalid config key. Please check the config keys."
+            if "text_detector" in configs:
+                text_detector_kwargs.update(configs["text_detector"])
+            if "text_recognizer" in configs:
+                text_recognizer_kwargs.update(configs["text_recognizer"])
+        else:
+            raise ValueError(
+                "configs must be a dict. See the https://kotaro-kinoshita.github.io/yomitoku-dev/usage/"
+            )
+        self.detector = TextDetector(**text_detector_kwargs)
+        self.recognizer = TextRecognizer(**text_recognizer_kwargs)
+    def aggregate(self, det_outputs, rec_outputs):
+        words = []
+        for points, det_score, pred, rec_score, direction in zip(
+            det_outputs.points,
+            det_outputs.scores,
+            rec_outputs.contents,
+            rec_outputs.scores,
+            rec_outputs.directions,
+        ):
+            words.append(
+                {
+                    "points": points,
+                    "content": pred,
+                    "direction": direction,
+                    "det_score": det_score,
+                    "rec_score": rec_score,
+                }
+            )
+        return words
+    def __call__(self, img):
+        """_summary_
+        Args:
+            img (np.ndarray): cv2 image(BGR)
+        """
+        det_outputs, vis = self.detector(img)
+        rec_outputs, vis = self.recognizer(img, det_outputs.points, vis=vis)
+        outputs = {"words": self.aggregate(det_outputs, rec_outputs)}
+        results = OCRSchema(**outputs)
+        return results, vis

yomitoku/postprocessor/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+from .dbnet_postporcessor import DBnetPostProcessor
+from .parseq_tokenizer import ParseqTokenizer
+from .rtdetr_postprocessor import RTDETRPostProcessor
+__all__ = [
+    "DBnetPostProcessor",
+    "RTDETRPostProcessor",
+    "ParseqTokenizer",
+]

yomitoku/postprocessor/dbnet_postporcessor.py ADDED Viewed

@@ -0,0 +1,137 @@
+import cv2
+import numpy as np
+import pyclipper
+from shapely.geometry import Polygon
+class DBnetPostProcessor:
+    def __init__(
+        self, min_size, thresh, box_thresh, max_candidates, unclip_ratio
+    ):
+        self.min_size = min_size
+        self.thresh = thresh
+        self.box_thresh = box_thresh
+        self.max_candidates = max_candidates
+        self.unclip_ratio = unclip_ratio
+    def __call__(self, preds, image_size):
+        """
+        pred:
+            binary: text region segmentation map, with shape (N, H, W)
+            thresh: [if exists] thresh hold prediction with shape (N, H, W)
+            thresh_binary: [if exists] binarized with threshhold, (N, H, W)
+        """
+        pred = preds["binary"][0]
+        segmentation = self.binarize(pred)[0]
+        height, width = image_size
+        quads, scores = self.boxes_from_bitmap(
+            pred, segmentation, width, height
+        )
+        return quads, scores
+    def binarize(self, pred):
+        return pred > self.thresh
+    def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
+        """
+        _bitmap: single map with shape (H, W),
+            whose values are binarized as {0, 1}
+        """
+        assert len(_bitmap.shape) == 2
+        bitmap = _bitmap.cpu().numpy()  # The first channel
+        pred = pred.cpu().detach().numpy()[0]
+        height, width = bitmap.shape
+        contours, _ = cv2.findContours(
+            (bitmap * 255).astype(np.uint8),
+            cv2.RETR_LIST,
+            cv2.CHAIN_APPROX_SIMPLE,
+        )
+        num_contours = min(len(contours), self.max_candidates)
+        boxes = []
+        scores = []
+        for index in range(num_contours):
+            contour = contours[index].squeeze(1)
+            points, sside = self.get_mini_boxes(contour)
+            if sside < self.min_size:
+                continue
+            points = np.array(points)
+            score = self.box_score_fast(pred, contour)
+            if self.box_thresh > score:
+                continue
+            box = self.unclip(points, unclip_ratio=self.unclip_ratio).reshape(
+                -1, 1, 2
+            )
+            box, sside = self.get_mini_boxes(box)
+            if sside < self.min_size + 2:
+                continue
+            box = np.array(box)
+            if not isinstance(dest_width, int):
+                dest_width = dest_width.item()
+                dest_height = dest_height.item()
+            box[:, 0] = np.clip(
+                np.round(box[:, 0] / width * dest_width), 0, dest_width
+            )
+            box[:, 1] = np.clip(
+                np.round(box[:, 1] / height * dest_height), 0, dest_height
+            )
+            boxes.append(box.astype(np.int16).tolist())
+            scores.append(score)
+        return boxes, scores
+    def unclip(self, box, unclip_ratio=1.5):
+        poly = Polygon(box)
+        distance = poly.area * unclip_ratio / poly.length
+        offset = pyclipper.PyclipperOffset()
+        offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
+        expanded = np.array(offset.Execute(distance))
+        return expanded
+    def get_mini_boxes(self, contour):
+        bounding_box = cv2.minAreaRect(contour)
+        points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
+        index_1, index_2, index_3, index_4 = 0, 1, 2, 3
+        if points[1][1] > points[0][1]:
+            index_1 = 0
+            index_4 = 1
+        else:
+            index_1 = 1
+            index_4 = 0
+        if points[3][1] > points[2][1]:
+            index_2 = 2
+            index_3 = 3
+        else:
+            index_2 = 3
+            index_3 = 2
+        box = [
+            points[index_1],
+            points[index_2],
+            points[index_3],
+            points[index_4],
+        ]
+        return box, min(bounding_box[1])
+    def box_score_fast(self, bitmap, _box):
+        h, w = bitmap.shape[:2]
+        box = _box.copy()
+        xmin = np.clip(np.floor(box[:, 0].min()).astype(int), 0, w - 1)
+        xmax = np.clip(np.ceil(box[:, 0].max()).astype(int), 0, w - 1)
+        ymin = np.clip(np.floor(box[:, 1].min()).astype(int), 0, h - 1)
+        ymax = np.clip(np.ceil(box[:, 1].max()).astype(int), 0, h - 1)
+        mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
+        box[:, 0] = box[:, 0] - xmin
+        box[:, 1] = box[:, 1] - ymin
+        cv2.fillPoly(mask, box.reshape(1, -1, 2).astype(np.int32), 1)
+        return cv2.mean(bitmap[ymin : ymax + 1, xmin : xmax + 1], mask)[0]

yomitoku/postprocessor/parseq_tokenizer.py ADDED Viewed

@@ -0,0 +1,128 @@
+# Scene Text Recognition Model Hub
+# Copyright 2022 Darwin Bautista
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import ABC, abstractmethod
+from typing import Optional
+import torch
+from torch import Tensor
+from torch.nn.utils.rnn import pad_sequence
+class BaseTokenizer(ABC):
+    def __init__(
+        self,
+        charset: str,
+        specials_first: tuple = (),
+        specials_last: tuple = (),
+    ) -> None:
+        self._itos = specials_first + tuple(charset) + specials_last
+        self._stoi = {s: i for i, s in enumerate(self._itos)}
+    def __len__(self):
+        return len(self._itos)
+    def _tok2ids(self, tokens: str) -> list[int]:
+        return [self._stoi[s] for s in tokens]
+    def _ids2tok(self, token_ids: list[int], join: bool = True) -> str:
+        tokens = [self._itos[i] for i in token_ids]
+        return "".join(tokens) if join else tokens
+    @abstractmethod
+    def encode(
+        self, labels: list[str], device: Optional[torch.device] = None
+    ) -> Tensor:
+        """Encode a batch of labels to a representation suitable for the model.
+        Args:
+            labels: List of labels. Each can be of arbitrary length.
+            device: Create tensor on this device.
+        Returns:
+            Batched tensor representation padded to the max label length. Shape: N, L
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def _filter(self, probs: Tensor, ids: Tensor) -> tuple[Tensor, list[int]]:
+        """Internal method which performs the necessary filtering prior to decoding."""
+        raise NotImplementedError
+    def decode(
+        self, token_dists: Tensor, raw: bool = False
+    ) -> tuple[list[str], list[Tensor]]:
+        """Decode a batch of token distributions.
+        Args:
+            token_dists: softmax probabilities over the token distribution. Shape: N, L, C
+            raw: return unprocessed labels (will return list of list of strings)
+        Returns:
+            list of string labels (arbitrary length) and
+            their corresponding sequence probabilities as a list of Tensors
+        """
+        batch_tokens = []
+        batch_probs = []
+        for dist in token_dists:
+            probs, ids = dist.max(-1)  # greedy selection
+            if not raw:
+                probs, ids = self._filter(probs, ids)
+            tokens = self._ids2tok(ids, not raw)
+            probs = probs.cpu().numpy()
+            probs = float(probs.prod())
+            batch_tokens.append(tokens)
+            batch_probs.append(probs)
+        return batch_tokens, batch_probs
+class ParseqTokenizer(BaseTokenizer):
+    BOS = "[B]"
+    EOS = "[E]"
+    PAD = "[P]"
+    def __init__(self, charset: str) -> None:
+        specials_first = (self.EOS,)
+        specials_last = (self.BOS, self.PAD)
+        super().__init__(charset, specials_first, specials_last)
+        self.eos_id, self.bos_id, self.pad_id = [
+            self._stoi[s] for s in specials_first + specials_last
+        ]
+    def encode(
+        self, labels: list[str], device: Optional[torch.device] = None
+    ) -> Tensor:
+        batch = [
+            torch.as_tensor(
+                [self.bos_id] + self._tok2ids(y) + [self.eos_id],
+                dtype=torch.long,
+                device=device,
+            )
+            for y in labels
+        ]
+        return pad_sequence(batch, batch_first=True, padding_value=self.pad_id)
+    def _filter(self, probs: Tensor, ids: Tensor) -> tuple[Tensor, list[int]]:
+        ids = ids.tolist()
+        try:
+            eos_idx = ids.index(self.eos_id)
+        except ValueError:
+            eos_idx = len(ids)  # Nothing to truncate.
+        # Truncate after EOS
+        ids = ids[:eos_idx]
+        probs = probs[
+            : eos_idx + 1
+        ]  # but include prob. for EOS (if it exists)
+        return probs, ids