PyPI - yomitoku - Versions diffs - 0.5.3__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

yomitoku 0.5.3py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

yomitoku/cli/main.py +47 -1
yomitoku/configs/__init__.py +2 -0
yomitoku/configs/cfg_text_recognizer_parseq_small.py +51 -0
yomitoku/document_analyzer.py +229 -26
yomitoku/export/export_csv.py +39 -2
yomitoku/export/export_html.py +2 -1
yomitoku/export/export_json.py +40 -2
yomitoku/export/export_markdown.py +2 -1
yomitoku/layout_analyzer.py +1 -5
yomitoku/layout_parser.py +58 -4
yomitoku/models/layers/rtdetr_backbone.py +5 -15
yomitoku/models/layers/rtdetr_hybrid_encoder.py +6 -18
yomitoku/models/layers/rtdetrv2_decoder.py +17 -42
yomitoku/models/parseq.py +9 -9
yomitoku/ocr.py +24 -27
yomitoku/onnx/.gitkeep +0 -0
yomitoku/postprocessor/rtdetr_postprocessor.py +4 -13
yomitoku/table_structure_recognizer.py +79 -9
yomitoku/text_detector.py +57 -7
yomitoku/text_recognizer.py +80 -16
yomitoku/utils/misc.py +20 -13
yomitoku/utils/visualizer.py +5 -5
{yomitoku-0.5.3.dist-info → yomitoku-0.7.0.dist-info}/METADATA +21 -9
{yomitoku-0.5.3.dist-info → yomitoku-0.7.0.dist-info}/RECORD +26 -24
{yomitoku-0.5.3.dist-info → yomitoku-0.7.0.dist-info}/WHEEL +1 -1
{yomitoku-0.5.3.dist-info → yomitoku-0.7.0.dist-info}/entry_points.txt +0 -0

yomitoku/layout_parser.py CHANGED Viewed

@@ -1,11 +1,16 @@
 from typing import List, Union
 import cv2
+import os
+import onnx
+import onnxruntime
 import torch
 import torchvision.transforms as T
 from PIL import Image
 from pydantic import conlist
+from .constants import ROOT_DIR
 from .base import BaseModelCatalog, BaseModule, BaseSchema
 from .configs import LayoutParserRTDETRv2Config
 from .models import RTDETRv2
@@ -91,6 +96,7 @@ class LayoutParser(BaseModule):
         device="cuda",
         visualize=False,
         from_pretrained=True,
+        infer_onnx=False,
     ):
         super().__init__()
         self.load_model(model_name, path_cfg, from_pretrained)
@@ -98,7 +104,6 @@ class LayoutParser(BaseModule):
         self.visualize = visualize
         self.model.eval()
-        self.model.to(self.device)
         self.postprocessor = RTDETRPostProcessor(
             num_classes=self._cfg.RTDETRTransformerv2.num_classes,
@@ -119,11 +124,49 @@ class LayoutParser(BaseModule):
         }
         self.role = self._cfg.role
+        self.infer_onnx = infer_onnx
+        if infer_onnx:
+            name = self._cfg.hf_hub_repo.split("/")[-1]
+            path_onnx = f"{ROOT_DIR}/onnx/{name}.onnx"
+            if not os.path.exists(path_onnx):
+                self.convert_onnx(path_onnx)
+            self.model = None
+            model = onnx.load(path_onnx)
+            if torch.cuda.is_available() and device == "cuda":
+                self.sess = onnxruntime.InferenceSession(
+                    model.SerializeToString(), providers=["CUDAExecutionProvider"]
+                )
+            else:
+                self.sess = onnxruntime.InferenceSession(model.SerializeToString())
+        if self.model is not None:
+            self.model.to(self.device)
+    def convert_onnx(self, path_onnx):
+        dynamic_axes = {
+            "input": {0: "batch_size"},
+            "output": {0: "batch_size"},
+        }
+        img_size = self._cfg.data.img_size
+        dummy_input = torch.randn(1, 3, *img_size, requires_grad=True)
+        torch.onnx.export(
+            self.model,
+            dummy_input,
+            path_onnx,
+            opset_version=16,
+            input_names=["input"],
+            output_names=["pred_logits", "pred_boxes"],
+            dynamic_axes=dynamic_axes,
+        )
     def preprocess(self, img):
         cv_img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
         img = Image.fromarray(cv_img)
-        img_tensor = self.transforms(img)[None].to(self.device)
+        img_tensor = self.transforms(img)[None]
         return img_tensor
     def postprocess(self, preds, image_size):
@@ -175,8 +218,19 @@ class LayoutParser(BaseModule):
         ori_h, ori_w = img.shape[:2]
         img_tensor = self.preprocess(img)
-        with torch.inference_mode():
-            preds = self.model(img_tensor)
+        if self.infer_onnx:
+            input = img_tensor.numpy()
+            results = self.sess.run(None, {"input": input})
+            preds = {
+                "pred_logits": torch.tensor(results[0]).to(self.device),
+                "pred_boxes": torch.tensor(results[1]).to(self.device),
+            }
+        else:
+            with torch.inference_mode():
+                img_tensor = img_tensor.to(self.device)
+                preds = self.model(img_tensor)
         results = self.postprocess(preds, (ori_h, ori_w))
         vis = None

yomitoku/models/layers/rtdetr_backbone.py CHANGED Viewed

@@ -59,9 +59,7 @@ class ConvNormLayer(nn.Module):
 class BasicBlock(nn.Module):
     expansion = 1
-    def __init__(
-        self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"
-    ):
+    def __init__(self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"):
         super().__init__()
         self.shortcut = shortcut
@@ -100,9 +98,7 @@ class BasicBlock(nn.Module):
 class BottleNeck(nn.Module):
     expansion = 4
-    def __init__(
-        self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"
-    ):
+    def __init__(self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"):
         super().__init__()
         if variant == "a":
@@ -125,17 +121,13 @@ class BottleNeck(nn.Module):
                             ("pool", nn.AvgPool2d(2, 2, 0, ceil_mode=True)),
                             (
                                 "conv",
-                                ConvNormLayer(
-                                    ch_in, ch_out * self.expansion, 1, 1
-                                ),
+                                ConvNormLayer(ch_in, ch_out * self.expansion, 1, 1),
                             ),
                         ]
                     )
                 )
             else:
-                self.short = ConvNormLayer(
-                    ch_in, ch_out * self.expansion, 1, stride
-                )
+                self.short = ConvNormLayer(ch_in, ch_out * self.expansion, 1, stride)
         self.act = nn.Identity() if act is None else get_activation(act)
@@ -156,9 +148,7 @@ class BottleNeck(nn.Module):
 class Blocks(nn.Module):
-    def __init__(
-        self, block, ch_in, ch_out, count, stage_num, act="relu", variant="b"
-    ):
+    def __init__(self, block, ch_in, ch_out, count, stage_num, act="relu", variant="b"):
         super().__init__()
         self.blocks = nn.ModuleList()

yomitoku/models/layers/rtdetr_hybrid_encoder.py CHANGED Viewed

@@ -252,9 +252,7 @@ class HybridEncoder(nn.Module):
         for in_channel in in_channels:
             if version == "v1":
                 proj = nn.Sequential(
-                    nn.Conv2d(
-                        in_channel, hidden_dim, kernel_size=1, bias=False
-                    ),
+                    nn.Conv2d(in_channel, hidden_dim, kernel_size=1, bias=False),
                     nn.BatchNorm2d(hidden_dim),
                 )
             elif version == "v2":
@@ -290,9 +288,7 @@ class HybridEncoder(nn.Module):
         self.encoder = nn.ModuleList(
             [
-                TransformerEncoder(
-                    copy.deepcopy(encoder_layer), num_encoder_layers
-                )
+                TransformerEncoder(copy.deepcopy(encoder_layer), num_encoder_layers)
                 for _ in range(len(use_encoder_idx))
             ]
         )
@@ -347,9 +343,7 @@ class HybridEncoder(nn.Module):
                 # self.register_buffer(f'pos_embed{idx}', pos_embed)
     @staticmethod
-    def build_2d_sincos_position_embedding(
-        w, h, embed_dim=256, temperature=10000.0
-    ):
+    def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.0):
         """ """
         grid_w = torch.arange(int(w), dtype=torch.float32)
         grid_h = torch.arange(int(h), dtype=torch.float32)
@@ -387,9 +381,7 @@ class HybridEncoder(nn.Module):
                         src_flatten.device
                     )
-                memory: torch.Tensor = self.encoder[i](
-                    src_flatten, pos_embed=pos_embed
-                )
+                memory: torch.Tensor = self.encoder[i](src_flatten, pos_embed=pos_embed)
                 proj_feats[enc_ind] = (
                     memory.permute(0, 2, 1)
                     .reshape(-1, self.hidden_dim, h, w)
@@ -401,13 +393,9 @@ class HybridEncoder(nn.Module):
         for idx in range(len(self.in_channels) - 1, 0, -1):
             feat_heigh = inner_outs[0]
             feat_low = proj_feats[idx - 1]
-            feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](
-                feat_heigh
-            )
+            feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](feat_heigh)
             inner_outs[0] = feat_heigh
-            upsample_feat = F.interpolate(
-                feat_heigh, scale_factor=2.0, mode="nearest"
-            )
+            upsample_feat = F.interpolate(feat_heigh, scale_factor=2.0, mode="nearest")
             inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](
                 torch.concat([upsample_feat, feat_low], dim=1)
             )

yomitoku/models/layers/rtdetrv2_decoder.py CHANGED Viewed

@@ -40,9 +40,7 @@ def inverse_sigmoid(x: torch.Tensor, eps: float = 1e-5) -> torch.Tensor:
 class MLP(nn.Module):
-    def __init__(
-        self, input_dim, hidden_dim, output_dim, num_layers, act="relu"
-    ):
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, act="relu"):
         super().__init__()
         self.num_layers = num_layers
         h = [hidden_dim] * (num_layers - 1)
@@ -193,9 +191,7 @@ class MSDeformableAttention(nn.Module):
         elif reference_points.shape[-1] == 4:
             # reference_points [8, 480, None, 1,  4]
             # sampling_offsets [8, 480, 8,    12, 2]
-            num_points_scale = self.num_points_scale.to(
-                dtype=query.dtype
-            ).unsqueeze(-1)
+            num_points_scale = self.num_points_scale.to(dtype=query.dtype).unsqueeze(-1)
             offset = (
                 sampling_offsets
                 * num_points_scale
@@ -330,9 +326,7 @@ def deformable_attention_core_func_v2(
     _, Len_q, _, _, _ = sampling_locations.shape
     split_shape = [h * w for h, w in value_spatial_shapes]
-    value_list = (
-        value.permute(0, 2, 3, 1).flatten(0, 1).split(split_shape, dim=-1)
-    )
+    value_list = value.permute(0, 2, 3, 1).flatten(0, 1).split(split_shape, dim=-1)
     # sampling_offsets [8, 480, 8, 12, 2]
     if method == "default":
@@ -361,8 +355,7 @@ def deformable_attention_core_func_v2(
         elif method == "discrete":
             # n * m, seq, n, 2
             sampling_coord = (
-                sampling_grid_l * torch.tensor([[w, h]], device=value.device)
-                + 0.5
+                sampling_grid_l * torch.tensor([[w, h]], device=value.device) + 0.5
             ).to(torch.int64)
             # FIX ME? for rectangle input
@@ -389,9 +382,7 @@ def deformable_attention_core_func_v2(
     attn_weights = attention_weights.permute(0, 2, 1, 3).reshape(
         bs * n_head, 1, Len_q, sum(num_points_list)
     )
-    weighted_sample_locs = (
-        torch.concat(sampling_value_list, dim=-1) * attn_weights
-    )
+    weighted_sample_locs = torch.concat(sampling_value_list, dim=-1) * attn_weights
     output = weighted_sample_locs.sum(-1).reshape(bs, n_head * c, Len_q)
     return output.permute(0, 2, 1)
@@ -606,9 +597,7 @@ class RTDETRTransformerv2(nn.Module):
                         [
                             (
                                 "conv",
-                                nn.Conv2d(
-                                    in_channels, self.hidden_dim, 1, bias=False
-                                ),
+                                nn.Conv2d(in_channels, self.hidden_dim, 1, bias=False),
                             ),
                             (
                                 "norm",
@@ -689,13 +678,9 @@ class RTDETRTransformerv2(nn.Module):
                 torch.arange(h), torch.arange(w), indexing="ij"
             )
             grid_xy = torch.stack([grid_x, grid_y], dim=-1)
-            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / torch.tensor(
-                [w, h], dtype=dtype
-            )
+            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / torch.tensor([w, h], dtype=dtype)
             wh = torch.ones_like(grid_xy) * grid_size * (2.0**lvl)
-            lvl_anchors = torch.concat([grid_xy, wh], dim=-1).reshape(
-                -1, h * w, 4
-            )
+            lvl_anchors = torch.concat([grid_xy, wh], dim=-1).reshape(-1, h * w, 4)
             anchors.append(lvl_anchors)
         anchors = torch.concat(anchors, dim=1).to(device)
@@ -729,22 +714,18 @@ class RTDETRTransformerv2(nn.Module):
         )
         enc_topk_bboxes_list, enc_topk_logits_list = [], []
-        enc_topk_memory, enc_topk_logits, enc_topk_bbox_unact = (
-            self._select_topk(
-                output_memory,
-                enc_outputs_logits,
-                enc_outputs_coord_unact,
-                self.num_queries,
-            )
+        enc_topk_memory, enc_topk_logits, enc_topk_bbox_unact = self._select_topk(
+            output_memory,
+            enc_outputs_logits,
+            enc_outputs_coord_unact,
+            self.num_queries,
         )
         # if self.num_select_queries != self.num_queries:
         #     raise NotImplementedError('')
         if self.learn_query_content:
-            content = self.tgt_embed.weight.unsqueeze(0).tile(
-                [memory.shape[0], 1, 1]
-            )
+            content = self.tgt_embed.weight.unsqueeze(0).tile([memory.shape[0], 1, 1])
         else:
             content = enc_topk_memory.detach()
@@ -771,9 +752,7 @@ class RTDETRTransformerv2(nn.Module):
         topk: int,
     ):
         if self.query_select_method == "default":
-            _, topk_ind = torch.topk(
-                outputs_logits.max(-1).values, topk, dim=-1
-            )
+            _, topk_ind = torch.topk(outputs_logits.max(-1).values, topk, dim=-1)
         elif self.query_select_method == "one2many":
             _, topk_ind = torch.topk(outputs_logits.flatten(1), topk, dim=-1)
@@ -786,16 +765,12 @@ class RTDETRTransformerv2(nn.Module):
         topk_coords = outputs_coords_unact.gather(
             dim=1,
-            index=topk_ind.unsqueeze(-1).repeat(
-                1, 1, outputs_coords_unact.shape[-1]
-            ),
+            index=topk_ind.unsqueeze(-1).repeat(1, 1, outputs_coords_unact.shape[-1]),
         )
         topk_logits = outputs_logits.gather(
             dim=1,
-            index=topk_ind.unsqueeze(-1).repeat(
-                1, 1, outputs_logits.shape[-1]
-            ),
+            index=topk_ind.unsqueeze(-1).repeat(1, 1, outputs_logits.shape[-1]),
         )
         topk_memory = memory.gather(

yomitoku/models/parseq.py CHANGED Viewed

@@ -22,7 +22,6 @@ from huggingface_hub import PyTorchModelHubMixin
 from timm.models.helpers import named_apply
 from torch import Tensor
-from ..postprocessor import ParseqTokenizer as Tokenizer
 from .layers.parseq_transformer import Decoder, Encoder, TokenEmbedding
@@ -123,7 +122,6 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
     def forward(
         self,
-        tokenizer: Tokenizer,
         images: Tensor,
         max_length: Optional[int] = None,
     ) -> Tensor:
@@ -150,11 +148,11 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
         if self.decode_ar:
             tgt_in = torch.full(
                 (bs, num_steps),
-                tokenizer.pad_id,
+                self.tokenizer.pad_id,
                 dtype=torch.long,
                 device=self._device,
             )
-            tgt_in[:, 0] = tokenizer.bos_id
+            tgt_in[:, 0] = self.tokenizer.bos_id
             logits = []
             for i in range(num_steps):
@@ -177,7 +175,7 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
                     # greedy decode. add the next token index to the target input
                     tgt_in[:, j] = p_i.squeeze().argmax(-1)
                     # Efficient batch decoding: If all output words have at least one EOS token, end decoding.
-                    if testing and (tgt_in == tokenizer.eos_id).any(dim=-1).all():
+                    if testing and (tgt_in == self.tokenizer.eos_id).any(dim=-1).all():
                         break
             logits = torch.cat(logits, dim=1)
@@ -185,7 +183,7 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
             # No prior context, so input is just <bos>. We query all positions.
             tgt_in = torch.full(
                 (bs, 1),
-                tokenizer.bos_id,
+                self.tokenizer.bos_id,
                 dtype=torch.long,
                 device=self._device,
             )
@@ -200,7 +198,7 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
                     torch.ones(
                         num_steps,
                         num_steps,
-                        dtype=torch.bool,
+                        dtype=torch.int64,
                         device=self._device,
                     ),
                     2,
@@ -208,7 +206,7 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
             ] = 0
             bos = torch.full(
                 (bs, 1),
-                tokenizer.bos_id,
+                self.tokenizer.bos_id,
                 dtype=torch.long,
                 device=self._device,
             )
@@ -216,7 +214,9 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
                 # Prior context is the previous output.
                 tgt_in = torch.cat([bos, logits[:, :-1].argmax(-1)], dim=1)
                 # Mask tokens beyond the first EOS token.
-                tgt_padding_mask = (tgt_in == tokenizer.eos_id).int().cumsum(-1) > 0
+                tgt_padding_mask = (tgt_in == self.tokenizer.eos_id).int().cumsum(
+                    -1
+                ) > 0
                 tgt_out = self.decode(
                     tgt_in,
                     memory,

yomitoku/ocr.py CHANGED Viewed

@@ -16,16 +16,37 @@ class WordPrediction(BaseSchema):
     )
     content: str
     direction: str
-    det_score: float
     rec_score: float
+    det_score: float
 class OCRSchema(BaseSchema):
     words: List[WordPrediction]
+def ocr_aggregate(det_outputs, rec_outputs):
+    words = []
+    for points, det_score, pred, rec_score, direction in zip(
+        det_outputs.points,
+        det_outputs.scores,
+        rec_outputs.contents,
+        rec_outputs.scores,
+        rec_outputs.directions,
+    ):
+        words.append(
+            {
+                "points": points,
+                "content": pred,
+                "direction": direction,
+                "det_score": det_score,
+                "rec_score": rec_score,
+            }
+        )
+    return words
 class OCR:
-    def __init__(self, configs=None, device="cuda", visualize=False):
+    def __init__(self, configs={}, device="cuda", visualize=False):
         text_detector_kwargs = {
             "device": device,
             "visualize": visualize,
@@ -36,10 +57,6 @@ class OCR:
         }
         if isinstance(configs, dict):
-            assert (
-                "text_detector" in configs or "text_recognizer" in configs
-            ), "Invalid config key. Please check the config keys."
             if "text_detector" in configs:
                 text_detector_kwargs.update(configs["text_detector"])
             if "text_recognizer" in configs:
@@ -52,26 +69,6 @@ class OCR:
         self.detector = TextDetector(**text_detector_kwargs)
         self.recognizer = TextRecognizer(**text_recognizer_kwargs)
-    def aggregate(self, det_outputs, rec_outputs):
-        words = []
-        for points, det_score, pred, rec_score, direction in zip(
-            det_outputs.points,
-            det_outputs.scores,
-            rec_outputs.contents,
-            rec_outputs.scores,
-            rec_outputs.directions,
-        ):
-            words.append(
-                {
-                    "points": points,
-                    "content": pred,
-                    "direction": direction,
-                    "det_score": det_score,
-                    "rec_score": rec_score,
-                }
-            )
-        return words
     def __call__(self, img):
         """_summary_
@@ -82,6 +79,6 @@ class OCR:
         det_outputs, vis = self.detector(img)
         rec_outputs, vis = self.recognizer(img, det_outputs.points, vis=vis)
-        outputs = {"words": self.aggregate(det_outputs, rec_outputs)}
+        outputs = {"words": ocr_aggregate(det_outputs, rec_outputs)}
         results = OCRSchema(**outputs)
         return results, vis

yomitoku/onnx/.gitkeep ADDED Viewed

File without changes

yomitoku/postprocessor/rtdetr_postprocessor.py CHANGED Viewed

@@ -54,16 +54,12 @@ class RTDETRPostProcessor(nn.Module):
         logits, boxes = outputs["pred_logits"], outputs["pred_boxes"]
         # orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)
-        bbox_pred = torchvision.ops.box_convert(
-            boxes, in_fmt="cxcywh", out_fmt="xyxy"
-        )
+        bbox_pred = torchvision.ops.box_convert(boxes, in_fmt="cxcywh", out_fmt="xyxy")
         bbox_pred *= orig_target_sizes.repeat(1, 2).unsqueeze(1)
         if self.use_focal_loss:
             scores = F.sigmoid(logits)
-            scores, index = torch.topk(
-                scores.flatten(1), self.num_top_queries, dim=-1
-            )
+            scores, index = torch.topk(scores.flatten(1), self.num_top_queries, dim=-1)
             # TODO for older tensorrt
             # labels = index % self.num_classes
             labels = mod(index, self.num_classes)
@@ -77,9 +73,7 @@ class RTDETRPostProcessor(nn.Module):
             scores = F.softmax(logits)[:, :, :-1]
             scores, labels = scores.max(dim=-1)
             if scores.shape[1] > self.num_top_queries:
-                scores, index = torch.topk(
-                    scores, self.num_top_queries, dim=-1
-                )
+                scores, index = torch.topk(scores, self.num_top_queries, dim=-1)
                 labels = torch.gather(labels, dim=1, index=index)
                 boxes = torch.gather(
                     boxes,
@@ -97,10 +91,7 @@ class RTDETRPostProcessor(nn.Module):
             labels = (
                 torch.tensor(
-                    [
-                        mscoco_label2category[int(x.item())]
-                        for x in labels.flatten()
-                    ]
+                    [mscoco_label2category[int(x.item())] for x in labels.flatten()]
                 )
                 .to(boxes.device)
                 .reshape(labels.shape)

yomitoku 0.5.3__py3-none-any.whl → 0.7.0__py3-none-any.whl

yomitoku 0.5.3py3-none-any.whl → 0.7.0py3-none-any.whl