PyPI - yomitoku - Versions diffs - 0.4.1__py3-none-any.whl → 0.7.4__py3-none-any.whl - Mend

yomitoku 0.4.1py3-none-any.whl → 0.7.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

yomitoku/base.py +1 -1
yomitoku/cli/main.py +219 -27
yomitoku/configs/__init__.py +2 -0
yomitoku/configs/cfg_text_detector_dbnet.py +1 -1
yomitoku/configs/cfg_text_recognizer_parseq_small.py +51 -0
yomitoku/data/functions.py +48 -23
yomitoku/document_analyzer.py +243 -41
yomitoku/export/__init__.py +18 -5
yomitoku/export/export_csv.py +71 -2
yomitoku/export/export_html.py +46 -12
yomitoku/export/export_json.py +66 -3
yomitoku/export/export_markdown.py +42 -6
yomitoku/layout_analyzer.py +2 -9
yomitoku/layout_parser.py +58 -4
yomitoku/models/dbnet_plus.py +13 -39
yomitoku/models/layers/activate.py +13 -0
yomitoku/models/layers/rtdetr_backbone.py +18 -17
yomitoku/models/layers/rtdetr_hybrid_encoder.py +19 -20
yomitoku/models/layers/rtdetrv2_decoder.py +14 -1
yomitoku/models/parseq.py +15 -22
yomitoku/ocr.py +24 -27
yomitoku/onnx/.gitkeep +0 -0
yomitoku/postprocessor/dbnet_postporcessor.py +15 -14
yomitoku/postprocessor/parseq_tokenizer.py +1 -3
yomitoku/postprocessor/rtdetr_postprocessor.py +14 -1
yomitoku/table_structure_recognizer.py +82 -9
yomitoku/text_detector.py +57 -7
yomitoku/text_recognizer.py +84 -16
yomitoku/utils/misc.py +21 -14
yomitoku/utils/visualizer.py +15 -8
{yomitoku-0.4.1.dist-info → yomitoku-0.7.4.dist-info}/METADATA +34 -41
yomitoku-0.7.4.dist-info/RECORD +54 -0
{yomitoku-0.4.1.dist-info → yomitoku-0.7.4.dist-info}/WHEEL +1 -1
yomitoku-0.4.1.dist-info/RECORD +0 -52
{yomitoku-0.4.1.dist-info → yomitoku-0.7.4.dist-info}/entry_points.txt +0 -0

yomitoku/export/export_markdown.py CHANGED Viewed

@@ -1,10 +1,11 @@
+import os
 import re
 import cv2
-import os
 def escape_markdown_special_chars(text):
-    special_chars = r"([`*_{}[\]()#+.!|-])"
+    special_chars = r"([`*{}[\]()#+!~|-])"
     return re.sub(special_chars, r"\\\1", text)
@@ -75,6 +76,8 @@ def figure_to_md(
     width=200,
     figure_dir="figures",
 ):
+    assert img is not None, "img is required for saving figures"
     elements = []
     for i, figure in enumerate(figures):
         x1, y1, x2, y2 = map(int, figure.box)
@@ -108,11 +111,11 @@ def figure_to_md(
     return elements
-def export_markdown(
+def convert_markdown(
     inputs,
-    out_path: str,
+    out_path,
+    ignore_line_break=False,
     img=None,
-    ignore_line_break: bool = False,
     export_figure_letter=False,
     export_figure=True,
     figure_width=200,
@@ -140,6 +143,39 @@ def export_markdown(
     elements = sorted(elements, key=lambda x: x["order"])
     markdown = "\n".join([element["md"] for element in elements])
+    return markdown, elements
-    with open(out_path, "w", encoding="utf-8") as f:
+def export_markdown(
+    inputs,
+    out_path: str,
+    ignore_line_break: bool = False,
+    img=None,
+    export_figure_letter=False,
+    export_figure=True,
+    figure_width=200,
+    figure_dir="figures",
+    encoding: str = "utf-8",
+):
+    markdown, elements = convert_markdown(
+        inputs,
+        out_path,
+        ignore_line_break,
+        img,
+        export_figure_letter,
+        export_figure,
+        figure_width,
+        figure_dir,
+    )
+    save_markdown(markdown, out_path, encoding)
+    return markdown
+def save_markdown(
+    markdown,
+    out_path,
+    encoding,
+):
+    with open(out_path, "w", encoding=encoding, errors="ignore") as f:
         f.write(markdown)

yomitoku/layout_analyzer.py CHANGED Viewed

@@ -15,7 +15,7 @@ class LayoutAnalyzerSchema(BaseSchema):
 class LayoutAnalyzer:
-    def __init__(self, configs=None, device="cuda", visualize=False):
+    def __init__(self, configs={}, device="cuda", visualize=False):
         layout_parser_kwargs = {
             "device": device,
             "visualize": visualize,
@@ -26,11 +26,6 @@ class LayoutAnalyzer:
         }
         if isinstance(configs, dict):
-            assert (
-                "layout_parser" in configs
-                or "table_structure_recognizer" in configs
-            ), "Invalid config key. Please check the config keys."
             if "layout_parser" in configs:
                 layout_parser_kwargs.update(configs["layout_parser"])
@@ -53,9 +48,7 @@ class LayoutAnalyzer:
     def __call__(self, img):
         layout_results, vis = self.layout_parser(img)
         table_boxes = [table.box for table in layout_results.tables]
-        table_results, vis = self.table_structure_recognizer(
-            img, table_boxes, vis=vis
-        )
+        table_results, vis = self.table_structure_recognizer(img, table_boxes, vis=vis)
         results = LayoutAnalyzerSchema(
             paragraphs=layout_results.paragraphs,

yomitoku/layout_parser.py CHANGED Viewed

@@ -1,11 +1,16 @@
 from typing import List, Union
 import cv2
+import os
+import onnx
+import onnxruntime
 import torch
 import torchvision.transforms as T
 from PIL import Image
 from pydantic import conlist
+from .constants import ROOT_DIR
 from .base import BaseModelCatalog, BaseModule, BaseSchema
 from .configs import LayoutParserRTDETRv2Config
 from .models import RTDETRv2
@@ -91,6 +96,7 @@ class LayoutParser(BaseModule):
         device="cuda",
         visualize=False,
         from_pretrained=True,
+        infer_onnx=False,
     ):
         super().__init__()
         self.load_model(model_name, path_cfg, from_pretrained)
@@ -98,7 +104,6 @@ class LayoutParser(BaseModule):
         self.visualize = visualize
         self.model.eval()
-        self.model.to(self.device)
         self.postprocessor = RTDETRPostProcessor(
             num_classes=self._cfg.RTDETRTransformerv2.num_classes,
@@ -119,11 +124,49 @@ class LayoutParser(BaseModule):
         }
         self.role = self._cfg.role
+        self.infer_onnx = infer_onnx
+        if infer_onnx:
+            name = self._cfg.hf_hub_repo.split("/")[-1]
+            path_onnx = f"{ROOT_DIR}/onnx/{name}.onnx"
+            if not os.path.exists(path_onnx):
+                self.convert_onnx(path_onnx)
+            self.model = None
+            model = onnx.load(path_onnx)
+            if torch.cuda.is_available() and device == "cuda":
+                self.sess = onnxruntime.InferenceSession(
+                    model.SerializeToString(), providers=["CUDAExecutionProvider"]
+                )
+            else:
+                self.sess = onnxruntime.InferenceSession(model.SerializeToString())
+        if self.model is not None:
+            self.model.to(self.device)
+    def convert_onnx(self, path_onnx):
+        dynamic_axes = {
+            "input": {0: "batch_size"},
+            "output": {0: "batch_size"},
+        }
+        img_size = self._cfg.data.img_size
+        dummy_input = torch.randn(1, 3, *img_size, requires_grad=True)
+        torch.onnx.export(
+            self.model,
+            dummy_input,
+            path_onnx,
+            opset_version=16,
+            input_names=["input"],
+            output_names=["pred_logits", "pred_boxes"],
+            dynamic_axes=dynamic_axes,
+        )
     def preprocess(self, img):
         cv_img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
         img = Image.fromarray(cv_img)
-        img_tensor = self.transforms(img)[None].to(self.device)
+        img_tensor = self.transforms(img)[None]
         return img_tensor
     def postprocess(self, preds, image_size):
@@ -175,8 +218,19 @@ class LayoutParser(BaseModule):
         ori_h, ori_w = img.shape[:2]
         img_tensor = self.preprocess(img)
-        with torch.inference_mode():
-            preds = self.model(img_tensor)
+        if self.infer_onnx:
+            input = img_tensor.numpy()
+            results = self.sess.run(None, {"input": input})
+            preds = {
+                "pred_logits": torch.tensor(results[0]).to(self.device),
+                "pred_boxes": torch.tensor(results[1]).to(self.device),
+            }
+        else:
+            with torch.inference_mode():
+                img_tensor = img_tensor.to(self.device)
+                preds = self.model(img_tensor)
         results = self.postprocess(preds, (ori_h, ori_w))
         vis = None

yomitoku/models/dbnet_plus.py CHANGED Viewed

@@ -20,9 +20,7 @@ class BackboneBase(nn.Module):
             "layer4": "layer4",
         }
-        self.body = IntermediateLayerGetter(
-            backbone, return_layers=return_layers
-        )
+        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
     def forward(self, tensor):
         xs = self.body(tensor)
@@ -57,18 +55,10 @@ class DBNetDecoder(nn.Module):
         self.training = True
         self.input_proj = nn.ModuleDict(
             {
-                "layer1": nn.Conv2d(
-                    in_channels[0], self.d_model, 1, bias=False
-                ),
-                "layer2": nn.Conv2d(
-                    in_channels[1], self.d_model, 1, bias=False
-                ),
-                "layer3": nn.Conv2d(
-                    in_channels[2], self.d_model, 1, bias=False
-                ),
-                "layer4": nn.Conv2d(
-                    in_channels[3], self.d_model, 1, bias=False
-                ),
+                "layer1": nn.Conv2d(in_channels[0], self.d_model, 1, bias=False),
+                "layer2": nn.Conv2d(in_channels[1], self.d_model, 1, bias=False),
+                "layer3": nn.Conv2d(in_channels[2], self.d_model, 1, bias=False),
+                "layer4": nn.Conv2d(in_channels[3], self.d_model, 1, bias=False),
             }
         )
@@ -89,9 +79,7 @@ class DBNetDecoder(nn.Module):
                         padding=1,
                         bias=False,
                     ),
-                    nn.Upsample(
-                        scale_factor=2, mode="bilinear", align_corners=False
-                    ),
+                    nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False),
                 ),
                 "layer3": nn.Sequential(
                     nn.Conv2d(
@@ -101,9 +89,7 @@ class DBNetDecoder(nn.Module):
                         padding=1,
                         bias=False,
                     ),
-                    nn.Upsample(
-                        scale_factor=4, mode="bilinear", align_corners=False
-                    ),
+                    nn.Upsample(scale_factor=4, mode="bilinear", align_corners=False),
                 ),
                 "layer4": nn.Sequential(
                     nn.Conv2d(
@@ -113,17 +99,13 @@ class DBNetDecoder(nn.Module):
                         padding=1,
                         bias=False,
                     ),
-                    nn.Upsample(
-                        scale_factor=4, mode="bilinear", align_corners=False
-                    ),
+                    nn.Upsample(scale_factor=4, mode="bilinear", align_corners=False),
                 ),
             }
         )
         self.binarize = nn.Sequential(
-            nn.Conv2d(
-                self.d_model, self.d_model // 4, 3, padding=1, bias=False
-            ),
+            nn.Conv2d(self.d_model, self.d_model // 4, 3, padding=1, bias=False),
             nn.BatchNorm2d(self.d_model // 4),
             nn.ReLU(inplace=True),
             nn.ConvTranspose2d(self.d_model // 4, self.d_model // 4, 2, 2),
@@ -166,16 +148,12 @@ class DBNetDecoder(nn.Module):
             m.weight.data.fill_(1.0)
             m.bias.data.fill_(1e-4)
-    def _init_thresh(
-        self, inner_channels, serial=False, smooth=False, bias=False
-    ):
+    def _init_thresh(self, inner_channels, serial=False, smooth=False, bias=False):
         in_channels = inner_channels
         if serial:
             in_channels += 1
         self.thresh = nn.Sequential(
-            nn.Conv2d(
-                in_channels, inner_channels // 4, 3, padding=1, bias=bias
-            ),
+            nn.Conv2d(in_channels, inner_channels // 4, 3, padding=1, bias=bias),
             nn.BatchNorm2d(inner_channels // 4),
             nn.ReLU(inplace=True),
             self._init_upsample(
@@ -186,16 +164,12 @@ class DBNetDecoder(nn.Module):
             ),
             nn.BatchNorm2d(inner_channels // 4),
             nn.ReLU(inplace=True),
-            self._init_upsample(
-                inner_channels // 4, 1, smooth=smooth, bias=bias
-            ),
+            self._init_upsample(inner_channels // 4, 1, smooth=smooth, bias=bias),
             nn.Sigmoid(),
         )
         return self.thresh
-    def _init_upsample(
-        self, in_channels, out_channels, smooth=False, bias=False
-    ):
+    def _init_upsample(self, in_channels, out_channels, smooth=False, bias=False):
         if smooth:
             inter_out_channels = out_channels
             if out_channels == 1:

yomitoku/models/layers/activate.py CHANGED Viewed

@@ -1,3 +1,16 @@
+# Copyright(c) 2023 lyuwenyu
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import torch.nn as nn

yomitoku/models/layers/rtdetr_backbone.py CHANGED Viewed

@@ -1,5 +1,16 @@
-"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
-"""
+# Copyright 2023 lyuwenyu
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from collections import OrderedDict
@@ -48,9 +59,7 @@ class ConvNormLayer(nn.Module):
 class BasicBlock(nn.Module):
     expansion = 1
-    def __init__(
-        self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"
-    ):
+    def __init__(self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"):
         super().__init__()
         self.shortcut = shortcut
@@ -89,9 +98,7 @@ class BasicBlock(nn.Module):
 class BottleNeck(nn.Module):
     expansion = 4
-    def __init__(
-        self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"
-    ):
+    def __init__(self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"):
         super().__init__()
         if variant == "a":
@@ -114,17 +121,13 @@ class BottleNeck(nn.Module):
                             ("pool", nn.AvgPool2d(2, 2, 0, ceil_mode=True)),
                             (
                                 "conv",
-                                ConvNormLayer(
-                                    ch_in, ch_out * self.expansion, 1, 1
-                                ),
+                                ConvNormLayer(ch_in, ch_out * self.expansion, 1, 1),
                             ),
                         ]
                     )
                 )
             else:
-                self.short = ConvNormLayer(
-                    ch_in, ch_out * self.expansion, 1, stride
-                )
+                self.short = ConvNormLayer(ch_in, ch_out * self.expansion, 1, stride)
         self.act = nn.Identity() if act is None else get_activation(act)
@@ -145,9 +148,7 @@ class BottleNeck(nn.Module):
 class Blocks(nn.Module):
-    def __init__(
-        self, block, ch_in, ch_out, count, stage_num, act="relu", variant="b"
-    ):
+    def __init__(self, block, ch_in, ch_out, count, stage_num, act="relu", variant="b"):
         super().__init__()
         self.blocks = nn.ModuleList()

yomitoku/models/layers/rtdetr_hybrid_encoder.py CHANGED Viewed

@@ -1,5 +1,16 @@
-"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
-"""
+# Copyright 2023 lyuwenyu
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import copy
 from collections import OrderedDict
@@ -241,9 +252,7 @@ class HybridEncoder(nn.Module):
         for in_channel in in_channels:
             if version == "v1":
                 proj = nn.Sequential(
-                    nn.Conv2d(
-                        in_channel, hidden_dim, kernel_size=1, bias=False
-                    ),
+                    nn.Conv2d(in_channel, hidden_dim, kernel_size=1, bias=False),
                     nn.BatchNorm2d(hidden_dim),
                 )
             elif version == "v2":
@@ -279,9 +288,7 @@ class HybridEncoder(nn.Module):
         self.encoder = nn.ModuleList(
             [
-                TransformerEncoder(
-                    copy.deepcopy(encoder_layer), num_encoder_layers
-                )
+                TransformerEncoder(copy.deepcopy(encoder_layer), num_encoder_layers)
                 for _ in range(len(use_encoder_idx))
             ]
         )
@@ -336,9 +343,7 @@ class HybridEncoder(nn.Module):
                 # self.register_buffer(f'pos_embed{idx}', pos_embed)
     @staticmethod
-    def build_2d_sincos_position_embedding(
-        w, h, embed_dim=256, temperature=10000.0
-    ):
+    def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.0):
         """ """
         grid_w = torch.arange(int(w), dtype=torch.float32)
         grid_h = torch.arange(int(h), dtype=torch.float32)
@@ -376,9 +381,7 @@ class HybridEncoder(nn.Module):
                         src_flatten.device
                     )
-                memory: torch.Tensor = self.encoder[i](
-                    src_flatten, pos_embed=pos_embed
-                )
+                memory: torch.Tensor = self.encoder[i](src_flatten, pos_embed=pos_embed)
                 proj_feats[enc_ind] = (
                     memory.permute(0, 2, 1)
                     .reshape(-1, self.hidden_dim, h, w)
@@ -390,13 +393,9 @@ class HybridEncoder(nn.Module):
         for idx in range(len(self.in_channels) - 1, 0, -1):
             feat_heigh = inner_outs[0]
             feat_low = proj_feats[idx - 1]
-            feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](
-                feat_heigh
-            )
+            feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](feat_heigh)
             inner_outs[0] = feat_heigh
-            upsample_feat = F.interpolate(
-                feat_heigh, scale_factor=2.0, mode="nearest"
-            )
+            upsample_feat = F.interpolate(feat_heigh, scale_factor=2.0, mode="nearest")
             inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](
                 torch.concat([upsample_feat, feat_low], dim=1)
             )

yomitoku/models/layers/rtdetrv2_decoder.py CHANGED Viewed

@@ -1,4 +1,17 @@
-"""Copyright(c) 2023 lyuwenyu. All Rights Reserved."""
+# Scene Text Recognition Model Hub
+# Copyright 2023 lyuwenyu
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import copy
 import functools

yomitoku/models/parseq.py CHANGED Viewed

@@ -22,13 +22,10 @@ from huggingface_hub import PyTorchModelHubMixin
 from timm.models.helpers import named_apply
 from torch import Tensor
-from ..postprocessor import ParseqTokenizer as Tokenizer
 from .layers.parseq_transformer import Decoder, Encoder, TokenEmbedding
-def init_weights(
-    module: nn.Module, name: str = "", exclude: Sequence[str] = ()
-):
+def init_weights(module: nn.Module, name: str = "", exclude: Sequence[str] = ()):
     """Initialize the weights using the typical initialization schemes used in SOTA models."""
     if any(map(name.startswith, exclude)):
         return
@@ -41,9 +38,7 @@ def init_weights(
         if module.padding_idx is not None:
             module.weight.data[module.padding_idx].zero_()
     elif isinstance(module, nn.Conv2d):
-        nn.init.kaiming_normal_(
-            module.weight, mode="fan_out", nonlinearity="relu"
-        )
+        nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
         if module.bias is not None:
             nn.init.zeros_(module.bias)
     elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d, nn.GroupNorm)):
@@ -86,6 +81,8 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
         named_apply(partial(init_weights, exclude=["encoder"]), self)
         nn.init.trunc_normal_(self.pos_queries, std=0.02)
+        self.export_onnx = False
     @property
     def _device(self) -> torch.device:
         return next(self.head.parameters(recurse=False)).device
@@ -93,9 +90,7 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
     @torch.jit.ignore
     def no_weight_decay(self):
         param_names = {"text_embed.embedding.weight", "pos_queries"}
-        enc_param_names = {
-            "encoder." + n for n in self.encoder.no_weight_decay()
-        }
+        enc_param_names = {"encoder." + n for n in self.encoder.no_weight_decay()}
         return param_names.union(enc_param_names)
     def encode(self, img: torch.Tensor):
@@ -129,7 +124,6 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
     def forward(
         self,
-        tokenizer: Tokenizer,
         images: Tensor,
         max_length: Optional[int] = None,
     ) -> Tensor:
@@ -149,20 +143,18 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
         # Special case for the forward permutation. Faster than using `generate_attn_masks()`
         tgt_mask = query_mask = torch.triu(
-            torch.ones(
-                (num_steps, num_steps), dtype=torch.bool, device=self._device
-            ),
+            torch.ones((num_steps, num_steps), dtype=torch.bool, device=self._device),
             1,
         )
         if self.decode_ar:
             tgt_in = torch.full(
                 (bs, num_steps),
-                tokenizer.pad_id,
+                self.tokenizer.pad_id,
                 dtype=torch.long,
                 device=self._device,
             )
-            tgt_in[:, 0] = tokenizer.bos_id
+            tgt_in[:, 0] = self.tokenizer.bos_id
             logits = []
             for i in range(num_steps):
@@ -186,8 +178,9 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
                     tgt_in[:, j] = p_i.squeeze().argmax(-1)
                     # Efficient batch decoding: If all output words have at least one EOS token, end decoding.
                     if (
-                        testing
-                        and (tgt_in == tokenizer.eos_id).any(dim=-1).all()
+                        not self.export_onnx
+                        and testing
+                        and (tgt_in == self.tokenizer.eos_id).any(dim=-1).all()
                     ):
                         break
@@ -196,7 +189,7 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
             # No prior context, so input is just <bos>. We query all positions.
             tgt_in = torch.full(
                 (bs, 1),
-                tokenizer.bos_id,
+                self.tokenizer.bos_id,
                 dtype=torch.long,
                 device=self._device,
             )
@@ -211,7 +204,7 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
                     torch.ones(
                         num_steps,
                         num_steps,
-                        dtype=torch.bool,
+                        dtype=torch.int64,
                         device=self._device,
                     ),
                     2,
@@ -219,7 +212,7 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
             ] = 0
             bos = torch.full(
                 (bs, 1),
-                tokenizer.bos_id,
+                self.tokenizer.bos_id,
                 dtype=torch.long,
                 device=self._device,
             )
@@ -227,7 +220,7 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
                 # Prior context is the previous output.
                 tgt_in = torch.cat([bos, logits[:, :-1].argmax(-1)], dim=1)
                 # Mask tokens beyond the first EOS token.
-                tgt_padding_mask = (tgt_in == tokenizer.eos_id).int().cumsum(
+                tgt_padding_mask = (tgt_in == self.tokenizer.eos_id).int().cumsum(
                     -1
                 ) > 0
                 tgt_out = self.decode(

yomitoku 0.4.1__py3-none-any.whl → 0.7.4__py3-none-any.whl

yomitoku 0.4.1py3-none-any.whl → 0.7.4py3-none-any.whl