PyPI - yomitoku - Versions diffs - 0.5.2__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

yomitoku 0.5.2py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

yomitoku/cli/main.py +15 -0
yomitoku/configs/__init__.py +2 -0
yomitoku/configs/cfg_text_recognizer_parseq_small.py +51 -0
yomitoku/layout_parser.py +53 -3
yomitoku/models/layers/activate.py +13 -0
yomitoku/models/layers/rtdetr_backbone.py +13 -1
yomitoku/models/layers/rtdetr_hybrid_encoder.py +13 -1
yomitoku/models/layers/rtdetrv2_decoder.py +14 -1
yomitoku/models/parseq.py +9 -9
yomitoku/onnx/.gitkeep +0 -0
yomitoku/postprocessor/rtdetr_postprocessor.py +14 -1
yomitoku/table_structure_recognizer.py +57 -4
yomitoku/text_detector.py +49 -4
yomitoku/text_recognizer.py +58 -9
{yomitoku-0.5.2.dist-info → yomitoku-0.6.0.dist-info}/METADATA +12 -7
{yomitoku-0.5.2.dist-info → yomitoku-0.6.0.dist-info}/RECORD +18 -16
{yomitoku-0.5.2.dist-info → yomitoku-0.6.0.dist-info}/WHEEL +1 -1
{yomitoku-0.5.2.dist-info → yomitoku-0.6.0.dist-info}/entry_points.txt +0 -0

yomitoku/cli/main.py CHANGED Viewed

@@ -104,6 +104,12 @@ def main():
         default="results",
         help="output directory",
     )
+    parser.add_argument(
+        "-l",
+        "--lite",
+        action="store_true",
+        help="if set, use lite model",
+    )
     parser.add_argument(
         "-d",
         "--device",
@@ -197,6 +203,15 @@ def main():
         },
     }
+    if args.lite:
+        configs["ocr"]["text_recognizer"]["model_name"] = "parseq-small"
+        configs["ocr"]["text_detector"]["infer_onnx"] = True
+        # Note: Text Detector以外はONNX推論よりもPyTorch推論の方が速いため、ONNX推論は行わない
+        # configs["ocr"]["text_recognizer"]["infer_onnx"] = True
+        # configs["layout_analyzer"]["table_structure_recognizer"]["infer_onnx"] = True
+        # configs["layout_analyzer"]["layout_parser"]["infer_onnx"] = True
     analyzer = DocumentAnalyzer(
         configs=configs,
         visualize=args.vis,

yomitoku/configs/__init__.py CHANGED Viewed

@@ -4,10 +4,12 @@ from .cfg_table_structure_recognizer_rtdtrv2 import (
 )
 from .cfg_text_detector_dbnet import TextDetectorDBNetConfig
 from .cfg_text_recognizer_parseq import TextRecognizerPARSeqConfig
+from .cfg_text_recognizer_parseq_small import TextRecognizerPARSeqSmallConfig
 __all__ = [
     "TextDetectorDBNetConfig",
     "TextRecognizerPARSeqConfig",
     "LayoutParserRTDETRv2Config",
     "TableStructureRecognizerRTDETRv2Config",
+    "TextRecognizerPARSeqSmallConfig",
 ]

yomitoku/configs/cfg_text_recognizer_parseq_small.py ADDED Viewed

@@ -0,0 +1,51 @@
+from dataclasses import dataclass, field
+from typing import List
+from ..constants import ROOT_DIR
+@dataclass
+class Data:
+    num_workers: int = 4
+    batch_size: int = 128
+    img_size: List[int] = field(default_factory=lambda: [32, 800])
+@dataclass
+class Encoder:
+    patch_size: List[int] = field(default_factory=lambda: [16, 16])
+    num_heads: int = 8
+    embed_dim: int = 384
+    mlp_ratio: int = 4
+    depth: int = 9
+@dataclass
+class Decoder:
+    embed_dim: int = 384
+    num_heads: int = 8
+    mlp_ratio: int = 4
+    depth: int = 1
+@dataclass
+class Visualize:
+    font: str = str(ROOT_DIR + "/resource/MPLUS1p-Medium.ttf")
+    color: List[int] = field(default_factory=lambda: [0, 0, 255])  # RGB
+    font_size: int = 18
+@dataclass
+class TextRecognizerPARSeqSmallConfig:
+    hf_hub_repo: str = "KotaroKinoshita/yomitoku-text-recognizer-parseq-small-open-beta"
+    charset: str = str(ROOT_DIR + "/resource/charset.txt")
+    num_tokens: int = 7312
+    max_label_length: int = 100
+    decode_ar: int = 1
+    refine_iters: int = 1
+    data: Data = field(default_factory=Data)
+    encoder: Encoder = field(default_factory=Encoder)
+    decoder: Decoder = field(default_factory=Decoder)
+    visualize: Visualize = field(default_factory=Visualize)

yomitoku/layout_parser.py CHANGED Viewed

@@ -1,11 +1,16 @@
 from typing import List, Union
 import cv2
+import os
+import onnx
+import onnxruntime
 import torch
 import torchvision.transforms as T
 from PIL import Image
 from pydantic import conlist
+from .constants import ROOT_DIR
 from .base import BaseModelCatalog, BaseModule, BaseSchema
 from .configs import LayoutParserRTDETRv2Config
 from .models import RTDETRv2
@@ -91,6 +96,7 @@ class LayoutParser(BaseModule):
         device="cuda",
         visualize=False,
         from_pretrained=True,
+        infer_onnx=False,
     ):
         super().__init__()
         self.load_model(model_name, path_cfg, from_pretrained)
@@ -119,11 +125,44 @@ class LayoutParser(BaseModule):
         }
         self.role = self._cfg.role
+        self.infer_onnx = infer_onnx
+        if infer_onnx:
+            name = self._cfg.hf_hub_repo.split("/")[-1]
+            path_onnx = f"{ROOT_DIR}/onnx/{name}.onnx"
+            if not os.path.exists(path_onnx):
+                self.convert_onnx(path_onnx)
+            model = onnx.load(path_onnx)
+            if torch.cuda.is_available() and device == "cuda":
+                self.sess = onnxruntime.InferenceSession(
+                    model.SerializeToString(), providers=["CUDAExecutionProvider"]
+                )
+            else:
+                self.sess = onnxruntime.InferenceSession(model.SerializeToString())
+    def convert_onnx(self, path_onnx):
+        dynamic_axes = {
+            "input": {0: "batch_size"},
+            "output": {0: "batch_size"},
+        }
+        img_size = self._cfg.data.img_size
+        dummy_input = torch.randn(1, 3, *img_size, requires_grad=True)
+        torch.onnx.export(
+            self.model,
+            dummy_input,
+            path_onnx,
+            opset_version=16,
+            input_names=["input"],
+            output_names=["pred_logits", "pred_boxes"],
+            dynamic_axes=dynamic_axes,
+        )
     def preprocess(self, img):
         cv_img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
         img = Image.fromarray(cv_img)
-        img_tensor = self.transforms(img)[None].to(self.device)
+        img_tensor = self.transforms(img)[None]
         return img_tensor
     def postprocess(self, preds, image_size):
@@ -175,8 +214,19 @@ class LayoutParser(BaseModule):
         ori_h, ori_w = img.shape[:2]
         img_tensor = self.preprocess(img)
-        with torch.inference_mode():
-            preds = self.model(img_tensor)
+        if self.infer_onnx:
+            input = img_tensor.numpy()
+            results = self.sess.run(None, {"input": input})
+            preds = {
+                "pred_logits": torch.tensor(results[0]).to(self.device),
+                "pred_boxes": torch.tensor(results[1]).to(self.device),
+            }
+        else:
+            with torch.inference_mode():
+                img_tensor = img_tensor.to(self.device)
+                preds = self.model(img_tensor)
         results = self.postprocess(preds, (ori_h, ori_w))
         vis = None

yomitoku/models/layers/activate.py CHANGED Viewed

@@ -1,3 +1,16 @@
+# Copyright(c) 2023 lyuwenyu
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import torch.nn as nn

yomitoku/models/layers/rtdetr_backbone.py CHANGED Viewed

@@ -1,4 +1,16 @@
-"""Copyright(c) 2023 lyuwenyu. All Rights Reserved."""
+# Copyright 2023 lyuwenyu
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from collections import OrderedDict

yomitoku/models/layers/rtdetr_hybrid_encoder.py CHANGED Viewed

@@ -1,4 +1,16 @@
-"""Copyright(c) 2023 lyuwenyu. All Rights Reserved."""
+# Copyright 2023 lyuwenyu
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import copy
 from collections import OrderedDict

yomitoku/models/layers/rtdetrv2_decoder.py CHANGED Viewed

@@ -1,4 +1,17 @@
-"""Copyright(c) 2023 lyuwenyu. All Rights Reserved."""
+# Scene Text Recognition Model Hub
+# Copyright 2023 lyuwenyu
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import copy
 import functools

yomitoku/models/parseq.py CHANGED Viewed

@@ -22,7 +22,6 @@ from huggingface_hub import PyTorchModelHubMixin
 from timm.models.helpers import named_apply
 from torch import Tensor
-from ..postprocessor import ParseqTokenizer as Tokenizer
 from .layers.parseq_transformer import Decoder, Encoder, TokenEmbedding
@@ -123,7 +122,6 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
     def forward(
         self,
-        tokenizer: Tokenizer,
         images: Tensor,
         max_length: Optional[int] = None,
     ) -> Tensor:
@@ -150,11 +148,11 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
         if self.decode_ar:
             tgt_in = torch.full(
                 (bs, num_steps),
-                tokenizer.pad_id,
+                self.tokenizer.pad_id,
                 dtype=torch.long,
                 device=self._device,
             )
-            tgt_in[:, 0] = tokenizer.bos_id
+            tgt_in[:, 0] = self.tokenizer.bos_id
             logits = []
             for i in range(num_steps):
@@ -177,7 +175,7 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
                     # greedy decode. add the next token index to the target input
                     tgt_in[:, j] = p_i.squeeze().argmax(-1)
                     # Efficient batch decoding: If all output words have at least one EOS token, end decoding.
-                    if testing and (tgt_in == tokenizer.eos_id).any(dim=-1).all():
+                    if testing and (tgt_in == self.tokenizer.eos_id).any(dim=-1).all():
                         break
             logits = torch.cat(logits, dim=1)
@@ -185,7 +183,7 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
             # No prior context, so input is just <bos>. We query all positions.
             tgt_in = torch.full(
                 (bs, 1),
-                tokenizer.bos_id,
+                self.tokenizer.bos_id,
                 dtype=torch.long,
                 device=self._device,
             )
@@ -200,7 +198,7 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
                     torch.ones(
                         num_steps,
                         num_steps,
-                        dtype=torch.bool,
+                        dtype=torch.int64,
                         device=self._device,
                     ),
                     2,
@@ -208,7 +206,7 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
             ] = 0
             bos = torch.full(
                 (bs, 1),
-                tokenizer.bos_id,
+                self.tokenizer.bos_id,
                 dtype=torch.long,
                 device=self._device,
             )
@@ -216,7 +214,9 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
                 # Prior context is the previous output.
                 tgt_in = torch.cat([bos, logits[:, :-1].argmax(-1)], dim=1)
                 # Mask tokens beyond the first EOS token.
-                tgt_padding_mask = (tgt_in == tokenizer.eos_id).int().cumsum(-1) > 0
+                tgt_padding_mask = (tgt_in == self.tokenizer.eos_id).int().cumsum(
+                    -1
+                ) > 0
                 tgt_out = self.decode(
                     tgt_in,
                     memory,

yomitoku/onnx/.gitkeep ADDED Viewed

File without changes

yomitoku/postprocessor/rtdetr_postprocessor.py CHANGED Viewed

@@ -1,4 +1,17 @@
-"""Copyright(c) 2023 lyuwenyu. All Rights Reserved."""
+# Copyright 2023 lyuwenyu
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import torch
 import torch.nn as nn

yomitoku/table_structure_recognizer.py CHANGED Viewed

@@ -1,11 +1,16 @@
 from typing import List, Union
 import cv2
+import os
+import onnx
+import onnxruntime
 import torch
 import torchvision.transforms as T
 from PIL import Image
 from pydantic import conlist
+from .constants import ROOT_DIR
 from .base import BaseModelCatalog, BaseModule, BaseSchema
 from .configs import TableStructureRecognizerRTDETRv2Config
 from .layout_parser import filter_contained_rectangles_within_category
@@ -109,12 +114,13 @@ class TableStructureRecognizer(BaseModule):
         device="cuda",
         visualize=False,
         from_pretrained=True,
+        infer_onnx=False,
     ):
         super().__init__()
         self.load_model(
             model_name,
             path_cfg,
-            from_pretrained=True,
+            from_pretrained=from_pretrained,
         )
         self.device = device
         self.visualize = visualize
@@ -127,6 +133,8 @@ class TableStructureRecognizer(BaseModule):
             num_top_queries=self._cfg.RTDETRTransformerv2.num_queries,
         )
+        self.save_config("table_structure_recognitizer.yaml")
         self.transforms = T.Compose(
             [
                 T.Resize(self._cfg.data.img_size),
@@ -140,6 +148,40 @@ class TableStructureRecognizer(BaseModule):
             id: category for id, category in enumerate(self._cfg.category)
         }
+        self.infer_onnx = infer_onnx
+        if infer_onnx:
+            name = self._cfg.hf_hub_repo.split("/")[-1]
+            path_onnx = f"{ROOT_DIR}/onnx/{name}.onnx"
+            if not os.path.exists(path_onnx):
+                self.convert_onnx(path_onnx)
+            model = onnx.load(path_onnx)
+            if torch.cuda.is_available() and device == "cuda":
+                self.sess = onnxruntime.InferenceSession(
+                    model.SerializeToString(), providers=["CUDAExecutionProvider"]
+                )
+            else:
+                self.sess = onnxruntime.InferenceSession(model.SerializeToString())
+    def convert_onnx(self, path_onnx):
+        dynamic_axes = {
+            "input": {0: "batch_size"},
+            "output": {0: "batch_size"},
+        }
+        img_size = self._cfg.data.img_size
+        dummy_input = torch.randn(1, 3, *img_size, requires_grad=True)
+        torch.onnx.export(
+            self.model,
+            dummy_input,
+            path_onnx,
+            opset_version=16,
+            input_names=["input"],
+            output_names=["pred_logits", "pred_boxes"],
+            dynamic_axes=dynamic_axes,
+        )
     def preprocess(self, img, boxes):
         cv_img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
@@ -149,7 +191,7 @@ class TableStructureRecognizer(BaseModule):
             table_img = cv_img[y1:y2, x1:x2, :]
             th, hw = table_img.shape[:2]
             table_img = Image.fromarray(table_img)
-            img_tensor = self.transforms(table_img)[None].to(self.device)
+            img_tensor = self.transforms(table_img)[None]
             table_imgs.append(
                 {
                     "tensor": img_tensor,
@@ -226,8 +268,19 @@ class TableStructureRecognizer(BaseModule):
         img_tensors = self.preprocess(img, table_boxes)
         outputs = []
         for data in img_tensors:
-            with torch.inference_mode():
-                pred = self.model(data["tensor"])
+            if self.infer_onnx:
+                input = data["tensor"].numpy()
+                results = self.sess.run(None, {"input": input})
+                pred = {
+                    "pred_logits": torch.tensor(results[0]).to(self.device),
+                    "pred_boxes": torch.tensor(results[1]).to(self.device),
+                }
+            else:
+                with torch.inference_mode():
+                    data["tensor"] = data["tensor"].to(self.device)
+                    pred = self.model(data["tensor"])
             table = self.postprocess(pred, data)
             outputs.append(table)

yomitoku/text_detector.py CHANGED Viewed

@@ -2,6 +2,7 @@ from typing import List
 import numpy as np
 import torch
+import os
 from pydantic import conlist
 from .base import BaseModelCatalog, BaseModule, BaseSchema
@@ -14,6 +15,10 @@ from .data.functions import (
 from .models import DBNet
 from .postprocessor import DBnetPostProcessor
 from .utils.visualizer import det_visualizer
+from .constants import ROOT_DIR
+import onnx
+import onnxruntime
 class TextDetectorModelCatalog(BaseModelCatalog):
@@ -43,12 +48,13 @@ class TextDetector(BaseModule):
         device="cuda",
         visualize=False,
         from_pretrained=True,
+        infer_onnx=False,
     ):
         super().__init__()
         self.load_model(
             model_name,
             path_cfg,
-            from_pretrained=True,
+            from_pretrained=from_pretrained,
         )
         self.device = device
@@ -58,6 +64,39 @@ class TextDetector(BaseModule):
         self.model.to(self.device)
         self.post_processor = DBnetPostProcessor(**self._cfg.post_process)
+        self.infer_onnx = infer_onnx
+        if infer_onnx:
+            name = self._cfg.hf_hub_repo.split("/")[-1]
+            path_onnx = f"{ROOT_DIR}/onnx/{name}.onnx"
+            if not os.path.exists(path_onnx):
+                self.convert_onnx(path_onnx)
+            model = onnx.load(path_onnx)
+            if torch.cuda.is_available() and device == "cuda":
+                self.sess = onnxruntime.InferenceSession(
+                    model.SerializeToString(), providers=["CUDAExecutionProvider"]
+                )
+            else:
+                self.sess = onnxruntime.InferenceSession(model.SerializeToString())
+    def convert_onnx(self, path_onnx):
+        dynamic_axes = {
+            "input": {0: "batch_size", 2: "height", 3: "width"},
+            "output": {0: "batch_size", 2: "height", 3: "width"},
+        }
+        dummy_input = torch.randn(1, 3, 256, 256, requires_grad=True)
+        torch.onnx.export(
+            self.model,
+            dummy_input,
+            path_onnx,
+            opset_version=14,
+            input_names=["input"],
+            output_names=["output"],
+            dynamic_axes=dynamic_axes,
+        )
     def preprocess(self, img):
         img = img.copy()
@@ -81,9 +120,15 @@ class TextDetector(BaseModule):
         ori_h, ori_w = img.shape[:2]
         tensor = self.preprocess(img)
-        tensor = tensor.to(self.device)
-        with torch.inference_mode():
-            preds = self.model(tensor)
+        if self.infer_onnx:
+            input = tensor.numpy()
+            results = self.sess.run(["output"], {"input": input})
+            preds = {"binary": torch.tensor(results[0])}
+        else:
+            with torch.inference_mode():
+                tensor = tensor.to(self.device)
+                preds = self.model(tensor)
         quads, scores = self.postprocess(preds, (ori_h, ori_w))
         outputs = {"points": quads, "scores": scores}

yomitoku/text_recognizer.py CHANGED Viewed

@@ -2,22 +2,28 @@ from typing import List
 import numpy as np
 import torch
+import os
 import unicodedata
 from pydantic import conlist
 from .base import BaseModelCatalog, BaseModule, BaseSchema
-from .configs import TextRecognizerPARSeqConfig
+from .configs import TextRecognizerPARSeqConfig, TextRecognizerPARSeqSmallConfig
 from .data.dataset import ParseqDataset
 from .models import PARSeq
 from .postprocessor import ParseqTokenizer as Tokenizer
 from .utils.misc import load_charset
 from .utils.visualizer import rec_visualizer
+from .constants import ROOT_DIR
+import onnx
+import onnxruntime
 class TextRecognizerModelCatalog(BaseModelCatalog):
     def __init__(self):
         super().__init__()
         self.register("parseq", TextRecognizerPARSeqConfig, PARSeq)
+        self.register("parseq-small", TextRecognizerPARSeqSmallConfig, PARSeq)
 class TextRecognizerSchema(BaseSchema):
@@ -43,23 +49,41 @@ class TextRecognizer(BaseModule):
         device="cuda",
         visualize=False,
         from_pretrained=True,
+        infer_onnx=False,
     ):
         super().__init__()
         self.load_model(
             model_name,
             path_cfg,
-            from_pretrained=True,
+            from_pretrained=from_pretrained,
         )
         self.charset = load_charset(self._cfg.charset)
         self.tokenizer = Tokenizer(self.charset)
         self.device = device
+        self.model.tokenizer = self.tokenizer
         self.model.eval()
         self.model.to(self.device)
         self.visualize = visualize
+        self.infer_onnx = infer_onnx
+        if infer_onnx:
+            name = self._cfg.hf_hub_repo.split("/")[-1]
+            path_onnx = f"{ROOT_DIR}/onnx/{name}.onnx"
+            if not os.path.exists(path_onnx):
+                self.convert_onnx(path_onnx)
+            model = onnx.load(path_onnx)
+            if torch.cuda.is_available() and device == "cuda":
+                self.sess = onnxruntime.InferenceSession(
+                    model.SerializeToString(), providers=["CUDAExecutionProvider"]
+                )
+            else:
+                self.sess = onnxruntime.InferenceSession(model.SerializeToString())
     def preprocess(self, img, polygons):
         dataset = ParseqDataset(self._cfg, img, polygons)
         dataloader = torch.utils.data.DataLoader(
@@ -71,6 +95,25 @@ class TextRecognizer(BaseModule):
         return dataloader
+    def convert_onnx(self, path_onnx):
+        img_size = self._cfg.data.img_size
+        input = torch.randn(1, 3, *img_size, requires_grad=True)
+        dynamic_axes = {
+            "input": {0: "batch_size"},
+            "output": {0: "batch_size"},
+        }
+        torch.onnx.export(
+            self.model,
+            input,
+            path_onnx,
+            opset_version=14,
+            input_names=["input"],
+            output_names=["output"],
+            do_constant_folding=True,
+            dynamic_axes=dynamic_axes,
+        )
     def postprocess(self, p, points):
         pred, score = self.tokenizer.decode(p)
         pred = [unicodedata.normalize("NFKC", x) for x in pred]
@@ -101,13 +144,19 @@ class TextRecognizer(BaseModule):
         scores = []
         directions = []
         for data in dataloader:
-            data = data.to(self.device)
-            with torch.inference_mode():
-                p = self.model(self.tokenizer, data).softmax(-1)
-                pred, score, direction = self.postprocess(p, points)
-                preds.extend(pred)
-                scores.extend(score)
-                directions.extend(direction)
+            if self.infer_onnx:
+                input = data.numpy()
+                results = self.sess.run(["output"], {"input": input})
+                p = torch.tensor(results[0])
+            else:
+                with torch.inference_mode():
+                    data = data.to(self.device)
+                    p = self.model(data).softmax(-1)
+            pred, score, direction = self.postprocess(p, points)
+            preds.extend(pred)
+            scores.extend(score)
+            directions.extend(direction)
         outputs = {
             "contents": preds,

{yomitoku-0.5.2.dist-info → yomitoku-0.6.0.dist-info}/METADATA RENAMED Viewed

@@ -1,14 +1,17 @@
 Metadata-Version: 2.3
 Name: yomitoku
-Version: 0.5.2
+Version: 0.6.0
 Summary: Yomitoku is an AI-powered document image analysis package designed specifically for the Japanese language.
 Author-email: Kotaro Kinoshita <kotaro.kinoshita@mlism.com>
 License: CC BY-NC-SA 4.0
 Keywords: Deep Learning,Japanese,OCR
-Requires-Python: <3.13,>=3.9
+Requires-Python: <3.13,>=3.10
 Requires-Dist: huggingface-hub>=0.26.1
 Requires-Dist: lxml>=5.3.0
 Requires-Dist: omegaconf>=2.3.0
+Requires-Dist: onnx>=1.17.0
+Requires-Dist: onnxruntime-gpu>=1.20.1
+Requires-Dist: onnxruntime>=1.20.1
 Requires-Dist: opencv-python>=4.10.0.84
 Requires-Dist: pyclipper>=1.3.0.post6
 Requires-Dist: pydantic>=2.9.2
@@ -23,7 +26,7 @@ Description-Content-Type: text/markdown
 <img src="static/logo/horizontal.png" width="800px">
-![Python](https://img.shields.io/badge/Python-3.9|3.10|3.11|3.12-F9DC3E.svg?logo=python&logoColor=&style=flat)
+![Python](https://img.shields.io/badge/Python-3.10|3.11|3.12-F9DC3E.svg?logo=python&logoColor=&style=flat)
 ![Pytorch](https://img.shields.io/badge/Pytorch-2.5-EE4C2C.svg?logo=Pytorch&style=fla)
 ![CUDA](https://img.shields.io/badge/CUDA->=11.8-76B900.svg?logo=NVIDIA&style=fla)
 ![OS](https://img.shields.io/badge/OS-Linux|Mac|Win-1793D1.svg?&style=fla)
@@ -69,19 +72,20 @@ Markdown でエクスポートした結果は関してはリポジトリ内の[s
 pip install yomitoku
 ```
-- pytorch はご自身の CUDAのバージョンにあったものをインストールしてください。デフォルトではCUDA12.4以上に対応したものがインストールされます。
-- pytorch は2.5以上のバージョンに対応しています。その関係でCUDA11.8以上のバージョンが必要になります。対応できない場合は、リポジトリ内のDockerfileを利用してください。
+- pytorch はご自身の CUDA のバージョンにあったものをインストールしてください。デフォルトでは CUDA12.4 以上に対応したものがインストールされます。
+- pytorch は 2.5 以上のバージョンに対応しています。その関係で CUDA11.8 以上のバージョンが必要になります。対応できない場合は、リポジトリ内の Dockerfile を利用してください。
 ## 🚀 実行方法
 ```
-yomitoku ${path_data} -f md -o results -v --figure
+yomitoku ${path_data} -f md -o results -v --figure　--lite
 ```
 - `${path_data}` 解析対象の画像が含まれたディレクトリか画像ファイルのパスを直接して指定してください。ディレクトリを対象とした場合はディレクトリのサブディレクトリ内の画像も含めて処理を実行します。
 - `-f`, `--format` 出力形式のファイルフォーマットを指定します。(json, csv, html, md をサポート)
 - `-o`, `--outdir` 出力先のディレクトリ名を指定します。存在しない場合は新規で作成されます。
 - `-v`, `--vis` を指定すると解析結果を可視化した画像を出力します。
+- `-l`, `--lite` を指定すると軽量モデルで推論を実行します。通常より高速に推論できますが、若干、精度が低下する可能性があります。
 - `-d`, `--device` モデルを実行するためのデバイスを指定します。gpu が利用できない場合は cpu で推論が実行されます。(デフォルト: cuda)
 - `--ignore_line_break` 画像の改行位置を無視して、段落内の文章を連結して返します。（デフォルト：画像通りの改行位置位置で改行します。）
 - `--figure_letter` 検出した図表に含まれる文字も出力ファイルにエクスポートします。
@@ -94,6 +98,7 @@ yomitoku --help
 ```
 **NOTE**
 - GPU での実行を推奨します。CPU を用いての推論向けに最適化されておらず、処理時間が長くなります。
 - 活字のみ識別をサポートしております。手書き文字に関しては、読み取れる場合もありますが、公式にはサポートしておりません。
 - Yomitoku は文書 OCR 向けに最適化されており、情景 OCR(看板など紙以外にプリントされた文字の読み取り)向けには最適化されていません。
@@ -107,6 +112,6 @@ yomitoku --help
 本リポジトリ内に格納されているソースコードおよび本プロジェクトに関連する HuggingFaceHub 上のモデルの重みファイルのライセンスは CC BY-NC-SA 4.0 に従います。
 非商用での個人利用、研究目的での利用はご自由にお使いください。
-商用目的での利用に関しては、別途、商用ライセンスを提供しますので、開発者にお問い合わせください。
+商用目的での利用に関しては、別途、商用ライセンスを提供しますので、https://www.mlism.com/ にお問い合わせください。
 YomiToku © 2024 by Kotaro Kinoshita is licensed under CC BY-NC-SA 4.0. To view a copy of this license, visit https://creativecommons.org/licenses/by-nc-sa/4.0/

{yomitoku-0.5.2.dist-info → yomitoku-0.6.0.dist-info}/RECORD RENAMED Viewed

@@ -3,19 +3,20 @@ yomitoku/base.py,sha256=lzR_V8t87aRasmFdFwD-8KAeSahSTI3AZaEn6g8sOv8,3871
 yomitoku/constants.py,sha256=zlW5QRc_u_F3C2RAgBFWyHJZexBnJT5N15GC-9d3iLo,686
 yomitoku/document_analyzer.py,sha256=HIg-nVzDhJIP-h-tn4uU86KakgHdlAhosEqK_i-SWe4,9906
 yomitoku/layout_analyzer.py,sha256=QTeRcVd8aySz8u6dg2ikET77ar3sqlukRLBwYfTyMPM,2033
-yomitoku/layout_parser.py,sha256=V2jCNHE61jNp8ytYdKwPV34V5qEK7y-7-Mq7-AkoQhU,5898
+yomitoku/layout_parser.py,sha256=Yni1C_7j4fzHcdmBNNGRZPc23W_6J6HwPPQVjYvaztM,7539
 yomitoku/ocr.py,sha256=Rcojw0aGA6yDF2RjqfK23_rMw-xm61KGd8JmTCTOOVU,2516
 yomitoku/reading_order.py,sha256=OfhOS9ttPDoPSuHrIRKyOzG19GGeRufbuSKDqhsohh4,6404
-yomitoku/table_structure_recognizer.py,sha256=CouRzfdO_toZKUQbzQqocKdMcgA3Pr7glkZuqD5itpg,7280
-yomitoku/text_detector.py,sha256=okp0xuq4lXgEDcfgCzeJcrj8hfSI4NvAgorsNwi_NYI,2682
-yomitoku/text_recognizer.py,sha256=RHdq1M3-e3C1RECgbaoqPngtxicG3izAma12juD2ICQ,3789
+yomitoku/table_structure_recognizer.py,sha256=Wf_Ehmf6V27iVLmw2o9i7kJnbwEOhuExI-ljIO3a8NE,9043
+yomitoku/text_detector.py,sha256=fbwKelsVfwCt5YL4h-WEf4qkniv5cXmyaLR6oSYz0eA,4167
+yomitoku/text_recognizer.py,sha256=Iu-IzwaziNjmrTeSw9aoN9BDTHkNOzsZhViCv45yiN8,5422
 yomitoku/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-yomitoku/cli/main.py,sha256=MBD0S4sXgquJ8P2egkZjJcglXvCke5Uw46C28SDtr8g,6252
-yomitoku/configs/__init__.py,sha256=KBhb9S7xt22HZaIcoWSgZHfscXXj9YlimOwLH5z9CRo,454
+yomitoku/cli/main.py,sha256=qDB_YNK7abstIr9tYLiJjNU3xLSCd5x1UNDKqwUi2Rk,6885
+yomitoku/configs/__init__.py,sha256=e1Alss5QJLZSNfD6zLEG6xu5vDQDw-4Jayiqq8bq52s,571
 yomitoku/configs/cfg_layout_parser_rtdtrv2.py,sha256=8PRxB2Ar9UF7-DLtbgSokhrzdXb0veWI6Wc-X8qigRw,2329
 yomitoku/configs/cfg_table_structure_recognizer_rtdtrv2.py,sha256=o70GMHD8k-zeBeJtuhPS8x7vVB-ffucnJXeSyn-0AXo,2116
 yomitoku/configs/cfg_text_detector_dbnet.py,sha256=U9k48PON7haoOaytiELhbZRpv9RMiUm6nnfHmdxIa9Q,1153
 yomitoku/configs/cfg_text_recognizer_parseq.py,sha256=hpFs3nKqh4XdU3BZMTultegtLEGahEsCaZdjfKC_MO8,1247
+yomitoku/configs/cfg_text_recognizer_parseq_small.py,sha256=uCm_VC_G79IbZpOiK8fgYzAJ4b98H5pf328wyQomtfo,1259
 yomitoku/data/__init__.py,sha256=KAofFc9rk9ZdTKBjemu9RM8Vj9XnKbWC2MPZ2RWtOdE,82
 yomitoku/data/dataset.py,sha256=-I4f-FDtgsPnJ2MnXB7FtwihMW3koDaSI1OEoqKneIg,1014
 yomitoku/data/functions.py,sha256=eOyxo8S6EoAf1xGSPLWQFb9-t5Rg52NggD9MFIrOSpY,7506
@@ -26,19 +27,20 @@ yomitoku/export/export_json.py,sha256=1ChvCAHfCmMQvCfcAb1p3fSpr4elNAs3xBSIbpfn3b
 yomitoku/export/export_markdown.py,sha256=mCcsXUWBLrYc1NcRSBFfBT28d6eCddAF1oHp0qdBEnE,3986
 yomitoku/models/__init__.py,sha256=Enxq9sjJWusZuxecTori8IQa8NEYKaiiptDluHX1avg,144
 yomitoku/models/dbnet_plus.py,sha256=jeWJZm0ihbxoJeAXBFK7uVIwoosx2IUNk7Ut5wRH0vA,7998
-yomitoku/models/parseq.py,sha256=7QT-q5_oWqXTDXobRk1R6Lpap_AxdC4AzkSsOgXjOwM,8611
+yomitoku/models/parseq.py,sha256=-DQMQuON2jwtb4Ib2V0O19un9w-WG4rXS0SiscydrXU,8593
 yomitoku/models/rtdetr.py,sha256=oJsr8RHz3frslhLfXdVJve47lUsrmqLjfdTrZ41tlQ0,687
 yomitoku/models/layers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-yomitoku/models/layers/activate.py,sha256=HUw0q-76RNjZF-o9O3fowfJcw0t1H5o0pbyioGdqUvU,668
+yomitoku/models/layers/activate.py,sha256=S54GPssZBMloM2oFAXeDVMmBBZOWyjwU98Niq758txE,1244
 yomitoku/models/layers/dbnet_feature_attention.py,sha256=Vpp_PiLVuI7Zs30TTg4RNRn16KTb81ewonADpUHd4aE,6060
 yomitoku/models/layers/parseq_transformer.py,sha256=33eroJf8rmgIptP-NpZLJMhG7XOTwV4rXsq674VrKnU,6704
-yomitoku/models/layers/rtdetr_backbone.py,sha256=QjfLW-3qn2My3Jbg6yLORX8A-D2sph9J9u3r5nNnDLo,9386
-yomitoku/models/layers/rtdetr_hybrid_encoder.py,sha256=D3dK37k7_0jPqV39-6Se8kBzF_SyZttNlbLleyNFiJU,13607
-yomitoku/models/layers/rtdetrv2_decoder.py,sha256=5bVYPLFYCy3PcjyHTPFHNLWqg3bctrk-dKVG4kayhaw,27517
+yomitoku/models/layers/rtdetr_backbone.py,sha256=VOWFW7XFfJl4cvPaupqqP4-I-YHdwlVltQEgliD69As,9904
+yomitoku/models/layers/rtdetr_hybrid_encoder.py,sha256=ZnpEzJLzHgu_hrx7YK6myXZ4F1CDHRM501RbAPQdzdQ,14125
+yomitoku/models/layers/rtdetrv2_decoder.py,sha256=ggUwTdWpBfyYHnZuLx8vyH8n0XfZkQFtxgpY-1YI2sI,28070
+yomitoku/onnx/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 yomitoku/postprocessor/__init__.py,sha256=W4vUuqBaFtH5dlSBIYgyaCroGLMjpV6RrNGIBQ8NFVw,243
 yomitoku/postprocessor/dbnet_postporcessor.py,sha256=o_y8b5REd2dFEdIpRcr6o-XBfOCHo9rBYGwokP_uhTc,4948
 yomitoku/postprocessor/parseq_tokenizer.py,sha256=e89_g_bc4Au3SchuxoJfJNATJTxFmVYetzXyAzPWm28,4315
-yomitoku/postprocessor/rtdetr_postprocessor.py,sha256=f52wfRKrxqSXy_LeidKDR9XAta_qPjto-oYEdO0XL8A,3386
+yomitoku/postprocessor/rtdetr_postprocessor.py,sha256=TCv1t1zCxg2rSirsLm4sXlaltGubH-roVdEqnUoRs-8,3905
 yomitoku/resource/MPLUS1p-Medium.ttf,sha256=KLL1KkCumIBkgQtx1n4SffdaFuCNffThktEAbkB1OU8,1758908
 yomitoku/resource/charset.txt,sha256=sU91kSi-9Wk4733bCXy4j_UDmvcsj96sHOq1ppUJlOY,21672
 yomitoku/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -46,7 +48,7 @@ yomitoku/utils/graph.py,sha256=LKNB8ZhSQwOZMfeAimPMF5UCVVr2ZaUWoGDkz8z-uGU,456
 yomitoku/utils/logger.py,sha256=uOmtQDr0A0JD7wyFshedL08BiNrQorHnpktRXba8bjU,424
 yomitoku/utils/misc.py,sha256=2Eyy7-9K_h4Mal1zGXq6OlxubfNzhS0mEYwn_xt7xl8,2497
 yomitoku/utils/visualizer.py,sha256=2pSmbhUPylzVVJ0bXtGDoNmMdArAByab4Py7Xavvs_A,5230
-yomitoku-0.5.2.dist-info/METADATA,sha256=qG0aq8sJb6iD-i0WvZL__YclRytpBdzyPzu6HNqtgIM,7819
-yomitoku-0.5.2.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
-yomitoku-0.5.2.dist-info/entry_points.txt,sha256=nFV3S11zgBNW0Qq_D0XQNg2R4lNXU_9XUFr6rdJoyF8,52
-yomitoku-0.5.2.dist-info/RECORD,,
+yomitoku-0.6.0.dist-info/METADATA,sha256=XDmMBtDx9MjXPuzcARwOwJXRN7PMCsQDwc38jDSwX5g,8134
+yomitoku-0.6.0.dist-info/WHEEL,sha256=C2FUgwZgiLbznR-k0b_5k3Ai_1aASOXDss3lzCUsUug,87
+yomitoku-0.6.0.dist-info/entry_points.txt,sha256=nFV3S11zgBNW0Qq_D0XQNg2R4lNXU_9XUFr6rdJoyF8,52
+yomitoku-0.6.0.dist-info/RECORD,,

{yomitoku-0.5.2.dist-info → yomitoku-0.6.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: hatchling 1.25.0
+Generator: hatchling 1.26.3
 Root-Is-Purelib: true
 Tag: py3-none-any

{yomitoku-0.5.2.dist-info → yomitoku-0.6.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

yomitoku 0.5.2__py3-none-any.whl → 0.6.0__py3-none-any.whl

yomitoku 0.5.2py3-none-any.whl → 0.6.0py3-none-any.whl