PyPI - yomitoku - Versions diffs - 0.4.1__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

yomitoku 0.4.1py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

yomitoku/configs/cfg_text_detector_dbnet.py +1 -1
yomitoku/data/functions.py +13 -6
yomitoku/document_analyzer.py +4 -10
yomitoku/layout_analyzer.py +2 -5
yomitoku/models/dbnet_plus.py +13 -39
yomitoku/models/layers/rtdetr_backbone.py +6 -17
yomitoku/models/layers/rtdetr_hybrid_encoder.py +7 -20
yomitoku/models/parseq.py +6 -19
yomitoku/postprocessor/dbnet_postporcessor.py +15 -14
yomitoku/postprocessor/parseq_tokenizer.py +1 -3
yomitoku/utils/misc.py +1 -1
yomitoku/utils/visualizer.py +10 -3
{yomitoku-0.4.1.dist-info → yomitoku-0.5.1.dist-info}/METADATA +21 -36
{yomitoku-0.4.1.dist-info → yomitoku-0.5.1.dist-info}/RECORD +16 -16
{yomitoku-0.4.1.dist-info → yomitoku-0.5.1.dist-info}/WHEEL +0 -0
{yomitoku-0.4.1.dist-info → yomitoku-0.5.1.dist-info}/entry_points.txt +0 -0

yomitoku/configs/cfg_text_detector_dbnet.py CHANGED Viewed

@@ -30,7 +30,7 @@ class PostProcess:
     thresh: float = 0.2
     box_thresh: float = 0.5
     max_candidates: int = 1500
-    unclip_ratio: float = 2.0
+    unclip_ratio: float = 7.0
 @dataclass

yomitoku/data/functions.py CHANGED Viewed

@@ -3,7 +3,7 @@ from pathlib import Path
 import cv2
 import numpy as np
 import torch
-from pdf2image import convert_from_path
+import pypdfium2
 from ..constants import (
     MIN_IMAGE_SIZE,
@@ -70,6 +70,7 @@ def load_pdf(pdf_path: str, dpi=200) -> list[np.ndarray]:
     Returns:
         list[np.ndarray]: list of image data(BGR)
     """
     pdf_path = Path(pdf_path)
     if not pdf_path.exists():
         raise FileNotFoundError(f"File not found: {pdf_path}")
@@ -86,11 +87,19 @@ def load_pdf(pdf_path: str, dpi=200) -> list[np.ndarray]:
         )
     try:
-        images = convert_from_path(pdf_path, dpi=dpi)
+        doc = pypdfium2.PdfDocument(pdf_path)
+        renderer = doc.render(
+            pypdfium2.PdfBitmap.to_pil,
+            scale=dpi / 72,
+        )
+        images = list(renderer)
+        images = [np.array(image.convert("RGB"))[:, :, ::-1] for image in images]
+        doc.close()
     except Exception as e:
         raise ValueError(f"Failed to open the PDF file: {pdf_path}") from e
-    return [np.array(img)[:, :, ::-1] for img in images]
+    return images
 def resize_shortest_edge(
@@ -193,9 +202,7 @@ def validate_quads(img: np.ndarray, quads: list[list[list[int]]]):
         h, w = img.shape[:2]
         if x1 < 0 or x2 > w or y1 < 0 or y2 > h:
-            raise ValueError(
-                f"The vertices are out of the image. {quad.tolist()}"
-            )
+            raise ValueError(f"The vertices are out of the image. {quad.tolist()}")
     return True

yomitoku/document_analyzer.py CHANGED Viewed

@@ -109,8 +109,8 @@ def extract_words_within_element(pred_words, element):
     if len(contained_words) == 0:
         return None, None, check_list
-    mean_width = word_sum_width / len(contained_words)
-    mean_height = word_sum_height / len(contained_words)
+    # mean_width = word_sum_width / len(contained_words)
+    # mean_height = word_sum_height / len(contained_words)
     word_direction = [word.direction for word in contained_words]
     cnt_horizontal = word_direction.count("horizontal")
@@ -120,18 +120,12 @@ def extract_words_within_element(pred_words, element):
     if element_direction == "horizontal":
         contained_words = sorted(
             contained_words,
-            key=lambda x: (
-                x.points[0][1] // int(mean_height),
-                x.points[0][0],
-            ),
+            key=lambda x: (sum([p[1] for p in x.points]) / 4),
         )
     else:
         contained_words = sorted(
             contained_words,
-            key=lambda x: (
-                x.points[1][0] // int(mean_width),
-                x.points[1][1],
-            ),
+            key=lambda x: (sum([p[0] for p in x.points]) / 4),
             reverse=True,
         )

yomitoku/layout_analyzer.py CHANGED Viewed

@@ -27,8 +27,7 @@ class LayoutAnalyzer:
         if isinstance(configs, dict):
             assert (
-                "layout_parser" in configs
-                or "table_structure_recognizer" in configs
+                "layout_parser" in configs or "table_structure_recognizer" in configs
             ), "Invalid config key. Please check the config keys."
             if "layout_parser" in configs:
@@ -53,9 +52,7 @@ class LayoutAnalyzer:
     def __call__(self, img):
         layout_results, vis = self.layout_parser(img)
         table_boxes = [table.box for table in layout_results.tables]
-        table_results, vis = self.table_structure_recognizer(
-            img, table_boxes, vis=vis
-        )
+        table_results, vis = self.table_structure_recognizer(img, table_boxes, vis=vis)
         results = LayoutAnalyzerSchema(
             paragraphs=layout_results.paragraphs,

yomitoku/models/dbnet_plus.py CHANGED Viewed

@@ -20,9 +20,7 @@ class BackboneBase(nn.Module):
             "layer4": "layer4",
         }
-        self.body = IntermediateLayerGetter(
-            backbone, return_layers=return_layers
-        )
+        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
     def forward(self, tensor):
         xs = self.body(tensor)
@@ -57,18 +55,10 @@ class DBNetDecoder(nn.Module):
         self.training = True
         self.input_proj = nn.ModuleDict(
             {
-                "layer1": nn.Conv2d(
-                    in_channels[0], self.d_model, 1, bias=False
-                ),
-                "layer2": nn.Conv2d(
-                    in_channels[1], self.d_model, 1, bias=False
-                ),
-                "layer3": nn.Conv2d(
-                    in_channels[2], self.d_model, 1, bias=False
-                ),
-                "layer4": nn.Conv2d(
-                    in_channels[3], self.d_model, 1, bias=False
-                ),
+                "layer1": nn.Conv2d(in_channels[0], self.d_model, 1, bias=False),
+                "layer2": nn.Conv2d(in_channels[1], self.d_model, 1, bias=False),
+                "layer3": nn.Conv2d(in_channels[2], self.d_model, 1, bias=False),
+                "layer4": nn.Conv2d(in_channels[3], self.d_model, 1, bias=False),
             }
         )
@@ -89,9 +79,7 @@ class DBNetDecoder(nn.Module):
                         padding=1,
                         bias=False,
                     ),
-                    nn.Upsample(
-                        scale_factor=2, mode="bilinear", align_corners=False
-                    ),
+                    nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False),
                 ),
                 "layer3": nn.Sequential(
                     nn.Conv2d(
@@ -101,9 +89,7 @@ class DBNetDecoder(nn.Module):
                         padding=1,
                         bias=False,
                     ),
-                    nn.Upsample(
-                        scale_factor=4, mode="bilinear", align_corners=False
-                    ),
+                    nn.Upsample(scale_factor=4, mode="bilinear", align_corners=False),
                 ),
                 "layer4": nn.Sequential(
                     nn.Conv2d(
@@ -113,17 +99,13 @@ class DBNetDecoder(nn.Module):
                         padding=1,
                         bias=False,
                     ),
-                    nn.Upsample(
-                        scale_factor=4, mode="bilinear", align_corners=False
-                    ),
+                    nn.Upsample(scale_factor=4, mode="bilinear", align_corners=False),
                 ),
             }
         )
         self.binarize = nn.Sequential(
-            nn.Conv2d(
-                self.d_model, self.d_model // 4, 3, padding=1, bias=False
-            ),
+            nn.Conv2d(self.d_model, self.d_model // 4, 3, padding=1, bias=False),
             nn.BatchNorm2d(self.d_model // 4),
             nn.ReLU(inplace=True),
             nn.ConvTranspose2d(self.d_model // 4, self.d_model // 4, 2, 2),
@@ -166,16 +148,12 @@ class DBNetDecoder(nn.Module):
             m.weight.data.fill_(1.0)
             m.bias.data.fill_(1e-4)
-    def _init_thresh(
-        self, inner_channels, serial=False, smooth=False, bias=False
-    ):
+    def _init_thresh(self, inner_channels, serial=False, smooth=False, bias=False):
         in_channels = inner_channels
         if serial:
             in_channels += 1
         self.thresh = nn.Sequential(
-            nn.Conv2d(
-                in_channels, inner_channels // 4, 3, padding=1, bias=bias
-            ),
+            nn.Conv2d(in_channels, inner_channels // 4, 3, padding=1, bias=bias),
             nn.BatchNorm2d(inner_channels // 4),
             nn.ReLU(inplace=True),
             self._init_upsample(
@@ -186,16 +164,12 @@ class DBNetDecoder(nn.Module):
             ),
             nn.BatchNorm2d(inner_channels // 4),
             nn.ReLU(inplace=True),
-            self._init_upsample(
-                inner_channels // 4, 1, smooth=smooth, bias=bias
-            ),
+            self._init_upsample(inner_channels // 4, 1, smooth=smooth, bias=bias),
             nn.Sigmoid(),
         )
         return self.thresh
-    def _init_upsample(
-        self, in_channels, out_channels, smooth=False, bias=False
-    ):
+    def _init_upsample(self, in_channels, out_channels, smooth=False, bias=False):
         if smooth:
             inter_out_channels = out_channels
             if out_channels == 1:

yomitoku/models/layers/rtdetr_backbone.py CHANGED Viewed

@@ -1,5 +1,4 @@
-"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
-"""
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved."""
 from collections import OrderedDict
@@ -48,9 +47,7 @@ class ConvNormLayer(nn.Module):
 class BasicBlock(nn.Module):
     expansion = 1
-    def __init__(
-        self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"
-    ):
+    def __init__(self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"):
         super().__init__()
         self.shortcut = shortcut
@@ -89,9 +86,7 @@ class BasicBlock(nn.Module):
 class BottleNeck(nn.Module):
     expansion = 4
-    def __init__(
-        self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"
-    ):
+    def __init__(self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"):
         super().__init__()
         if variant == "a":
@@ -114,17 +109,13 @@ class BottleNeck(nn.Module):
                             ("pool", nn.AvgPool2d(2, 2, 0, ceil_mode=True)),
                             (
                                 "conv",
-                                ConvNormLayer(
-                                    ch_in, ch_out * self.expansion, 1, 1
-                                ),
+                                ConvNormLayer(ch_in, ch_out * self.expansion, 1, 1),
                             ),
                         ]
                     )
                 )
             else:
-                self.short = ConvNormLayer(
-                    ch_in, ch_out * self.expansion, 1, stride
-                )
+                self.short = ConvNormLayer(ch_in, ch_out * self.expansion, 1, stride)
         self.act = nn.Identity() if act is None else get_activation(act)
@@ -145,9 +136,7 @@ class BottleNeck(nn.Module):
 class Blocks(nn.Module):
-    def __init__(
-        self, block, ch_in, ch_out, count, stage_num, act="relu", variant="b"
-    ):
+    def __init__(self, block, ch_in, ch_out, count, stage_num, act="relu", variant="b"):
         super().__init__()
         self.blocks = nn.ModuleList()

yomitoku/models/layers/rtdetr_hybrid_encoder.py CHANGED Viewed

@@ -1,5 +1,4 @@
-"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
-"""
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved."""
 import copy
 from collections import OrderedDict
@@ -241,9 +240,7 @@ class HybridEncoder(nn.Module):
         for in_channel in in_channels:
             if version == "v1":
                 proj = nn.Sequential(
-                    nn.Conv2d(
-                        in_channel, hidden_dim, kernel_size=1, bias=False
-                    ),
+                    nn.Conv2d(in_channel, hidden_dim, kernel_size=1, bias=False),
                     nn.BatchNorm2d(hidden_dim),
                 )
             elif version == "v2":
@@ -279,9 +276,7 @@ class HybridEncoder(nn.Module):
         self.encoder = nn.ModuleList(
             [
-                TransformerEncoder(
-                    copy.deepcopy(encoder_layer), num_encoder_layers
-                )
+                TransformerEncoder(copy.deepcopy(encoder_layer), num_encoder_layers)
                 for _ in range(len(use_encoder_idx))
             ]
         )
@@ -336,9 +331,7 @@ class HybridEncoder(nn.Module):
                 # self.register_buffer(f'pos_embed{idx}', pos_embed)
     @staticmethod
-    def build_2d_sincos_position_embedding(
-        w, h, embed_dim=256, temperature=10000.0
-    ):
+    def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.0):
         """ """
         grid_w = torch.arange(int(w), dtype=torch.float32)
         grid_h = torch.arange(int(h), dtype=torch.float32)
@@ -376,9 +369,7 @@ class HybridEncoder(nn.Module):
                         src_flatten.device
                     )
-                memory: torch.Tensor = self.encoder[i](
-                    src_flatten, pos_embed=pos_embed
-                )
+                memory: torch.Tensor = self.encoder[i](src_flatten, pos_embed=pos_embed)
                 proj_feats[enc_ind] = (
                     memory.permute(0, 2, 1)
                     .reshape(-1, self.hidden_dim, h, w)
@@ -390,13 +381,9 @@ class HybridEncoder(nn.Module):
         for idx in range(len(self.in_channels) - 1, 0, -1):
             feat_heigh = inner_outs[0]
             feat_low = proj_feats[idx - 1]
-            feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](
-                feat_heigh
-            )
+            feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](feat_heigh)
             inner_outs[0] = feat_heigh
-            upsample_feat = F.interpolate(
-                feat_heigh, scale_factor=2.0, mode="nearest"
-            )
+            upsample_feat = F.interpolate(feat_heigh, scale_factor=2.0, mode="nearest")
             inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](
                 torch.concat([upsample_feat, feat_low], dim=1)
             )

yomitoku/models/parseq.py CHANGED Viewed

@@ -26,9 +26,7 @@ from ..postprocessor import ParseqTokenizer as Tokenizer
 from .layers.parseq_transformer import Decoder, Encoder, TokenEmbedding
-def init_weights(
-    module: nn.Module, name: str = "", exclude: Sequence[str] = ()
-):
+def init_weights(module: nn.Module, name: str = "", exclude: Sequence[str] = ()):
     """Initialize the weights using the typical initialization schemes used in SOTA models."""
     if any(map(name.startswith, exclude)):
         return
@@ -41,9 +39,7 @@ def init_weights(
         if module.padding_idx is not None:
             module.weight.data[module.padding_idx].zero_()
     elif isinstance(module, nn.Conv2d):
-        nn.init.kaiming_normal_(
-            module.weight, mode="fan_out", nonlinearity="relu"
-        )
+        nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
         if module.bias is not None:
             nn.init.zeros_(module.bias)
     elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d, nn.GroupNorm)):
@@ -93,9 +89,7 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
     @torch.jit.ignore
     def no_weight_decay(self):
         param_names = {"text_embed.embedding.weight", "pos_queries"}
-        enc_param_names = {
-            "encoder." + n for n in self.encoder.no_weight_decay()
-        }
+        enc_param_names = {"encoder." + n for n in self.encoder.no_weight_decay()}
         return param_names.union(enc_param_names)
     def encode(self, img: torch.Tensor):
@@ -149,9 +143,7 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
         # Special case for the forward permutation. Faster than using `generate_attn_masks()`
         tgt_mask = query_mask = torch.triu(
-            torch.ones(
-                (num_steps, num_steps), dtype=torch.bool, device=self._device
-            ),
+            torch.ones((num_steps, num_steps), dtype=torch.bool, device=self._device),
             1,
         )
@@ -185,10 +177,7 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
                     # greedy decode. add the next token index to the target input
                     tgt_in[:, j] = p_i.squeeze().argmax(-1)
                     # Efficient batch decoding: If all output words have at least one EOS token, end decoding.
-                    if (
-                        testing
-                        and (tgt_in == tokenizer.eos_id).any(dim=-1).all()
-                    ):
+                    if testing and (tgt_in == tokenizer.eos_id).any(dim=-1).all():
                         break
             logits = torch.cat(logits, dim=1)
@@ -227,9 +216,7 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
                 # Prior context is the previous output.
                 tgt_in = torch.cat([bos, logits[:, :-1].argmax(-1)], dim=1)
                 # Mask tokens beyond the first EOS token.
-                tgt_padding_mask = (tgt_in == tokenizer.eos_id).int().cumsum(
-                    -1
-                ) > 0
+                tgt_padding_mask = (tgt_in == tokenizer.eos_id).int().cumsum(-1) > 0
                 tgt_out = self.decode(
                     tgt_in,
                     memory,

yomitoku/postprocessor/dbnet_postporcessor.py CHANGED Viewed

@@ -1,13 +1,12 @@
 import cv2
+import math
 import numpy as np
 import pyclipper
 from shapely.geometry import Polygon
 class DBnetPostProcessor:
-    def __init__(
-        self, min_size, thresh, box_thresh, max_candidates, unclip_ratio
-    ):
+    def __init__(self, min_size, thresh, box_thresh, max_candidates, unclip_ratio):
         self.min_size = min_size
         self.thresh = thresh
         self.box_thresh = box_thresh
@@ -24,9 +23,7 @@ class DBnetPostProcessor:
         pred = preds["binary"][0]
         segmentation = self.binarize(pred)[0]
         height, width = image_size
-        quads, scores = self.boxes_from_bitmap(
-            pred, segmentation, width, height
-        )
+        quads, scores = self.boxes_from_bitmap(pred, segmentation, width, height)
         return quads, scores
     def binarize(self, pred):
@@ -65,9 +62,7 @@ class DBnetPostProcessor:
             if self.box_thresh > score:
                 continue
-            box = self.unclip(points, unclip_ratio=self.unclip_ratio).reshape(
-                -1, 1, 2
-            )
+            box = self.unclip(points, unclip_ratio=self.unclip_ratio).reshape(-1, 1, 2)
             box, sside = self.get_mini_boxes(box)
             if sside < self.min_size + 2:
                 continue
@@ -76,9 +71,7 @@ class DBnetPostProcessor:
                 dest_width = dest_width.item()
                 dest_height = dest_height.item()
-            box[:, 0] = np.clip(
-                np.round(box[:, 0] / width * dest_width), 0, dest_width
-            )
+            box[:, 0] = np.clip(np.round(box[:, 0] / width * dest_width), 0, dest_width)
             box[:, 1] = np.clip(
                 np.round(box[:, 1] / height * dest_height), 0, dest_height
             )
@@ -88,9 +81,17 @@ class DBnetPostProcessor:
         return boxes, scores
-    def unclip(self, box, unclip_ratio=1.5):
+    def unclip(self, box, unclip_ratio=7):
+        # 小さい文字が見切れやすい、大きい文字のマージンが過度に大きくなる等の課題がある
+        # 対応として、文字の大きさに応じて、拡大パラメータを動的に変更する
+        # Note: こののルールはヒューリスティックで理論的根拠はない
         poly = Polygon(box)
-        distance = poly.area * unclip_ratio / poly.length
+        width = box[:, 0].max() - box[:, 0].min()
+        height = box[:, 1].max() - box[:, 1].min()
+        box_dist = min(width, height)
+        ratio = unclip_ratio / math.sqrt(box_dist)
+        distance = poly.area * ratio / poly.length
         offset = pyclipper.PyclipperOffset()
         offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
         expanded = np.array(offset.Execute(distance))

yomitoku/postprocessor/parseq_tokenizer.py CHANGED Viewed

@@ -122,7 +122,5 @@ class ParseqTokenizer(BaseTokenizer):
             eos_idx = len(ids)  # Nothing to truncate.
         # Truncate after EOS
         ids = ids[:eos_idx]
-        probs = probs[
-            : eos_idx + 1
-        ]  # but include prob. for EOS (if it exists)
+        probs = probs[: eos_idx + 1]  # but include prob. for EOS (if it exists)
         return probs, ids

yomitoku/utils/misc.py CHANGED Viewed

@@ -1,5 +1,5 @@
 def load_charset(charset_path):
-    with open(charset_path, "r") as f:
+    with open(charset_path, "r", encoding="utf-8") as f:
         charset = f.read()
     return charset

yomitoku/utils/visualizer.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import cv2
 import numpy as np
-from PIL import Image, ImageDraw, ImageFont
+from PIL import Image, ImageDraw, ImageFont, features
 from ..constants import PALETTE
+from .logger import set_logger
+logger = set_logger(__name__, "INFO")
 def _reading_order_visualizer(img, elements, line_color, tip_size):
@@ -148,13 +150,18 @@ def rec_visualizer(
     out = img.copy()
     pillow_img = Image.fromarray(out)
     draw = ImageDraw.Draw(pillow_img)
+    has_raqm = features.check_feature(feature="raqm")
+    if not has_raqm:
+        logger.warning(
+            "libraqm is not installed. Vertical text rendering is not supported. Rendering horizontally instead."
+        )
     for pred, quad, direction in zip(
         outputs.contents, outputs.points, outputs.directions
     ):
         quad = np.array(quad).astype(np.int32)
         font = ImageFont.truetype(font_path, font_size)
-        if direction == "horizontal":
+        if direction == "horizontal" or not has_raqm:
             x_offset = 0
             y_offset = -font_size

{yomitoku-0.4.1.dist-info → yomitoku-0.5.1.dist-info}/METADATA RENAMED Viewed

@@ -1,7 +1,7 @@
 Metadata-Version: 2.3
 Name: yomitoku
-Version: 0.4.1
-Summary: Yomitoku is a document image analysis package powered by AI technology for the Japanese language.
+Version: 0.5.1
+Summary: Yomitoku is an AI-powered document image analysis package designed specifically for the Japanese language.
 Author-email: Kotaro Kinoshita <kotaro.kinoshita@mlism.com>
 License: CC BY-NC-SA 4.0
 Keywords: Deep Learning,Japanese,OCR
@@ -10,24 +10,25 @@ Requires-Dist: huggingface-hub>=0.26.1
 Requires-Dist: lxml>=5.3.0
 Requires-Dist: omegaconf>=2.3.0
 Requires-Dist: opencv-python>=4.10.0.84
-Requires-Dist: pdf2image>=1.17.0
 Requires-Dist: pyclipper>=1.3.0.post6
 Requires-Dist: pydantic>=2.9.2
+Requires-Dist: pypdfium2>=4.30.0
 Requires-Dist: shapely>=2.0.6
 Requires-Dist: timm>=1.0.11
-Requires-Dist: torch>=2.5.0
+Requires-Dist: torch==2.5.0
 Requires-Dist: torchvision>=0.20.0
 Description-Content-Type: text/markdown
-# YomiToku
+日本語版 | [English](README_EN.md)
+<img src="static/logo/horizontal.png" width="800px">
 ![Python](https://img.shields.io/badge/Python-3.9|3.10|3.11|3.12-F9DC3E.svg?logo=python&logoColor=&style=flat)
 ![Pytorch](https://img.shields.io/badge/Pytorch-2.5-EE4C2C.svg?logo=Pytorch&style=fla)
-![OS](https://img.shields.io/badge/OS-Linux|MacOS-1793D1.svg?&style=fla)
+![CUDA](https://img.shields.io/badge/CUDA->=11.8-76B900.svg?logo=NVIDIA&style=fla)
+![OS](https://img.shields.io/badge/OS-Linux|Mac|Win-1793D1.svg?&style=fla)
 [![Document](https://img.shields.io/badge/docs-live-brightgreen)](https://kotaro-kinoshita.github.io/yomitoku-dev/)
-<img src="static/logo/horizontal.png" width="800px">
 ## 🌟 概要
 YomiToku は日本語に特化した AI 文章画像解析エンジン(Document AI)です。画像内の文字の全文 OCR およびレイアウト解析機能を有しており、画像内の文字情報や図表を認識、抽出、変換します。
@@ -60,31 +61,16 @@ Markdown でエクスポートした結果は関してはリポジトリ内の[s
 ## 📣 リリース情報
-- 2024 年 12 月 XX YomiToku vX.X.X を公開
+- 2024 年 11 月 26 YomiToku v0.5.1 (beta) を公開
 ## 💡 インストールの方法
 ```
-pip install git+https://github.com/kotaro-kinoshita/yomitoku-dev.git@main
+pip install yomitoku
 ```
-- pytorch がご自身の GPU の環境にあったものをインストールしてください
-### 依存ライブラリ
-pdf ファイルの解析を行うためには、別途、[poppler](https://poppler.freedesktop.org/)のインストールが必要です。
-**Mac**
-```
-brew install poppler
-```
-**Linux**
-```
-apt install poppler-utils -y
-```
+- pytorch はご自身の CUDAのバージョンにあったものをインストールしてください。デフォルトではCUDA12.4以上に対応したものがインストールされます。
+- pytorch は2.5以上のバージョンに対応しています。その関係でCUDA11.8以上のバージョンが必要になります。対応できない場合は、リポジトリ内のDockerfileを利用してください。
 ## 🚀 実行方法
@@ -98,8 +84,8 @@ yomitoku ${path_data} -f md -o results -v --figure
 - `-v`, `--vis` を指定すると解析結果を可視化した画像を出力します。
 - `-d`, `--device` モデルを実行するためのデバイスを指定します。gpu が利用できない場合は cpu で推論が実行されます。(デフォルト: cuda)
 - `--ignore_line_break` 画像の改行位置を無視して、段落内の文章を連結して返します。（デフォルト：画像通りの改行位置位置で改行します。）
-- `figure_letter` 検出した図表に含まれる文字も出力ファイルにエクスポートします。
-- `figure` 検出した図、画像を出力ファイルにエクスポートします。(html と markdown のみ)
+- `--figure_letter` 検出した図表に含まれる文字も出力ファイルにエクスポートします。
+- `--figure` 検出した図、画像を出力ファイルにエクスポートします。(html と markdown のみ)
 その他のオプションに関しては、ヘルプを参照
@@ -107,11 +93,10 @@ yomitoku ${path_data} -f md -o results -v --figure
 yomitoku --help
 ```
-### Note
-- CPU を用いての推論向けに最適化されておらず、処理時間が長くなりますので、GPU での実行を推奨します。
+**NOTE**
+- GPU での実行を推奨します。CPU を用いての推論向けに最適化されておらず、処理時間が長くなります。
 - 活字のみ識別をサポートしております。手書き文字に関しては、読み取れる場合もありますが、公式にはサポートしておりません。
-- OCR は文書 OCR と情景 OCR(看板など紙以外にプリントされた文字)に大別されますが、Yomitoku は文書 OCR 向けに最適化されています。
+- Yomitoku は文書 OCR 向けに最適化されており、情景 OCR(看板など紙以外にプリントされた文字の読み取り)向けには最適化されていません。
 - AI-OCR の識別精度を高めるために、入力画像の解像度が重要です。低解像度画像では識別精度が低下します。最低でも画像の短辺を 720px 以上の画像で推論することをお勧めします。
 ## 📝 ドキュメント
@@ -120,8 +105,8 @@ yomitoku --help
 ## LICENSE
-本リポジトリ内に格納されているリソースのライセンスは YomiToku は CC BY-NC-SA 4.0 に従います。
-非商用での個人利用、研究目的での利用は自由に利用できます。
+本リポジトリ内に格納されているソースコードおよび本プロジェクトに関連する HuggingFaceHub 上のモデルの重みファイルのライセンスは CC BY-NC-SA 4.0 に従います。
+非商用での個人利用、研究目的での利用はご自由にお使いください。
 商用目的での利用に関しては、別途、商用ライセンスを提供しますので、開発者にお問い合わせください。
-YomiToku © 2024 by MLism Inc. is licensed under CC BY-NC-SA 4.0. To view a copy of this license, visit https://creativecommons.org/licenses/by-nc-sa/4.0/
+YomiToku © 2024 by Kotaro Kinoshita is licensed under CC BY-NC-SA 4.0. To view a copy of this license, visit https://creativecommons.org/licenses/by-nc-sa/4.0/

{yomitoku-0.4.1.dist-info → yomitoku-0.5.1.dist-info}/RECORD RENAMED Viewed

@@ -1,8 +1,8 @@
 yomitoku/__init__.py,sha256=kXOM8RbpwwLABG3p3vPT3dJWBk4JX2MFGrOeBEW0hKM,543
 yomitoku/base.py,sha256=lzR_V8t87aRasmFdFwD-8KAeSahSTI3AZaEn6g8sOv8,3871
 yomitoku/constants.py,sha256=zlW5QRc_u_F3C2RAgBFWyHJZexBnJT5N15GC-9d3iLo,686
-yomitoku/document_analyzer.py,sha256=0dTH6YrCvp2EZXufPnSN4DdF95DZ0_z1TIDML744oX0,10029
-yomitoku/layout_analyzer.py,sha256=WIP8PjuayoM7VNtmrbb1b1r4joHYuSyIHg91GZ3F46s,2071
+yomitoku/document_analyzer.py,sha256=HIg-nVzDhJIP-h-tn4uU86KakgHdlAhosEqK_i-SWe4,9906
+yomitoku/layout_analyzer.py,sha256=QTeRcVd8aySz8u6dg2ikET77ar3sqlukRLBwYfTyMPM,2033
 yomitoku/layout_parser.py,sha256=V2jCNHE61jNp8ytYdKwPV34V5qEK7y-7-Mq7-AkoQhU,5898
 yomitoku/ocr.py,sha256=Rcojw0aGA6yDF2RjqfK23_rMw-xm61KGd8JmTCTOOVU,2516
 yomitoku/reading_order.py,sha256=OfhOS9ttPDoPSuHrIRKyOzG19GGeRufbuSKDqhsohh4,6404
@@ -14,39 +14,39 @@ yomitoku/cli/main.py,sha256=MBD0S4sXgquJ8P2egkZjJcglXvCke5Uw46C28SDtr8g,6252
 yomitoku/configs/__init__.py,sha256=KBhb9S7xt22HZaIcoWSgZHfscXXj9YlimOwLH5z9CRo,454
 yomitoku/configs/cfg_layout_parser_rtdtrv2.py,sha256=8PRxB2Ar9UF7-DLtbgSokhrzdXb0veWI6Wc-X8qigRw,2329
 yomitoku/configs/cfg_table_structure_recognizer_rtdtrv2.py,sha256=o70GMHD8k-zeBeJtuhPS8x7vVB-ffucnJXeSyn-0AXo,2116
-yomitoku/configs/cfg_text_detector_dbnet.py,sha256=AUl9aStR6z7SEPldIDd7GNQVPQx0eUlyn6ui3B4RVjA,1153
+yomitoku/configs/cfg_text_detector_dbnet.py,sha256=U9k48PON7haoOaytiELhbZRpv9RMiUm6nnfHmdxIa9Q,1153
 yomitoku/configs/cfg_text_recognizer_parseq.py,sha256=hpFs3nKqh4XdU3BZMTultegtLEGahEsCaZdjfKC_MO8,1247
 yomitoku/data/__init__.py,sha256=KAofFc9rk9ZdTKBjemu9RM8Vj9XnKbWC2MPZ2RWtOdE,82
 yomitoku/data/dataset.py,sha256=-I4f-FDtgsPnJ2MnXB7FtwihMW3koDaSI1OEoqKneIg,1014
-yomitoku/data/functions.py,sha256=2rJz4Gfd3UzlTq2bzXyFhcwtxJoUjNsnnNMJfk5-i4o,7361
+yomitoku/data/functions.py,sha256=eOyxo8S6EoAf1xGSPLWQFb9-t5Rg52NggD9MFIrOSpY,7506
 yomitoku/export/__init__.py,sha256=aANEfuovH2aevFjb2pGrBLFP-4iRzEzD9wcriCR-M7I,229
 yomitoku/export/export_csv.py,sha256=-n8eYPIzDQuiixeqpTbWaN9aQ5oFyl7XRfpv51oKPTI,1979
 yomitoku/export/export_html.py,sha256=X3H_orkS1BRlQo8Z1NzgrFwsIboDzRAx9etmqj90k2Y,4866
 yomitoku/export/export_json.py,sha256=1ChvCAHfCmMQvCfcAb1p3fSpr4elNAs3xBSIbpfn3bc,998
 yomitoku/export/export_markdown.py,sha256=mCcsXUWBLrYc1NcRSBFfBT28d6eCddAF1oHp0qdBEnE,3986
 yomitoku/models/__init__.py,sha256=Enxq9sjJWusZuxecTori8IQa8NEYKaiiptDluHX1avg,144
-yomitoku/models/dbnet_plus.py,sha256=VsE9anGOL1OzCivLilWpJ__32JHnSBEJOwdk_fpHE_o,8428
-yomitoku/models/parseq.py,sha256=OfN3ts1Z6f5T27amoRKvnL8qCma-wf0veIbWWoG4GuU,8801
+yomitoku/models/dbnet_plus.py,sha256=jeWJZm0ihbxoJeAXBFK7uVIwoosx2IUNk7Ut5wRH0vA,7998
+yomitoku/models/parseq.py,sha256=7QT-q5_oWqXTDXobRk1R6Lpap_AxdC4AzkSsOgXjOwM,8611
 yomitoku/models/rtdetr.py,sha256=oJsr8RHz3frslhLfXdVJve47lUsrmqLjfdTrZ41tlQ0,687
 yomitoku/models/layers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 yomitoku/models/layers/activate.py,sha256=HUw0q-76RNjZF-o9O3fowfJcw0t1H5o0pbyioGdqUvU,668
 yomitoku/models/layers/dbnet_feature_attention.py,sha256=Vpp_PiLVuI7Zs30TTg4RNRn16KTb81ewonADpUHd4aE,6060
 yomitoku/models/layers/parseq_transformer.py,sha256=33eroJf8rmgIptP-NpZLJMhG7XOTwV4rXsq674VrKnU,6704
-yomitoku/models/layers/rtdetr_backbone.py,sha256=8-57bh8IjUCL94qM5mTpOXUTYPih1Xek5E8xs5pMGBE,9537
-yomitoku/models/layers/rtdetr_hybrid_encoder.py,sha256=i19sqZAIfPVotvpWBuxpxbepi9xhnGlRpaiL9XMp_Cc,13804
+yomitoku/models/layers/rtdetr_backbone.py,sha256=QjfLW-3qn2My3Jbg6yLORX8A-D2sph9J9u3r5nNnDLo,9386
+yomitoku/models/layers/rtdetr_hybrid_encoder.py,sha256=D3dK37k7_0jPqV39-6Se8kBzF_SyZttNlbLleyNFiJU,13607
 yomitoku/models/layers/rtdetrv2_decoder.py,sha256=5bVYPLFYCy3PcjyHTPFHNLWqg3bctrk-dKVG4kayhaw,27517
 yomitoku/postprocessor/__init__.py,sha256=W4vUuqBaFtH5dlSBIYgyaCroGLMjpV6RrNGIBQ8NFVw,243
-yomitoku/postprocessor/dbnet_postporcessor.py,sha256=iEPOWbGaJ8YIYCQJOpBadbf7uUGEAPjmeNDcNAvY8yc,4523
-yomitoku/postprocessor/parseq_tokenizer.py,sha256=eXIHIazEkByjyXKegYEzQ3CE0ReAJYIC2VpQJjnNQjU,4337
+yomitoku/postprocessor/dbnet_postporcessor.py,sha256=o_y8b5REd2dFEdIpRcr6o-XBfOCHo9rBYGwokP_uhTc,4948
+yomitoku/postprocessor/parseq_tokenizer.py,sha256=e89_g_bc4Au3SchuxoJfJNATJTxFmVYetzXyAzPWm28,4315
 yomitoku/postprocessor/rtdetr_postprocessor.py,sha256=f52wfRKrxqSXy_LeidKDR9XAta_qPjto-oYEdO0XL8A,3386
 yomitoku/resource/MPLUS1p-Medium.ttf,sha256=KLL1KkCumIBkgQtx1n4SffdaFuCNffThktEAbkB1OU8,1758908
 yomitoku/resource/charset.txt,sha256=sU91kSi-9Wk4733bCXy4j_UDmvcsj96sHOq1ppUJlOY,21672
 yomitoku/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 yomitoku/utils/graph.py,sha256=LKNB8ZhSQwOZMfeAimPMF5UCVVr2ZaUWoGDkz8z-uGU,456
 yomitoku/utils/logger.py,sha256=uOmtQDr0A0JD7wyFshedL08BiNrQorHnpktRXba8bjU,424
-yomitoku/utils/misc.py,sha256=dC1w3DmsoU_ECqngbAs14vPOFCbcecZSGmbztgwx4XU,2479
-yomitoku/utils/visualizer.py,sha256=EEDo4bts61FX6mJecgJiHtzY2vLH6sJOQgOVr9yVsF0,4912
-yomitoku-0.4.1.dist-info/METADATA,sha256=mTZjZU6_zGTcnYgGR7bu5nGNOnys7DFH_NkAbq3FQrc,7553
-yomitoku-0.4.1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
-yomitoku-0.4.1.dist-info/entry_points.txt,sha256=nFV3S11zgBNW0Qq_D0XQNg2R4lNXU_9XUFr6rdJoyF8,52
-yomitoku-0.4.1.dist-info/RECORD,,
+yomitoku/utils/misc.py,sha256=2Eyy7-9K_h4Mal1zGXq6OlxubfNzhS0mEYwn_xt7xl8,2497
+yomitoku/utils/visualizer.py,sha256=2pSmbhUPylzVVJ0bXtGDoNmMdArAByab4Py7Xavvs_A,5230
+yomitoku-0.5.1.dist-info/METADATA,sha256=-8bUVnN26cxYlZO0ZQH3liki_xMfhUX47ruHLl-2BGM,7817
+yomitoku-0.5.1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
+yomitoku-0.5.1.dist-info/entry_points.txt,sha256=nFV3S11zgBNW0Qq_D0XQNg2R4lNXU_9XUFr6rdJoyF8,52
+yomitoku-0.5.1.dist-info/RECORD,,

{yomitoku-0.4.1.dist-info → yomitoku-0.5.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{yomitoku-0.4.1.dist-info → yomitoku-0.5.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

yomitoku 0.4.1__py3-none-any.whl → 0.5.1__py3-none-any.whl

yomitoku 0.4.1py3-none-any.whl → 0.5.1py3-none-any.whl