PyPI - yomitoku - Versions diffs - 0.4.1__py3-none-any.whl → 0.7.4__py3-none-any.whl - Mend

yomitoku 0.4.1py3-none-any.whl → 0.7.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

yomitoku/base.py +1 -1
yomitoku/cli/main.py +219 -27
yomitoku/configs/__init__.py +2 -0
yomitoku/configs/cfg_text_detector_dbnet.py +1 -1
yomitoku/configs/cfg_text_recognizer_parseq_small.py +51 -0
yomitoku/data/functions.py +48 -23
yomitoku/document_analyzer.py +243 -41
yomitoku/export/__init__.py +18 -5
yomitoku/export/export_csv.py +71 -2
yomitoku/export/export_html.py +46 -12
yomitoku/export/export_json.py +66 -3
yomitoku/export/export_markdown.py +42 -6
yomitoku/layout_analyzer.py +2 -9
yomitoku/layout_parser.py +58 -4
yomitoku/models/dbnet_plus.py +13 -39
yomitoku/models/layers/activate.py +13 -0
yomitoku/models/layers/rtdetr_backbone.py +18 -17
yomitoku/models/layers/rtdetr_hybrid_encoder.py +19 -20
yomitoku/models/layers/rtdetrv2_decoder.py +14 -1
yomitoku/models/parseq.py +15 -22
yomitoku/ocr.py +24 -27
yomitoku/onnx/.gitkeep +0 -0
yomitoku/postprocessor/dbnet_postporcessor.py +15 -14
yomitoku/postprocessor/parseq_tokenizer.py +1 -3
yomitoku/postprocessor/rtdetr_postprocessor.py +14 -1
yomitoku/table_structure_recognizer.py +82 -9
yomitoku/text_detector.py +57 -7
yomitoku/text_recognizer.py +84 -16
yomitoku/utils/misc.py +21 -14
yomitoku/utils/visualizer.py +15 -8
{yomitoku-0.4.1.dist-info → yomitoku-0.7.4.dist-info}/METADATA +34 -41
yomitoku-0.7.4.dist-info/RECORD +54 -0
{yomitoku-0.4.1.dist-info → yomitoku-0.7.4.dist-info}/WHEEL +1 -1
yomitoku-0.4.1.dist-info/RECORD +0 -52
{yomitoku-0.4.1.dist-info → yomitoku-0.7.4.dist-info}/entry_points.txt +0 -0

yomitoku/base.py CHANGED Viewed

@@ -54,7 +54,7 @@ class BaseSchema(BaseModel):
         validate_assignment = True
     def to_json(self, out_path: str, **kwargs):
-        export_json(self, out_path, **kwargs)
+        return export_json(self, out_path, **kwargs)
 class BaseModule:

yomitoku/cli/main.py CHANGED Viewed

@@ -1,30 +1,94 @@
 import argparse
 import os
+import time
 from pathlib import Path
 import cv2
-import time
+import torch
 from ..constants import SUPPORT_OUTPUT_FORMAT
 from ..data.functions import load_image, load_pdf
 from ..document_analyzer import DocumentAnalyzer
 from ..utils.logger import set_logger
+from ..export import save_csv, save_html, save_json, save_markdown
+from ..export import convert_json, convert_csv, convert_html, convert_markdown
 logger = set_logger(__name__, "INFO")
+def merge_all_pages(results):
+    out = None
+    for result in results:
+        format = result["format"]
+        data = result["data"]
+        if format == "json":
+            if out is None:
+                out = [data]
+            else:
+                out.append(data)
+        elif format == "csv":
+            if out is None:
+                out = data
+            else:
+                out.extend(data)
+        elif format == "html":
+            if out is None:
+                out = data
+            else:
+                out += "\n" + data
+        elif format == "md":
+            if out is None:
+                out = data
+            else:
+                out += "\n" + data
+    return out
+def save_merged_file(out_path, args, out):
+    if args.format == "json":
+        save_json(out, out_path, args.encoding)
+    elif args.format == "csv":
+        save_csv(out, out_path, args.encoding)
+    elif args.format == "html":
+        save_html(out, out_path, args.encoding)
+    elif args.format == "md":
+        save_markdown(out, out_path, args.encoding)
+def validate_encoding(encoding):
+    if encoding not in [
+        "utf-8",
+        "utf-8-sig",
+        "shift-jis",
+        "euc-jp",
+        "cp932",
+    ]:
+        raise ValueError(f"Invalid encoding: {encoding}")
+    return True
 def process_single_file(args, analyzer, path, format):
     if path.suffix[1:].lower() in ["pdf"]:
         imgs = load_pdf(path)
     else:
-        imgs = [load_image(path)]
+        imgs = load_image(path)
+    results = []
     for page, img in enumerate(imgs):
-        results, ocr, layout = analyzer(img)
+        result, ocr, layout = analyzer(img)
         dirname = path.parent.name
         filename = path.stem
+        # cv2.imwrite(
+        #    os.path.join(args.outdir, f"{dirname}_{filename}_p{page+1}.jpg"), img
+        # )
         if ocr is not None:
             out_path = os.path.join(
                 args.outdir, f"{dirname}_{filename}_p{page+1}_ocr.jpg"
@@ -44,37 +108,129 @@ def process_single_file(args, analyzer, path, format):
         out_path = os.path.join(args.outdir, f"{dirname}_{filename}_p{page+1}.{format}")
         if format == "json":
-            results.to_json(
-                out_path,
-                ignore_line_break=args.ignore_line_break,
+            if args.combine:
+                json = convert_json(
+                    result,
+                    out_path,
+                    args.ignore_line_break,
+                    img,
+                    args.figure,
+                    args.figure_dir,
+                )
+            else:
+                json = result.to_json(
+                    out_path,
+                    ignore_line_break=args.ignore_line_break,
+                    encoding=args.encoding,
+                    img=img,
+                    export_figure=args.figure,
+                    figure_dir=args.figure_dir,
+                )
+            results.append(
+                {
+                    "format": format,
+                    "data": json.model_dump(),
+                }
             )
         elif format == "csv":
-            results.to_csv(
-                out_path,
-                ignore_line_break=args.ignore_line_break,
+            if args.combine:
+                csv = convert_csv(
+                    result,
+                    out_path,
+                    args.ignore_line_break,
+                    img,
+                    args.figure,
+                    args.figure_dir,
+                )
+            else:
+                csv = result.to_csv(
+                    out_path,
+                    ignore_line_break=args.ignore_line_break,
+                    encoding=args.encoding,
+                    img=img,
+                    export_figure=args.figure,
+                    figure_dir=args.figure_dir,
+                )
+            results.append(
+                {
+                    "format": format,
+                    "data": csv,
+                }
             )
         elif format == "html":
-            results.to_html(
-                out_path,
-                ignore_line_break=args.ignore_line_break,
-                img=img,
-                export_figure=args.figure,
-                export_figure_letter=args.figure_letter,
-                figure_width=args.figure_width,
-                figure_dir=args.figure_dir,
+            if args.combine:
+                html, _ = convert_html(
+                    result,
+                    out_path,
+                    ignore_line_break=args.ignore_line_break,
+                    img=img,
+                    export_figure=args.figure,
+                    export_figure_letter=args.figure_letter,
+                    figure_width=args.figure_width,
+                    figure_dir=args.figure_dir,
+                )
+            else:
+                html = result.to_html(
+                    out_path,
+                    ignore_line_break=args.ignore_line_break,
+                    img=img,
+                    export_figure=args.figure,
+                    export_figure_letter=args.figure_letter,
+                    figure_width=args.figure_width,
+                    figure_dir=args.figure_dir,
+                    encoding=args.encoding,
+                )
+            results.append(
+                {
+                    "format": format,
+                    "data": html,
+                }
             )
         elif format == "md":
-            results.to_markdown(
-                out_path,
-                ignore_line_break=args.ignore_line_break,
-                img=img,
-                export_figure=args.figure,
-                export_figure_letter=args.figure_letter,
-                figure_width=args.figure_width,
-                figure_dir=args.figure_dir,
+            if args.combine:
+                md, _ = convert_markdown(
+                    result,
+                    out_path,
+                    ignore_line_break=args.ignore_line_break,
+                    img=img,
+                    export_figure=args.figure,
+                    export_figure_letter=args.figure_letter,
+                    figure_width=args.figure_width,
+                    figure_dir=args.figure_dir,
+                )
+            else:
+                md = result.to_markdown(
+                    out_path,
+                    ignore_line_break=args.ignore_line_break,
+                    img=img,
+                    export_figure=args.figure,
+                    export_figure_letter=args.figure_letter,
+                    figure_width=args.figure_width,
+                    figure_dir=args.figure_dir,
+                    encoding=args.encoding,
+                )
+            results.append(
+                {
+                    "format": format,
+                    "data": md,
+                }
             )
-        logger.info(f"Output file: {out_path}")
+    out = merge_all_pages(results)
+    if args.combine:
+        out_path = os.path.join(args.outdir, f"{dirname}_{filename}.{format}")
+        save_merged_file(
+            out_path,
+            args,
+            out,
+        )
 def main():
@@ -104,6 +260,12 @@ def main():
         default="results",
         help="output directory",
     )
+    parser.add_argument(
+        "-l",
+        "--lite",
+        action="store_true",
+        help="if set, use lite model",
+    )
     parser.add_argument(
         "-d",
         "--device",
@@ -162,6 +324,22 @@ def main():
         default="figures",
         help="directory to save figure images",
     )
+    parser.add_argument(
+        "--encoding",
+        type=str,
+        default="utf-8",
+        help="Specifies the character encoding for the output file to be exported. If unsupported characters are included, they will be ignored.",
+    )
+    parser.add_argument(
+        "--combine",
+        action="store_true",
+        help="if set, merge all pages in the output",
+    )
+    parser.add_argument(
+        "--ignore_meta",
+        action="store_true",
+        help="if set, ignore meta information(header, footer) in the output",
+    )
     args = parser.parse_args()
@@ -175,6 +353,8 @@ def main():
             f"Invalid output format: {args.format}. Supported formats are {SUPPORT_OUTPUT_FORMAT}"
         )
+    validate_encoding(args.encoding)
     if format == "markdown":
         format = "md"
@@ -197,10 +377,22 @@ def main():
         },
     }
+    if args.lite:
+        configs["ocr"]["text_recognizer"]["model_name"] = "parseq-small"
+        if args.device == "cpu" or not torch.cuda.is_available():
+            configs["ocr"]["text_detector"]["infer_onnx"] = True
+        # Note: Text Detector以外はONNX推論よりもPyTorch推論の方が速いため、ONNX推論は行わない
+        # configs["ocr"]["text_recognizer"]["infer_onnx"] = True
+        # configs["layout_analyzer"]["table_structure_recognizer"]["infer_onnx"] = True
+        # configs["layout_analyzer"]["layout_parser"]["infer_onnx"] = True
     analyzer = DocumentAnalyzer(
         configs=configs,
         visualize=args.vis,
         device=args.device,
+        ignore_meta=args.ignore_meta,
     )
     os.makedirs(args.outdir, exist_ok=True)

yomitoku/configs/__init__.py CHANGED Viewed

@@ -4,10 +4,12 @@ from .cfg_table_structure_recognizer_rtdtrv2 import (
 )
 from .cfg_text_detector_dbnet import TextDetectorDBNetConfig
 from .cfg_text_recognizer_parseq import TextRecognizerPARSeqConfig
+from .cfg_text_recognizer_parseq_small import TextRecognizerPARSeqSmallConfig
 __all__ = [
     "TextDetectorDBNetConfig",
     "TextRecognizerPARSeqConfig",
     "LayoutParserRTDETRv2Config",
     "TableStructureRecognizerRTDETRv2Config",
+    "TextRecognizerPARSeqSmallConfig",
 ]

yomitoku/configs/cfg_text_detector_dbnet.py CHANGED Viewed

@@ -30,7 +30,7 @@ class PostProcess:
     thresh: float = 0.2
     box_thresh: float = 0.5
     max_candidates: int = 1500
-    unclip_ratio: float = 2.0
+    unclip_ratio: float = 7.0
 @dataclass

yomitoku/configs/cfg_text_recognizer_parseq_small.py ADDED Viewed

@@ -0,0 +1,51 @@
+from dataclasses import dataclass, field
+from typing import List
+from ..constants import ROOT_DIR
+@dataclass
+class Data:
+    num_workers: int = 4
+    batch_size: int = 128
+    img_size: List[int] = field(default_factory=lambda: [32, 800])
+@dataclass
+class Encoder:
+    patch_size: List[int] = field(default_factory=lambda: [16, 16])
+    num_heads: int = 8
+    embed_dim: int = 384
+    mlp_ratio: int = 4
+    depth: int = 9
+@dataclass
+class Decoder:
+    embed_dim: int = 384
+    num_heads: int = 8
+    mlp_ratio: int = 4
+    depth: int = 1
+@dataclass
+class Visualize:
+    font: str = str(ROOT_DIR + "/resource/MPLUS1p-Medium.ttf")
+    color: List[int] = field(default_factory=lambda: [0, 0, 255])  # RGB
+    font_size: int = 18
+@dataclass
+class TextRecognizerPARSeqSmallConfig:
+    hf_hub_repo: str = "KotaroKinoshita/yomitoku-text-recognizer-parseq-small-open-beta"
+    charset: str = str(ROOT_DIR + "/resource/charset.txt")
+    num_tokens: int = 7312
+    max_label_length: int = 100
+    decode_ar: int = 1
+    refine_iters: int = 1
+    data: Data = field(default_factory=Data)
+    encoder: Encoder = field(default_factory=Encoder)
+    decoder: Decoder = field(default_factory=Decoder)
+    visualize: Visualize = field(default_factory=Visualize)

yomitoku/data/functions.py CHANGED Viewed

@@ -1,9 +1,10 @@
 from pathlib import Path
 import cv2
+from PIL import Image
 import numpy as np
 import torch
-from pdf2image import convert_from_path
+import pypdfium2
 from ..constants import (
     MIN_IMAGE_SIZE,
@@ -15,6 +16,20 @@ from ..utils.logger import set_logger
 logger = set_logger(__name__)
+def validate_image(img: np.ndarray):
+    h, w = img.shape[:2]
+    if h < MIN_IMAGE_SIZE or w < MIN_IMAGE_SIZE:
+        raise ValueError("Image size is too small.")
+    if min(h, w) < WARNING_IMAGE_SIZE:
+        logger.warning(
+            """
+            The image size is small, which may result in reduced OCR accuracy.
+            The process will continue, but it is recommended to input images with a minimum size of 720 pixels on the shorter side.
+            """
+        )
 def load_image(image_path: str) -> np.ndarray:
     """
     Open an image file.
@@ -40,24 +55,27 @@ def load_image(image_path: str) -> np.ndarray:
             "PDF file is not supported by load_image(). Use load_pdf() instead."
         )
-    img = cv2.imread(image_path, cv2.IMREAD_COLOR)
-    if img is None:
+    try:
+        img = Image.open(image_path)
+    except Exception:
         raise ValueError("Invalid image data.")
-    h, w = img.shape[:2]
-    if h < MIN_IMAGE_SIZE or w < MIN_IMAGE_SIZE:
-        raise ValueError("Image size is too small.")
-    if min(h, w) < WARNING_IMAGE_SIZE:
-        logger.warning(
-            """
-            The image size is small, which may result in reduced OCR accuracy.
-            The process will continue, but it is recommended to input images with a minimum size of 720 pixels on the shorter side.
-            """
-        )
+    pages = []
+    if ext in ["tif", "tiff"]:
+        try:
+            while True:
+                img_arr = np.array(img.copy().convert("RGB"))
+                validate_image(img_arr)
+                pages.append(img_arr[:, :, ::-1])
+                img.seek(img.tell() + 1)
+        except EOFError:
+            pass
+    else:
+        img_arr = np.array(img.convert("RGB"))
+        validate_image(img_arr)
+        pages.append(img_arr[:, :, ::-1])
-    return img
+    return pages
 def load_pdf(pdf_path: str, dpi=200) -> list[np.ndarray]:
@@ -70,6 +88,7 @@ def load_pdf(pdf_path: str, dpi=200) -> list[np.ndarray]:
     Returns:
         list[np.ndarray]: list of image data(BGR)
     """
     pdf_path = Path(pdf_path)
     if not pdf_path.exists():
         raise FileNotFoundError(f"File not found: {pdf_path}")
@@ -86,11 +105,19 @@ def load_pdf(pdf_path: str, dpi=200) -> list[np.ndarray]:
         )
     try:
-        images = convert_from_path(pdf_path, dpi=dpi)
+        doc = pypdfium2.PdfDocument(pdf_path)
+        renderer = doc.render(
+            pypdfium2.PdfBitmap.to_pil,
+            scale=dpi / 72,
+        )
+        images = list(renderer)
+        images = [np.array(image.convert("RGB"))[:, :, ::-1] for image in images]
+        doc.close()
     except Exception as e:
         raise ValueError(f"Failed to open the PDF file: {pdf_path}") from e
-    return [np.array(img)[:, :, ::-1] for img in images]
+    return images
 def resize_shortest_edge(
@@ -123,7 +150,7 @@ def resize_shortest_edge(
     neww = max(int(new_w / 32) * 32, 32)
     newh = max(int(new_h / 32) * 32, 32)
-    img = cv2.resize(img, (neww, newh))
+    img = cv2.resize(img, (neww, newh), interpolation=cv2.INTER_AREA)
     return img
@@ -193,9 +220,7 @@ def validate_quads(img: np.ndarray, quads: list[list[list[int]]]):
         h, w = img.shape[:2]
         if x1 < 0 or x2 > w or y1 < 0 or y2 > h:
-            raise ValueError(
-                f"The vertices are out of the image. {quad.tolist()}"
-            )
+            raise ValueError(f"The vertices are out of the image. {quad.tolist()}")
     return True
@@ -268,7 +293,7 @@ def resize_with_padding(img, target_size, background_color=(0, 0, 0)):
     new_w = int(w * min(scale_w, scale_h))
     new_h = int(h * min(scale_w, scale_h))
-    resized = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
+    resized = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
     canvas = np.zeros((target_size[0], target_size[1], 3), dtype=np.uint8)
     canvas[:, :] = background_color

yomitoku 0.4.1__py3-none-any.whl → 0.7.4__py3-none-any.whl

yomitoku 0.4.1py3-none-any.whl → 0.7.4py3-none-any.whl