PyPI - yomitoku - Versions diffs - 0.5.3__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

yomitoku 0.5.3py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

yomitoku/cli/main.py +47 -1
yomitoku/configs/__init__.py +2 -0
yomitoku/configs/cfg_text_recognizer_parseq_small.py +51 -0
yomitoku/document_analyzer.py +229 -26
yomitoku/export/export_csv.py +39 -2
yomitoku/export/export_html.py +2 -1
yomitoku/export/export_json.py +40 -2
yomitoku/export/export_markdown.py +2 -1
yomitoku/layout_analyzer.py +1 -5
yomitoku/layout_parser.py +58 -4
yomitoku/models/layers/rtdetr_backbone.py +5 -15
yomitoku/models/layers/rtdetr_hybrid_encoder.py +6 -18
yomitoku/models/layers/rtdetrv2_decoder.py +17 -42
yomitoku/models/parseq.py +9 -9
yomitoku/ocr.py +24 -27
yomitoku/onnx/.gitkeep +0 -0
yomitoku/postprocessor/rtdetr_postprocessor.py +4 -13
yomitoku/table_structure_recognizer.py +79 -9
yomitoku/text_detector.py +57 -7
yomitoku/text_recognizer.py +80 -16
yomitoku/utils/misc.py +20 -13
yomitoku/utils/visualizer.py +5 -5
{yomitoku-0.5.3.dist-info → yomitoku-0.7.0.dist-info}/METADATA +21 -9
{yomitoku-0.5.3.dist-info → yomitoku-0.7.0.dist-info}/RECORD +26 -24
{yomitoku-0.5.3.dist-info → yomitoku-0.7.0.dist-info}/WHEEL +1 -1
{yomitoku-0.5.3.dist-info → yomitoku-0.7.0.dist-info}/entry_points.txt +0 -0

yomitoku/cli/main.py CHANGED Viewed

@@ -13,6 +13,18 @@ from ..utils.logger import set_logger
 logger = set_logger(__name__, "INFO")
+def validate_encoding(encoding):
+    if encoding not in [
+        "utf-8",
+        "utf-8-sig",
+        "shift-jis",
+        "euc-jp",
+        "cp932",
+    ]:
+        raise ValueError(f"Invalid encoding: {encoding}")
+    return True
 def process_single_file(args, analyzer, path, format):
     if path.suffix[1:].lower() in ["pdf"]:
         imgs = load_pdf(path)
@@ -21,7 +33,6 @@ def process_single_file(args, analyzer, path, format):
     for page, img in enumerate(imgs):
         results, ocr, layout = analyzer(img)
         dirname = path.parent.name
         filename = path.stem
@@ -47,11 +58,19 @@ def process_single_file(args, analyzer, path, format):
             results.to_json(
                 out_path,
                 ignore_line_break=args.ignore_line_break,
+                encoding=args.encoding,
+                img=img,
+                export_figure=args.figure,
+                figure_dir=args.figure_dir,
             )
         elif format == "csv":
             results.to_csv(
                 out_path,
                 ignore_line_break=args.ignore_line_break,
+                encoding=args.encoding,
+                img=img,
+                export_figure=args.figure,
+                figure_dir=args.figure_dir,
             )
         elif format == "html":
             results.to_html(
@@ -62,6 +81,7 @@ def process_single_file(args, analyzer, path, format):
                 export_figure_letter=args.figure_letter,
                 figure_width=args.figure_width,
                 figure_dir=args.figure_dir,
+                encoding=args.encoding,
             )
         elif format == "md":
             results.to_markdown(
@@ -72,6 +92,7 @@ def process_single_file(args, analyzer, path, format):
                 export_figure_letter=args.figure_letter,
                 figure_width=args.figure_width,
                 figure_dir=args.figure_dir,
+                encoding=args.encoding,
             )
         logger.info(f"Output file: {out_path}")
@@ -104,6 +125,12 @@ def main():
         default="results",
         help="output directory",
     )
+    parser.add_argument(
+        "-l",
+        "--lite",
+        action="store_true",
+        help="if set, use lite model",
+    )
     parser.add_argument(
         "-d",
         "--device",
@@ -162,6 +189,12 @@ def main():
         default="figures",
         help="directory to save figure images",
     )
+    parser.add_argument(
+        "--encoding",
+        type=str,
+        default="utf-8",
+        help="Specifies the character encoding for the output file to be exported. If unsupported characters are included, they will be ignored.",
+    )
     args = parser.parse_args()
@@ -175,6 +208,8 @@ def main():
             f"Invalid output format: {args.format}. Supported formats are {SUPPORT_OUTPUT_FORMAT}"
         )
+    validate_encoding(args.encoding)
     if format == "markdown":
         format = "md"
@@ -197,6 +232,17 @@ def main():
         },
     }
+    if args.lite:
+        configs["ocr"]["text_recognizer"]["model_name"] = "parseq-small"
+        if args.device == "cpu":
+            configs["ocr"]["text_detector"]["infer_onnx"] = True
+        # Note: Text Detector以外はONNX推論よりもPyTorch推論の方が速いため、ONNX推論は行わない
+        # configs["ocr"]["text_recognizer"]["infer_onnx"] = True
+        # configs["layout_analyzer"]["table_structure_recognizer"]["infer_onnx"] = True
+        # configs["layout_analyzer"]["layout_parser"]["infer_onnx"] = True
     analyzer = DocumentAnalyzer(
         configs=configs,
         visualize=args.vis,

yomitoku/configs/__init__.py CHANGED Viewed

@@ -4,10 +4,12 @@ from .cfg_table_structure_recognizer_rtdtrv2 import (
 )
 from .cfg_text_detector_dbnet import TextDetectorDBNetConfig
 from .cfg_text_recognizer_parseq import TextRecognizerPARSeqConfig
+from .cfg_text_recognizer_parseq_small import TextRecognizerPARSeqSmallConfig
 __all__ = [
     "TextDetectorDBNetConfig",
     "TextRecognizerPARSeqConfig",
     "LayoutParserRTDETRv2Config",
     "TableStructureRecognizerRTDETRv2Config",
+    "TextRecognizerPARSeqSmallConfig",
 ]

yomitoku/configs/cfg_text_recognizer_parseq_small.py ADDED Viewed

@@ -0,0 +1,51 @@
+from dataclasses import dataclass, field
+from typing import List
+from ..constants import ROOT_DIR
+@dataclass
+class Data:
+    num_workers: int = 4
+    batch_size: int = 128
+    img_size: List[int] = field(default_factory=lambda: [32, 800])
+@dataclass
+class Encoder:
+    patch_size: List[int] = field(default_factory=lambda: [16, 16])
+    num_heads: int = 8
+    embed_dim: int = 384
+    mlp_ratio: int = 4
+    depth: int = 9
+@dataclass
+class Decoder:
+    embed_dim: int = 384
+    num_heads: int = 8
+    mlp_ratio: int = 4
+    depth: int = 1
+@dataclass
+class Visualize:
+    font: str = str(ROOT_DIR + "/resource/MPLUS1p-Medium.ttf")
+    color: List[int] = field(default_factory=lambda: [0, 0, 255])  # RGB
+    font_size: int = 18
+@dataclass
+class TextRecognizerPARSeqSmallConfig:
+    hf_hub_repo: str = "KotaroKinoshita/yomitoku-text-recognizer-parseq-small-open-beta"
+    charset: str = str(ROOT_DIR + "/resource/charset.txt")
+    num_tokens: int = 7312
+    max_label_length: int = 100
+    decode_ar: int = 1
+    refine_iters: int = 1
+    data: Data = field(default_factory=Data)
+    encoder: Encoder = field(default_factory=Encoder)
+    decoder: Decoder = field(default_factory=Decoder)
+    visualize: Visualize = field(default_factory=Visualize)

yomitoku/document_analyzer.py CHANGED Viewed

@@ -2,17 +2,26 @@ import asyncio
 from concurrent.futures import ThreadPoolExecutor
 from typing import List, Union
+import numpy as np
 from pydantic import conlist
 from .base import BaseSchema
 from .export import export_csv, export_html, export_markdown
 from .layout_analyzer import LayoutAnalyzer
-from .ocr import OCR, WordPrediction
-from .table_structure_recognizer import TableStructureRecognizerSchema
-from .utils.misc import is_contained, quad_to_xyxy
+from .ocr import OCRSchema, WordPrediction, ocr_aggregate
 from .reading_order import prediction_reading_order
+from .table_structure_recognizer import TableStructureRecognizerSchema
+from .utils.misc import (
+    is_contained,
+    quad_to_xyxy,
+    calc_overlap_ratio,
+)
 from .utils.visualizer import reading_order_visualizer
+from yomitoku.text_detector import TextDetector
+from yomitoku.text_recognizer import TextRecognizer
+from .utils.visualizer import det_visualizer
 class ParagraphSchema(BaseSchema):
@@ -98,41 +107,57 @@ def extract_words_within_element(pred_words, element):
     word_sum_width = 0
     word_sum_height = 0
     check_list = [False] * len(pred_words)
     for i, word in enumerate(pred_words):
         word_box = quad_to_xyxy(word.points)
         if is_contained(element.box, word_box, threshold=0.5):
-            contained_words.append(word)
             word_sum_width += word_box[2] - word_box[0]
             word_sum_height += word_box[3] - word_box[1]
             check_list[i] = True
+            word_element = ParagraphSchema(
+                box=word_box,
+                contents=word.content,
+                direction=word.direction,
+                order=0,
+                role=None,
+            )
+            contained_words.append(word_element)
     if len(contained_words) == 0:
         return None, None, check_list
-    # mean_width = word_sum_width / len(contained_words)
-    # mean_height = word_sum_height / len(contained_words)
+    element_direction = "horizontal"
     word_direction = [word.direction for word in contained_words]
     cnt_horizontal = word_direction.count("horizontal")
     cnt_vertical = word_direction.count("vertical")
     element_direction = "horizontal" if cnt_horizontal > cnt_vertical else "vertical"
-    if element_direction == "horizontal":
-        contained_words = sorted(
-            contained_words,
-            key=lambda x: (sum([p[1] for p in x.points]) / 4),
-        )
-    else:
-        contained_words = sorted(
-            contained_words,
-            key=lambda x: (sum([p[0] for p in x.points]) / 4),
-            reverse=True,
-        )
-    contained_words = "\n".join([content.content for content in contained_words])
+    prediction_reading_order(contained_words, element_direction)
+    contained_words = sorted(contained_words, key=lambda x: x.order)
+    contained_words = "\n".join([content.contents for content in contained_words])
     return (contained_words, element_direction, check_list)
+def is_vertical(quad, thresh_aspect=2):
+    quad = np.array(quad)
+    width = np.linalg.norm(quad[0] - quad[1])
+    height = np.linalg.norm(quad[1] - quad[2])
+    return height > width * thresh_aspect
+def is_noise(quad, thresh=15):
+    quad = np.array(quad)
+    width = np.linalg.norm(quad[0] - quad[1])
+    height = np.linalg.norm(quad[1] - quad[2])
+    return width < thresh or height < thresh
 def recursive_update(original, new_data):
     for key, value in new_data.items():
         # `value`が辞書の場合、再帰的に更新
@@ -148,8 +173,163 @@ def recursive_update(original, new_data):
     return original
+def _extract_words_within_table(words, table, check_list):
+    horizontal_words = []
+    vertical_words = []
+    for i, (points, score) in enumerate(zip(words.points, words.scores)):
+        word_box = quad_to_xyxy(points)
+        if is_contained(table.box, word_box, threshold=0.5):
+            if is_vertical(points):
+                vertical_words.append({"points": points, "score": score})
+            else:
+                horizontal_words.append({"points": points, "score": score})
+            check_list[i] = True
+    return (horizontal_words, vertical_words, check_list)
+def _calc_overlap_words_on_lines(lines, words):
+    overlap_ratios = [[0 for _ in lines] for _ in words]
+    for i, word in enumerate(words):
+        word_box = quad_to_xyxy(word["points"])
+        for j, row in enumerate(lines):
+            overlap_ratio, _ = calc_overlap_ratio(
+                row.box,
+                word_box,
+            )
+            overlap_ratios[i][j] = overlap_ratio
+    return overlap_ratios
+def _correct_vertical_word_boxes(overlap_ratios_vertical, table, table_words_vertical):
+    allocated_cols = [cols.index(max(cols)) for cols in overlap_ratios_vertical]
+    new_points = []
+    new_scores = []
+    for i, col_index in enumerate(allocated_cols):
+        col_cells = []
+        for cell in table.cells:
+            if cell.col <= (col_index + 1) < (cell.col + cell.col_span):
+                col_cells.append(cell)
+        word_point = table_words_vertical[i]["points"]
+        word_score = table_words_vertical[i]["score"]
+        for cell in col_cells:
+            word_box = quad_to_xyxy(word_point)
+            _, intersection = calc_overlap_ratio(
+                cell.box,
+                word_box,
+            )
+            if intersection is not None:
+                _, y1, _, y2 = intersection
+                new_point = [
+                    [word_point[0][0], max(word_point[0][1], y1)],
+                    [word_point[1][0], max(word_point[1][1], y1)],
+                    [word_point[2][0], min(word_point[2][1], y2)],
+                    [word_point[3][0], min(word_point[3][1], y2)],
+                ]
+                if not is_noise(new_point):
+                    new_points.append(new_point)
+                    new_scores.append(word_score)
+    return new_points, new_scores
+def _correct_horizontal_word_boxes(
+    overlap_ratios_horizontal, table, table_words_horizontal
+):
+    allocated_rows = [rows.index(max(rows)) for rows in overlap_ratios_horizontal]
+    new_points = []
+    new_scores = []
+    for i, row_index in enumerate(allocated_rows):
+        row_cells = []
+        for cell in table.cells:
+            if cell.row <= (row_index + 1) < (cell.row + cell.row_span):
+                row_cells.append(cell)
+        word_point = table_words_horizontal[i]["points"]
+        word_score = table_words_horizontal[i]["score"]
+        for cell in row_cells:
+            word_box = quad_to_xyxy(word_point)
+            _, intersection = calc_overlap_ratio(
+                cell.box,
+                word_box,
+            )
+            if intersection is not None:
+                x1, _, x2, _ = intersection
+                new_point = [
+                    [max(word_point[0][0], x1), word_point[0][1]],
+                    [min(word_point[1][0], x2), word_point[1][1]],
+                    [min(word_point[2][0], x2), word_point[2][1]],
+                    [max(word_point[3][0], x1), word_point[3][1]],
+                ]
+                if not is_noise(new_point):
+                    new_points.append(new_point)
+                    new_scores.append(word_score)
+    return new_points, new_scores
+def _split_text_across_cells(results_det, results_layout):
+    check_list = [False] * len(results_det.points)
+    new_points = []
+    new_scores = []
+    for table in results_layout.tables:
+        table_words_horizontal, table_words_vertical, check_list = (
+            _extract_words_within_table(results_det, table, check_list)
+        )
+        overlap_ratios_horizontal = _calc_overlap_words_on_lines(
+            table.rows,
+            table_words_horizontal,
+        )
+        overlap_ratios_vertical = _calc_overlap_words_on_lines(
+            table.cols,
+            table_words_vertical,
+        )
+        new_points_horizontal, new_scores_horizontal = _correct_horizontal_word_boxes(
+            overlap_ratios_horizontal, table, table_words_horizontal
+        )
+        new_points_vertical, new_scores_vertical = _correct_vertical_word_boxes(
+            overlap_ratios_vertical, table, table_words_vertical
+        )
+        new_points.extend(new_points_horizontal)
+        new_scores.extend(new_scores_horizontal)
+        new_points.extend(new_points_vertical)
+        new_scores.extend(new_scores_vertical)
+    for i, flag in enumerate(check_list):
+        if not flag:
+            new_points.append(results_det.points[i])
+            new_scores.append(results_det.scores[i])
+    results_det.points = new_points
+    results_det.scores = new_scores
+    return results_det
 class DocumentAnalyzer:
-    def __init__(self, configs=None, device="cuda", visualize=False):
+    def __init__(self, configs={}, device="cuda", visualize=False):
         default_configs = {
             "ocr": {
                 "text_detector": {
@@ -180,8 +360,16 @@ class DocumentAnalyzer:
                 "configs must be a dict. See the https://kotaro-kinoshita.github.io/yomitoku-dev/usage/"
             )
-        self.ocr = OCR(configs=default_configs["ocr"])
-        self.layout = LayoutAnalyzer(configs=default_configs["layout_analyzer"])
+        self.text_detector = TextDetector(
+            **default_configs["ocr"]["text_detector"],
+        )
+        self.text_recognizer = TextRecognizer(
+            **default_configs["ocr"]["text_recognizer"]
+        )
+        self.layout = LayoutAnalyzer(
+            configs=default_configs["layout_analyzer"],
+        )
         self.visualize = visualize
     def aggregate(self, ocr_res, layout_res):
@@ -286,16 +474,31 @@ class DocumentAnalyzer:
         with ThreadPoolExecutor(max_workers=2) as executor:
             loop = asyncio.get_running_loop()
             tasks = [
-                loop.run_in_executor(executor, self.ocr, img),
+                # loop.run_in_executor(executor, self.ocr, img),
+                loop.run_in_executor(executor, self.text_detector, img),
                 loop.run_in_executor(executor, self.layout, img),
             ]
             results = await asyncio.gather(*tasks)
-            results_ocr, ocr = results[0]
+            results_det, _ = results[0]
             results_layout, layout = results[1]
-        outputs = self.aggregate(results_ocr, results_layout)
+            results_det = _split_text_across_cells(results_det, results_layout)
+            vis_det = None
+            if self.visualize:
+                vis_det = det_visualizer(
+                    img,
+                    results_det.points,
+                )
+            results_rec, ocr = self.text_recognizer(img, results_det.points, vis_det)
+            outputs = {"words": ocr_aggregate(results_det, results_rec)}
+            results_ocr = OCRSchema(**outputs)
+            outputs = self.aggregate(results_ocr, results_layout)
         results = DocumentAnalyzerSchema(**outputs)
         return results, ocr, layout

yomitoku/export/export_csv.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import csv
+import cv2
+import os
 def table_to_csv(table, ignore_line_break):
@@ -33,7 +35,34 @@ def paragraph_to_csv(paragraph, ignore_line_break):
     return contents
-def export_csv(inputs, out_path: str, ignore_line_break: bool = False):
+def save_figure(
+    figures,
+    img,
+    out_path,
+    figure_dir="figures",
+):
+    for i, figure in enumerate(figures):
+        x1, y1, x2, y2 = map(int, figure.box)
+        figure_img = img[y1:y2, x1:x2, :]
+        save_dir = os.path.dirname(out_path)
+        save_dir = os.path.join(save_dir, figure_dir)
+        os.makedirs(save_dir, exist_ok=True)
+        filename = os.path.splitext(os.path.basename(out_path))[0]
+        figure_name = f"{filename}_figure_{i}.png"
+        figure_path = os.path.join(save_dir, figure_name)
+        cv2.imwrite(figure_path, figure_img)
+def export_csv(
+    inputs,
+    out_path: str,
+    ignore_line_break: bool = False,
+    encoding: str = "utf-8",
+    img=None,
+    export_figure: bool = True,
+    figure_dir="figures",
+):
     elements = []
     for table in inputs.tables:
         table_csv = table_to_csv(table, ignore_line_break)
@@ -58,9 +87,17 @@ def export_csv(inputs, out_path: str, ignore_line_break: bool = False):
             }
         )
+    if export_figure:
+        save_figure(
+            inputs.figures,
+            img,
+            out_path,
+            figure_dir=figure_dir,
+        )
     elements = sorted(elements, key=lambda x: x["order"])
-    with open(out_path, "w", newline="", encoding="utf-8") as f:
+    with open(out_path, "w", newline="", encoding=encoding, errors="ignore") as f:
         writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
         for element in elements:
             if element["type"] == "table":

yomitoku/export/export_html.py CHANGED Viewed

@@ -154,6 +154,7 @@ def export_html(
     img=None,
     figure_width=200,
     figure_dir="figures",
+    encoding: str = "utf-8",
 ):
     html_string = ""
     elements = []
@@ -184,5 +185,5 @@ def export_html(
     parsed_html = html.fromstring(html_string)
     formatted_html = etree.tostring(parsed_html, pretty_print=True, encoding="unicode")
-    with open(out_path, "w", encoding="utf-8") as f:
+    with open(out_path, "w", encoding=encoding, errors="ignore") as f:
         f.write(formatted_html)

yomitoku/export/export_json.py CHANGED Viewed

@@ -1,5 +1,8 @@
 import json
+import cv2
+import os
 def paragraph_to_json(paragraph, ignore_line_break):
     if ignore_line_break:
@@ -12,7 +15,34 @@ def table_to_json(table, ignore_line_break):
             cell.contents = cell.contents.replace("\n", "")
-def export_json(inputs, out_path, ignore_line_break=False):
+def save_figure(
+    figures,
+    img,
+    out_path,
+    figure_dir="figures",
+):
+    for i, figure in enumerate(figures):
+        x1, y1, x2, y2 = map(int, figure.box)
+        figure_img = img[y1:y2, x1:x2, :]
+        save_dir = os.path.dirname(out_path)
+        save_dir = os.path.join(save_dir, figure_dir)
+        os.makedirs(save_dir, exist_ok=True)
+        filename = os.path.splitext(os.path.basename(out_path))[0]
+        figure_name = f"{filename}_figure_{i}.png"
+        figure_path = os.path.join(save_dir, figure_name)
+        cv2.imwrite(figure_path, figure_img)
+def export_json(
+    inputs,
+    out_path,
+    ignore_line_break=False,
+    encoding: str = "utf-8",
+    img=None,
+    export_figure=False,
+    figure_dir="figures",
+):
     from yomitoku.document_analyzer import DocumentAnalyzerSchema
     if isinstance(inputs, DocumentAnalyzerSchema):
@@ -23,7 +53,15 @@ def export_json(inputs, out_path, ignore_line_break=False):
         for paragraph in inputs.paragraphs:
             paragraph_to_json(paragraph, ignore_line_break)
-    with open(out_path, "w", encoding="utf-8") as f:
+        if export_figure:
+            save_figure(
+                inputs.figures,
+                img,
+                out_path,
+                figure_dir=figure_dir,
+            )
+    with open(out_path, "w", encoding=encoding, errors="ignore") as f:
         json.dump(
             inputs.model_dump(),
             f,

yomitoku/export/export_markdown.py CHANGED Viewed

@@ -117,6 +117,7 @@ def export_markdown(
     export_figure=True,
     figure_width=200,
     figure_dir="figures",
+    encoding: str = "utf-8",
 ):
     elements = []
     for table in inputs.tables:
@@ -141,5 +142,5 @@ def export_markdown(
     elements = sorted(elements, key=lambda x: x["order"])
     markdown = "\n".join([element["md"] for element in elements])
-    with open(out_path, "w", encoding="utf-8") as f:
+    with open(out_path, "w", encoding=encoding, errors="ignore") as f:
         f.write(markdown)

yomitoku/layout_analyzer.py CHANGED Viewed

@@ -15,7 +15,7 @@ class LayoutAnalyzerSchema(BaseSchema):
 class LayoutAnalyzer:
-    def __init__(self, configs=None, device="cuda", visualize=False):
+    def __init__(self, configs={}, device="cuda", visualize=False):
         layout_parser_kwargs = {
             "device": device,
             "visualize": visualize,
@@ -26,10 +26,6 @@ class LayoutAnalyzer:
         }
         if isinstance(configs, dict):
-            assert (
-                "layout_parser" in configs or "table_structure_recognizer" in configs
-            ), "Invalid config key. Please check the config keys."
             if "layout_parser" in configs:
                 layout_parser_kwargs.update(configs["layout_parser"])

yomitoku 0.5.3__py3-none-any.whl → 0.7.0__py3-none-any.whl

yomitoku 0.5.3py3-none-any.whl → 0.7.0py3-none-any.whl