PyPI - yomitoku - Versions diffs - 0.4.0.post1.dev0__py3-none-any.whl - Mend

yomitoku 0.4.0.post1.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

yomitoku/__init__.py +20 -0
yomitoku/base.py +136 -0
yomitoku/cli/__init__.py +0 -0
yomitoku/cli/main.py +230 -0
yomitoku/configs/__init__.py +13 -0
yomitoku/configs/cfg_layout_parser_rtdtrv2.py +89 -0
yomitoku/configs/cfg_table_structure_recognizer_rtdtrv2.py +80 -0
yomitoku/configs/cfg_text_detector_dbnet.py +49 -0
yomitoku/configs/cfg_text_recognizer_parseq.py +51 -0
yomitoku/constants.py +32 -0
yomitoku/data/__init__.py +3 -0
yomitoku/data/dataset.py +40 -0
yomitoku/data/functions.py +279 -0
yomitoku/document_analyzer.py +315 -0
yomitoku/export/__init__.py +6 -0
yomitoku/export/export_csv.py +71 -0
yomitoku/export/export_html.py +188 -0
yomitoku/export/export_json.py +34 -0
yomitoku/export/export_markdown.py +145 -0
yomitoku/layout_analyzer.py +66 -0
yomitoku/layout_parser.py +189 -0
yomitoku/models/__init__.py +9 -0
yomitoku/models/dbnet_plus.py +272 -0
yomitoku/models/layers/__init__.py +0 -0
yomitoku/models/layers/activate.py +38 -0
yomitoku/models/layers/dbnet_feature_attention.py +160 -0
yomitoku/models/layers/parseq_transformer.py +218 -0
yomitoku/models/layers/rtdetr_backbone.py +333 -0
yomitoku/models/layers/rtdetr_hybrid_encoder.py +433 -0
yomitoku/models/layers/rtdetrv2_decoder.py +811 -0
yomitoku/models/parseq.py +243 -0
yomitoku/models/rtdetr.py +22 -0
yomitoku/ocr.py +87 -0
yomitoku/postprocessor/__init__.py +9 -0
yomitoku/postprocessor/dbnet_postporcessor.py +137 -0
yomitoku/postprocessor/parseq_tokenizer.py +128 -0
yomitoku/postprocessor/rtdetr_postprocessor.py +107 -0
yomitoku/reading_order.py +214 -0
yomitoku/resource/MPLUS1p-Medium.ttf +0 -0
yomitoku/resource/charset.txt +1 -0
yomitoku/table_structure_recognizer.py +244 -0
yomitoku/text_detector.py +103 -0
yomitoku/text_recognizer.py +128 -0
yomitoku/utils/__init__.py +0 -0
yomitoku/utils/graph.py +20 -0
yomitoku/utils/logger.py +15 -0
yomitoku/utils/misc.py +102 -0
yomitoku/utils/visualizer.py +179 -0
yomitoku-0.4.0.post1.dev0.dist-info/METADATA +127 -0
yomitoku-0.4.0.post1.dev0.dist-info/RECORD +52 -0
yomitoku-0.4.0.post1.dev0.dist-info/WHEEL +4 -0
yomitoku-0.4.0.post1.dev0.dist-info/entry_points.txt +2 -0

yomitoku/data/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .functions import load_image, load_pdf
+__all__ = ["load_image", "load_pdf"]

yomitoku/data/dataset.py ADDED Viewed

@@ -0,0 +1,40 @@
+from torch.utils.data import Dataset
+from torchvision import transforms as T
+from .functions import (
+    extract_roi_with_perspective,
+    resize_with_padding,
+    rotate_text_image,
+    validate_quads,
+)
+class ParseqDataset(Dataset):
+    def __init__(self, cfg, img, quads):
+        self.img = img[:, :, ::-1]
+        self.quads = quads
+        self.cfg = cfg
+        self.img = img
+        self.transform = T.Compose(
+            [
+                T.ToTensor(),
+                T.Normalize(0.5, 0.5),
+            ]
+        )
+        validate_quads(self.img, self.quads)
+    def __len__(self):
+        return len(self.quads)
+    def __getitem__(self, index):
+        polygon = self.quads[index]
+        roi_img = extract_roi_with_perspective(self.img, polygon)
+        if roi_img is None:
+            return
+        roi_img = rotate_text_image(roi_img, thresh_aspect=2)
+        resized = resize_with_padding(roi_img, self.cfg.data.img_size)
+        tensor = self.transform(resized)
+        return tensor

yomitoku/data/functions.py ADDED Viewed

@@ -0,0 +1,279 @@
+from pathlib import Path
+import cv2
+import numpy as np
+import torch
+from pdf2image import convert_from_path
+from ..constants import (
+    MIN_IMAGE_SIZE,
+    SUPPORT_INPUT_FORMAT,
+    WARNING_IMAGE_SIZE,
+)
+from ..utils.logger import set_logger
+logger = set_logger(__name__)
+def load_image(image_path: str) -> np.ndarray:
+    """
+    Open an image file.
+    Args:
+        image_path (str): path to the image file
+    Returns:
+        np.ndarray: image data(BGR)
+    """
+    image_path = Path(image_path)
+    if not image_path.exists():
+        raise FileNotFoundError(f"File not found: {image_path}")
+    ext = image_path.suffix[1:].lower()
+    if ext not in SUPPORT_INPUT_FORMAT:
+        raise ValueError(
+            f"Unsupported image format. Supported formats are {SUPPORT_INPUT_FORMAT}"
+        )
+    if ext == "pdf":
+        raise ValueError(
+            "PDF file is not supported by load_image(). Use load_pdf() instead."
+        )
+    img = cv2.imread(image_path, cv2.IMREAD_COLOR)
+    if img is None:
+        raise ValueError("Invalid image data.")
+    h, w = img.shape[:2]
+    if h < MIN_IMAGE_SIZE or w < MIN_IMAGE_SIZE:
+        raise ValueError("Image size is too small.")
+    if min(h, w) < WARNING_IMAGE_SIZE:
+        logger.warning(
+            """
+            The image size is small, which may result in reduced OCR accuracy.
+            The process will continue, but it is recommended to input images with a minimum size of 720 pixels on the shorter side.
+            """
+        )
+    return img
+def load_pdf(pdf_path: str, dpi=200) -> list[np.ndarray]:
+    """
+    Open a PDF file.
+    Args:
+        pdf_path (str): path to the PDF file
+    Returns:
+        list[np.ndarray]: list of image data(BGR)
+    """
+    pdf_path = Path(pdf_path)
+    if not pdf_path.exists():
+        raise FileNotFoundError(f"File not found: {pdf_path}")
+    ext = pdf_path.suffix[1:].lower()
+    if ext not in SUPPORT_INPUT_FORMAT:
+        raise ValueError(
+            f"Unsupported image format. Supported formats are {SUPPORT_INPUT_FORMAT}"
+        )
+    if ext != "pdf":
+        raise ValueError(
+            "image file is not supported by load_pdf(). Use load_image() instead."
+        )
+    try:
+        images = convert_from_path(pdf_path, dpi=dpi)
+    except Exception as e:
+        raise ValueError(f"Failed to open the PDF file: {pdf_path}") from e
+    return [np.array(img)[:, :, ::-1] for img in images]
+def resize_shortest_edge(
+    img: np.ndarray, shortest_edge_length: int, max_length: int
+) -> np.ndarray:
+    """
+    Resize the shortest edge of the image to `shortest_edge_length` while keeping the aspect ratio.
+    if the longest edge is longer than `max_length`, resize the longest edge to `max_length` while keeping the aspect ratio.
+    Args:
+        img (np.ndarray): target image
+        shortest_edge_length (int): pixel length of the shortest edge after resizing
+        max_length (int): pixel length of maximum edge after resizing
+    Returns:
+        np.ndarray: resized image
+    """
+    h, w = img.shape[:2]
+    scale = shortest_edge_length / min(h, w)
+    if h < w:
+        new_h, new_w = shortest_edge_length, int(w * scale)
+    else:
+        new_h, new_w = int(h * scale), shortest_edge_length
+    if max(new_h, new_w) > max_length:
+        scale = float(max_length) / max(new_h, new_w)
+        new_h, new_w = int(new_h * scale), int(new_w * scale)
+    neww = max(int(new_w / 32) * 32, 32)
+    newh = max(int(new_h / 32) * 32, 32)
+    img = cv2.resize(img, (neww, newh))
+    return img
+def standardization_image(
+    img: np.ndarray, rgb=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)
+) -> np.ndarray:
+    """
+    Normalize the image data.
+    Args:
+        img (np.ndarray): target image
+    Returns:
+        np.ndarray: normalized image
+    """
+    img = img[:, :, ::-1]
+    img = img / 255.0
+    img = (img - np.array(rgb)) / np.array(std)
+    img = img.astype(np.float32)
+    return img
+def array_to_tensor(img: np.ndarray) -> torch.Tensor:
+    """
+    Convert the image data to tensor.
+    (H, W, C) -> (N, C, H, W)
+    Args:
+        img (np.ndarray): target image(H, W, C)
+    Returns:
+        torch.Tensor: (N, C, H, W) tensor
+    """
+    img = np.transpose(img, (2, 0, 1))
+    tensor = torch.as_tensor(img, dtype=torch.float)
+    tensor = tensor[None, :, :, :]
+    return tensor
+def validate_quads(img: np.ndarray, quads: list[list[list[int]]]):
+    """
+    Validate the vertices of the quadrilateral.
+    Args:
+        img (np.ndarray): target image
+        quads (list[list[list[int]]]): list of quadrilateral
+    Raises:
+        ValueError: if the vertices are invalid
+    """
+    h, w = img.shape[:2]
+    for quad in quads:
+        if len(quad) != 4:
+            raise ValueError("The number of vertices must be 4.")
+        for point in quad:
+            if len(point) != 2:
+                raise ValueError("The number of coordinates must be 2.")
+        quad = np.array(quad, dtype=int)
+        x1 = np.min(quad[:, 0])
+        x2 = np.max(quad[:, 0])
+        y1 = np.min(quad[:, 1])
+        y2 = np.max(quad[:, 1])
+        h, w = img.shape[:2]
+        if x1 < 0 or x2 > w or y1 < 0 or y2 > h:
+            raise ValueError(
+                f"The vertices are out of the image. {quad.tolist()}"
+            )
+    return True
+def extract_roi_with_perspective(img, quad):
+    """
+    Extract the word image from the image with perspective transformation.
+    Args:
+        img (np.ndarray): target image
+        polygon (np.ndarray): polygon vertices
+    Returns:
+        np.ndarray: extracted image
+    """
+    dst = img.copy()
+    quad = np.array(quad, dtype=np.float32)
+    width = np.linalg.norm(quad[0] - quad[1])
+    height = np.linalg.norm(quad[1] - quad[2])
+    width = int(width)
+    height = int(height)
+    pts1 = np.float32(quad)
+    pts2 = np.float32([[0, 0], [width, 0], [width, height], [0, height]])
+    M = cv2.getPerspectiveTransform(pts1, pts2)
+    dst = cv2.warpPerspective(dst, M, (width, height))
+    return dst
+def rotate_text_image(img, thresh_aspect=2):
+    """
+    Rotate the image if the aspect ratio is too high.
+    Args:
+        img (np.ndarray): target image
+        thresh_aspect (int): threshold of aspect ratio
+    Returns:
+        np.ndarray: rotated image
+    """
+    h, w = img.shape[:2]
+    if h > thresh_aspect * w:
+        img = cv2.rotate(img, cv2.ROTATE_90_COUNTERCLOCKWISE)
+    return img
+def resize_with_padding(img, target_size, background_color=(0, 0, 0)):
+    """
+    Resize the image with padding.
+    Args:
+        img (np.ndarray): target image
+        target_size (int, int): target size
+        background_color (Tuple[int, int, int]): background color
+    Returns:
+        np.ndarray: resized image
+    """
+    h, w = img.shape[:2]
+    scale_w = 1.0
+    scale_h = 1.0
+    if w > target_size[1]:
+        scale_w = target_size[1] / w
+    if h > target_size[0]:
+        scale_h = target_size[0] / h
+    new_w = int(w * min(scale_w, scale_h))
+    new_h = int(h * min(scale_w, scale_h))
+    resized = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
+    canvas = np.zeros((target_size[0], target_size[1], 3), dtype=np.uint8)
+    canvas[:, :] = background_color
+    resized_size = resized.shape[:2]
+    canvas[: resized_size[0], : resized_size[1], :] = resized
+    return canvas

yomitoku/document_analyzer.py ADDED Viewed

@@ -0,0 +1,315 @@
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+from typing import List, Union
+from pydantic import conlist
+from .base import BaseSchema
+from .export import export_csv, export_html, export_markdown
+from .layout_analyzer import LayoutAnalyzer
+from .ocr import OCR, WordPrediction
+from .table_structure_recognizer import TableStructureRecognizerSchema
+from .utils.misc import is_contained, quad_to_xyxy
+from .reading_order import prediction_reading_order
+from .utils.visualizer import reading_order_visualizer
+class ParagraphSchema(BaseSchema):
+    box: conlist(int, min_length=4, max_length=4)
+    contents: Union[str, None]
+    direction: Union[str, None]
+    order: Union[int, None]
+    role: Union[str, None]
+class FigureSchema(BaseSchema):
+    box: conlist(int, min_length=4, max_length=4)
+    order: Union[int, None]
+    paragraphs: List[ParagraphSchema]
+    order: Union[int, None]
+    direction: Union[str, None]
+class DocumentAnalyzerSchema(BaseSchema):
+    paragraphs: List[ParagraphSchema]
+    tables: List[TableStructureRecognizerSchema]
+    words: List[WordPrediction]
+    figures: List[FigureSchema]
+    def to_html(self, out_path: str, **kwargs):
+        export_html(self, out_path, **kwargs)
+    def to_markdown(self, out_path: str, **kwargs):
+        export_markdown(self, out_path, **kwargs)
+    def to_csv(self, out_path: str, **kwargs):
+        export_csv(self, out_path, **kwargs)
+def combine_flags(flag1, flag2):
+    return [f1 or f2 for f1, f2 in zip(flag1, flag2)]
+def judge_page_direction(paragraphs):
+    h_sum_area = 0
+    v_sum_area = 0
+    for paragraph in paragraphs:
+        x1, y1, x2, y2 = paragraph.box
+        w = x2 - x1
+        h = y2 - y1
+        if paragraph.direction == "horizontal":
+            h_sum_area += w * h
+        else:
+            v_sum_area += w * h
+    if v_sum_area > h_sum_area:
+        return "vertical"
+    return "horizontal"
+def extract_paragraph_within_figure(paragraphs, figures):
+    new_figures = []
+    check_list = [False] * len(paragraphs)
+    for figure in figures:
+        figure = {"box": figure.box, "order": 0}
+        contained_paragraphs = []
+        for i, paragraph in enumerate(paragraphs):
+            if is_contained(figure["box"], paragraph.box, threshold=0.7):
+                contained_paragraphs.append(paragraph)
+                check_list[i] = True
+        figure["direction"] = judge_page_direction(contained_paragraphs)
+        figure_paragraphs = prediction_reading_order(
+            contained_paragraphs, figure["direction"]
+        )
+        figure["paragraphs"] = sorted(figure_paragraphs, key=lambda x: x.order)
+        figure = FigureSchema(**figure)
+        new_figures.append(figure)
+    return new_figures, check_list
+def extract_words_within_element(pred_words, element):
+    contained_words = []
+    word_sum_width = 0
+    word_sum_height = 0
+    check_list = [False] * len(pred_words)
+    for i, word in enumerate(pred_words):
+        word_box = quad_to_xyxy(word.points)
+        if is_contained(element.box, word_box, threshold=0.5):
+            contained_words.append(word)
+            word_sum_width += word_box[2] - word_box[0]
+            word_sum_height += word_box[3] - word_box[1]
+            check_list[i] = True
+    if len(contained_words) == 0:
+        return None, None, check_list
+    mean_width = word_sum_width / len(contained_words)
+    mean_height = word_sum_height / len(contained_words)
+    word_direction = [word.direction for word in contained_words]
+    cnt_horizontal = word_direction.count("horizontal")
+    cnt_vertical = word_direction.count("vertical")
+    element_direction = "horizontal" if cnt_horizontal > cnt_vertical else "vertical"
+    if element_direction == "horizontal":
+        contained_words = sorted(
+            contained_words,
+            key=lambda x: (
+                x.points[0][1] // int(mean_height),
+                x.points[0][0],
+            ),
+        )
+    else:
+        contained_words = sorted(
+            contained_words,
+            key=lambda x: (
+                x.points[1][0] // int(mean_width),
+                x.points[1][1],
+            ),
+            reverse=True,
+        )
+    contained_words = "\n".join([content.content for content in contained_words])
+    return (contained_words, element_direction, check_list)
+def recursive_update(original, new_data):
+    for key, value in new_data.items():
+        # `value`が辞書の場合、再帰的に更新
+        if (
+            isinstance(value, dict)
+            and key in original
+            and isinstance(original[key], dict)
+        ):
+            recursive_update(original[key], value)
+        # `value`が辞書でない場合、またはキーが存在しない場合に上書き
+        else:
+            original[key] = value
+    return original
+class DocumentAnalyzer:
+    def __init__(self, configs=None, device="cuda", visualize=False):
+        default_configs = {
+            "ocr": {
+                "text_detector": {
+                    "device": device,
+                    "visualize": visualize,
+                },
+                "text_recognizer": {
+                    "device": device,
+                    "visualize": visualize,
+                },
+            },
+            "layout_analyzer": {
+                "layout_parser": {
+                    "device": device,
+                    "visualize": visualize,
+                },
+                "table_structure_recognizer": {
+                    "device": device,
+                    "visualize": visualize,
+                },
+            },
+        }
+        if isinstance(configs, dict):
+            recursive_update(default_configs, configs)
+        else:
+            raise ValueError(
+                "configs must be a dict. See the https://kotaro-kinoshita.github.io/yomitoku-dev/usage/"
+            )
+        self.ocr = OCR(configs=default_configs["ocr"])
+        self.layout = LayoutAnalyzer(configs=default_configs["layout_analyzer"])
+        self.visualize = visualize
+    def aggregate(self, ocr_res, layout_res):
+        paragraphs = []
+        check_list = [False] * len(ocr_res.words)
+        for table in layout_res.tables:
+            for cell in table.cells:
+                words, direction, flags = extract_words_within_element(
+                    ocr_res.words, cell
+                )
+                if words is None:
+                    words = ""
+                cell.contents = words
+                check_list = combine_flags(check_list, flags)
+        for paragraph in layout_res.paragraphs:
+            words, direction, flags = extract_words_within_element(
+                ocr_res.words, paragraph
+            )
+            if words is None:
+                continue
+            paragraph = {
+                "contents": words,
+                "box": paragraph.box,
+                "direction": direction,
+                "order": 0,
+                "role": paragraph.role,
+            }
+            check_list = combine_flags(check_list, flags)
+            paragraph = ParagraphSchema(**paragraph)
+            paragraphs.append(paragraph)
+        for i, word in enumerate(ocr_res.words):
+            direction = word.direction
+            if not check_list[i]:
+                paragraph = {
+                    "contents": word.content,
+                    "box": quad_to_xyxy(word.points),
+                    "direction": direction,
+                    "order": 0,
+                    "role": None,
+                }
+                paragraph = ParagraphSchema(**paragraph)
+                paragraphs.append(paragraph)
+        figures, check_list = extract_paragraph_within_figure(
+            paragraphs, layout_res.figures
+        )
+        paragraphs = [
+            paragraph for paragraph, flag in zip(paragraphs, check_list) if not flag
+        ]
+        page_direction = judge_page_direction(paragraphs)
+        headers = [
+            paragraph for paragraph in paragraphs if paragraph.role == "page_header"
+        ]
+        footers = [
+            paragraph for paragraph in paragraphs if paragraph.role == "page_footer"
+        ]
+        page_contents = [
+            paragraph
+            for paragraph in paragraphs
+            if paragraph.role is None or paragraph.role == "section_headings"
+        ]
+        elements = page_contents + layout_res.tables + figures
+        prediction_reading_order(headers, page_direction)
+        prediction_reading_order(footers, page_direction)
+        prediction_reading_order(elements, page_direction, self.img)
+        for i, element in enumerate(elements):
+            element.order += len(headers)
+        for i, footer in enumerate(footers):
+            footer.order += len(elements) + len(headers)
+        paragraphs = headers + page_contents + footers
+        paragraphs = sorted(paragraphs, key=lambda x: x.order)
+        figures = sorted(figures, key=lambda x: x.order)
+        tables = sorted(layout_res.tables, key=lambda x: x.order)
+        outputs = {
+            "paragraphs": paragraphs,
+            "tables": tables,
+            "figures": figures,
+            "words": ocr_res.words,
+        }
+        return outputs
+    async def run(self, img):
+        with ThreadPoolExecutor(max_workers=2) as executor:
+            loop = asyncio.get_running_loop()
+            tasks = [
+                loop.run_in_executor(executor, self.ocr, img),
+                loop.run_in_executor(executor, self.layout, img),
+            ]
+            results = await asyncio.gather(*tasks)
+            results_ocr, ocr = results[0]
+            results_layout, layout = results[1]
+        outputs = self.aggregate(results_ocr, results_layout)
+        results = DocumentAnalyzerSchema(**outputs)
+        return results, ocr, layout
+    def __call__(self, img):
+        self.img = img
+        resutls, ocr, layout = asyncio.run(self.run(img))
+        if self.visualize:
+            layout = reading_order_visualizer(layout, resutls)
+        return resutls, ocr, layout

yomitoku/export/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+from .export_csv import export_csv
+from .export_html import export_html
+from .export_json import export_json
+from .export_markdown import export_markdown
+__all__ = ["export_html", "export_markdown", "export_csv", "export_json"]