PyPI - yomitoku - Versions diffs - 0.7.3__tar.gz → 0.7.4__tar.gz - Mend

yomitoku 0.7.3tar.gz → 0.7.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (170) hide show

{yomitoku-0.7.3 → yomitoku-0.7.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: yomitoku
-Version: 0.7.3
+Version: 0.7.4
 Summary: Yomitoku is an AI-powered document image analysis package designed specifically for the Japanese language.
 Author-email: Kotaro Kinoshita <kotaro.kinoshita@mlism.com>
 License: CC BY-NC-SA 4.0

{yomitoku-0.7.3 → yomitoku-0.7.4}/configs/yomitoku-table-structure-recognizer-rtdtrv2-open-beta.yaml RENAMED Viewed

@@ -64,4 +64,4 @@ RTDETRTransformerv2:
 category:
 - row
 - col
-- span
+- span

{yomitoku-0.7.3 → yomitoku-0.7.4}/configs/yomitoku-text-detector-dbnet-open-beta.yaml RENAMED Viewed

@@ -27,4 +27,4 @@ visualize:
   - 0
   - 255
   - 0
-  heatmap: false
+  heatmap: false

{yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/cli/main.py RENAMED Viewed

@@ -12,6 +12,7 @@ from ..document_analyzer import DocumentAnalyzer
 from ..utils.logger import set_logger
 from ..export import save_csv, save_html, save_json, save_markdown
+from ..export import convert_json, convert_csv, convert_html, convert_markdown
 logger = set_logger(__name__, "INFO")
@@ -51,13 +52,13 @@ def merge_all_pages(results):
 def save_merged_file(out_path, args, out):
     if args.format == "json":
-        save_json(out_path, args.encoding, out)
+        save_json(out, out_path, args.encoding)
     elif args.format == "csv":
-        save_csv(out_path, args.encoding, out)
+        save_csv(out, out_path, args.encoding)
     elif args.format == "html":
-        save_html(out_path, args.encoding, out)
+        save_html(out, out_path, args.encoding)
     elif args.format == "md":
-        save_markdown(out_path, args.encoding, out)
+        save_markdown(out, out_path, args.encoding)
 def validate_encoding(encoding):
@@ -76,7 +77,7 @@ def process_single_file(args, analyzer, path, format):
     if path.suffix[1:].lower() in ["pdf"]:
         imgs = load_pdf(path)
     else:
-        imgs = [load_image(path)]
+        imgs = load_image(path)
     results = []
     for page, img in enumerate(imgs):
@@ -84,6 +85,10 @@ def process_single_file(args, analyzer, path, format):
         dirname = path.parent.name
         filename = path.stem
+        # cv2.imwrite(
+        #    os.path.join(args.outdir, f"{dirname}_{filename}_p{page+1}.jpg"), img
+        # )
         if ocr is not None:
             out_path = os.path.join(
                 args.outdir, f"{dirname}_{filename}_p{page+1}_ocr.jpg"
@@ -103,34 +108,51 @@ def process_single_file(args, analyzer, path, format):
         out_path = os.path.join(args.outdir, f"{dirname}_{filename}_p{page+1}.{format}")
         if format == "json":
-            json = result.to_json(
-                out_path,
-                ignore_line_break=args.ignore_line_break,
-                encoding=args.encoding,
-                img=img,
-                export_figure=args.figure,
-                figure_dir=args.figure_dir,
-            )
+            if args.combine:
+                json = convert_json(
+                    result,
+                    out_path,
+                    args.ignore_line_break,
+                    img,
+                    args.figure,
+                    args.figure_dir,
+                )
+            else:
+                json = result.to_json(
+                    out_path,
+                    ignore_line_break=args.ignore_line_break,
+                    encoding=args.encoding,
+                    img=img,
+                    export_figure=args.figure,
+                    figure_dir=args.figure_dir,
+                )
             results.append(
                 {
                     "format": format,
-                    "data": json,
+                    "data": json.model_dump(),
                 }
             )
-            if not args.combine:
-                save_json(out_path, args.encoding, json)
         elif format == "csv":
-            csv = result.to_csv(
-                out_path,
-                ignore_line_break=args.ignore_line_break,
-                encoding=args.encoding,
-                img=img,
-                export_figure=args.figure,
-                figure_dir=args.figure_dir,
-            )
+            if args.combine:
+                csv = convert_csv(
+                    result,
+                    out_path,
+                    args.ignore_line_break,
+                    img,
+                    args.figure,
+                    args.figure_dir,
+                )
+            else:
+                csv = result.to_csv(
+                    out_path,
+                    ignore_line_break=args.ignore_line_break,
+                    encoding=args.encoding,
+                    img=img,
+                    export_figure=args.figure,
+                    figure_dir=args.figure_dir,
+                )
             results.append(
                 {
@@ -139,20 +161,29 @@ def process_single_file(args, analyzer, path, format):
                 }
             )
-            if not args.combine:
-                save_csv(out_path, args.encoding, csv)
         elif format == "html":
-            html = result.to_html(
-                out_path,
-                ignore_line_break=args.ignore_line_break,
-                img=img,
-                export_figure=args.figure,
-                export_figure_letter=args.figure_letter,
-                figure_width=args.figure_width,
-                figure_dir=args.figure_dir,
-                encoding=args.encoding,
-            )
+            if args.combine:
+                html, _ = convert_html(
+                    result,
+                    out_path,
+                    ignore_line_break=args.ignore_line_break,
+                    img=img,
+                    export_figure=args.figure,
+                    export_figure_letter=args.figure_letter,
+                    figure_width=args.figure_width,
+                    figure_dir=args.figure_dir,
+                )
+            else:
+                html = result.to_html(
+                    out_path,
+                    ignore_line_break=args.ignore_line_break,
+                    img=img,
+                    export_figure=args.figure,
+                    export_figure_letter=args.figure_letter,
+                    figure_width=args.figure_width,
+                    figure_dir=args.figure_dir,
+                    encoding=args.encoding,
+                )
             results.append(
                 {
@@ -161,20 +192,29 @@ def process_single_file(args, analyzer, path, format):
                 }
             )
-            if not args.combine:
-                save_html(out_path, args.encoding, html)
         elif format == "md":
-            md = result.to_markdown(
-                out_path,
-                ignore_line_break=args.ignore_line_break,
-                img=img,
-                export_figure=args.figure,
-                export_figure_letter=args.figure_letter,
-                figure_width=args.figure_width,
-                figure_dir=args.figure_dir,
-                encoding=args.encoding,
-            )
+            if args.combine:
+                md, _ = convert_markdown(
+                    result,
+                    out_path,
+                    ignore_line_break=args.ignore_line_break,
+                    img=img,
+                    export_figure=args.figure,
+                    export_figure_letter=args.figure_letter,
+                    figure_width=args.figure_width,
+                    figure_dir=args.figure_dir,
+                )
+            else:
+                md = result.to_markdown(
+                    out_path,
+                    ignore_line_break=args.ignore_line_break,
+                    img=img,
+                    export_figure=args.figure,
+                    export_figure_letter=args.figure_letter,
+                    figure_width=args.figure_width,
+                    figure_dir=args.figure_dir,
+                    encoding=args.encoding,
+                )
             results.append(
                 {
@@ -183,9 +223,6 @@ def process_single_file(args, analyzer, path, format):
                 }
             )
-            if not args.combine:
-                save_markdown(out_path, args.encoding, md)
     out = merge_all_pages(results)
     if args.combine:
         out_path = os.path.join(args.outdir, f"{dirname}_{filename}.{format}")

{yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/data/functions.py RENAMED Viewed

@@ -1,6 +1,7 @@
 from pathlib import Path
 import cv2
+from PIL import Image
 import numpy as np
 import torch
 import pypdfium2
@@ -15,6 +16,20 @@ from ..utils.logger import set_logger
 logger = set_logger(__name__)
+def validate_image(img: np.ndarray):
+    h, w = img.shape[:2]
+    if h < MIN_IMAGE_SIZE or w < MIN_IMAGE_SIZE:
+        raise ValueError("Image size is too small.")
+    if min(h, w) < WARNING_IMAGE_SIZE:
+        logger.warning(
+            """
+            The image size is small, which may result in reduced OCR accuracy.
+            The process will continue, but it is recommended to input images with a minimum size of 720 pixels on the shorter side.
+            """
+        )
 def load_image(image_path: str) -> np.ndarray:
     """
     Open an image file.
@@ -40,24 +55,27 @@ def load_image(image_path: str) -> np.ndarray:
             "PDF file is not supported by load_image(). Use load_pdf() instead."
         )
-    img = cv2.imread(image_path, cv2.IMREAD_COLOR)
-    if img is None:
+    try:
+        img = Image.open(image_path)
+    except Exception:
         raise ValueError("Invalid image data.")
-    h, w = img.shape[:2]
-    if h < MIN_IMAGE_SIZE or w < MIN_IMAGE_SIZE:
-        raise ValueError("Image size is too small.")
-    if min(h, w) < WARNING_IMAGE_SIZE:
-        logger.warning(
-            """
-            The image size is small, which may result in reduced OCR accuracy.
-            The process will continue, but it is recommended to input images with a minimum size of 720 pixels on the shorter side.
-            """
-        )
+    pages = []
+    if ext in ["tif", "tiff"]:
+        try:
+            while True:
+                img_arr = np.array(img.copy().convert("RGB"))
+                validate_image(img_arr)
+                pages.append(img_arr[:, :, ::-1])
+                img.seek(img.tell() + 1)
+        except EOFError:
+            pass
+    else:
+        img_arr = np.array(img.convert("RGB"))
+        validate_image(img_arr)
+        pages.append(img_arr[:, :, ::-1])
-    return img
+    return pages
 def load_pdf(pdf_path: str, dpi=200) -> list[np.ndarray]:

yomitoku-0.7.4/src/yomitoku/export/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+from .export_csv import export_csv, save_csv, convert_csv
+from .export_html import export_html, save_html, convert_html
+from .export_json import export_json, save_json, convert_json
+from .export_markdown import export_markdown, save_markdown, convert_markdown
+__all__ = [
+    "export_html",
+    "export_markdown",
+    "export_csv",
+    "export_json",
+    "save_html",
+    "save_markdown",
+    "save_csv",
+    "save_json",
+    "convert_html",
+    "convert_markdown",
+    "convert_csv",
+    "convert_json",
+]

{yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/export/export_csv.py RENAMED Viewed

@@ -57,11 +57,10 @@ def save_figure(
         cv2.imwrite(figure_path, figure_img)
-def export_csv(
+def convert_csv(
     inputs,
-    out_path: str,
-    ignore_line_break: bool = False,
-    encoding: str = "utf-8",
+    out_path,
+    ignore_line_break,
     img=None,
     export_figure: bool = True,
     figure_dir="figures",
@@ -90,6 +89,8 @@ def export_csv(
             }
         )
+    elements = sorted(elements, key=lambda x: x["order"])
     if export_figure:
         save_figure(
             inputs.figures,
@@ -98,11 +99,36 @@ def export_csv(
             figure_dir=figure_dir,
         )
-    elements = sorted(elements, key=lambda x: x["order"])
     return elements
-def save_csv(out_path, encoding, elements):
+def export_csv(
+    inputs,
+    out_path: str,
+    ignore_line_break: bool = False,
+    encoding: str = "utf-8",
+    img=None,
+    export_figure: bool = True,
+    figure_dir="figures",
+):
+    elements = convert_csv(
+        inputs,
+        out_path,
+        ignore_line_break,
+        img,
+        export_figure,
+        figure_dir,
+    )
+    save_csv(elements, out_path, encoding)
+    return elements
+def save_csv(
+    elements,
+    out_path,
+    encoding,
+):
     with open(out_path, "w", newline="", encoding=encoding, errors="ignore") as f:
         writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
         for element in elements:

{yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/export/export_html.py RENAMED Viewed

@@ -146,16 +146,15 @@ def figure_to_html(
     return elements
-def export_html(
+def convert_html(
     inputs,
-    out_path: str,
-    ignore_line_break: bool = False,
-    export_figure: bool = True,
-    export_figure_letter: bool = False,
+    out_path,
+    ignore_line_break,
+    export_figure,
+    export_figure_letter,
     img=None,
     figure_width=200,
     figure_dir="figures",
-    encoding: str = "utf-8",
 ):
     html_string = ""
     elements = []
@@ -181,13 +180,43 @@ def export_html(
     elements = sorted(elements, key=lambda x: x["order"])
     html_string = "".join([element["html"] for element in elements])
-    # html_string = add_html_tag(html_string)
     parsed_html = html.fromstring(html_string)
     formatted_html = etree.tostring(parsed_html, pretty_print=True, encoding="unicode")
+    return formatted_html, elements
+def export_html(
+    inputs,
+    out_path: str,
+    ignore_line_break: bool = False,
+    export_figure: bool = True,
+    export_figure_letter: bool = False,
+    img=None,
+    figure_width=200,
+    figure_dir="figures",
+    encoding: str = "utf-8",
+):
+    formatted_html, elements = convert_html(
+        inputs,
+        out_path,
+        ignore_line_break,
+        export_figure,
+        export_figure_letter,
+        img,
+        figure_width,
+        figure_dir,
+    )
+    save_html(formatted_html, out_path, encoding)
     return formatted_html
-def save_html(out_path, encoding, html):
+def save_html(
+    html,
+    out_path,
+    encoding,
+):
     with open(out_path, "w", encoding=encoding, errors="ignore") as f:
         f.write(html)

{yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/export/export_json.py RENAMED Viewed

@@ -36,15 +36,7 @@ def save_figure(
         cv2.imwrite(figure_path, figure_img)
-def export_json(
-    inputs,
-    out_path,
-    ignore_line_break=False,
-    encoding: str = "utf-8",
-    img=None,
-    export_figure=False,
-    figure_dir="figures",
-):
+def convert_json(inputs, out_path, ignore_line_break, img, export_figure, figure_dir):
     from yomitoku.document_analyzer import DocumentAnalyzerSchema
     if isinstance(inputs, DocumentAnalyzerSchema):
@@ -55,18 +47,45 @@ def export_json(
         for paragraph in inputs.paragraphs:
             paragraph_to_json(paragraph, ignore_line_break)
-        if export_figure:
-            save_figure(
-                inputs.figures,
-                img,
-                out_path,
-                figure_dir=figure_dir,
-            )
+    if isinstance(inputs, DocumentAnalyzerSchema) and export_figure:
+        save_figure(
+            inputs.figures,
+            img,
+            out_path,
+            figure_dir=figure_dir,
+        )
+    return inputs
+def export_json(
+    inputs,
+    out_path,
+    ignore_line_break=False,
+    encoding: str = "utf-8",
+    img=None,
+    export_figure=False,
+    figure_dir="figures",
+):
+    inputs = convert_json(
+        inputs,
+        out_path,
+        ignore_line_break,
+        img,
+        export_figure,
+        figure_dir,
+    )
+    save_json(
+        inputs.model_dump(),
+        out_path,
+        encoding,
+    )
-    return inputs.model_dump()
+    return inputs
-def save_json(out_path, encoding, data):
+def save_json(data, out_path, encoding):
     with open(out_path, "w", encoding=encoding, errors="ignore") as f:
         json.dump(
             data,

{yomitoku-0.7.3 → yomitoku-0.7.4}/src/yomitoku/export/export_markdown.py RENAMED Viewed

@@ -111,16 +111,15 @@ def figure_to_md(
     return elements
-def export_markdown(
+def convert_markdown(
     inputs,
-    out_path: str,
+    out_path,
+    ignore_line_break=False,
     img=None,
-    ignore_line_break: bool = False,
     export_figure_letter=False,
     export_figure=True,
     figure_width=200,
     figure_dir="figures",
-    encoding: str = "utf-8",
 ):
     elements = []
     for table in inputs.tables:
@@ -144,10 +143,39 @@ def export_markdown(
     elements = sorted(elements, key=lambda x: x["order"])
     markdown = "\n".join([element["md"] for element in elements])
+    return markdown, elements
+def export_markdown(
+    inputs,
+    out_path: str,
+    ignore_line_break: bool = False,
+    img=None,
+    export_figure_letter=False,
+    export_figure=True,
+    figure_width=200,
+    figure_dir="figures",
+    encoding: str = "utf-8",
+):
+    markdown, elements = convert_markdown(
+        inputs,
+        out_path,
+        ignore_line_break,
+        img,
+        export_figure_letter,
+        export_figure,
+        figure_width,
+        figure_dir,
+    )
+    save_markdown(markdown, out_path, encoding)
     return markdown
-def save_markdown(out_path, encoding, markdown):
+def save_markdown(
+    markdown,
+    out_path,
+    encoding,
+):
     with open(out_path, "w", encoding=encoding, errors="ignore") as f:
         f.write(markdown)

yomitoku-0.7.4/tests/data/sampldoc.tif ADDED Viewed

Binary file

{yomitoku-0.7.3 → yomitoku-0.7.4}/tests/test_data.py RENAMED Viewed

@@ -36,14 +36,16 @@ def test_load_image():
         "tests/data/test.bmp",
         "tests/data/test_gray.jpg",
         "tests/data/rgba.png",
+        "tests/data/sampldoc.tif",
     ]
     for target in targets:
         image = load_image(target)
-        assert image.shape[2] == 3
-        assert image.shape[0] > 32
-        assert image.shape[1] > 32
-        assert image.dtype == "uint8"
+        assert len(image) >= 1
+        assert image[0].shape[2] == 3
+        assert image[0].shape[0] > 32
+        assert image[0].shape[1] > 32
+        assert image[0].dtype == "uint8"
 def test_load_pdf():

yomitoku 0.7.3__tar.gz → 0.7.4__tar.gz

yomitoku 0.7.3tar.gz → 0.7.4tar.gz