yomitoku 0.4.1__py3-none-any.whl → 0.7.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- yomitoku/base.py +1 -1
 - yomitoku/cli/main.py +219 -27
 - yomitoku/configs/__init__.py +2 -0
 - yomitoku/configs/cfg_text_detector_dbnet.py +1 -1
 - yomitoku/configs/cfg_text_recognizer_parseq_small.py +51 -0
 - yomitoku/data/functions.py +48 -23
 - yomitoku/document_analyzer.py +243 -41
 - yomitoku/export/__init__.py +18 -5
 - yomitoku/export/export_csv.py +71 -2
 - yomitoku/export/export_html.py +46 -12
 - yomitoku/export/export_json.py +66 -3
 - yomitoku/export/export_markdown.py +42 -6
 - yomitoku/layout_analyzer.py +2 -9
 - yomitoku/layout_parser.py +58 -4
 - yomitoku/models/dbnet_plus.py +13 -39
 - yomitoku/models/layers/activate.py +13 -0
 - yomitoku/models/layers/rtdetr_backbone.py +18 -17
 - yomitoku/models/layers/rtdetr_hybrid_encoder.py +19 -20
 - yomitoku/models/layers/rtdetrv2_decoder.py +14 -1
 - yomitoku/models/parseq.py +15 -22
 - yomitoku/ocr.py +24 -27
 - yomitoku/onnx/.gitkeep +0 -0
 - yomitoku/postprocessor/dbnet_postporcessor.py +15 -14
 - yomitoku/postprocessor/parseq_tokenizer.py +1 -3
 - yomitoku/postprocessor/rtdetr_postprocessor.py +14 -1
 - yomitoku/table_structure_recognizer.py +82 -9
 - yomitoku/text_detector.py +57 -7
 - yomitoku/text_recognizer.py +84 -16
 - yomitoku/utils/misc.py +21 -14
 - yomitoku/utils/visualizer.py +15 -8
 - {yomitoku-0.4.1.dist-info → yomitoku-0.7.4.dist-info}/METADATA +34 -41
 - yomitoku-0.7.4.dist-info/RECORD +54 -0
 - {yomitoku-0.4.1.dist-info → yomitoku-0.7.4.dist-info}/WHEEL +1 -1
 - yomitoku-0.4.1.dist-info/RECORD +0 -52
 - {yomitoku-0.4.1.dist-info → yomitoku-0.7.4.dist-info}/entry_points.txt +0 -0
 
    
        yomitoku/base.py
    CHANGED
    
    
    
        yomitoku/cli/main.py
    CHANGED
    
    | 
         @@ -1,30 +1,94 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            import argparse
         
     | 
| 
       2 
2 
     | 
    
         
             
            import os
         
     | 
| 
      
 3 
     | 
    
         
            +
            import time
         
     | 
| 
       3 
4 
     | 
    
         
             
            from pathlib import Path
         
     | 
| 
       4 
5 
     | 
    
         | 
| 
       5 
6 
     | 
    
         
             
            import cv2
         
     | 
| 
       6 
     | 
    
         
            -
            import  
     | 
| 
      
 7 
     | 
    
         
            +
            import torch
         
     | 
| 
       7 
8 
     | 
    
         | 
| 
       8 
9 
     | 
    
         
             
            from ..constants import SUPPORT_OUTPUT_FORMAT
         
     | 
| 
       9 
10 
     | 
    
         
             
            from ..data.functions import load_image, load_pdf
         
     | 
| 
       10 
11 
     | 
    
         
             
            from ..document_analyzer import DocumentAnalyzer
         
     | 
| 
       11 
12 
     | 
    
         
             
            from ..utils.logger import set_logger
         
     | 
| 
       12 
13 
     | 
    
         | 
| 
      
 14 
     | 
    
         
            +
            from ..export import save_csv, save_html, save_json, save_markdown
         
     | 
| 
      
 15 
     | 
    
         
            +
            from ..export import convert_json, convert_csv, convert_html, convert_markdown
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
       13 
17 
     | 
    
         
             
            logger = set_logger(__name__, "INFO")
         
     | 
| 
       14 
18 
     | 
    
         | 
| 
       15 
19 
     | 
    
         | 
| 
      
 20 
     | 
    
         
            +
            def merge_all_pages(results):
         
     | 
| 
      
 21 
     | 
    
         
            +
                out = None
         
     | 
| 
      
 22 
     | 
    
         
            +
                for result in results:
         
     | 
| 
      
 23 
     | 
    
         
            +
                    format = result["format"]
         
     | 
| 
      
 24 
     | 
    
         
            +
                    data = result["data"]
         
     | 
| 
      
 25 
     | 
    
         
            +
             
     | 
| 
      
 26 
     | 
    
         
            +
                    if format == "json":
         
     | 
| 
      
 27 
     | 
    
         
            +
                        if out is None:
         
     | 
| 
      
 28 
     | 
    
         
            +
                            out = [data]
         
     | 
| 
      
 29 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 30 
     | 
    
         
            +
                            out.append(data)
         
     | 
| 
      
 31 
     | 
    
         
            +
             
     | 
| 
      
 32 
     | 
    
         
            +
                    elif format == "csv":
         
     | 
| 
      
 33 
     | 
    
         
            +
                        if out is None:
         
     | 
| 
      
 34 
     | 
    
         
            +
                            out = data
         
     | 
| 
      
 35 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 36 
     | 
    
         
            +
                            out.extend(data)
         
     | 
| 
      
 37 
     | 
    
         
            +
             
     | 
| 
      
 38 
     | 
    
         
            +
                    elif format == "html":
         
     | 
| 
      
 39 
     | 
    
         
            +
                        if out is None:
         
     | 
| 
      
 40 
     | 
    
         
            +
                            out = data
         
     | 
| 
      
 41 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 42 
     | 
    
         
            +
                            out += "\n" + data
         
     | 
| 
      
 43 
     | 
    
         
            +
             
     | 
| 
      
 44 
     | 
    
         
            +
                    elif format == "md":
         
     | 
| 
      
 45 
     | 
    
         
            +
                        if out is None:
         
     | 
| 
      
 46 
     | 
    
         
            +
                            out = data
         
     | 
| 
      
 47 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 48 
     | 
    
         
            +
                            out += "\n" + data
         
     | 
| 
      
 49 
     | 
    
         
            +
             
     | 
| 
      
 50 
     | 
    
         
            +
                return out
         
     | 
| 
      
 51 
     | 
    
         
            +
             
     | 
| 
      
 52 
     | 
    
         
            +
             
     | 
| 
      
 53 
     | 
    
         
            +
            def save_merged_file(out_path, args, out):
         
     | 
| 
      
 54 
     | 
    
         
            +
                if args.format == "json":
         
     | 
| 
      
 55 
     | 
    
         
            +
                    save_json(out, out_path, args.encoding)
         
     | 
| 
      
 56 
     | 
    
         
            +
                elif args.format == "csv":
         
     | 
| 
      
 57 
     | 
    
         
            +
                    save_csv(out, out_path, args.encoding)
         
     | 
| 
      
 58 
     | 
    
         
            +
                elif args.format == "html":
         
     | 
| 
      
 59 
     | 
    
         
            +
                    save_html(out, out_path, args.encoding)
         
     | 
| 
      
 60 
     | 
    
         
            +
                elif args.format == "md":
         
     | 
| 
      
 61 
     | 
    
         
            +
                    save_markdown(out, out_path, args.encoding)
         
     | 
| 
      
 62 
     | 
    
         
            +
             
     | 
| 
      
 63 
     | 
    
         
            +
             
     | 
| 
      
 64 
     | 
    
         
            +
            def validate_encoding(encoding):
         
     | 
| 
      
 65 
     | 
    
         
            +
                if encoding not in [
         
     | 
| 
      
 66 
     | 
    
         
            +
                    "utf-8",
         
     | 
| 
      
 67 
     | 
    
         
            +
                    "utf-8-sig",
         
     | 
| 
      
 68 
     | 
    
         
            +
                    "shift-jis",
         
     | 
| 
      
 69 
     | 
    
         
            +
                    "euc-jp",
         
     | 
| 
      
 70 
     | 
    
         
            +
                    "cp932",
         
     | 
| 
      
 71 
     | 
    
         
            +
                ]:
         
     | 
| 
      
 72 
     | 
    
         
            +
                    raise ValueError(f"Invalid encoding: {encoding}")
         
     | 
| 
      
 73 
     | 
    
         
            +
                return True
         
     | 
| 
      
 74 
     | 
    
         
            +
             
     | 
| 
      
 75 
     | 
    
         
            +
             
     | 
| 
       16 
76 
     | 
    
         
             
            def process_single_file(args, analyzer, path, format):
         
     | 
| 
       17 
77 
     | 
    
         
             
                if path.suffix[1:].lower() in ["pdf"]:
         
     | 
| 
       18 
78 
     | 
    
         
             
                    imgs = load_pdf(path)
         
     | 
| 
       19 
79 
     | 
    
         
             
                else:
         
     | 
| 
       20 
     | 
    
         
            -
                    imgs =  
     | 
| 
      
 80 
     | 
    
         
            +
                    imgs = load_image(path)
         
     | 
| 
       21 
81 
     | 
    
         | 
| 
      
 82 
     | 
    
         
            +
                results = []
         
     | 
| 
       22 
83 
     | 
    
         
             
                for page, img in enumerate(imgs):
         
     | 
| 
       23 
     | 
    
         
            -
                     
     | 
| 
       24 
     | 
    
         
            -
             
     | 
| 
      
 84 
     | 
    
         
            +
                    result, ocr, layout = analyzer(img)
         
     | 
| 
       25 
85 
     | 
    
         
             
                    dirname = path.parent.name
         
     | 
| 
       26 
86 
     | 
    
         
             
                    filename = path.stem
         
     | 
| 
       27 
87 
     | 
    
         | 
| 
      
 88 
     | 
    
         
            +
                    # cv2.imwrite(
         
     | 
| 
      
 89 
     | 
    
         
            +
                    #    os.path.join(args.outdir, f"{dirname}_{filename}_p{page+1}.jpg"), img
         
     | 
| 
      
 90 
     | 
    
         
            +
                    # )
         
     | 
| 
      
 91 
     | 
    
         
            +
             
     | 
| 
       28 
92 
     | 
    
         
             
                    if ocr is not None:
         
     | 
| 
       29 
93 
     | 
    
         
             
                        out_path = os.path.join(
         
     | 
| 
       30 
94 
     | 
    
         
             
                            args.outdir, f"{dirname}_{filename}_p{page+1}_ocr.jpg"
         
     | 
| 
         @@ -44,37 +108,129 @@ def process_single_file(args, analyzer, path, format): 
     | 
|
| 
       44 
108 
     | 
    
         
             
                    out_path = os.path.join(args.outdir, f"{dirname}_{filename}_p{page+1}.{format}")
         
     | 
| 
       45 
109 
     | 
    
         | 
| 
       46 
110 
     | 
    
         
             
                    if format == "json":
         
     | 
| 
       47 
     | 
    
         
            -
                         
     | 
| 
       48 
     | 
    
         
            -
                             
     | 
| 
       49 
     | 
    
         
            -
             
     | 
| 
      
 111 
     | 
    
         
            +
                        if args.combine:
         
     | 
| 
      
 112 
     | 
    
         
            +
                            json = convert_json(
         
     | 
| 
      
 113 
     | 
    
         
            +
                                result,
         
     | 
| 
      
 114 
     | 
    
         
            +
                                out_path,
         
     | 
| 
      
 115 
     | 
    
         
            +
                                args.ignore_line_break,
         
     | 
| 
      
 116 
     | 
    
         
            +
                                img,
         
     | 
| 
      
 117 
     | 
    
         
            +
                                args.figure,
         
     | 
| 
      
 118 
     | 
    
         
            +
                                args.figure_dir,
         
     | 
| 
      
 119 
     | 
    
         
            +
                            )
         
     | 
| 
      
 120 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 121 
     | 
    
         
            +
                            json = result.to_json(
         
     | 
| 
      
 122 
     | 
    
         
            +
                                out_path,
         
     | 
| 
      
 123 
     | 
    
         
            +
                                ignore_line_break=args.ignore_line_break,
         
     | 
| 
      
 124 
     | 
    
         
            +
                                encoding=args.encoding,
         
     | 
| 
      
 125 
     | 
    
         
            +
                                img=img,
         
     | 
| 
      
 126 
     | 
    
         
            +
                                export_figure=args.figure,
         
     | 
| 
      
 127 
     | 
    
         
            +
                                figure_dir=args.figure_dir,
         
     | 
| 
      
 128 
     | 
    
         
            +
                            )
         
     | 
| 
      
 129 
     | 
    
         
            +
             
     | 
| 
      
 130 
     | 
    
         
            +
                        results.append(
         
     | 
| 
      
 131 
     | 
    
         
            +
                            {
         
     | 
| 
      
 132 
     | 
    
         
            +
                                "format": format,
         
     | 
| 
      
 133 
     | 
    
         
            +
                                "data": json.model_dump(),
         
     | 
| 
      
 134 
     | 
    
         
            +
                            }
         
     | 
| 
       50 
135 
     | 
    
         
             
                        )
         
     | 
| 
      
 136 
     | 
    
         
            +
             
     | 
| 
       51 
137 
     | 
    
         
             
                    elif format == "csv":
         
     | 
| 
       52 
     | 
    
         
            -
                         
     | 
| 
       53 
     | 
    
         
            -
                             
     | 
| 
       54 
     | 
    
         
            -
             
     | 
| 
      
 138 
     | 
    
         
            +
                        if args.combine:
         
     | 
| 
      
 139 
     | 
    
         
            +
                            csv = convert_csv(
         
     | 
| 
      
 140 
     | 
    
         
            +
                                result,
         
     | 
| 
      
 141 
     | 
    
         
            +
                                out_path,
         
     | 
| 
      
 142 
     | 
    
         
            +
                                args.ignore_line_break,
         
     | 
| 
      
 143 
     | 
    
         
            +
                                img,
         
     | 
| 
      
 144 
     | 
    
         
            +
                                args.figure,
         
     | 
| 
      
 145 
     | 
    
         
            +
                                args.figure_dir,
         
     | 
| 
      
 146 
     | 
    
         
            +
                            )
         
     | 
| 
      
 147 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 148 
     | 
    
         
            +
                            csv = result.to_csv(
         
     | 
| 
      
 149 
     | 
    
         
            +
                                out_path,
         
     | 
| 
      
 150 
     | 
    
         
            +
                                ignore_line_break=args.ignore_line_break,
         
     | 
| 
      
 151 
     | 
    
         
            +
                                encoding=args.encoding,
         
     | 
| 
      
 152 
     | 
    
         
            +
                                img=img,
         
     | 
| 
      
 153 
     | 
    
         
            +
                                export_figure=args.figure,
         
     | 
| 
      
 154 
     | 
    
         
            +
                                figure_dir=args.figure_dir,
         
     | 
| 
      
 155 
     | 
    
         
            +
                            )
         
     | 
| 
      
 156 
     | 
    
         
            +
             
     | 
| 
      
 157 
     | 
    
         
            +
                        results.append(
         
     | 
| 
      
 158 
     | 
    
         
            +
                            {
         
     | 
| 
      
 159 
     | 
    
         
            +
                                "format": format,
         
     | 
| 
      
 160 
     | 
    
         
            +
                                "data": csv,
         
     | 
| 
      
 161 
     | 
    
         
            +
                            }
         
     | 
| 
       55 
162 
     | 
    
         
             
                        )
         
     | 
| 
      
 163 
     | 
    
         
            +
             
     | 
| 
       56 
164 
     | 
    
         
             
                    elif format == "html":
         
     | 
| 
       57 
     | 
    
         
            -
                         
     | 
| 
       58 
     | 
    
         
            -
                             
     | 
| 
       59 
     | 
    
         
            -
             
     | 
| 
       60 
     | 
    
         
            -
             
     | 
| 
       61 
     | 
    
         
            -
             
     | 
| 
       62 
     | 
    
         
            -
             
     | 
| 
       63 
     | 
    
         
            -
             
     | 
| 
       64 
     | 
    
         
            -
             
     | 
| 
      
 165 
     | 
    
         
            +
                        if args.combine:
         
     | 
| 
      
 166 
     | 
    
         
            +
                            html, _ = convert_html(
         
     | 
| 
      
 167 
     | 
    
         
            +
                                result,
         
     | 
| 
      
 168 
     | 
    
         
            +
                                out_path,
         
     | 
| 
      
 169 
     | 
    
         
            +
                                ignore_line_break=args.ignore_line_break,
         
     | 
| 
      
 170 
     | 
    
         
            +
                                img=img,
         
     | 
| 
      
 171 
     | 
    
         
            +
                                export_figure=args.figure,
         
     | 
| 
      
 172 
     | 
    
         
            +
                                export_figure_letter=args.figure_letter,
         
     | 
| 
      
 173 
     | 
    
         
            +
                                figure_width=args.figure_width,
         
     | 
| 
      
 174 
     | 
    
         
            +
                                figure_dir=args.figure_dir,
         
     | 
| 
      
 175 
     | 
    
         
            +
                            )
         
     | 
| 
      
 176 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 177 
     | 
    
         
            +
                            html = result.to_html(
         
     | 
| 
      
 178 
     | 
    
         
            +
                                out_path,
         
     | 
| 
      
 179 
     | 
    
         
            +
                                ignore_line_break=args.ignore_line_break,
         
     | 
| 
      
 180 
     | 
    
         
            +
                                img=img,
         
     | 
| 
      
 181 
     | 
    
         
            +
                                export_figure=args.figure,
         
     | 
| 
      
 182 
     | 
    
         
            +
                                export_figure_letter=args.figure_letter,
         
     | 
| 
      
 183 
     | 
    
         
            +
                                figure_width=args.figure_width,
         
     | 
| 
      
 184 
     | 
    
         
            +
                                figure_dir=args.figure_dir,
         
     | 
| 
      
 185 
     | 
    
         
            +
                                encoding=args.encoding,
         
     | 
| 
      
 186 
     | 
    
         
            +
                            )
         
     | 
| 
      
 187 
     | 
    
         
            +
             
     | 
| 
      
 188 
     | 
    
         
            +
                        results.append(
         
     | 
| 
      
 189 
     | 
    
         
            +
                            {
         
     | 
| 
      
 190 
     | 
    
         
            +
                                "format": format,
         
     | 
| 
      
 191 
     | 
    
         
            +
                                "data": html,
         
     | 
| 
      
 192 
     | 
    
         
            +
                            }
         
     | 
| 
       65 
193 
     | 
    
         
             
                        )
         
     | 
| 
      
 194 
     | 
    
         
            +
             
     | 
| 
       66 
195 
     | 
    
         
             
                    elif format == "md":
         
     | 
| 
       67 
     | 
    
         
            -
                         
     | 
| 
       68 
     | 
    
         
            -
                             
     | 
| 
       69 
     | 
    
         
            -
             
     | 
| 
       70 
     | 
    
         
            -
             
     | 
| 
       71 
     | 
    
         
            -
             
     | 
| 
       72 
     | 
    
         
            -
             
     | 
| 
       73 
     | 
    
         
            -
             
     | 
| 
       74 
     | 
    
         
            -
             
     | 
| 
      
 196 
     | 
    
         
            +
                        if args.combine:
         
     | 
| 
      
 197 
     | 
    
         
            +
                            md, _ = convert_markdown(
         
     | 
| 
      
 198 
     | 
    
         
            +
                                result,
         
     | 
| 
      
 199 
     | 
    
         
            +
                                out_path,
         
     | 
| 
      
 200 
     | 
    
         
            +
                                ignore_line_break=args.ignore_line_break,
         
     | 
| 
      
 201 
     | 
    
         
            +
                                img=img,
         
     | 
| 
      
 202 
     | 
    
         
            +
                                export_figure=args.figure,
         
     | 
| 
      
 203 
     | 
    
         
            +
                                export_figure_letter=args.figure_letter,
         
     | 
| 
      
 204 
     | 
    
         
            +
                                figure_width=args.figure_width,
         
     | 
| 
      
 205 
     | 
    
         
            +
                                figure_dir=args.figure_dir,
         
     | 
| 
      
 206 
     | 
    
         
            +
                            )
         
     | 
| 
      
 207 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 208 
     | 
    
         
            +
                            md = result.to_markdown(
         
     | 
| 
      
 209 
     | 
    
         
            +
                                out_path,
         
     | 
| 
      
 210 
     | 
    
         
            +
                                ignore_line_break=args.ignore_line_break,
         
     | 
| 
      
 211 
     | 
    
         
            +
                                img=img,
         
     | 
| 
      
 212 
     | 
    
         
            +
                                export_figure=args.figure,
         
     | 
| 
      
 213 
     | 
    
         
            +
                                export_figure_letter=args.figure_letter,
         
     | 
| 
      
 214 
     | 
    
         
            +
                                figure_width=args.figure_width,
         
     | 
| 
      
 215 
     | 
    
         
            +
                                figure_dir=args.figure_dir,
         
     | 
| 
      
 216 
     | 
    
         
            +
                                encoding=args.encoding,
         
     | 
| 
      
 217 
     | 
    
         
            +
                            )
         
     | 
| 
      
 218 
     | 
    
         
            +
             
     | 
| 
      
 219 
     | 
    
         
            +
                        results.append(
         
     | 
| 
      
 220 
     | 
    
         
            +
                            {
         
     | 
| 
      
 221 
     | 
    
         
            +
                                "format": format,
         
     | 
| 
      
 222 
     | 
    
         
            +
                                "data": md,
         
     | 
| 
      
 223 
     | 
    
         
            +
                            }
         
     | 
| 
       75 
224 
     | 
    
         
             
                        )
         
     | 
| 
       76 
225 
     | 
    
         | 
| 
       77 
     | 
    
         
            -
             
     | 
| 
      
 226 
     | 
    
         
            +
                out = merge_all_pages(results)
         
     | 
| 
      
 227 
     | 
    
         
            +
                if args.combine:
         
     | 
| 
      
 228 
     | 
    
         
            +
                    out_path = os.path.join(args.outdir, f"{dirname}_{filename}.{format}")
         
     | 
| 
      
 229 
     | 
    
         
            +
                    save_merged_file(
         
     | 
| 
      
 230 
     | 
    
         
            +
                        out_path,
         
     | 
| 
      
 231 
     | 
    
         
            +
                        args,
         
     | 
| 
      
 232 
     | 
    
         
            +
                        out,
         
     | 
| 
      
 233 
     | 
    
         
            +
                    )
         
     | 
| 
       78 
234 
     | 
    
         | 
| 
       79 
235 
     | 
    
         | 
| 
       80 
236 
     | 
    
         
             
            def main():
         
     | 
| 
         @@ -104,6 +260,12 @@ def main(): 
     | 
|
| 
       104 
260 
     | 
    
         
             
                    default="results",
         
     | 
| 
       105 
261 
     | 
    
         
             
                    help="output directory",
         
     | 
| 
       106 
262 
     | 
    
         
             
                )
         
     | 
| 
      
 263 
     | 
    
         
            +
                parser.add_argument(
         
     | 
| 
      
 264 
     | 
    
         
            +
                    "-l",
         
     | 
| 
      
 265 
     | 
    
         
            +
                    "--lite",
         
     | 
| 
      
 266 
     | 
    
         
            +
                    action="store_true",
         
     | 
| 
      
 267 
     | 
    
         
            +
                    help="if set, use lite model",
         
     | 
| 
      
 268 
     | 
    
         
            +
                )
         
     | 
| 
       107 
269 
     | 
    
         
             
                parser.add_argument(
         
     | 
| 
       108 
270 
     | 
    
         
             
                    "-d",
         
     | 
| 
       109 
271 
     | 
    
         
             
                    "--device",
         
     | 
| 
         @@ -162,6 +324,22 @@ def main(): 
     | 
|
| 
       162 
324 
     | 
    
         
             
                    default="figures",
         
     | 
| 
       163 
325 
     | 
    
         
             
                    help="directory to save figure images",
         
     | 
| 
       164 
326 
     | 
    
         
             
                )
         
     | 
| 
      
 327 
     | 
    
         
            +
                parser.add_argument(
         
     | 
| 
      
 328 
     | 
    
         
            +
                    "--encoding",
         
     | 
| 
      
 329 
     | 
    
         
            +
                    type=str,
         
     | 
| 
      
 330 
     | 
    
         
            +
                    default="utf-8",
         
     | 
| 
      
 331 
     | 
    
         
            +
                    help="Specifies the character encoding for the output file to be exported. If unsupported characters are included, they will be ignored.",
         
     | 
| 
      
 332 
     | 
    
         
            +
                )
         
     | 
| 
      
 333 
     | 
    
         
            +
                parser.add_argument(
         
     | 
| 
      
 334 
     | 
    
         
            +
                    "--combine",
         
     | 
| 
      
 335 
     | 
    
         
            +
                    action="store_true",
         
     | 
| 
      
 336 
     | 
    
         
            +
                    help="if set, merge all pages in the output",
         
     | 
| 
      
 337 
     | 
    
         
            +
                )
         
     | 
| 
      
 338 
     | 
    
         
            +
                parser.add_argument(
         
     | 
| 
      
 339 
     | 
    
         
            +
                    "--ignore_meta",
         
     | 
| 
      
 340 
     | 
    
         
            +
                    action="store_true",
         
     | 
| 
      
 341 
     | 
    
         
            +
                    help="if set, ignore meta information(header, footer) in the output",
         
     | 
| 
      
 342 
     | 
    
         
            +
                )
         
     | 
| 
       165 
343 
     | 
    
         | 
| 
       166 
344 
     | 
    
         
             
                args = parser.parse_args()
         
     | 
| 
       167 
345 
     | 
    
         | 
| 
         @@ -175,6 +353,8 @@ def main(): 
     | 
|
| 
       175 
353 
     | 
    
         
             
                        f"Invalid output format: {args.format}. Supported formats are {SUPPORT_OUTPUT_FORMAT}"
         
     | 
| 
       176 
354 
     | 
    
         
             
                    )
         
     | 
| 
       177 
355 
     | 
    
         | 
| 
      
 356 
     | 
    
         
            +
                validate_encoding(args.encoding)
         
     | 
| 
      
 357 
     | 
    
         
            +
             
     | 
| 
       178 
358 
     | 
    
         
             
                if format == "markdown":
         
     | 
| 
       179 
359 
     | 
    
         
             
                    format = "md"
         
     | 
| 
       180 
360 
     | 
    
         | 
| 
         @@ -197,10 +377,22 @@ def main(): 
     | 
|
| 
       197 
377 
     | 
    
         
             
                    },
         
     | 
| 
       198 
378 
     | 
    
         
             
                }
         
     | 
| 
       199 
379 
     | 
    
         | 
| 
      
 380 
     | 
    
         
            +
                if args.lite:
         
     | 
| 
      
 381 
     | 
    
         
            +
                    configs["ocr"]["text_recognizer"]["model_name"] = "parseq-small"
         
     | 
| 
      
 382 
     | 
    
         
            +
             
     | 
| 
      
 383 
     | 
    
         
            +
                    if args.device == "cpu" or not torch.cuda.is_available():
         
     | 
| 
      
 384 
     | 
    
         
            +
                        configs["ocr"]["text_detector"]["infer_onnx"] = True
         
     | 
| 
      
 385 
     | 
    
         
            +
             
     | 
| 
      
 386 
     | 
    
         
            +
                    # Note: Text Detector以外はONNX推論よりもPyTorch推論の方が速いため、ONNX推論は行わない
         
     | 
| 
      
 387 
     | 
    
         
            +
                    # configs["ocr"]["text_recognizer"]["infer_onnx"] = True
         
     | 
| 
      
 388 
     | 
    
         
            +
                    # configs["layout_analyzer"]["table_structure_recognizer"]["infer_onnx"] = True
         
     | 
| 
      
 389 
     | 
    
         
            +
                    # configs["layout_analyzer"]["layout_parser"]["infer_onnx"] = True
         
     | 
| 
      
 390 
     | 
    
         
            +
             
     | 
| 
       200 
391 
     | 
    
         
             
                analyzer = DocumentAnalyzer(
         
     | 
| 
       201 
392 
     | 
    
         
             
                    configs=configs,
         
     | 
| 
       202 
393 
     | 
    
         
             
                    visualize=args.vis,
         
     | 
| 
       203 
394 
     | 
    
         
             
                    device=args.device,
         
     | 
| 
      
 395 
     | 
    
         
            +
                    ignore_meta=args.ignore_meta,
         
     | 
| 
       204 
396 
     | 
    
         
             
                )
         
     | 
| 
       205 
397 
     | 
    
         | 
| 
       206 
398 
     | 
    
         
             
                os.makedirs(args.outdir, exist_ok=True)
         
     | 
    
        yomitoku/configs/__init__.py
    CHANGED
    
    | 
         @@ -4,10 +4,12 @@ from .cfg_table_structure_recognizer_rtdtrv2 import ( 
     | 
|
| 
       4 
4 
     | 
    
         
             
            )
         
     | 
| 
       5 
5 
     | 
    
         
             
            from .cfg_text_detector_dbnet import TextDetectorDBNetConfig
         
     | 
| 
       6 
6 
     | 
    
         
             
            from .cfg_text_recognizer_parseq import TextRecognizerPARSeqConfig
         
     | 
| 
      
 7 
     | 
    
         
            +
            from .cfg_text_recognizer_parseq_small import TextRecognizerPARSeqSmallConfig
         
     | 
| 
       7 
8 
     | 
    
         | 
| 
       8 
9 
     | 
    
         
             
            __all__ = [
         
     | 
| 
       9 
10 
     | 
    
         
             
                "TextDetectorDBNetConfig",
         
     | 
| 
       10 
11 
     | 
    
         
             
                "TextRecognizerPARSeqConfig",
         
     | 
| 
       11 
12 
     | 
    
         
             
                "LayoutParserRTDETRv2Config",
         
     | 
| 
       12 
13 
     | 
    
         
             
                "TableStructureRecognizerRTDETRv2Config",
         
     | 
| 
      
 14 
     | 
    
         
            +
                "TextRecognizerPARSeqSmallConfig",
         
     | 
| 
       13 
15 
     | 
    
         
             
            ]
         
     | 
| 
         @@ -0,0 +1,51 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            from dataclasses import dataclass, field
         
     | 
| 
      
 2 
     | 
    
         
            +
            from typing import List
         
     | 
| 
      
 3 
     | 
    
         
            +
             
     | 
| 
      
 4 
     | 
    
         
            +
            from ..constants import ROOT_DIR
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
             
     | 
| 
      
 7 
     | 
    
         
            +
            @dataclass
         
     | 
| 
      
 8 
     | 
    
         
            +
            class Data:
         
     | 
| 
      
 9 
     | 
    
         
            +
                num_workers: int = 4
         
     | 
| 
      
 10 
     | 
    
         
            +
                batch_size: int = 128
         
     | 
| 
      
 11 
     | 
    
         
            +
                img_size: List[int] = field(default_factory=lambda: [32, 800])
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
             
     | 
| 
      
 14 
     | 
    
         
            +
            @dataclass
         
     | 
| 
      
 15 
     | 
    
         
            +
            class Encoder:
         
     | 
| 
      
 16 
     | 
    
         
            +
                patch_size: List[int] = field(default_factory=lambda: [16, 16])
         
     | 
| 
      
 17 
     | 
    
         
            +
                num_heads: int = 8
         
     | 
| 
      
 18 
     | 
    
         
            +
                embed_dim: int = 384
         
     | 
| 
      
 19 
     | 
    
         
            +
                mlp_ratio: int = 4
         
     | 
| 
      
 20 
     | 
    
         
            +
                depth: int = 9
         
     | 
| 
      
 21 
     | 
    
         
            +
             
     | 
| 
      
 22 
     | 
    
         
            +
             
     | 
| 
      
 23 
     | 
    
         
            +
            @dataclass
         
     | 
| 
      
 24 
     | 
    
         
            +
            class Decoder:
         
     | 
| 
      
 25 
     | 
    
         
            +
                embed_dim: int = 384
         
     | 
| 
      
 26 
     | 
    
         
            +
                num_heads: int = 8
         
     | 
| 
      
 27 
     | 
    
         
            +
                mlp_ratio: int = 4
         
     | 
| 
      
 28 
     | 
    
         
            +
                depth: int = 1
         
     | 
| 
      
 29 
     | 
    
         
            +
             
     | 
| 
      
 30 
     | 
    
         
            +
             
     | 
| 
      
 31 
     | 
    
         
            +
            @dataclass
         
     | 
| 
      
 32 
     | 
    
         
            +
            class Visualize:
         
     | 
| 
      
 33 
     | 
    
         
            +
                font: str = str(ROOT_DIR + "/resource/MPLUS1p-Medium.ttf")
         
     | 
| 
      
 34 
     | 
    
         
            +
                color: List[int] = field(default_factory=lambda: [0, 0, 255])  # RGB
         
     | 
| 
      
 35 
     | 
    
         
            +
                font_size: int = 18
         
     | 
| 
      
 36 
     | 
    
         
            +
             
     | 
| 
      
 37 
     | 
    
         
            +
             
     | 
| 
      
 38 
     | 
    
         
            +
            @dataclass
         
     | 
| 
      
 39 
     | 
    
         
            +
            class TextRecognizerPARSeqSmallConfig:
         
     | 
| 
      
 40 
     | 
    
         
            +
                hf_hub_repo: str = "KotaroKinoshita/yomitoku-text-recognizer-parseq-small-open-beta"
         
     | 
| 
      
 41 
     | 
    
         
            +
                charset: str = str(ROOT_DIR + "/resource/charset.txt")
         
     | 
| 
      
 42 
     | 
    
         
            +
                num_tokens: int = 7312
         
     | 
| 
      
 43 
     | 
    
         
            +
                max_label_length: int = 100
         
     | 
| 
      
 44 
     | 
    
         
            +
                decode_ar: int = 1
         
     | 
| 
      
 45 
     | 
    
         
            +
                refine_iters: int = 1
         
     | 
| 
      
 46 
     | 
    
         
            +
             
     | 
| 
      
 47 
     | 
    
         
            +
                data: Data = field(default_factory=Data)
         
     | 
| 
      
 48 
     | 
    
         
            +
                encoder: Encoder = field(default_factory=Encoder)
         
     | 
| 
      
 49 
     | 
    
         
            +
                decoder: Decoder = field(default_factory=Decoder)
         
     | 
| 
      
 50 
     | 
    
         
            +
             
     | 
| 
      
 51 
     | 
    
         
            +
                visualize: Visualize = field(default_factory=Visualize)
         
     | 
    
        yomitoku/data/functions.py
    CHANGED
    
    | 
         @@ -1,9 +1,10 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            from pathlib import Path
         
     | 
| 
       2 
2 
     | 
    
         | 
| 
       3 
3 
     | 
    
         
             
            import cv2
         
     | 
| 
      
 4 
     | 
    
         
            +
            from PIL import Image
         
     | 
| 
       4 
5 
     | 
    
         
             
            import numpy as np
         
     | 
| 
       5 
6 
     | 
    
         
             
            import torch
         
     | 
| 
       6 
     | 
    
         
            -
             
     | 
| 
      
 7 
     | 
    
         
            +
            import pypdfium2
         
     | 
| 
       7 
8 
     | 
    
         | 
| 
       8 
9 
     | 
    
         
             
            from ..constants import (
         
     | 
| 
       9 
10 
     | 
    
         
             
                MIN_IMAGE_SIZE,
         
     | 
| 
         @@ -15,6 +16,20 @@ from ..utils.logger import set_logger 
     | 
|
| 
       15 
16 
     | 
    
         
             
            logger = set_logger(__name__)
         
     | 
| 
       16 
17 
     | 
    
         | 
| 
       17 
18 
     | 
    
         | 
| 
      
 19 
     | 
    
         
            +
            def validate_image(img: np.ndarray):
         
     | 
| 
      
 20 
     | 
    
         
            +
                h, w = img.shape[:2]
         
     | 
| 
      
 21 
     | 
    
         
            +
                if h < MIN_IMAGE_SIZE or w < MIN_IMAGE_SIZE:
         
     | 
| 
      
 22 
     | 
    
         
            +
                    raise ValueError("Image size is too small.")
         
     | 
| 
      
 23 
     | 
    
         
            +
             
     | 
| 
      
 24 
     | 
    
         
            +
                if min(h, w) < WARNING_IMAGE_SIZE:
         
     | 
| 
      
 25 
     | 
    
         
            +
                    logger.warning(
         
     | 
| 
      
 26 
     | 
    
         
            +
                        """
         
     | 
| 
      
 27 
     | 
    
         
            +
                        The image size is small, which may result in reduced OCR accuracy. 
         
     | 
| 
      
 28 
     | 
    
         
            +
                        The process will continue, but it is recommended to input images with a minimum size of 720 pixels on the shorter side.
         
     | 
| 
      
 29 
     | 
    
         
            +
                        """
         
     | 
| 
      
 30 
     | 
    
         
            +
                    )
         
     | 
| 
      
 31 
     | 
    
         
            +
             
     | 
| 
      
 32 
     | 
    
         
            +
             
     | 
| 
       18 
33 
     | 
    
         
             
            def load_image(image_path: str) -> np.ndarray:
         
     | 
| 
       19 
34 
     | 
    
         
             
                """
         
     | 
| 
       20 
35 
     | 
    
         
             
                Open an image file.
         
     | 
| 
         @@ -40,24 +55,27 @@ def load_image(image_path: str) -> np.ndarray: 
     | 
|
| 
       40 
55 
     | 
    
         
             
                        "PDF file is not supported by load_image(). Use load_pdf() instead."
         
     | 
| 
       41 
56 
     | 
    
         
             
                    )
         
     | 
| 
       42 
57 
     | 
    
         | 
| 
       43 
     | 
    
         
            -
                 
     | 
| 
       44 
     | 
    
         
            -
             
     | 
| 
       45 
     | 
    
         
            -
                 
     | 
| 
      
 58 
     | 
    
         
            +
                try:
         
     | 
| 
      
 59 
     | 
    
         
            +
                    img = Image.open(image_path)
         
     | 
| 
      
 60 
     | 
    
         
            +
                except Exception:
         
     | 
| 
       46 
61 
     | 
    
         
             
                    raise ValueError("Invalid image data.")
         
     | 
| 
       47 
62 
     | 
    
         | 
| 
       48 
     | 
    
         
            -
                 
     | 
| 
       49 
     | 
    
         
            -
                if  
     | 
| 
       50 
     | 
    
         
            -
                     
     | 
| 
       51 
     | 
    
         
            -
             
     | 
| 
       52 
     | 
    
         
            -
             
     | 
| 
       53 
     | 
    
         
            -
             
     | 
| 
       54 
     | 
    
         
            -
             
     | 
| 
       55 
     | 
    
         
            -
             
     | 
| 
       56 
     | 
    
         
            -
             
     | 
| 
       57 
     | 
    
         
            -
                         
     | 
| 
       58 
     | 
    
         
            -
             
     | 
| 
      
 63 
     | 
    
         
            +
                pages = []
         
     | 
| 
      
 64 
     | 
    
         
            +
                if ext in ["tif", "tiff"]:
         
     | 
| 
      
 65 
     | 
    
         
            +
                    try:
         
     | 
| 
      
 66 
     | 
    
         
            +
                        while True:
         
     | 
| 
      
 67 
     | 
    
         
            +
                            img_arr = np.array(img.copy().convert("RGB"))
         
     | 
| 
      
 68 
     | 
    
         
            +
                            validate_image(img_arr)
         
     | 
| 
      
 69 
     | 
    
         
            +
                            pages.append(img_arr[:, :, ::-1])
         
     | 
| 
      
 70 
     | 
    
         
            +
                            img.seek(img.tell() + 1)
         
     | 
| 
      
 71 
     | 
    
         
            +
                    except EOFError:
         
     | 
| 
      
 72 
     | 
    
         
            +
                        pass
         
     | 
| 
      
 73 
     | 
    
         
            +
                else:
         
     | 
| 
      
 74 
     | 
    
         
            +
                    img_arr = np.array(img.convert("RGB"))
         
     | 
| 
      
 75 
     | 
    
         
            +
                    validate_image(img_arr)
         
     | 
| 
      
 76 
     | 
    
         
            +
                    pages.append(img_arr[:, :, ::-1])
         
     | 
| 
       59 
77 
     | 
    
         | 
| 
       60 
     | 
    
         
            -
                return  
     | 
| 
      
 78 
     | 
    
         
            +
                return pages
         
     | 
| 
       61 
79 
     | 
    
         | 
| 
       62 
80 
     | 
    
         | 
| 
       63 
81 
     | 
    
         
             
            def load_pdf(pdf_path: str, dpi=200) -> list[np.ndarray]:
         
     | 
| 
         @@ -70,6 +88,7 @@ def load_pdf(pdf_path: str, dpi=200) -> list[np.ndarray]: 
     | 
|
| 
       70 
88 
     | 
    
         
             
                Returns:
         
     | 
| 
       71 
89 
     | 
    
         
             
                    list[np.ndarray]: list of image data(BGR)
         
     | 
| 
       72 
90 
     | 
    
         
             
                """
         
     | 
| 
      
 91 
     | 
    
         
            +
             
     | 
| 
       73 
92 
     | 
    
         
             
                pdf_path = Path(pdf_path)
         
     | 
| 
       74 
93 
     | 
    
         
             
                if not pdf_path.exists():
         
     | 
| 
       75 
94 
     | 
    
         
             
                    raise FileNotFoundError(f"File not found: {pdf_path}")
         
     | 
| 
         @@ -86,11 +105,19 @@ def load_pdf(pdf_path: str, dpi=200) -> list[np.ndarray]: 
     | 
|
| 
       86 
105 
     | 
    
         
             
                    )
         
     | 
| 
       87 
106 
     | 
    
         | 
| 
       88 
107 
     | 
    
         
             
                try:
         
     | 
| 
       89 
     | 
    
         
            -
                     
     | 
| 
      
 108 
     | 
    
         
            +
                    doc = pypdfium2.PdfDocument(pdf_path)
         
     | 
| 
      
 109 
     | 
    
         
            +
                    renderer = doc.render(
         
     | 
| 
      
 110 
     | 
    
         
            +
                        pypdfium2.PdfBitmap.to_pil,
         
     | 
| 
      
 111 
     | 
    
         
            +
                        scale=dpi / 72,
         
     | 
| 
      
 112 
     | 
    
         
            +
                    )
         
     | 
| 
      
 113 
     | 
    
         
            +
                    images = list(renderer)
         
     | 
| 
      
 114 
     | 
    
         
            +
                    images = [np.array(image.convert("RGB"))[:, :, ::-1] for image in images]
         
     | 
| 
      
 115 
     | 
    
         
            +
             
     | 
| 
      
 116 
     | 
    
         
            +
                    doc.close()
         
     | 
| 
       90 
117 
     | 
    
         
             
                except Exception as e:
         
     | 
| 
       91 
118 
     | 
    
         
             
                    raise ValueError(f"Failed to open the PDF file: {pdf_path}") from e
         
     | 
| 
       92 
119 
     | 
    
         | 
| 
       93 
     | 
    
         
            -
                return  
     | 
| 
      
 120 
     | 
    
         
            +
                return images
         
     | 
| 
       94 
121 
     | 
    
         | 
| 
       95 
122 
     | 
    
         | 
| 
       96 
123 
     | 
    
         
             
            def resize_shortest_edge(
         
     | 
| 
         @@ -123,7 +150,7 @@ def resize_shortest_edge( 
     | 
|
| 
       123 
150 
     | 
    
         
             
                neww = max(int(new_w / 32) * 32, 32)
         
     | 
| 
       124 
151 
     | 
    
         
             
                newh = max(int(new_h / 32) * 32, 32)
         
     | 
| 
       125 
152 
     | 
    
         | 
| 
       126 
     | 
    
         
            -
                img = cv2.resize(img, (neww, newh))
         
     | 
| 
      
 153 
     | 
    
         
            +
                img = cv2.resize(img, (neww, newh), interpolation=cv2.INTER_AREA)
         
     | 
| 
       127 
154 
     | 
    
         
             
                return img
         
     | 
| 
       128 
155 
     | 
    
         | 
| 
       129 
156 
     | 
    
         | 
| 
         @@ -193,9 +220,7 @@ def validate_quads(img: np.ndarray, quads: list[list[list[int]]]): 
     | 
|
| 
       193 
220 
     | 
    
         
             
                    h, w = img.shape[:2]
         
     | 
| 
       194 
221 
     | 
    
         | 
| 
       195 
222 
     | 
    
         
             
                    if x1 < 0 or x2 > w or y1 < 0 or y2 > h:
         
     | 
| 
       196 
     | 
    
         
            -
                        raise ValueError(
         
     | 
| 
       197 
     | 
    
         
            -
                            f"The vertices are out of the image. {quad.tolist()}"
         
     | 
| 
       198 
     | 
    
         
            -
                        )
         
     | 
| 
      
 223 
     | 
    
         
            +
                        raise ValueError(f"The vertices are out of the image. {quad.tolist()}")
         
     | 
| 
       199 
224 
     | 
    
         | 
| 
       200 
225 
     | 
    
         
             
                return True
         
     | 
| 
       201 
226 
     | 
    
         | 
| 
         @@ -268,7 +293,7 @@ def resize_with_padding(img, target_size, background_color=(0, 0, 0)): 
     | 
|
| 
       268 
293 
     | 
    
         
             
                new_w = int(w * min(scale_w, scale_h))
         
     | 
| 
       269 
294 
     | 
    
         
             
                new_h = int(h * min(scale_w, scale_h))
         
     | 
| 
       270 
295 
     | 
    
         | 
| 
       271 
     | 
    
         
            -
                resized = cv2.resize(img, (new_w, new_h), interpolation=cv2. 
     | 
| 
      
 296 
     | 
    
         
            +
                resized = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
         
     | 
| 
       272 
297 
     | 
    
         | 
| 
       273 
298 
     | 
    
         
             
                canvas = np.zeros((target_size[0], target_size[1], 3), dtype=np.uint8)
         
     | 
| 
       274 
299 
     | 
    
         
             
                canvas[:, :] = background_color
         
     |