yomitoku 0.9.3__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- yomitoku/cli/main.py +11 -1
- yomitoku/document_analyzer.py +7 -40
- yomitoku/export/export_csv.py +17 -0
- yomitoku/layout_analyzer.py +3 -13
- yomitoku/layout_parser.py +2 -15
- yomitoku/ocr.py +1 -22
- yomitoku/schemas.py +241 -0
- yomitoku/table_structure_recognizer.py +2 -29
- yomitoku/text_detector.py +2 -15
- yomitoku/text_recognizer.py +17 -20
- yomitoku/utils/searchable_pdf.py +0 -2
- yomitoku/utils/visualizer.py +16 -5
- {yomitoku-0.9.3.dist-info → yomitoku-0.10.0.dist-info}/METADATA +2 -2
- {yomitoku-0.9.3.dist-info → yomitoku-0.10.0.dist-info}/RECORD +16 -15
- {yomitoku-0.9.3.dist-info → yomitoku-0.10.0.dist-info}/WHEEL +0 -0
- {yomitoku-0.9.3.dist-info → yomitoku-0.10.0.dist-info}/entry_points.txt +0 -0
yomitoku/cli/main.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
import argparse
|
2
2
|
import os
|
3
|
+
import re
|
3
4
|
import time
|
4
5
|
from pathlib import Path
|
5
6
|
|
@@ -96,7 +97,7 @@ def process_single_file(args, analyzer, path, format):
|
|
96
97
|
format_results = []
|
97
98
|
for page, img in enumerate(imgs):
|
98
99
|
result, ocr, layout = analyzer(img)
|
99
|
-
dirname = path.parent.name
|
100
|
+
dirname = _sanitize_path_component(path.parent.name)
|
100
101
|
filename = path.stem
|
101
102
|
|
102
103
|
# cv2.imwrite(
|
@@ -158,6 +159,7 @@ def process_single_file(args, analyzer, path, format):
|
|
158
159
|
args.ignore_line_break,
|
159
160
|
img,
|
160
161
|
args.figure,
|
162
|
+
args.figure_letter,
|
161
163
|
args.figure_dir,
|
162
164
|
)
|
163
165
|
else:
|
@@ -167,6 +169,7 @@ def process_single_file(args, analyzer, path, format):
|
|
167
169
|
encoding=args.encoding,
|
168
170
|
img=img,
|
169
171
|
export_figure=args.figure,
|
172
|
+
export_figure_letter=args.figure_letter,
|
170
173
|
figure_dir=args.figure_dir,
|
171
174
|
)
|
172
175
|
|
@@ -469,5 +472,12 @@ def main():
|
|
469
472
|
logger.info(f"Total Processing time: {end - start:.2f} sec")
|
470
473
|
|
471
474
|
|
475
|
+
def _sanitize_path_component(component):
|
476
|
+
if not component:
|
477
|
+
return component
|
478
|
+
|
479
|
+
return re.sub(r"^\.+", lambda m: "_" * len(m.group(0)), component)
|
480
|
+
|
481
|
+
|
472
482
|
if __name__ == "__main__":
|
473
483
|
main()
|
yomitoku/document_analyzer.py
CHANGED
@@ -1,53 +1,17 @@
|
|
1
1
|
import asyncio
|
2
2
|
from concurrent.futures import ThreadPoolExecutor
|
3
|
-
from typing import List, Union
|
4
3
|
|
5
4
|
import numpy as np
|
6
|
-
from pydantic import conlist
|
7
5
|
|
8
6
|
from yomitoku.text_detector import TextDetector
|
9
7
|
from yomitoku.text_recognizer import TextRecognizer
|
10
8
|
|
11
|
-
from .base import BaseSchema
|
12
|
-
from .export import export_csv, export_html, export_markdown
|
13
9
|
from .layout_analyzer import LayoutAnalyzer
|
14
|
-
from .ocr import OCRSchema,
|
10
|
+
from .ocr import OCRSchema, ocr_aggregate
|
15
11
|
from .reading_order import prediction_reading_order
|
16
|
-
from .table_structure_recognizer import TableStructureRecognizerSchema
|
17
12
|
from .utils.misc import calc_overlap_ratio, is_contained, quad_to_xyxy
|
18
13
|
from .utils.visualizer import det_visualizer, reading_order_visualizer
|
19
|
-
|
20
|
-
|
21
|
-
class ParagraphSchema(BaseSchema):
|
22
|
-
box: conlist(int, min_length=4, max_length=4)
|
23
|
-
contents: Union[str, None]
|
24
|
-
direction: Union[str, None]
|
25
|
-
order: Union[int, None]
|
26
|
-
role: Union[str, None]
|
27
|
-
|
28
|
-
|
29
|
-
class FigureSchema(BaseSchema):
|
30
|
-
box: conlist(int, min_length=4, max_length=4)
|
31
|
-
order: Union[int, None]
|
32
|
-
paragraphs: List[ParagraphSchema]
|
33
|
-
order: Union[int, None]
|
34
|
-
direction: Union[str, None]
|
35
|
-
|
36
|
-
|
37
|
-
class DocumentAnalyzerSchema(BaseSchema):
|
38
|
-
paragraphs: List[ParagraphSchema]
|
39
|
-
tables: List[TableStructureRecognizerSchema]
|
40
|
-
words: List[WordPrediction]
|
41
|
-
figures: List[FigureSchema]
|
42
|
-
|
43
|
-
def to_html(self, out_path: str, **kwargs):
|
44
|
-
return export_html(self, out_path, **kwargs)
|
45
|
-
|
46
|
-
def to_markdown(self, out_path: str, **kwargs):
|
47
|
-
return export_markdown(self, out_path, **kwargs)
|
48
|
-
|
49
|
-
def to_csv(self, out_path: str, **kwargs):
|
50
|
-
return export_csv(self, out_path, **kwargs)
|
14
|
+
from .schemas import ParagraphSchema, FigureSchema, DocumentAnalyzerSchema
|
51
15
|
|
52
16
|
|
53
17
|
def combine_flags(flag1, flag2):
|
@@ -333,6 +297,7 @@ class DocumentAnalyzer:
|
|
333
297
|
visualize=False,
|
334
298
|
ignore_meta=False,
|
335
299
|
reading_order="auto",
|
300
|
+
split_text_across_cells=False,
|
336
301
|
):
|
337
302
|
default_configs = {
|
338
303
|
"ocr": {
|
@@ -363,7 +328,7 @@ class DocumentAnalyzer:
|
|
363
328
|
recursive_update(default_configs, configs)
|
364
329
|
else:
|
365
330
|
raise ValueError(
|
366
|
-
"configs must be a dict. See the https://kotaro-kinoshita.github.io/yomitoku
|
331
|
+
"configs must be a dict. See the https://kotaro-kinoshita.github.io/yomitoku/module/#config"
|
367
332
|
)
|
368
333
|
|
369
334
|
self.text_detector = TextDetector(
|
@@ -379,6 +344,7 @@ class DocumentAnalyzer:
|
|
379
344
|
self.visualize = visualize
|
380
345
|
|
381
346
|
self.ignore_meta = ignore_meta
|
347
|
+
self.split_text_across_cells = split_text_across_cells
|
382
348
|
|
383
349
|
def aggregate(self, ocr_res, layout_res):
|
384
350
|
paragraphs = []
|
@@ -504,7 +470,8 @@ class DocumentAnalyzer:
|
|
504
470
|
results_det, _ = results[0]
|
505
471
|
results_layout, layout = results[1]
|
506
472
|
|
507
|
-
|
473
|
+
if self.split_text_across_cells:
|
474
|
+
results_det = _split_text_across_cells(results_det, results_layout)
|
508
475
|
|
509
476
|
vis_det = None
|
510
477
|
if self.visualize:
|
yomitoku/export/export_csv.py
CHANGED
@@ -63,6 +63,7 @@ def convert_csv(
|
|
63
63
|
ignore_line_break,
|
64
64
|
img=None,
|
65
65
|
export_figure: bool = True,
|
66
|
+
export_figure_letter: bool = False,
|
66
67
|
figure_dir="figures",
|
67
68
|
):
|
68
69
|
elements = []
|
@@ -89,6 +90,20 @@ def convert_csv(
|
|
89
90
|
}
|
90
91
|
)
|
91
92
|
|
93
|
+
if export_figure_letter:
|
94
|
+
for figure in inputs.figures:
|
95
|
+
paragraphs = sorted(figure.paragraphs, key=lambda x: x.order)
|
96
|
+
for paragraph in paragraphs:
|
97
|
+
contents = paragraph_to_csv(paragraph, ignore_line_break)
|
98
|
+
elements.append(
|
99
|
+
{
|
100
|
+
"type": "paragraph",
|
101
|
+
"box": paragraph.box,
|
102
|
+
"element": contents,
|
103
|
+
"order": figure.order,
|
104
|
+
}
|
105
|
+
)
|
106
|
+
|
92
107
|
elements = sorted(elements, key=lambda x: x["order"])
|
93
108
|
|
94
109
|
if export_figure:
|
@@ -109,6 +124,7 @@ def export_csv(
|
|
109
124
|
encoding: str = "utf-8",
|
110
125
|
img=None,
|
111
126
|
export_figure: bool = True,
|
127
|
+
export_figure_letter: bool = False,
|
112
128
|
figure_dir="figures",
|
113
129
|
):
|
114
130
|
elements = convert_csv(
|
@@ -117,6 +133,7 @@ def export_csv(
|
|
117
133
|
ignore_line_break,
|
118
134
|
img,
|
119
135
|
export_figure,
|
136
|
+
export_figure_letter,
|
120
137
|
figure_dir,
|
121
138
|
)
|
122
139
|
|
yomitoku/layout_analyzer.py
CHANGED
@@ -1,17 +1,7 @@
|
|
1
|
-
from
|
1
|
+
from .layout_parser import LayoutParser
|
2
|
+
from .table_structure_recognizer import TableStructureRecognizer
|
2
3
|
|
3
|
-
from .
|
4
|
-
from .layout_parser import Element, LayoutParser
|
5
|
-
from .table_structure_recognizer import (
|
6
|
-
TableStructureRecognizer,
|
7
|
-
TableStructureRecognizerSchema,
|
8
|
-
)
|
9
|
-
|
10
|
-
|
11
|
-
class LayoutAnalyzerSchema(BaseSchema):
|
12
|
-
paragraphs: List[Element]
|
13
|
-
tables: List[TableStructureRecognizerSchema]
|
14
|
-
figures: List[Element]
|
4
|
+
from .schemas import LayoutAnalyzerSchema
|
15
5
|
|
16
6
|
|
17
7
|
class LayoutAnalyzer:
|
yomitoku/layout_parser.py
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
from typing import List, Union
|
2
|
-
|
3
1
|
import cv2
|
4
2
|
import os
|
5
3
|
import onnx
|
@@ -7,28 +5,17 @@ import onnxruntime
|
|
7
5
|
import torch
|
8
6
|
import torchvision.transforms as T
|
9
7
|
from PIL import Image
|
10
|
-
from pydantic import conlist
|
11
8
|
|
12
9
|
from .constants import ROOT_DIR
|
13
10
|
|
14
|
-
from .base import BaseModelCatalog, BaseModule
|
11
|
+
from .base import BaseModelCatalog, BaseModule
|
15
12
|
from .configs import LayoutParserRTDETRv2Config, LayoutParserRTDETRv2V2Config
|
16
13
|
from .models import RTDETRv2
|
17
14
|
from .postprocessor import RTDETRPostProcessor
|
18
15
|
from .utils.misc import filter_by_flag, is_contained
|
19
16
|
from .utils.visualizer import layout_visualizer
|
20
17
|
|
21
|
-
|
22
|
-
class Element(BaseSchema):
|
23
|
-
box: conlist(int, min_length=4, max_length=4)
|
24
|
-
score: float
|
25
|
-
role: Union[str, None]
|
26
|
-
|
27
|
-
|
28
|
-
class LayoutParserSchema(BaseSchema):
|
29
|
-
paragraphs: List[Element]
|
30
|
-
tables: List[Element]
|
31
|
-
figures: List[Element]
|
18
|
+
from .schemas import LayoutParserSchema
|
32
19
|
|
33
20
|
|
34
21
|
class LayoutParserModelCatalog(BaseModelCatalog):
|
yomitoku/ocr.py
CHANGED
@@ -1,27 +1,6 @@
|
|
1
|
-
from typing import List
|
2
|
-
|
3
|
-
from pydantic import conlist
|
4
|
-
|
5
1
|
from yomitoku.text_detector import TextDetector
|
6
2
|
from yomitoku.text_recognizer import TextRecognizer
|
7
|
-
|
8
|
-
from .base import BaseSchema
|
9
|
-
|
10
|
-
|
11
|
-
class WordPrediction(BaseSchema):
|
12
|
-
points: conlist(
|
13
|
-
conlist(int, min_length=2, max_length=2),
|
14
|
-
min_length=4,
|
15
|
-
max_length=4,
|
16
|
-
)
|
17
|
-
content: str
|
18
|
-
direction: str
|
19
|
-
rec_score: float
|
20
|
-
det_score: float
|
21
|
-
|
22
|
-
|
23
|
-
class OCRSchema(BaseSchema):
|
24
|
-
words: List[WordPrediction]
|
3
|
+
from .schemas import OCRSchema
|
25
4
|
|
26
5
|
|
27
6
|
def ocr_aggregate(det_outputs, rec_outputs):
|
yomitoku/schemas.py
ADDED
@@ -0,0 +1,241 @@
|
|
1
|
+
from typing import List, Union
|
2
|
+
from pydantic import conlist, Field
|
3
|
+
|
4
|
+
from .base import BaseSchema
|
5
|
+
from .export import export_csv, export_html, export_markdown, export_json
|
6
|
+
|
7
|
+
|
8
|
+
class Element(BaseSchema):
|
9
|
+
box: conlist(int, min_length=4, max_length=4) = Field(
|
10
|
+
...,
|
11
|
+
description="Bounding box of the layout element in the format [x1, y1, x2, y2]",
|
12
|
+
)
|
13
|
+
score: float = Field(
|
14
|
+
...,
|
15
|
+
description="Confidence score of the layout element detection",
|
16
|
+
)
|
17
|
+
role: Union[str, None] = Field(
|
18
|
+
...,
|
19
|
+
description="Role of the element, e.g., ['section_headings', 'page_header', 'page_footer', 'list_item', 'caption', 'inline_formula', 'display_formula', 'index']",
|
20
|
+
)
|
21
|
+
|
22
|
+
|
23
|
+
class ParagraphSchema(BaseSchema):
|
24
|
+
box: conlist(int, min_length=4, max_length=4) = Field(
|
25
|
+
...,
|
26
|
+
description="Bounding box of the paragraph in the format [x1, y1, x2, y2]",
|
27
|
+
)
|
28
|
+
contents: Union[str, None] = Field(
|
29
|
+
...,
|
30
|
+
description="Text content of the paragraph",
|
31
|
+
)
|
32
|
+
direction: Union[str, None] = Field(
|
33
|
+
...,
|
34
|
+
description="Text direction, e.g., ['horizontal' or 'vertical']",
|
35
|
+
)
|
36
|
+
order: Union[int, None] = Field(
|
37
|
+
...,
|
38
|
+
description="Order of the paragraph in the document",
|
39
|
+
)
|
40
|
+
role: Union[str, None] = Field(
|
41
|
+
...,
|
42
|
+
description="Role of the paragraph, e.g., ['section_headings', 'page_header', 'page_footer'])",
|
43
|
+
)
|
44
|
+
|
45
|
+
|
46
|
+
class TableCellSchema(BaseSchema):
|
47
|
+
col: int = Field(
|
48
|
+
...,
|
49
|
+
description="Column index of the cell",
|
50
|
+
)
|
51
|
+
row: int = Field(
|
52
|
+
...,
|
53
|
+
description="Row index of the cell",
|
54
|
+
)
|
55
|
+
col_span: int = Field(
|
56
|
+
...,
|
57
|
+
description="Number of columns spanned by the cell",
|
58
|
+
)
|
59
|
+
row_span: int = Field(
|
60
|
+
...,
|
61
|
+
description="Number of rows spanned by the cell",
|
62
|
+
)
|
63
|
+
box: conlist(int, min_length=4, max_length=4) = Field(
|
64
|
+
...,
|
65
|
+
description="Bounding box of the cell in the format [x1, y1, x2, y2]",
|
66
|
+
)
|
67
|
+
contents: Union[str, None] = Field(
|
68
|
+
...,
|
69
|
+
description="Text content of the cell",
|
70
|
+
)
|
71
|
+
|
72
|
+
|
73
|
+
class TableLineSchema(BaseSchema):
|
74
|
+
box: conlist(int, min_length=4, max_length=4) = Field(
|
75
|
+
...,
|
76
|
+
description="Bounding box of the table line in the format [x1, y1, x2, y2]",
|
77
|
+
)
|
78
|
+
score: float = Field(
|
79
|
+
...,
|
80
|
+
description="Confidence score of the table line detection",
|
81
|
+
)
|
82
|
+
|
83
|
+
|
84
|
+
class TableStructureRecognizerSchema(BaseSchema):
|
85
|
+
box: conlist(int, min_length=4, max_length=4) = Field(
|
86
|
+
...,
|
87
|
+
description="Bounding box of the table in the format [x1, y1, x2, y2]",
|
88
|
+
)
|
89
|
+
n_row: int = Field(..., description="Number of rows in the table")
|
90
|
+
n_col: int = Field(..., description="Number of columns in the table")
|
91
|
+
rows: List[TableLineSchema] = Field(
|
92
|
+
...,
|
93
|
+
description="List of table lines representing rows",
|
94
|
+
)
|
95
|
+
cols: List[TableLineSchema] = Field(
|
96
|
+
...,
|
97
|
+
description="List of table lines representing columns",
|
98
|
+
)
|
99
|
+
spans: List[TableLineSchema] = Field(
|
100
|
+
...,
|
101
|
+
description="List of table lines representing spans",
|
102
|
+
)
|
103
|
+
cells: List[TableCellSchema] = Field(
|
104
|
+
...,
|
105
|
+
description="List of table cells",
|
106
|
+
)
|
107
|
+
order: int = Field(
|
108
|
+
...,
|
109
|
+
description="Order of the table in the document",
|
110
|
+
)
|
111
|
+
|
112
|
+
|
113
|
+
class LayoutAnalyzerSchema(BaseSchema):
|
114
|
+
paragraphs: List[Element] = Field(
|
115
|
+
...,
|
116
|
+
description="List of detected paragraphs",
|
117
|
+
)
|
118
|
+
tables: List[TableStructureRecognizerSchema] = Field(
|
119
|
+
...,
|
120
|
+
description="List of detected tables",
|
121
|
+
)
|
122
|
+
figures: List[Element] = Field(
|
123
|
+
...,
|
124
|
+
description="List of detected figures",
|
125
|
+
)
|
126
|
+
|
127
|
+
|
128
|
+
class WordPrediction(BaseSchema):
|
129
|
+
points: conlist(
|
130
|
+
conlist(int, min_length=2, max_length=2),
|
131
|
+
min_length=4,
|
132
|
+
max_length=4,
|
133
|
+
) = Field(
|
134
|
+
...,
|
135
|
+
description="Bounding box of the word in the format [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]",
|
136
|
+
)
|
137
|
+
content: str = Field(..., description="Text content of the word")
|
138
|
+
direction: str = Field(
|
139
|
+
..., description="Text direction, e.g., 'horizontal' or 'vertical'"
|
140
|
+
)
|
141
|
+
rec_score: float = Field(
|
142
|
+
..., description="Confidence score of the word recognition"
|
143
|
+
)
|
144
|
+
det_score: float = Field(
|
145
|
+
...,
|
146
|
+
description="Confidence score of the word detection",
|
147
|
+
)
|
148
|
+
|
149
|
+
|
150
|
+
class TextDetectorSchema(BaseSchema):
|
151
|
+
points: List[
|
152
|
+
conlist(
|
153
|
+
conlist(int, min_length=2, max_length=2),
|
154
|
+
min_length=4,
|
155
|
+
max_length=4,
|
156
|
+
)
|
157
|
+
] = Field(
|
158
|
+
...,
|
159
|
+
description="List of bounding boxes of detected text regions in the format [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]",
|
160
|
+
)
|
161
|
+
scores: List[float] = Field(
|
162
|
+
...,
|
163
|
+
description="List of confidence scores for each detected text region",
|
164
|
+
)
|
165
|
+
|
166
|
+
|
167
|
+
class OCRSchema(BaseSchema):
|
168
|
+
words: List[WordPrediction] = Field(
|
169
|
+
...,
|
170
|
+
description="List of recognized words with their bounding boxes, content, direction, and scores",
|
171
|
+
)
|
172
|
+
|
173
|
+
|
174
|
+
class LayoutParserSchema(BaseSchema):
|
175
|
+
paragraphs: List[Element] = Field(..., description="List of detected paragraphs")
|
176
|
+
tables: List[Element] = Field(..., description="List of detected tables")
|
177
|
+
figures: List[Element] = Field(..., description="List of detected figures")
|
178
|
+
|
179
|
+
|
180
|
+
class FigureSchema(BaseSchema):
|
181
|
+
box: conlist(int, min_length=4, max_length=4) = Field(
|
182
|
+
..., description="Bounding box of the figure in the format [x1, y1, x2, y2]"
|
183
|
+
)
|
184
|
+
order: Union[int, None] = Field(
|
185
|
+
..., description="Order of the figure in the document"
|
186
|
+
)
|
187
|
+
paragraphs: List[ParagraphSchema] = Field(
|
188
|
+
..., description="List of paragraphs associated with the figure"
|
189
|
+
)
|
190
|
+
order: Union[int, None] = Field(
|
191
|
+
..., description="Order of the figure in the document"
|
192
|
+
)
|
193
|
+
direction: Union[str, None] = Field(
|
194
|
+
..., description="Text direction, e.g., ['horizontal' or 'vertical']"
|
195
|
+
)
|
196
|
+
|
197
|
+
|
198
|
+
class DocumentAnalyzerSchema(BaseSchema):
|
199
|
+
paragraphs: List[ParagraphSchema] = Field(
|
200
|
+
..., description="List of detected paragraphs"
|
201
|
+
)
|
202
|
+
tables: List[TableStructureRecognizerSchema] = Field(
|
203
|
+
..., description="List of detected tables"
|
204
|
+
)
|
205
|
+
words: List[WordPrediction] = Field(..., description="List of recognized words")
|
206
|
+
figures: List[FigureSchema] = Field(..., description="List of detected figures")
|
207
|
+
|
208
|
+
def to_html(self, out_path: str, **kwargs):
|
209
|
+
return export_html(self, out_path, **kwargs)
|
210
|
+
|
211
|
+
def to_markdown(self, out_path: str, **kwargs):
|
212
|
+
return export_markdown(self, out_path, **kwargs)
|
213
|
+
|
214
|
+
def to_csv(self, out_path: str, **kwargs):
|
215
|
+
return export_csv(self, out_path, **kwargs)
|
216
|
+
|
217
|
+
def to_json(self, out_path: str, **kwargs):
|
218
|
+
return export_json(self, out_path, **kwargs)
|
219
|
+
|
220
|
+
|
221
|
+
class TextRecognizerSchema(BaseSchema):
|
222
|
+
contents: List[str] = Field(
|
223
|
+
...,
|
224
|
+
description="List of recognized text contents",
|
225
|
+
)
|
226
|
+
directions: List[str] = Field(
|
227
|
+
..., description="List of text directions, e.g., ['horizontal' or 'vertical']"
|
228
|
+
)
|
229
|
+
scores: List[float] = Field(
|
230
|
+
..., description="List of confidence scores for each recognized text"
|
231
|
+
)
|
232
|
+
points: List[
|
233
|
+
conlist(
|
234
|
+
conlist(int, min_length=2, max_length=2),
|
235
|
+
min_length=4,
|
236
|
+
max_length=4,
|
237
|
+
)
|
238
|
+
] = Field(
|
239
|
+
...,
|
240
|
+
description="List of bounding boxes of recognized text in the format [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]",
|
241
|
+
)
|
@@ -1,5 +1,3 @@
|
|
1
|
-
from typing import List, Union
|
2
|
-
|
3
1
|
import cv2
|
4
2
|
import os
|
5
3
|
import onnx
|
@@ -7,17 +5,17 @@ import onnxruntime
|
|
7
5
|
import torch
|
8
6
|
import torchvision.transforms as T
|
9
7
|
from PIL import Image
|
10
|
-
from pydantic import conlist
|
11
8
|
|
12
9
|
from .constants import ROOT_DIR
|
13
10
|
|
14
|
-
from .base import BaseModelCatalog, BaseModule
|
11
|
+
from .base import BaseModelCatalog, BaseModule
|
15
12
|
from .configs import TableStructureRecognizerRTDETRv2Config
|
16
13
|
from .layout_parser import filter_contained_rectangles_within_category
|
17
14
|
from .models import RTDETRv2
|
18
15
|
from .postprocessor import RTDETRPostProcessor
|
19
16
|
from .utils.misc import calc_intersection, filter_by_flag, is_contained
|
20
17
|
from .utils.visualizer import table_visualizer
|
18
|
+
from .schemas import TableStructureRecognizerSchema
|
21
19
|
|
22
20
|
|
23
21
|
class TableStructureRecognizerModelCatalog(BaseModelCatalog):
|
@@ -26,31 +24,6 @@ class TableStructureRecognizerModelCatalog(BaseModelCatalog):
|
|
26
24
|
self.register("rtdetrv2", TableStructureRecognizerRTDETRv2Config, RTDETRv2)
|
27
25
|
|
28
26
|
|
29
|
-
class TableCellSchema(BaseSchema):
|
30
|
-
col: int
|
31
|
-
row: int
|
32
|
-
col_span: int
|
33
|
-
row_span: int
|
34
|
-
box: conlist(int, min_length=4, max_length=4)
|
35
|
-
contents: Union[str, None]
|
36
|
-
|
37
|
-
|
38
|
-
class TableLineSchema(BaseSchema):
|
39
|
-
box: conlist(int, min_length=4, max_length=4)
|
40
|
-
score: float
|
41
|
-
|
42
|
-
|
43
|
-
class TableStructureRecognizerSchema(BaseSchema):
|
44
|
-
box: conlist(int, min_length=4, max_length=4)
|
45
|
-
n_row: int
|
46
|
-
n_col: int
|
47
|
-
rows: List[TableLineSchema]
|
48
|
-
cols: List[TableLineSchema]
|
49
|
-
cells: List[TableCellSchema]
|
50
|
-
spans: List[TableLineSchema]
|
51
|
-
order: int
|
52
|
-
|
53
|
-
|
54
27
|
def extract_cells(row_boxes, col_boxes):
|
55
28
|
cells = []
|
56
29
|
for i, row_box in enumerate(row_boxes):
|
yomitoku/text_detector.py
CHANGED
@@ -1,11 +1,8 @@
|
|
1
|
-
from typing import List
|
2
|
-
|
3
1
|
import numpy as np
|
4
2
|
import torch
|
5
3
|
import os
|
6
|
-
from pydantic import conlist
|
7
4
|
|
8
|
-
from .base import BaseModelCatalog, BaseModule
|
5
|
+
from .base import BaseModelCatalog, BaseModule
|
9
6
|
from .configs import (
|
10
7
|
TextDetectorDBNetConfig,
|
11
8
|
TextDetectorDBNetV2Config,
|
@@ -19,6 +16,7 @@ from .models import DBNet
|
|
19
16
|
from .postprocessor import DBnetPostProcessor
|
20
17
|
from .utils.visualizer import det_visualizer
|
21
18
|
from .constants import ROOT_DIR
|
19
|
+
from .schemas import TextDetectorSchema
|
22
20
|
|
23
21
|
import onnx
|
24
22
|
import onnxruntime
|
@@ -31,17 +29,6 @@ class TextDetectorModelCatalog(BaseModelCatalog):
|
|
31
29
|
self.register("dbnetv2", TextDetectorDBNetV2Config, DBNet)
|
32
30
|
|
33
31
|
|
34
|
-
class TextDetectorSchema(BaseSchema):
|
35
|
-
points: List[
|
36
|
-
conlist(
|
37
|
-
conlist(int, min_length=2, max_length=2),
|
38
|
-
min_length=4,
|
39
|
-
max_length=4,
|
40
|
-
)
|
41
|
-
]
|
42
|
-
scores: List[float]
|
43
|
-
|
44
|
-
|
45
32
|
class TextDetector(BaseModule):
|
46
33
|
model_catalog = TextDetectorModelCatalog()
|
47
34
|
|
yomitoku/text_recognizer.py
CHANGED
@@ -1,12 +1,9 @@
|
|
1
|
-
from typing import List
|
2
|
-
|
3
1
|
import numpy as np
|
4
2
|
import torch
|
5
3
|
import os
|
6
4
|
import unicodedata
|
7
|
-
from pydantic import conlist
|
8
5
|
|
9
|
-
from .base import BaseModelCatalog, BaseModule
|
6
|
+
from .base import BaseModelCatalog, BaseModule
|
10
7
|
from .configs import (
|
11
8
|
TextRecognizerPARSeqConfig,
|
12
9
|
TextRecognizerPARSeqSmallConfig,
|
@@ -19,6 +16,8 @@ from .utils.misc import load_charset
|
|
19
16
|
from .utils.visualizer import rec_visualizer
|
20
17
|
|
21
18
|
from .constants import ROOT_DIR
|
19
|
+
from .schemas import TextRecognizerSchema
|
20
|
+
|
22
21
|
import onnx
|
23
22
|
import onnxruntime
|
24
23
|
|
@@ -31,19 +30,6 @@ class TextRecognizerModelCatalog(BaseModelCatalog):
|
|
31
30
|
self.register("parseq-small", TextRecognizerPARSeqSmallConfig, PARSeq)
|
32
31
|
|
33
32
|
|
34
|
-
class TextRecognizerSchema(BaseSchema):
|
35
|
-
contents: List[str]
|
36
|
-
directions: List[str]
|
37
|
-
scores: List[float]
|
38
|
-
points: List[
|
39
|
-
conlist(
|
40
|
-
conlist(int, min_length=2, max_length=2),
|
41
|
-
min_length=4,
|
42
|
-
max_length=4,
|
43
|
-
)
|
44
|
-
]
|
45
|
-
|
46
|
-
|
47
33
|
class TextRecognizer(BaseModule):
|
48
34
|
model_catalog = TextRecognizerModelCatalog()
|
49
35
|
|
@@ -94,10 +80,21 @@ class TextRecognizer(BaseModule):
|
|
94
80
|
self.model.to(self.device)
|
95
81
|
|
96
82
|
def preprocess(self, img, polygons):
|
83
|
+
if polygons is None:
|
84
|
+
h, w = img.shape[:2]
|
85
|
+
polygons = [
|
86
|
+
[
|
87
|
+
[0, 0],
|
88
|
+
[w, 0],
|
89
|
+
[w, h],
|
90
|
+
[0, h],
|
91
|
+
]
|
92
|
+
]
|
93
|
+
|
97
94
|
dataset = ParseqDataset(self._cfg, img, polygons)
|
98
95
|
dataloader = self._make_mini_batch(dataset)
|
99
96
|
|
100
|
-
return dataloader
|
97
|
+
return dataloader, polygons
|
101
98
|
|
102
99
|
def _make_mini_batch(self, dataset):
|
103
100
|
mini_batches = []
|
@@ -150,7 +147,7 @@ class TextRecognizer(BaseModule):
|
|
150
147
|
|
151
148
|
return pred, score, directions
|
152
149
|
|
153
|
-
def __call__(self, img, points, vis=None):
|
150
|
+
def __call__(self, img, points=None, vis=None):
|
154
151
|
"""
|
155
152
|
Apply the recognition model to the input image.
|
156
153
|
|
@@ -160,7 +157,7 @@ class TextRecognizer(BaseModule):
|
|
160
157
|
vis (np.ndarray, optional): rendering image. Defaults to None.
|
161
158
|
"""
|
162
159
|
|
163
|
-
dataloader = self.preprocess(img, points)
|
160
|
+
dataloader, points = self.preprocess(img, points)
|
164
161
|
preds = []
|
165
162
|
scores = []
|
166
163
|
directions = []
|
yomitoku/utils/searchable_pdf.py
CHANGED
@@ -72,8 +72,6 @@ def create_searchable_pdf(images, ocr_results, output_path, font_path=None):
|
|
72
72
|
|
73
73
|
for i, (image, ocr_result) in enumerate(zip(images, ocr_results)):
|
74
74
|
image = Image.fromarray(image[:, :, ::-1]) # Convert BGR to RGB
|
75
|
-
pdfmetrics.registerFont(TTFont("MPLUS1p-Medium", FONT_PATH))
|
76
|
-
|
77
75
|
image_path = f"tmp_{i}.png"
|
78
76
|
image.save(image_path)
|
79
77
|
w, h = image.size
|
yomitoku/utils/visualizer.py
CHANGED
@@ -10,17 +10,28 @@ logger = set_logger(__name__, "INFO")
|
|
10
10
|
def _reading_order_visualizer(img, elements, line_color, tip_size):
|
11
11
|
out = img.copy()
|
12
12
|
for i, element in enumerate(elements):
|
13
|
-
if i == 0:
|
14
|
-
continue
|
15
|
-
|
16
|
-
prev_element = elements[i - 1]
|
17
13
|
cur_x1, cur_y1, cur_x2, cur_y2 = element.box
|
18
|
-
prev_x1, prev_y1, prev_x2, prev_y2 = prev_element.box
|
19
14
|
|
20
15
|
cur_center = (
|
21
16
|
cur_x1 + (cur_x2 - cur_x1) / 2,
|
22
17
|
cur_y1 + (cur_y2 - cur_y1) / 2,
|
23
18
|
)
|
19
|
+
|
20
|
+
cv2.putText(
|
21
|
+
out,
|
22
|
+
str(i),
|
23
|
+
(int(cur_center[0]), int(cur_center[1])),
|
24
|
+
cv2.FONT_HERSHEY_SIMPLEX,
|
25
|
+
1,
|
26
|
+
(0, 200, 0),
|
27
|
+
2,
|
28
|
+
)
|
29
|
+
|
30
|
+
if i == 0:
|
31
|
+
continue
|
32
|
+
|
33
|
+
prev_element = elements[i - 1]
|
34
|
+
prev_x1, prev_y1, prev_x2, prev_y2 = prev_element.box
|
24
35
|
prev_center = (
|
25
36
|
prev_x1 + (prev_x2 - prev_x1) / 2,
|
26
37
|
prev_y1 + (prev_y2 - prev_y1) / 2,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: yomitoku
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.10.0
|
4
4
|
Summary: Yomitoku is an AI-powered document image analysis package designed specifically for the Japanese language.
|
5
5
|
Author-email: Kotaro Kinoshita <kotaro.kinoshita@mlism.com>
|
6
6
|
License: CC BY-NC-SA 4.0
|
@@ -41,7 +41,7 @@ Description-Content-Type: text/markdown
|
|
41
41
|
YomiToku は日本語に特化した AI 文章画像解析エンジン(Document AI)です。画像内の文字の全文 OCR およびレイアウト解析機能を有しており、画像内の文字情報や図表を認識、抽出、変換します。
|
42
42
|
|
43
43
|
- 🤖 日本語データセットで学習した 4 種類(文字位置の検知、文字列認識、レイアウト解析、表の構造認識)の AI モデルを搭載しています。4 種類のモデルはすべて独自に学習されたモデルで日本語文書に対して、高精度に推論可能です。
|
44
|
-
- 🇯🇵 各モデルは日本語の文書画像に特化して学習されており、7000
|
44
|
+
- 🇯🇵 各モデルは日本語の文書画像に特化して学習されており、7000 文字を超える日本語文字の認識をサポート、手書き文字、縦書きなど日本語特有のレイアウト構造の文書画像の解析も可能です。(日本語以外にも英語の文書に対しても対応しています)。
|
45
45
|
- 📈 レイアウト解析、表の構造解析, 読み順推定機能により、文書画像のレイアウトの意味的構造を壊さずに情報を抽出することが可能です。
|
46
46
|
- 📄 多様な出力形式をサポートしています。html やマークダウン、json、csv のいずれかのフォーマットに変換可能です。また、文書内に含まれる図表、画像の抽出の出力も可能です。文書画像をサーチャブルPDFに変換する処理もサポートしています。
|
47
47
|
- ⚡ GPU 環境で高速に動作し、効率的に文書の文字起こし解析が可能です。また、VRAM も 8GB 以内で動作し、ハイエンドな GPU を用意する必要はありません。
|
@@ -1,16 +1,17 @@
|
|
1
1
|
yomitoku/__init__.py,sha256=kXOM8RbpwwLABG3p3vPT3dJWBk4JX2MFGrOeBEW0hKM,543
|
2
2
|
yomitoku/base.py,sha256=9U3sfe69O6vuO430JzzKQQNkgPsLM9WdLfOUUhp3Ljs,3878
|
3
3
|
yomitoku/constants.py,sha256=2jya14UflDkMdYWMKc-ZllkWbJW2qh59Cnt2brrgNb4,693
|
4
|
-
yomitoku/document_analyzer.py,sha256=
|
5
|
-
yomitoku/layout_analyzer.py,sha256=
|
6
|
-
yomitoku/layout_parser.py,sha256=
|
7
|
-
yomitoku/ocr.py,sha256=
|
4
|
+
yomitoku/document_analyzer.py,sha256=FyF85m7k-BxzpOKb3sIfBRpxh_4NDPC7EC3x91hxoGo,15959
|
5
|
+
yomitoku/layout_analyzer.py,sha256=soLDcX09NlNicRYenOhFLgq8L8ct9xo7N9Hsj1IWKZw,1643
|
6
|
+
yomitoku/layout_parser.py,sha256=BSWiL8Xl7c0CY2CXNteLye5e-bLdR1hXKtps94kon9w,7440
|
7
|
+
yomitoku/ocr.py,sha256=gKWNciOQIgUcYrNmKhksSK8TSNisK8wY2zG2ZPXh2Fk,1920
|
8
8
|
yomitoku/reading_order.py,sha256=_T09PqT7guk57zWo4HdSazLSQTwM91piyELA_wNHQAQ,7521
|
9
|
-
yomitoku/
|
10
|
-
yomitoku/
|
11
|
-
yomitoku/
|
9
|
+
yomitoku/schemas.py,sha256=azI9iVQ88-JPSuRmDVxCcdr2KNICJmLuMl0AQmfof-0,7582
|
10
|
+
yomitoku/table_structure_recognizer.py,sha256=UjYdzY-9dIClWP9iz0HCLr1DU2UY7n7Rtr7L9vOJwDU,9043
|
11
|
+
yomitoku/text_detector.py,sha256=gXofo7ywFsI3hNMKKfYoOwlYVDerJym2Zg_Eq7NNGv4,4136
|
12
|
+
yomitoku/text_recognizer.py,sha256=hS_spLnINVkMFOWm1bBG3WVfI7rK4o7ONt_nTUnpMLM,5969
|
12
13
|
yomitoku/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
|
-
yomitoku/cli/main.py,sha256=
|
14
|
+
yomitoku/cli/main.py,sha256=s7wxBtgxPu7P-ARtjVmQlOosus3srlfS4-RuV0BFpyM,13821
|
14
15
|
yomitoku/cli/mcp_server.py,sha256=WnWzxd13HaemC3b-5i9B9NVBGc3WGfum2nYhoBolEnk,5641
|
15
16
|
yomitoku/configs/__init__.py,sha256=x5-ccjGiP6xxRtDPT7f1Enl7SsE0hSk0G8f7eF9V85I,886
|
16
17
|
yomitoku/configs/cfg_layout_parser_rtdtrv2.py,sha256=8PRxB2Ar9UF7-DLtbgSokhrzdXb0veWI6Wc-X8qigRw,2329
|
@@ -25,7 +26,7 @@ yomitoku/data/__init__.py,sha256=KAofFc9rk9ZdTKBjemu9RM8Vj9XnKbWC2MPZ2RWtOdE,82
|
|
25
26
|
yomitoku/data/dataset.py,sha256=lpBcpkMuQzRIyLJ4_mqtuhR9s2ZmzgBgc-XYuE_b2Sc,1326
|
26
27
|
yomitoku/data/functions.py,sha256=RExCUxI3-gccIMw-H0ribX2jeGKkrJWhS4fNn_12c3Y,7878
|
27
28
|
yomitoku/export/__init__.py,sha256=gmlikMHRXfzfJ_8q4fyDlnpGms-x1oggQOwJEWHMgBU,508
|
28
|
-
yomitoku/export/export_csv.py,sha256=
|
29
|
+
yomitoku/export/export_csv.py,sha256=4U4KQ2RcBQmyUZ9O7a4uLoB6RUw80HPL1EEJUDwQlcI,4044
|
29
30
|
yomitoku/export/export_html.py,sha256=LQDyZgbzmI0qJ0-FEK-54r9816H3L9hD10ChMcw0KyA,5620
|
30
31
|
yomitoku/export/export_json.py,sha256=iNG37tdIuYG2x3NiiZemKaB6-X45WrhVPZhbX7RUzRI,2410
|
31
32
|
yomitoku/export/export_markdown.py,sha256=KrdxDmKzVP_LbTKuDNGGsT31QOPKVsNNlb6wtLEW-1Q,4705
|
@@ -51,9 +52,9 @@ yomitoku/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
51
52
|
yomitoku/utils/graph.py,sha256=LKNB8ZhSQwOZMfeAimPMF5UCVVr2ZaUWoGDkz8z-uGU,456
|
52
53
|
yomitoku/utils/logger.py,sha256=uOmtQDr0A0JD7wyFshedL08BiNrQorHnpktRXba8bjU,424
|
53
54
|
yomitoku/utils/misc.py,sha256=r92x45kQR8lC5jO1MZaHBDtcCWBkQXg_WS9H4RXJzSY,4127
|
54
|
-
yomitoku/utils/searchable_pdf.py,sha256=
|
55
|
-
yomitoku/utils/visualizer.py,sha256=
|
56
|
-
yomitoku-0.
|
57
|
-
yomitoku-0.
|
58
|
-
yomitoku-0.
|
59
|
-
yomitoku-0.
|
55
|
+
yomitoku/utils/searchable_pdf.py,sha256=taZ-XtXN4RItePMDv4q0fRVlryusdkexA3TCXzwlXRo,3497
|
56
|
+
yomitoku/utils/visualizer.py,sha256=ycC7SGuyXGGnX9KMJecdcEe1PWq30fG-EghB0E0EmWY,5468
|
57
|
+
yomitoku-0.10.0.dist-info/METADATA,sha256=Xd2cOvxpBFl-jSyGK61MLEwwC7CDEIEUIUAVk0L58tI,8870
|
58
|
+
yomitoku-0.10.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
59
|
+
yomitoku-0.10.0.dist-info/entry_points.txt,sha256=n3c8bQSj5Be5GHAOv_NZ8cldJFmWeigQxSmteFTmu_k,96
|
60
|
+
yomitoku-0.10.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|