yomitoku 0.9.0__py3-none-any.whl → 0.9.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- yomitoku/cli/main.py +36 -10
- yomitoku/cli/{mcp.py → mcp_server.py} +37 -6
- yomitoku/data/dataset.py +20 -10
- yomitoku/data/functions.py +19 -20
- yomitoku/document_analyzer.py +21 -6
- yomitoku/reading_order.py +38 -8
- yomitoku/utils/misc.py +49 -2
- yomitoku/utils/searchable_pdf.py +116 -0
- {yomitoku-0.9.0.dist-info → yomitoku-0.9.2.dist-info}/METADATA +6 -2
- {yomitoku-0.9.0.dist-info → yomitoku-0.9.2.dist-info}/RECORD +12 -11
- {yomitoku-0.9.0.dist-info → yomitoku-0.9.2.dist-info}/entry_points.txt +1 -1
- {yomitoku-0.9.0.dist-info → yomitoku-0.9.2.dist-info}/WHEEL +0 -0
yomitoku/cli/main.py
CHANGED
@@ -9,6 +9,7 @@ from ..constants import SUPPORT_OUTPUT_FORMAT
|
|
9
9
|
from ..data.functions import load_image, load_pdf
|
10
10
|
from ..document_analyzer import DocumentAnalyzer
|
11
11
|
from ..utils.logger import set_logger
|
12
|
+
from ..utils.searchable_pdf import create_searchable_pdf
|
12
13
|
|
13
14
|
from ..export import save_csv, save_html, save_json, save_markdown
|
14
15
|
from ..export import convert_json, convert_csv, convert_html, convert_markdown
|
@@ -80,11 +81,13 @@ def process_single_file(args, analyzer, path, format):
|
|
80
81
|
else:
|
81
82
|
imgs = load_image(path)
|
82
83
|
|
84
|
+
format_results = []
|
83
85
|
results = []
|
84
86
|
for page, img in enumerate(imgs):
|
85
87
|
result, ocr, layout = analyzer(img)
|
86
88
|
dirname = path.parent.name
|
87
89
|
filename = path.stem
|
90
|
+
results.append(result)
|
88
91
|
|
89
92
|
# cv2.imwrite(
|
90
93
|
# os.path.join(args.outdir, f"{dirname}_{filename}_p{page+1}.jpg"), img
|
@@ -92,7 +95,7 @@ def process_single_file(args, analyzer, path, format):
|
|
92
95
|
|
93
96
|
if ocr is not None:
|
94
97
|
out_path = os.path.join(
|
95
|
-
args.outdir, f"{dirname}_{filename}_p{page+1}_ocr.jpg"
|
98
|
+
args.outdir, f"{dirname}_{filename}_p{page + 1}_ocr.jpg"
|
96
99
|
)
|
97
100
|
|
98
101
|
save_image(ocr, out_path)
|
@@ -100,13 +103,15 @@ def process_single_file(args, analyzer, path, format):
|
|
100
103
|
|
101
104
|
if layout is not None:
|
102
105
|
out_path = os.path.join(
|
103
|
-
args.outdir, f"{dirname}_{filename}_p{page+1}_layout.jpg"
|
106
|
+
args.outdir, f"{dirname}_{filename}_p{page + 1}_layout.jpg"
|
104
107
|
)
|
105
108
|
|
106
109
|
save_image(layout, out_path)
|
107
110
|
logger.info(f"Output file: {out_path}")
|
108
111
|
|
109
|
-
out_path = os.path.join(
|
112
|
+
out_path = os.path.join(
|
113
|
+
args.outdir, f"{dirname}_{filename}_p{page + 1}.{format}"
|
114
|
+
)
|
110
115
|
|
111
116
|
if format == "json":
|
112
117
|
if args.combine:
|
@@ -128,7 +133,7 @@ def process_single_file(args, analyzer, path, format):
|
|
128
133
|
figure_dir=args.figure_dir,
|
129
134
|
)
|
130
135
|
|
131
|
-
|
136
|
+
format_results.append(
|
132
137
|
{
|
133
138
|
"format": format,
|
134
139
|
"data": json.model_dump(),
|
@@ -155,7 +160,7 @@ def process_single_file(args, analyzer, path, format):
|
|
155
160
|
figure_dir=args.figure_dir,
|
156
161
|
)
|
157
162
|
|
158
|
-
|
163
|
+
format_results.append(
|
159
164
|
{
|
160
165
|
"format": format,
|
161
166
|
"data": csv,
|
@@ -186,7 +191,7 @@ def process_single_file(args, analyzer, path, format):
|
|
186
191
|
encoding=args.encoding,
|
187
192
|
)
|
188
193
|
|
189
|
-
|
194
|
+
format_results.append(
|
190
195
|
{
|
191
196
|
"format": format,
|
192
197
|
"data": html,
|
@@ -217,14 +222,14 @@ def process_single_file(args, analyzer, path, format):
|
|
217
222
|
encoding=args.encoding,
|
218
223
|
)
|
219
224
|
|
220
|
-
|
225
|
+
format_results.append(
|
221
226
|
{
|
222
227
|
"format": format,
|
223
228
|
"data": md,
|
224
229
|
}
|
225
230
|
)
|
226
231
|
|
227
|
-
out = merge_all_pages(
|
232
|
+
out = merge_all_pages(format_results)
|
228
233
|
if args.combine:
|
229
234
|
out_path = os.path.join(args.outdir, f"{dirname}_{filename}.{format}")
|
230
235
|
save_merged_file(
|
@@ -233,6 +238,15 @@ def process_single_file(args, analyzer, path, format):
|
|
233
238
|
out,
|
234
239
|
)
|
235
240
|
|
241
|
+
if args.searchable_pdf:
|
242
|
+
pdf_path = os.path.join(args.outdir, f"{filename}.pdf")
|
243
|
+
create_searchable_pdf(
|
244
|
+
imgs,
|
245
|
+
results,
|
246
|
+
output_path=pdf_path,
|
247
|
+
)
|
248
|
+
logger.info(f"Output SearchablePDF: {pdf_path}")
|
249
|
+
|
236
250
|
|
237
251
|
def main():
|
238
252
|
parser = argparse.ArgumentParser()
|
@@ -341,6 +355,17 @@ def main():
|
|
341
355
|
action="store_true",
|
342
356
|
help="if set, ignore meta information(header, footer) in the output",
|
343
357
|
)
|
358
|
+
parser.add_argument(
|
359
|
+
"--reading_order",
|
360
|
+
default="auto",
|
361
|
+
type=str,
|
362
|
+
choices=["auto", "left2right", "top2bottom", "right2left"],
|
363
|
+
)
|
364
|
+
parser.add_argument(
|
365
|
+
"--searchable_pdf",
|
366
|
+
action="store_true",
|
367
|
+
help="if set, create searchable PDF",
|
368
|
+
)
|
344
369
|
|
345
370
|
args = parser.parse_args()
|
346
371
|
|
@@ -394,6 +419,7 @@ def main():
|
|
394
419
|
visualize=args.vis,
|
395
420
|
device=args.device,
|
396
421
|
ignore_meta=args.ignore_meta,
|
422
|
+
reading_order=args.reading_order,
|
397
423
|
)
|
398
424
|
|
399
425
|
os.makedirs(args.outdir, exist_ok=True)
|
@@ -408,7 +434,7 @@ def main():
|
|
408
434
|
logger.info(f"Processing file: {file_path}")
|
409
435
|
process_single_file(args, analyzer, file_path, format)
|
410
436
|
end = time.time()
|
411
|
-
logger.info(f"Total Processing time: {end-start:.2f} sec")
|
437
|
+
logger.info(f"Total Processing time: {end - start:.2f} sec")
|
412
438
|
except Exception:
|
413
439
|
continue
|
414
440
|
else:
|
@@ -416,7 +442,7 @@ def main():
|
|
416
442
|
logger.info(f"Processing file: {path}")
|
417
443
|
process_single_file(args, analyzer, path, format)
|
418
444
|
end = time.time()
|
419
|
-
logger.info(f"Total Processing time: {end-start:.2f} sec")
|
445
|
+
logger.info(f"Total Processing time: {end - start:.2f} sec")
|
420
446
|
|
421
447
|
|
422
448
|
if __name__ == "__main__":
|
@@ -1,14 +1,20 @@
|
|
1
|
-
import json
|
2
|
-
import io
|
3
1
|
import csv
|
2
|
+
import io
|
3
|
+
import json
|
4
4
|
import os
|
5
|
+
from argparse import ArgumentParser
|
5
6
|
from pathlib import Path
|
6
7
|
|
7
8
|
from mcp.server.fastmcp import Context, FastMCP
|
8
9
|
|
9
10
|
from yomitoku import DocumentAnalyzer
|
10
11
|
from yomitoku.data.functions import load_image, load_pdf
|
11
|
-
from yomitoku.export import
|
12
|
+
from yomitoku.export import (
|
13
|
+
convert_csv,
|
14
|
+
convert_html,
|
15
|
+
convert_json,
|
16
|
+
convert_markdown,
|
17
|
+
)
|
12
18
|
|
13
19
|
try:
|
14
20
|
RESOURCE_DIR = os.environ["RESOURCE_DIR"]
|
@@ -154,12 +160,37 @@ async def get_file_list() -> list[str]:
|
|
154
160
|
return os.listdir(RESOURCE_DIR)
|
155
161
|
|
156
162
|
|
157
|
-
def run_mcp_server():
|
163
|
+
def run_mcp_server(transport="stdio", mount_path=None):
|
158
164
|
"""
|
159
165
|
Run the MCP server.
|
160
166
|
"""
|
161
|
-
|
167
|
+
|
168
|
+
if transport == "stdio":
|
169
|
+
mcp.run()
|
170
|
+
elif transport == "sse":
|
171
|
+
mcp.run(transport=transport, mount_path=mount_path)
|
172
|
+
|
173
|
+
|
174
|
+
def main():
|
175
|
+
parser = ArgumentParser(description="Run the MCP server.")
|
176
|
+
parser.add_argument(
|
177
|
+
"--transport",
|
178
|
+
"-t",
|
179
|
+
type=str,
|
180
|
+
default="stdio",
|
181
|
+
choices=["stdio", "sse"],
|
182
|
+
help="Transport method for the MCP server.",
|
183
|
+
)
|
184
|
+
parser.add_argument(
|
185
|
+
"--mount_path",
|
186
|
+
"-m",
|
187
|
+
type=str,
|
188
|
+
default=None,
|
189
|
+
help="Mount path for the MCP server (only used with SSE transport).",
|
190
|
+
)
|
191
|
+
args = parser.parse_args()
|
192
|
+
run_mcp_server(transport=args.transport, mount_path=args.mount_path)
|
162
193
|
|
163
194
|
|
164
195
|
if __name__ == "__main__":
|
165
|
-
|
196
|
+
main()
|
yomitoku/data/dataset.py
CHANGED
@@ -8,9 +8,11 @@ from .functions import (
|
|
8
8
|
validate_quads,
|
9
9
|
)
|
10
10
|
|
11
|
+
from concurrent.futures import ThreadPoolExecutor
|
12
|
+
|
11
13
|
|
12
14
|
class ParseqDataset(Dataset):
|
13
|
-
def __init__(self, cfg, img, quads):
|
15
|
+
def __init__(self, cfg, img, quads, num_workers=8):
|
14
16
|
self.img = img[:, :, ::-1]
|
15
17
|
self.quads = quads
|
16
18
|
self.cfg = cfg
|
@@ -22,19 +24,27 @@ class ParseqDataset(Dataset):
|
|
22
24
|
]
|
23
25
|
)
|
24
26
|
|
25
|
-
|
27
|
+
with ThreadPoolExecutor(max_workers=num_workers) as executor:
|
28
|
+
data = list(executor.map(self.preprocess, self.quads))
|
26
29
|
|
27
|
-
|
28
|
-
|
30
|
+
self.data = [tensor for tensor in data if tensor is not None]
|
31
|
+
|
32
|
+
def preprocess(self, quad):
|
33
|
+
if validate_quads(self.img, quad) is None:
|
34
|
+
return None
|
35
|
+
|
36
|
+
roi_img = extract_roi_with_perspective(self.img, quad)
|
29
37
|
|
30
|
-
def __getitem__(self, index):
|
31
|
-
polygon = self.quads[index]
|
32
|
-
roi_img = extract_roi_with_perspective(self.img, polygon)
|
33
38
|
if roi_img is None:
|
34
|
-
return
|
39
|
+
return None
|
35
40
|
|
36
41
|
roi_img = rotate_text_image(roi_img, thresh_aspect=2)
|
37
42
|
resized = resize_with_padding(roi_img, self.cfg.data.img_size)
|
38
|
-
tensor = self.transform(resized)
|
39
43
|
|
40
|
-
return
|
44
|
+
return resized
|
45
|
+
|
46
|
+
def __len__(self):
|
47
|
+
return len(self.data)
|
48
|
+
|
49
|
+
def __getitem__(self, index):
|
50
|
+
return self.transform(self.data[index])
|
yomitoku/data/functions.py
CHANGED
@@ -191,7 +191,7 @@ def array_to_tensor(img: np.ndarray) -> torch.Tensor:
|
|
191
191
|
return tensor
|
192
192
|
|
193
193
|
|
194
|
-
def validate_quads(img: np.ndarray,
|
194
|
+
def validate_quads(img: np.ndarray, quad: list[list[list[int]]]):
|
195
195
|
"""
|
196
196
|
Validate the vertices of the quadrilateral.
|
197
197
|
|
@@ -204,23 +204,23 @@ def validate_quads(img: np.ndarray, quads: list[list[list[int]]]):
|
|
204
204
|
"""
|
205
205
|
|
206
206
|
h, w = img.shape[:2]
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
207
|
+
if len(quad) != 4:
|
208
|
+
# raise ValueError("The number of vertices must be 4.")
|
209
|
+
return None
|
210
|
+
|
211
|
+
for point in quad:
|
212
|
+
if len(point) != 2:
|
213
|
+
return None
|
214
|
+
|
215
|
+
quad = np.array(quad, dtype=int)
|
216
|
+
x1 = np.min(quad[:, 0])
|
217
|
+
x2 = np.max(quad[:, 0])
|
218
|
+
y1 = np.min(quad[:, 1])
|
219
|
+
y2 = np.max(quad[:, 1])
|
220
|
+
h, w = img.shape[:2]
|
221
221
|
|
222
|
-
|
223
|
-
|
222
|
+
if x1 < 0 or x2 > w or y1 < 0 or y2 > h:
|
223
|
+
return None
|
224
224
|
|
225
225
|
return True
|
226
226
|
|
@@ -237,19 +237,18 @@ def extract_roi_with_perspective(img, quad):
|
|
237
237
|
np.ndarray: extracted image
|
238
238
|
"""
|
239
239
|
dst = img.copy()
|
240
|
-
quad = np.array(quad, dtype=np.
|
240
|
+
quad = np.array(quad, dtype=np.int64)
|
241
|
+
|
241
242
|
width = np.linalg.norm(quad[0] - quad[1])
|
242
243
|
height = np.linalg.norm(quad[1] - quad[2])
|
243
244
|
|
244
245
|
width = int(width)
|
245
246
|
height = int(height)
|
246
|
-
|
247
247
|
pts1 = np.float32(quad)
|
248
248
|
pts2 = np.float32([[0, 0], [width, 0], [width, height], [0, height]])
|
249
249
|
|
250
250
|
M = cv2.getPerspectiveTransform(pts1, pts2)
|
251
251
|
dst = cv2.warpPerspective(dst, M, (width, height))
|
252
|
-
|
253
252
|
return dst
|
254
253
|
|
255
254
|
|
yomitoku/document_analyzer.py
CHANGED
@@ -86,8 +86,12 @@ def extract_paragraph_within_figure(paragraphs, figures):
|
|
86
86
|
check_list[i] = True
|
87
87
|
|
88
88
|
figure["direction"] = judge_page_direction(contained_paragraphs)
|
89
|
+
reading_order = (
|
90
|
+
"left2right" if figure["direction"] == "horizontal" else "right2left"
|
91
|
+
)
|
92
|
+
|
89
93
|
figure_paragraphs = prediction_reading_order(
|
90
|
-
contained_paragraphs,
|
94
|
+
contained_paragraphs, reading_order
|
91
95
|
)
|
92
96
|
figure["paragraphs"] = sorted(figure_paragraphs, key=lambda x: x.order)
|
93
97
|
figure = FigureSchema(**figure)
|
@@ -126,8 +130,8 @@ def extract_words_within_element(pred_words, element):
|
|
126
130
|
cnt_vertical = word_direction.count("vertical")
|
127
131
|
|
128
132
|
element_direction = "horizontal" if cnt_horizontal > cnt_vertical else "vertical"
|
129
|
-
|
130
|
-
prediction_reading_order(contained_words,
|
133
|
+
order = "left2right" if element_direction == "horizontal" else "right2left"
|
134
|
+
prediction_reading_order(contained_words, order)
|
131
135
|
contained_words = sorted(contained_words, key=lambda x: x.order)
|
132
136
|
|
133
137
|
contained_words = "\n".join([content.contents for content in contained_words])
|
@@ -328,6 +332,7 @@ class DocumentAnalyzer:
|
|
328
332
|
device="cuda",
|
329
333
|
visualize=False,
|
330
334
|
ignore_meta=False,
|
335
|
+
reading_order="auto",
|
331
336
|
):
|
332
337
|
default_configs = {
|
333
338
|
"ocr": {
|
@@ -352,6 +357,8 @@ class DocumentAnalyzer:
|
|
352
357
|
},
|
353
358
|
}
|
354
359
|
|
360
|
+
self.reading_order = reading_order
|
361
|
+
|
355
362
|
if isinstance(configs, dict):
|
356
363
|
recursive_update(default_configs, configs)
|
357
364
|
else:
|
@@ -452,9 +459,17 @@ class DocumentAnalyzer:
|
|
452
459
|
|
453
460
|
elements = page_contents + layout_res.tables + figures
|
454
461
|
|
455
|
-
prediction_reading_order(headers,
|
456
|
-
prediction_reading_order(footers,
|
457
|
-
|
462
|
+
prediction_reading_order(headers, "left2right")
|
463
|
+
prediction_reading_order(footers, "left2right")
|
464
|
+
|
465
|
+
if self.reading_order == "auto":
|
466
|
+
reading_order = (
|
467
|
+
"right2left" if page_direction == "vertical" else "top2bottom"
|
468
|
+
)
|
469
|
+
else:
|
470
|
+
reading_order = self.reading_order
|
471
|
+
|
472
|
+
prediction_reading_order(elements, reading_order, self.img)
|
458
473
|
|
459
474
|
for i, element in enumerate(elements):
|
460
475
|
element.order += len(headers)
|
yomitoku/reading_order.py
CHANGED
@@ -17,7 +17,6 @@ def _priority_dfs(nodes, direction):
|
|
17
17
|
|
18
18
|
pending_nodes = sorted(nodes, key=lambda x: x.prop["distance"])
|
19
19
|
visited = [False] * len(nodes)
|
20
|
-
|
21
20
|
start = pending_nodes.pop(0)
|
22
21
|
stack = [start]
|
23
22
|
|
@@ -53,11 +52,11 @@ def _priority_dfs(nodes, direction):
|
|
53
52
|
children.append(node)
|
54
53
|
stack.remove(node)
|
55
54
|
|
56
|
-
if direction
|
55
|
+
if direction in "top2bottom":
|
57
56
|
children = sorted(
|
58
57
|
children, key=lambda x: x.prop["box"][0], reverse=True
|
59
58
|
)
|
60
|
-
|
59
|
+
elif direction in ["right2left", "left2right"]:
|
61
60
|
children = sorted(
|
62
61
|
children, key=lambda x: x.prop["box"][1], reverse=True
|
63
62
|
)
|
@@ -121,7 +120,7 @@ def _exist_other_node_between_horizontal(node, other_node, nodes):
|
|
121
120
|
return False
|
122
121
|
|
123
122
|
|
124
|
-
def
|
123
|
+
def _create_graph_top2bottom(nodes):
|
125
124
|
for i, node in enumerate(nodes):
|
126
125
|
for j, other_node in enumerate(nodes):
|
127
126
|
if i == j:
|
@@ -146,7 +145,7 @@ def _create_graph_horizontal(nodes):
|
|
146
145
|
node.children = sorted(node.children, key=lambda x: x.prop["box"][0])
|
147
146
|
|
148
147
|
|
149
|
-
def
|
148
|
+
def _create_graph_right2left(nodes):
|
150
149
|
max_x = max([node.prop["box"][2] for node in nodes])
|
151
150
|
|
152
151
|
for i, node in enumerate(nodes):
|
@@ -172,15 +171,46 @@ def _create_graph_vertical(nodes):
|
|
172
171
|
node.children = sorted(node.children, key=lambda x: x.prop["box"][1])
|
173
172
|
|
174
173
|
|
174
|
+
def _create_graph_left2right(nodes, x_weight=1, y_weight=5):
|
175
|
+
for i, node in enumerate(nodes):
|
176
|
+
for j, other_node in enumerate(nodes):
|
177
|
+
if i == j:
|
178
|
+
continue
|
179
|
+
|
180
|
+
if is_intersected_horizontal(node.prop["box"], other_node.prop["box"]):
|
181
|
+
tx = node.prop["box"][2]
|
182
|
+
ox = other_node.prop["box"][2]
|
183
|
+
|
184
|
+
if _exist_other_node_between_horizontal(node, other_node, nodes):
|
185
|
+
continue
|
186
|
+
|
187
|
+
if ox < tx:
|
188
|
+
other_node.add_link(node)
|
189
|
+
else:
|
190
|
+
node.add_link(other_node)
|
191
|
+
|
192
|
+
node_distance = (
|
193
|
+
node.prop["box"][0] * x_weight + node.prop["box"][1] * y_weight
|
194
|
+
)
|
195
|
+
node.prop["distance"] = node_distance
|
196
|
+
|
197
|
+
for node in nodes:
|
198
|
+
node.children = sorted(node.children, key=lambda x: x.prop["box"][1])
|
199
|
+
|
200
|
+
|
175
201
|
def prediction_reading_order(elements, direction, img=None):
|
176
202
|
if len(elements) < 2:
|
177
203
|
return elements
|
178
204
|
|
179
205
|
nodes = [Node(i, element.dict()) for i, element in enumerate(elements)]
|
180
|
-
if direction == "
|
181
|
-
|
206
|
+
if direction == "top2bottom":
|
207
|
+
_create_graph_top2bottom(nodes)
|
208
|
+
elif direction == "right2left":
|
209
|
+
_create_graph_right2left(nodes)
|
210
|
+
elif direction == "left2right":
|
211
|
+
_create_graph_left2right(nodes)
|
182
212
|
else:
|
183
|
-
|
213
|
+
raise ValueError(f"Invalid direction: {direction}")
|
184
214
|
|
185
215
|
# For debugging
|
186
216
|
# if img is not None:
|
yomitoku/utils/misc.py
CHANGED
@@ -80,7 +80,7 @@ def calc_intersection(rect_a, rect_b):
|
|
80
80
|
return [ix1, iy1, ix2, iy2]
|
81
81
|
|
82
82
|
|
83
|
-
def is_intersected_horizontal(rect_a, rect_b):
|
83
|
+
def is_intersected_horizontal(rect_a, rect_b, threshold=0.5):
|
84
84
|
_, ay1, _, ay2 = map(int, rect_a)
|
85
85
|
_, by1, _, by2 = map(int, rect_b)
|
86
86
|
|
@@ -88,9 +88,11 @@ def is_intersected_horizontal(rect_a, rect_b):
|
|
88
88
|
iy1 = max(ay1, by1)
|
89
89
|
iy2 = min(ay2, by2)
|
90
90
|
|
91
|
+
min_height = min(ay2 - ay1, by2 - by1)
|
92
|
+
|
91
93
|
overlap_height = max(0, iy2 - iy1)
|
92
94
|
|
93
|
-
if overlap_height
|
95
|
+
if (overlap_height / min_height) < threshold:
|
94
96
|
return False
|
95
97
|
|
96
98
|
return True
|
@@ -119,3 +121,48 @@ def quad_to_xyxy(quad):
|
|
119
121
|
y2 = max([y for _, y in quad])
|
120
122
|
|
121
123
|
return x1, y1, x2, y2
|
124
|
+
|
125
|
+
|
126
|
+
def convert_table_array(table):
|
127
|
+
n_rows = table.n_row
|
128
|
+
n_cols = table.n_col
|
129
|
+
|
130
|
+
table_array = [["" for _ in range(n_cols)] for _ in range(n_rows)]
|
131
|
+
|
132
|
+
for cell in table.cells:
|
133
|
+
row = cell.row - 1
|
134
|
+
col = cell.col - 1
|
135
|
+
row_span = cell.row_span
|
136
|
+
col_span = cell.col_span
|
137
|
+
contents = cell.contents
|
138
|
+
|
139
|
+
for i in range(row, row + row_span):
|
140
|
+
for j in range(col, col + col_span):
|
141
|
+
table_array[i][j] = contents
|
142
|
+
|
143
|
+
return table_array
|
144
|
+
|
145
|
+
|
146
|
+
def convert_table_array_to_dict(table_array, header_row=1):
|
147
|
+
n_cols = len(table_array[0])
|
148
|
+
n_rows = len(table_array)
|
149
|
+
|
150
|
+
header_cols = []
|
151
|
+
for i in range(n_cols):
|
152
|
+
header = []
|
153
|
+
for j in range(header_row):
|
154
|
+
header.append(table_array[j][i])
|
155
|
+
|
156
|
+
if len(header) > 0:
|
157
|
+
header_cols.append("_".join(header))
|
158
|
+
else:
|
159
|
+
header_cols.append(f"col_{i}")
|
160
|
+
|
161
|
+
table_dict = []
|
162
|
+
for i in range(header_row, n_rows):
|
163
|
+
row_dict = {}
|
164
|
+
for j in range(n_cols):
|
165
|
+
row_dict[header_cols[j]] = table_array[i][j]
|
166
|
+
table_dict.append(row_dict)
|
167
|
+
|
168
|
+
return table_dict
|
@@ -0,0 +1,116 @@
|
|
1
|
+
import os
|
2
|
+
|
3
|
+
from PIL import Image
|
4
|
+
from io import BytesIO
|
5
|
+
|
6
|
+
from reportlab.pdfgen import canvas
|
7
|
+
from reportlab.pdfbase.ttfonts import TTFont
|
8
|
+
from reportlab.pdfbase import pdfmetrics
|
9
|
+
from reportlab.pdfbase.pdfmetrics import stringWidth
|
10
|
+
|
11
|
+
import numpy as np
|
12
|
+
import jaconv
|
13
|
+
|
14
|
+
from ..constants import ROOT_DIR
|
15
|
+
|
16
|
+
FONT_PATH = ROOT_DIR + "/resource/MPLUS1p-Medium.ttf"
|
17
|
+
pdfmetrics.registerFont(TTFont("MPLUS1p-Medium", FONT_PATH))
|
18
|
+
|
19
|
+
|
20
|
+
def _poly2rect(points):
|
21
|
+
"""
|
22
|
+
Convert a polygon defined by its corner points to a rectangle.
|
23
|
+
The points should be in the format [[x1, y1], [x2, y2], [x3, y3], [x4, y4]].
|
24
|
+
"""
|
25
|
+
points = np.array(points, dtype=int)
|
26
|
+
x_min = points[:, 0].min()
|
27
|
+
x_max = points[:, 0].max()
|
28
|
+
y_min = points[:, 1].min()
|
29
|
+
y_max = points[:, 1].max()
|
30
|
+
|
31
|
+
return [x_min, y_min, x_max, y_max]
|
32
|
+
|
33
|
+
|
34
|
+
def _calc_font_size(content, bbox_height, bbox_width):
|
35
|
+
rates = np.arange(0.5, 1.0, 0.01)
|
36
|
+
|
37
|
+
min_diff = np.inf
|
38
|
+
best_font_size = None
|
39
|
+
for rate in rates:
|
40
|
+
font_size = bbox_height * rate
|
41
|
+
text_w = stringWidth(content, "MPLUS1p-Medium", font_size)
|
42
|
+
diff = abs(text_w - bbox_width)
|
43
|
+
if diff < min_diff:
|
44
|
+
min_diff = diff
|
45
|
+
best_font_size = font_size
|
46
|
+
|
47
|
+
return best_font_size
|
48
|
+
|
49
|
+
|
50
|
+
def to_full_width(text):
|
51
|
+
fw_map = {
|
52
|
+
"\u00a5": "\uffe5", # ¥ → ¥
|
53
|
+
"\u00b7": "\u30fb", # · → ・
|
54
|
+
" ": "\u3000", # 半角スペース→全角スペース
|
55
|
+
}
|
56
|
+
|
57
|
+
TO_FULLWIDTH = str.maketrans(fw_map)
|
58
|
+
|
59
|
+
jaconv_text = jaconv.h2z(text, kana=True, ascii=True, digit=True)
|
60
|
+
jaconv_text = jaconv_text.translate(TO_FULLWIDTH)
|
61
|
+
|
62
|
+
return jaconv_text
|
63
|
+
|
64
|
+
|
65
|
+
def create_searchable_pdf(images, ocr_results, output_path):
|
66
|
+
packet = BytesIO()
|
67
|
+
c = canvas.Canvas(packet)
|
68
|
+
|
69
|
+
for i, (image, ocr_result) in enumerate(zip(images, ocr_results)):
|
70
|
+
image = Image.fromarray(image[:, :, ::-1]) # Convert BGR to RGB
|
71
|
+
pdfmetrics.registerFont(TTFont("MPLUS1p-Medium", FONT_PATH))
|
72
|
+
|
73
|
+
image_path = f"tmp_{i}.png"
|
74
|
+
image.save(image_path)
|
75
|
+
w, h = image.size
|
76
|
+
|
77
|
+
c.setPageSize((w, h))
|
78
|
+
c.drawImage(image_path, 0, 0, width=w, height=h)
|
79
|
+
os.remove(image_path) # Clean up temporary image file
|
80
|
+
|
81
|
+
for word in ocr_result.words:
|
82
|
+
text = word.content
|
83
|
+
bbox = _poly2rect(word.points)
|
84
|
+
direction = word.direction
|
85
|
+
|
86
|
+
x1, y1, x2, y2 = bbox
|
87
|
+
bbox_height = y2 - y1
|
88
|
+
bbox_width = x2 - x1
|
89
|
+
|
90
|
+
if direction == "vertical":
|
91
|
+
text = to_full_width(text)
|
92
|
+
|
93
|
+
if direction == "horizontal":
|
94
|
+
font_size = _calc_font_size(text, bbox_height, bbox_width)
|
95
|
+
else:
|
96
|
+
font_size = _calc_font_size(text, bbox_width, bbox_height)
|
97
|
+
|
98
|
+
c.setFont("MPLUS1p-Medium", font_size)
|
99
|
+
c.setFillColorRGB(1, 1, 1, alpha=0) # 透明
|
100
|
+
# c.setFillColorRGB(0, 0, 0)
|
101
|
+
if direction == "vertical":
|
102
|
+
base_y = h - y2 + (bbox_height - font_size)
|
103
|
+
for j, ch in enumerate(text):
|
104
|
+
c.saveState()
|
105
|
+
c.translate(x1 + font_size * 0.5, base_y - (j - 1) * font_size)
|
106
|
+
c.rotate(-90)
|
107
|
+
c.drawString(0, 0, ch)
|
108
|
+
c.restoreState()
|
109
|
+
else:
|
110
|
+
base_y = h - y2 + (bbox_height - font_size) * 0.5
|
111
|
+
c.drawString(x1, base_y, text)
|
112
|
+
c.showPage()
|
113
|
+
c.save()
|
114
|
+
|
115
|
+
with open(output_path, "wb") as f:
|
116
|
+
f.write(packet.getvalue())
|
@@ -1,12 +1,13 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: yomitoku
|
3
|
-
Version: 0.9.
|
3
|
+
Version: 0.9.2
|
4
4
|
Summary: Yomitoku is an AI-powered document image analysis package designed specifically for the Japanese language.
|
5
5
|
Author-email: Kotaro Kinoshita <kotaro.kinoshita@mlism.com>
|
6
6
|
License: CC BY-NC-SA 4.0
|
7
7
|
Keywords: Deep Learning,Japanese,OCR
|
8
8
|
Requires-Python: <3.13,>=3.10
|
9
9
|
Requires-Dist: huggingface-hub>=0.26.1
|
10
|
+
Requires-Dist: jaconv>=0.4.0
|
10
11
|
Requires-Dist: lxml>=5.3.0
|
11
12
|
Requires-Dist: omegaconf>=2.3.0
|
12
13
|
Requires-Dist: onnx>=1.17.0
|
@@ -15,6 +16,7 @@ Requires-Dist: opencv-python>=4.10.0.84
|
|
15
16
|
Requires-Dist: pyclipper>=1.3.0.post6
|
16
17
|
Requires-Dist: pydantic>=2.9.2
|
17
18
|
Requires-Dist: pypdfium2>=4.30.0
|
19
|
+
Requires-Dist: reportlab>=4.4.1
|
18
20
|
Requires-Dist: shapely>=2.0.6
|
19
21
|
Requires-Dist: timm>=1.0.11
|
20
22
|
Requires-Dist: torch>=2.5.0
|
@@ -41,7 +43,7 @@ YomiToku は日本語に特化した AI 文章画像解析エンジン(Document
|
|
41
43
|
- 🤖 日本語データセットで学習した 4 種類(文字位置の検知、文字列認識、レイアウト解析、表の構造認識)の AI モデルを搭載しています。4 種類のモデルはすべて独自に学習されたモデルで日本語文書に対して、高精度に推論可能です。
|
42
44
|
- 🇯🇵 各モデルは日本語の文書画像に特化して学習されており、7000 文字を超える日本語文字の認識をサーポート、手書き文字、縦書きなど日本語特有のレイアウト構造の文書画像の解析も可能です。(日本語以外にも英語の文書に対しても対応しています)。
|
43
45
|
- 📈 レイアウト解析、表の構造解析, 読み順推定機能により、文書画像のレイアウトの意味的構造を壊さずに情報を抽出することが可能です。
|
44
|
-
- 📄 多様な出力形式をサポートしています。html やマークダウン、json、csv
|
46
|
+
- 📄 多様な出力形式をサポートしています。html やマークダウン、json、csv のいずれかのフォーマットに変換可能です。また、文書内に含まれる図表、画像の抽出の出力も可能です。文書画像をサーチャブルPDFに変換する処理もサポートしています。
|
45
47
|
- ⚡ GPU 環境で高速に動作し、効率的に文書の文字起こし解析が可能です。また、VRAM も 8GB 以内で動作し、ハイエンドな GPU を用意する必要はありません。
|
46
48
|
|
47
49
|
## 🖼️ デモ
|
@@ -66,6 +68,7 @@ Markdown でエクスポートした結果は関してはリポジトリ内の[s
|
|
66
68
|
|
67
69
|
## 📣 リリース情報
|
68
70
|
|
71
|
+
- 2025 年 4 月 4 日 YomiToku v0.8.0 手書き文字認識のサポート
|
69
72
|
- 2024 年 11 月 26 日 YomiToku v0.5.1 (beta) を公開
|
70
73
|
|
71
74
|
## 💡 インストールの方法
|
@@ -95,6 +98,7 @@ yomitoku ${path_data} -f md -o results -v --figure --lite
|
|
95
98
|
- `--encoding` エクスポートする出力ファイルの文字エンコーディングを指定します。サポートされていない文字コードが含まれる場合は、その文字を無視します。(utf-8, utf-8-sig, shift-jis, enc-jp, cp932)
|
96
99
|
- `--combine` PDFを入力に与えたときに、複数ページが含まれる場合に、それらの予測結果を一つのファイルに統合してエクスポートします。
|
97
100
|
- `--ignore_meta` 文章のheater, fotterなどの文字情報を出力ファイルに含めません。
|
101
|
+
- `--searchable_pdf` 読み取った文字情報をPDFに埋め込み全文検索可能なPDFを出力します。
|
98
102
|
|
99
103
|
その他のオプションに関しては、ヘルプを参照
|
100
104
|
|
@@ -1,17 +1,17 @@
|
|
1
1
|
yomitoku/__init__.py,sha256=kXOM8RbpwwLABG3p3vPT3dJWBk4JX2MFGrOeBEW0hKM,543
|
2
2
|
yomitoku/base.py,sha256=9U3sfe69O6vuO430JzzKQQNkgPsLM9WdLfOUUhp3Ljs,3878
|
3
3
|
yomitoku/constants.py,sha256=zlW5QRc_u_F3C2RAgBFWyHJZexBnJT5N15GC-9d3iLo,686
|
4
|
-
yomitoku/document_analyzer.py,sha256=
|
4
|
+
yomitoku/document_analyzer.py,sha256=xliAelQdfsK64FtVuFvstDBr9uf2TwhqW31g2g91_CY,16888
|
5
5
|
yomitoku/layout_analyzer.py,sha256=VhNf1ZQFoozj6WUGk5ll1p2p1jk5X3j-JPcDbTAoSl4,1856
|
6
6
|
yomitoku/layout_parser.py,sha256=0MgbCsD90srQdsxkGEL0TgKm4rkmGzsQYx0sjKQ03yc,7718
|
7
7
|
yomitoku/ocr.py,sha256=JSTjkupcxHITQm6ERnzU7As0c3KWf8-oxc0AqNoWHXo,2272
|
8
|
-
yomitoku/reading_order.py,sha256=
|
8
|
+
yomitoku/reading_order.py,sha256=_T09PqT7guk57zWo4HdSazLSQTwM91piyELA_wNHQAQ,7521
|
9
9
|
yomitoku/table_structure_recognizer.py,sha256=tHjex6deT_FjRK5ePz9bUXA_QIhgv_vYtK-ynm4ALxg,9625
|
10
10
|
yomitoku/text_detector.py,sha256=6IwEJJKp_F8YH0Oki0QV-Mqi--P2LGbNKo-_kxBB_eo,4383
|
11
11
|
yomitoku/text_recognizer.py,sha256=eaxozNu-Ms6iv8efbKZzn8pJNW1Wo4f86bGhzSMtv3s,5992
|
12
12
|
yomitoku/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
|
-
yomitoku/cli/main.py,sha256=
|
14
|
-
yomitoku/cli/
|
13
|
+
yomitoku/cli/main.py,sha256=7AaaFzMf33ER__XPDBNkrJkKwclne7QyVFWeBvpUYBY,12849
|
14
|
+
yomitoku/cli/mcp_server.py,sha256=WnWzxd13HaemC3b-5i9B9NVBGc3WGfum2nYhoBolEnk,5641
|
15
15
|
yomitoku/configs/__init__.py,sha256=x5-ccjGiP6xxRtDPT7f1Enl7SsE0hSk0G8f7eF9V85I,886
|
16
16
|
yomitoku/configs/cfg_layout_parser_rtdtrv2.py,sha256=8PRxB2Ar9UF7-DLtbgSokhrzdXb0veWI6Wc-X8qigRw,2329
|
17
17
|
yomitoku/configs/cfg_layout_parser_rtdtrv2_v2.py,sha256=nMrL3uvoVmyzZ909Bz2zmfp9b6AEBLKhIprOvQ5yiQE,2324
|
@@ -22,8 +22,8 @@ yomitoku/configs/cfg_text_recognizer_parseq.py,sha256=hpFs3nKqh4XdU3BZMTultegtLE
|
|
22
22
|
yomitoku/configs/cfg_text_recognizer_parseq_small.py,sha256=uCm_VC_G79IbZpOiK8fgYzAJ4b98H5pf328wyQomtfo,1259
|
23
23
|
yomitoku/configs/cfg_text_recognizer_parseq_v2.py,sha256=GfHzbByOKjH21PRTxT8x_fU4r4Mda6F750Z8pjNeb8g,1249
|
24
24
|
yomitoku/data/__init__.py,sha256=KAofFc9rk9ZdTKBjemu9RM8Vj9XnKbWC2MPZ2RWtOdE,82
|
25
|
-
yomitoku/data/dataset.py,sha256
|
26
|
-
yomitoku/data/functions.py,sha256=
|
25
|
+
yomitoku/data/dataset.py,sha256=lpBcpkMuQzRIyLJ4_mqtuhR9s2ZmzgBgc-XYuE_b2Sc,1326
|
26
|
+
yomitoku/data/functions.py,sha256=RExCUxI3-gccIMw-H0ribX2jeGKkrJWhS4fNn_12c3Y,7878
|
27
27
|
yomitoku/export/__init__.py,sha256=gmlikMHRXfzfJ_8q4fyDlnpGms-x1oggQOwJEWHMgBU,508
|
28
28
|
yomitoku/export/export_csv.py,sha256=VY8mntUCPDbDco_dyvq5O0_Q4wga9_GTyjHCS-y4UiQ,3399
|
29
29
|
yomitoku/export/export_html.py,sha256=LQDyZgbzmI0qJ0-FEK-54r9816H3L9hD10ChMcw0KyA,5620
|
@@ -50,9 +50,10 @@ yomitoku/resource/charset.txt,sha256=sU91kSi-9Wk4733bCXy4j_UDmvcsj96sHOq1ppUJlOY
|
|
50
50
|
yomitoku/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
51
51
|
yomitoku/utils/graph.py,sha256=LKNB8ZhSQwOZMfeAimPMF5UCVVr2ZaUWoGDkz8z-uGU,456
|
52
52
|
yomitoku/utils/logger.py,sha256=uOmtQDr0A0JD7wyFshedL08BiNrQorHnpktRXba8bjU,424
|
53
|
-
yomitoku/utils/misc.py,sha256=
|
53
|
+
yomitoku/utils/misc.py,sha256=r92x45kQR8lC5jO1MZaHBDtcCWBkQXg_WS9H4RXJzSY,4127
|
54
|
+
yomitoku/utils/searchable_pdf.py,sha256=40JbcxWrHzYTtzvI9MPYHMrWqLWKiLWo4mWDNRFXwHY,3530
|
54
55
|
yomitoku/utils/visualizer.py,sha256=DjDwHiAu1iFRKh96H3Egq4vuI2s_-9dLCDeykhKi8jo,5251
|
55
|
-
yomitoku-0.9.
|
56
|
-
yomitoku-0.9.
|
57
|
-
yomitoku-0.9.
|
58
|
-
yomitoku-0.9.
|
56
|
+
yomitoku-0.9.2.dist-info/METADATA,sha256=vDEaaXAimCBfVwMeWmfyJBqzb7sXtZk4-ia3PXrtk7c,8966
|
57
|
+
yomitoku-0.9.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
58
|
+
yomitoku-0.9.2.dist-info/entry_points.txt,sha256=n3c8bQSj5Be5GHAOv_NZ8cldJFmWeigQxSmteFTmu_k,96
|
59
|
+
yomitoku-0.9.2.dist-info/RECORD,,
|
File without changes
|