yomitoku 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- yomitoku/cli/main.py +34 -2
- yomitoku/document_analyzer.py +228 -26
- yomitoku/export/export_csv.py +41 -2
- yomitoku/export/export_html.py +4 -1
- yomitoku/export/export_json.py +42 -2
- yomitoku/export/export_markdown.py +4 -1
- yomitoku/layout_analyzer.py +1 -5
- yomitoku/layout_parser.py +5 -1
- yomitoku/ocr.py +24 -27
- yomitoku/table_structure_recognizer.py +24 -7
- yomitoku/text_detector.py +8 -3
- yomitoku/text_recognizer.py +22 -7
- yomitoku/utils/misc.py +20 -13
- yomitoku/utils/visualizer.py +5 -5
- {yomitoku-0.6.0.dist-info → yomitoku-0.7.1.dist-info}/METADATA +11 -4
- {yomitoku-0.6.0.dist-info → yomitoku-0.7.1.dist-info}/RECORD +18 -18
- {yomitoku-0.6.0.dist-info → yomitoku-0.7.1.dist-info}/WHEEL +1 -1
- {yomitoku-0.6.0.dist-info → yomitoku-0.7.1.dist-info}/entry_points.txt +0 -0
yomitoku/cli/main.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
import argparse
|
2
2
|
import os
|
3
|
+
import torch
|
3
4
|
from pathlib import Path
|
4
5
|
|
5
6
|
import cv2
|
@@ -13,6 +14,18 @@ from ..utils.logger import set_logger
|
|
13
14
|
logger = set_logger(__name__, "INFO")
|
14
15
|
|
15
16
|
|
17
|
+
def validate_encoding(encoding):
|
18
|
+
if encoding not in [
|
19
|
+
"utf-8",
|
20
|
+
"utf-8-sig",
|
21
|
+
"shift-jis",
|
22
|
+
"euc-jp",
|
23
|
+
"cp932",
|
24
|
+
]:
|
25
|
+
raise ValueError(f"Invalid encoding: {encoding}")
|
26
|
+
return True
|
27
|
+
|
28
|
+
|
16
29
|
def process_single_file(args, analyzer, path, format):
|
17
30
|
if path.suffix[1:].lower() in ["pdf"]:
|
18
31
|
imgs = load_pdf(path)
|
@@ -21,7 +34,6 @@ def process_single_file(args, analyzer, path, format):
|
|
21
34
|
|
22
35
|
for page, img in enumerate(imgs):
|
23
36
|
results, ocr, layout = analyzer(img)
|
24
|
-
|
25
37
|
dirname = path.parent.name
|
26
38
|
filename = path.stem
|
27
39
|
|
@@ -47,11 +59,19 @@ def process_single_file(args, analyzer, path, format):
|
|
47
59
|
results.to_json(
|
48
60
|
out_path,
|
49
61
|
ignore_line_break=args.ignore_line_break,
|
62
|
+
encoding=args.encoding,
|
63
|
+
img=img,
|
64
|
+
export_figure=args.figure,
|
65
|
+
figure_dir=args.figure_dir,
|
50
66
|
)
|
51
67
|
elif format == "csv":
|
52
68
|
results.to_csv(
|
53
69
|
out_path,
|
54
70
|
ignore_line_break=args.ignore_line_break,
|
71
|
+
encoding=args.encoding,
|
72
|
+
img=img,
|
73
|
+
export_figure=args.figure,
|
74
|
+
figure_dir=args.figure_dir,
|
55
75
|
)
|
56
76
|
elif format == "html":
|
57
77
|
results.to_html(
|
@@ -62,6 +82,7 @@ def process_single_file(args, analyzer, path, format):
|
|
62
82
|
export_figure_letter=args.figure_letter,
|
63
83
|
figure_width=args.figure_width,
|
64
84
|
figure_dir=args.figure_dir,
|
85
|
+
encoding=args.encoding,
|
65
86
|
)
|
66
87
|
elif format == "md":
|
67
88
|
results.to_markdown(
|
@@ -72,6 +93,7 @@ def process_single_file(args, analyzer, path, format):
|
|
72
93
|
export_figure_letter=args.figure_letter,
|
73
94
|
figure_width=args.figure_width,
|
74
95
|
figure_dir=args.figure_dir,
|
96
|
+
encoding=args.encoding,
|
75
97
|
)
|
76
98
|
|
77
99
|
logger.info(f"Output file: {out_path}")
|
@@ -168,6 +190,12 @@ def main():
|
|
168
190
|
default="figures",
|
169
191
|
help="directory to save figure images",
|
170
192
|
)
|
193
|
+
parser.add_argument(
|
194
|
+
"--encoding",
|
195
|
+
type=str,
|
196
|
+
default="utf-8",
|
197
|
+
help="Specifies the character encoding for the output file to be exported. If unsupported characters are included, they will be ignored.",
|
198
|
+
)
|
171
199
|
|
172
200
|
args = parser.parse_args()
|
173
201
|
|
@@ -181,6 +209,8 @@ def main():
|
|
181
209
|
f"Invalid output format: {args.format}. Supported formats are {SUPPORT_OUTPUT_FORMAT}"
|
182
210
|
)
|
183
211
|
|
212
|
+
validate_encoding(args.encoding)
|
213
|
+
|
184
214
|
if format == "markdown":
|
185
215
|
format = "md"
|
186
216
|
|
@@ -205,7 +235,9 @@ def main():
|
|
205
235
|
|
206
236
|
if args.lite:
|
207
237
|
configs["ocr"]["text_recognizer"]["model_name"] = "parseq-small"
|
208
|
-
|
238
|
+
|
239
|
+
if args.device == "cpu" or not torch.cuda.is_available():
|
240
|
+
configs["ocr"]["text_detector"]["infer_onnx"] = True
|
209
241
|
|
210
242
|
# Note: Text Detector以外はONNX推論よりもPyTorch推論の方が速いため、ONNX推論は行わない
|
211
243
|
# configs["ocr"]["text_recognizer"]["infer_onnx"] = True
|
yomitoku/document_analyzer.py
CHANGED
@@ -2,17 +2,26 @@ import asyncio
|
|
2
2
|
from concurrent.futures import ThreadPoolExecutor
|
3
3
|
from typing import List, Union
|
4
4
|
|
5
|
+
import numpy as np
|
6
|
+
|
5
7
|
from pydantic import conlist
|
6
8
|
|
7
9
|
from .base import BaseSchema
|
8
10
|
from .export import export_csv, export_html, export_markdown
|
9
11
|
from .layout_analyzer import LayoutAnalyzer
|
10
|
-
from .ocr import
|
11
|
-
from .table_structure_recognizer import TableStructureRecognizerSchema
|
12
|
-
from .utils.misc import is_contained, quad_to_xyxy
|
12
|
+
from .ocr import OCRSchema, WordPrediction, ocr_aggregate
|
13
13
|
from .reading_order import prediction_reading_order
|
14
|
-
|
14
|
+
from .table_structure_recognizer import TableStructureRecognizerSchema
|
15
|
+
from .utils.misc import (
|
16
|
+
is_contained,
|
17
|
+
quad_to_xyxy,
|
18
|
+
calc_overlap_ratio,
|
19
|
+
)
|
15
20
|
from .utils.visualizer import reading_order_visualizer
|
21
|
+
from yomitoku.text_detector import TextDetector
|
22
|
+
from yomitoku.text_recognizer import TextRecognizer
|
23
|
+
|
24
|
+
from .utils.visualizer import det_visualizer
|
16
25
|
|
17
26
|
|
18
27
|
class ParagraphSchema(BaseSchema):
|
@@ -98,41 +107,56 @@ def extract_words_within_element(pred_words, element):
|
|
98
107
|
word_sum_width = 0
|
99
108
|
word_sum_height = 0
|
100
109
|
check_list = [False] * len(pred_words)
|
110
|
+
|
101
111
|
for i, word in enumerate(pred_words):
|
102
112
|
word_box = quad_to_xyxy(word.points)
|
103
113
|
if is_contained(element.box, word_box, threshold=0.5):
|
104
|
-
contained_words.append(word)
|
105
114
|
word_sum_width += word_box[2] - word_box[0]
|
106
115
|
word_sum_height += word_box[3] - word_box[1]
|
107
116
|
check_list[i] = True
|
108
117
|
|
118
|
+
word_element = ParagraphSchema(
|
119
|
+
box=word_box,
|
120
|
+
contents=word.content,
|
121
|
+
direction=word.direction,
|
122
|
+
order=0,
|
123
|
+
role=None,
|
124
|
+
)
|
125
|
+
contained_words.append(word_element)
|
126
|
+
|
109
127
|
if len(contained_words) == 0:
|
110
128
|
return None, None, check_list
|
111
129
|
|
112
|
-
# mean_width = word_sum_width / len(contained_words)
|
113
|
-
# mean_height = word_sum_height / len(contained_words)
|
114
|
-
|
115
130
|
word_direction = [word.direction for word in contained_words]
|
116
131
|
cnt_horizontal = word_direction.count("horizontal")
|
117
132
|
cnt_vertical = word_direction.count("vertical")
|
118
133
|
|
119
134
|
element_direction = "horizontal" if cnt_horizontal > cnt_vertical else "vertical"
|
120
|
-
if element_direction == "horizontal":
|
121
|
-
contained_words = sorted(
|
122
|
-
contained_words,
|
123
|
-
key=lambda x: (sum([p[1] for p in x.points]) / 4),
|
124
|
-
)
|
125
|
-
else:
|
126
|
-
contained_words = sorted(
|
127
|
-
contained_words,
|
128
|
-
key=lambda x: (sum([p[0] for p in x.points]) / 4),
|
129
|
-
reverse=True,
|
130
|
-
)
|
131
135
|
|
132
|
-
contained_words
|
136
|
+
prediction_reading_order(contained_words, element_direction)
|
137
|
+
contained_words = sorted(contained_words, key=lambda x: x.order)
|
138
|
+
|
139
|
+
contained_words = "\n".join([content.contents for content in contained_words])
|
140
|
+
|
133
141
|
return (contained_words, element_direction, check_list)
|
134
142
|
|
135
143
|
|
144
|
+
def is_vertical(quad, thresh_aspect=2):
|
145
|
+
quad = np.array(quad)
|
146
|
+
width = np.linalg.norm(quad[0] - quad[1])
|
147
|
+
height = np.linalg.norm(quad[1] - quad[2])
|
148
|
+
|
149
|
+
return height > width * thresh_aspect
|
150
|
+
|
151
|
+
|
152
|
+
def is_noise(quad, thresh=15):
|
153
|
+
quad = np.array(quad)
|
154
|
+
width = np.linalg.norm(quad[0] - quad[1])
|
155
|
+
height = np.linalg.norm(quad[1] - quad[2])
|
156
|
+
|
157
|
+
return width < thresh or height < thresh
|
158
|
+
|
159
|
+
|
136
160
|
def recursive_update(original, new_data):
|
137
161
|
for key, value in new_data.items():
|
138
162
|
# `value`が辞書の場合、再帰的に更新
|
@@ -148,8 +172,163 @@ def recursive_update(original, new_data):
|
|
148
172
|
return original
|
149
173
|
|
150
174
|
|
175
|
+
def _extract_words_within_table(words, table, check_list):
|
176
|
+
horizontal_words = []
|
177
|
+
vertical_words = []
|
178
|
+
|
179
|
+
for i, (points, score) in enumerate(zip(words.points, words.scores)):
|
180
|
+
word_box = quad_to_xyxy(points)
|
181
|
+
if is_contained(table.box, word_box, threshold=0.5):
|
182
|
+
if is_vertical(points):
|
183
|
+
vertical_words.append({"points": points, "score": score})
|
184
|
+
else:
|
185
|
+
horizontal_words.append({"points": points, "score": score})
|
186
|
+
|
187
|
+
check_list[i] = True
|
188
|
+
|
189
|
+
return (horizontal_words, vertical_words, check_list)
|
190
|
+
|
191
|
+
|
192
|
+
def _calc_overlap_words_on_lines(lines, words):
|
193
|
+
overlap_ratios = [[0 for _ in lines] for _ in words]
|
194
|
+
|
195
|
+
for i, word in enumerate(words):
|
196
|
+
word_box = quad_to_xyxy(word["points"])
|
197
|
+
for j, row in enumerate(lines):
|
198
|
+
overlap_ratio, _ = calc_overlap_ratio(
|
199
|
+
row.box,
|
200
|
+
word_box,
|
201
|
+
)
|
202
|
+
overlap_ratios[i][j] = overlap_ratio
|
203
|
+
|
204
|
+
return overlap_ratios
|
205
|
+
|
206
|
+
|
207
|
+
def _correct_vertical_word_boxes(overlap_ratios_vertical, table, table_words_vertical):
|
208
|
+
allocated_cols = [cols.index(max(cols)) for cols in overlap_ratios_vertical]
|
209
|
+
|
210
|
+
new_points = []
|
211
|
+
new_scores = []
|
212
|
+
for i, col_index in enumerate(allocated_cols):
|
213
|
+
col_cells = []
|
214
|
+
for cell in table.cells:
|
215
|
+
if cell.col <= (col_index + 1) < (cell.col + cell.col_span):
|
216
|
+
col_cells.append(cell)
|
217
|
+
|
218
|
+
word_point = table_words_vertical[i]["points"]
|
219
|
+
word_score = table_words_vertical[i]["score"]
|
220
|
+
|
221
|
+
for cell in col_cells:
|
222
|
+
word_box = quad_to_xyxy(word_point)
|
223
|
+
|
224
|
+
_, intersection = calc_overlap_ratio(
|
225
|
+
cell.box,
|
226
|
+
word_box,
|
227
|
+
)
|
228
|
+
|
229
|
+
if intersection is not None:
|
230
|
+
_, y1, _, y2 = intersection
|
231
|
+
|
232
|
+
new_point = [
|
233
|
+
[word_point[0][0], max(word_point[0][1], y1)],
|
234
|
+
[word_point[1][0], max(word_point[1][1], y1)],
|
235
|
+
[word_point[2][0], min(word_point[2][1], y2)],
|
236
|
+
[word_point[3][0], min(word_point[3][1], y2)],
|
237
|
+
]
|
238
|
+
|
239
|
+
if not is_noise(new_point):
|
240
|
+
new_points.append(new_point)
|
241
|
+
new_scores.append(word_score)
|
242
|
+
|
243
|
+
return new_points, new_scores
|
244
|
+
|
245
|
+
|
246
|
+
def _correct_horizontal_word_boxes(
|
247
|
+
overlap_ratios_horizontal, table, table_words_horizontal
|
248
|
+
):
|
249
|
+
allocated_rows = [rows.index(max(rows)) for rows in overlap_ratios_horizontal]
|
250
|
+
|
251
|
+
new_points = []
|
252
|
+
new_scores = []
|
253
|
+
for i, row_index in enumerate(allocated_rows):
|
254
|
+
row_cells = []
|
255
|
+
for cell in table.cells:
|
256
|
+
if cell.row <= (row_index + 1) < (cell.row + cell.row_span):
|
257
|
+
row_cells.append(cell)
|
258
|
+
|
259
|
+
word_point = table_words_horizontal[i]["points"]
|
260
|
+
word_score = table_words_horizontal[i]["score"]
|
261
|
+
|
262
|
+
for cell in row_cells:
|
263
|
+
word_box = quad_to_xyxy(word_point)
|
264
|
+
|
265
|
+
_, intersection = calc_overlap_ratio(
|
266
|
+
cell.box,
|
267
|
+
word_box,
|
268
|
+
)
|
269
|
+
|
270
|
+
if intersection is not None:
|
271
|
+
x1, _, x2, _ = intersection
|
272
|
+
|
273
|
+
new_point = [
|
274
|
+
[max(word_point[0][0], x1), word_point[0][1]],
|
275
|
+
[min(word_point[1][0], x2), word_point[1][1]],
|
276
|
+
[min(word_point[2][0], x2), word_point[2][1]],
|
277
|
+
[max(word_point[3][0], x1), word_point[3][1]],
|
278
|
+
]
|
279
|
+
|
280
|
+
if not is_noise(new_point):
|
281
|
+
new_points.append(new_point)
|
282
|
+
new_scores.append(word_score)
|
283
|
+
|
284
|
+
return new_points, new_scores
|
285
|
+
|
286
|
+
|
287
|
+
def _split_text_across_cells(results_det, results_layout):
|
288
|
+
check_list = [False] * len(results_det.points)
|
289
|
+
new_points = []
|
290
|
+
new_scores = []
|
291
|
+
for table in results_layout.tables:
|
292
|
+
table_words_horizontal, table_words_vertical, check_list = (
|
293
|
+
_extract_words_within_table(results_det, table, check_list)
|
294
|
+
)
|
295
|
+
|
296
|
+
overlap_ratios_horizontal = _calc_overlap_words_on_lines(
|
297
|
+
table.rows,
|
298
|
+
table_words_horizontal,
|
299
|
+
)
|
300
|
+
|
301
|
+
overlap_ratios_vertical = _calc_overlap_words_on_lines(
|
302
|
+
table.cols,
|
303
|
+
table_words_vertical,
|
304
|
+
)
|
305
|
+
|
306
|
+
new_points_horizontal, new_scores_horizontal = _correct_horizontal_word_boxes(
|
307
|
+
overlap_ratios_horizontal, table, table_words_horizontal
|
308
|
+
)
|
309
|
+
|
310
|
+
new_points_vertical, new_scores_vertical = _correct_vertical_word_boxes(
|
311
|
+
overlap_ratios_vertical, table, table_words_vertical
|
312
|
+
)
|
313
|
+
|
314
|
+
new_points.extend(new_points_horizontal)
|
315
|
+
new_scores.extend(new_scores_horizontal)
|
316
|
+
new_points.extend(new_points_vertical)
|
317
|
+
new_scores.extend(new_scores_vertical)
|
318
|
+
|
319
|
+
for i, flag in enumerate(check_list):
|
320
|
+
if not flag:
|
321
|
+
new_points.append(results_det.points[i])
|
322
|
+
new_scores.append(results_det.scores[i])
|
323
|
+
|
324
|
+
results_det.points = new_points
|
325
|
+
results_det.scores = new_scores
|
326
|
+
|
327
|
+
return results_det
|
328
|
+
|
329
|
+
|
151
330
|
class DocumentAnalyzer:
|
152
|
-
def __init__(self, configs=
|
331
|
+
def __init__(self, configs={}, device="cuda", visualize=False):
|
153
332
|
default_configs = {
|
154
333
|
"ocr": {
|
155
334
|
"text_detector": {
|
@@ -180,8 +359,16 @@ class DocumentAnalyzer:
|
|
180
359
|
"configs must be a dict. See the https://kotaro-kinoshita.github.io/yomitoku-dev/usage/"
|
181
360
|
)
|
182
361
|
|
183
|
-
self.
|
184
|
-
|
362
|
+
self.text_detector = TextDetector(
|
363
|
+
**default_configs["ocr"]["text_detector"],
|
364
|
+
)
|
365
|
+
self.text_recognizer = TextRecognizer(
|
366
|
+
**default_configs["ocr"]["text_recognizer"]
|
367
|
+
)
|
368
|
+
|
369
|
+
self.layout = LayoutAnalyzer(
|
370
|
+
configs=default_configs["layout_analyzer"],
|
371
|
+
)
|
185
372
|
self.visualize = visualize
|
186
373
|
|
187
374
|
def aggregate(self, ocr_res, layout_res):
|
@@ -286,16 +473,31 @@ class DocumentAnalyzer:
|
|
286
473
|
with ThreadPoolExecutor(max_workers=2) as executor:
|
287
474
|
loop = asyncio.get_running_loop()
|
288
475
|
tasks = [
|
289
|
-
loop.run_in_executor(executor, self.ocr, img),
|
476
|
+
# loop.run_in_executor(executor, self.ocr, img),
|
477
|
+
loop.run_in_executor(executor, self.text_detector, img),
|
290
478
|
loop.run_in_executor(executor, self.layout, img),
|
291
479
|
]
|
292
480
|
|
293
481
|
results = await asyncio.gather(*tasks)
|
294
482
|
|
295
|
-
|
483
|
+
results_det, _ = results[0]
|
296
484
|
results_layout, layout = results[1]
|
297
485
|
|
298
|
-
|
486
|
+
results_det = _split_text_across_cells(results_det, results_layout)
|
487
|
+
|
488
|
+
vis_det = None
|
489
|
+
if self.visualize:
|
490
|
+
vis_det = det_visualizer(
|
491
|
+
img,
|
492
|
+
results_det.points,
|
493
|
+
)
|
494
|
+
|
495
|
+
results_rec, ocr = self.text_recognizer(img, results_det.points, vis_det)
|
496
|
+
|
497
|
+
outputs = {"words": ocr_aggregate(results_det, results_rec)}
|
498
|
+
results_ocr = OCRSchema(**outputs)
|
499
|
+
outputs = self.aggregate(results_ocr, results_layout)
|
500
|
+
|
299
501
|
results = DocumentAnalyzerSchema(**outputs)
|
300
502
|
return results, ocr, layout
|
301
503
|
|
yomitoku/export/export_csv.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
import csv
|
2
|
+
import cv2
|
3
|
+
import os
|
2
4
|
|
3
5
|
|
4
6
|
def table_to_csv(table, ignore_line_break):
|
@@ -33,7 +35,36 @@ def paragraph_to_csv(paragraph, ignore_line_break):
|
|
33
35
|
return contents
|
34
36
|
|
35
37
|
|
36
|
-
def
|
38
|
+
def save_figure(
|
39
|
+
figures,
|
40
|
+
img,
|
41
|
+
out_path,
|
42
|
+
figure_dir="figures",
|
43
|
+
):
|
44
|
+
assert img is not None, "img is required for saving figures"
|
45
|
+
|
46
|
+
for i, figure in enumerate(figures):
|
47
|
+
x1, y1, x2, y2 = map(int, figure.box)
|
48
|
+
figure_img = img[y1:y2, x1:x2, :]
|
49
|
+
save_dir = os.path.dirname(out_path)
|
50
|
+
save_dir = os.path.join(save_dir, figure_dir)
|
51
|
+
os.makedirs(save_dir, exist_ok=True)
|
52
|
+
|
53
|
+
filename = os.path.splitext(os.path.basename(out_path))[0]
|
54
|
+
figure_name = f"{filename}_figure_{i}.png"
|
55
|
+
figure_path = os.path.join(save_dir, figure_name)
|
56
|
+
cv2.imwrite(figure_path, figure_img)
|
57
|
+
|
58
|
+
|
59
|
+
def export_csv(
|
60
|
+
inputs,
|
61
|
+
out_path: str,
|
62
|
+
ignore_line_break: bool = False,
|
63
|
+
encoding: str = "utf-8",
|
64
|
+
img=None,
|
65
|
+
export_figure: bool = True,
|
66
|
+
figure_dir="figures",
|
67
|
+
):
|
37
68
|
elements = []
|
38
69
|
for table in inputs.tables:
|
39
70
|
table_csv = table_to_csv(table, ignore_line_break)
|
@@ -58,9 +89,17 @@ def export_csv(inputs, out_path: str, ignore_line_break: bool = False):
|
|
58
89
|
}
|
59
90
|
)
|
60
91
|
|
92
|
+
if export_figure:
|
93
|
+
save_figure(
|
94
|
+
inputs.figures,
|
95
|
+
img,
|
96
|
+
out_path,
|
97
|
+
figure_dir=figure_dir,
|
98
|
+
)
|
99
|
+
|
61
100
|
elements = sorted(elements, key=lambda x: x["order"])
|
62
101
|
|
63
|
-
with open(out_path, "w", newline="", encoding="
|
102
|
+
with open(out_path, "w", newline="", encoding=encoding, errors="ignore") as f:
|
64
103
|
writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
|
65
104
|
for element in elements:
|
66
105
|
if element["type"] == "table":
|
yomitoku/export/export_html.py
CHANGED
@@ -110,6 +110,8 @@ def figure_to_html(
|
|
110
110
|
figure_dir="figures",
|
111
111
|
width=200,
|
112
112
|
):
|
113
|
+
assert img is not None, "img is required for saving figures"
|
114
|
+
|
113
115
|
elements = []
|
114
116
|
for i, figure in enumerate(figures):
|
115
117
|
x1, y1, x2, y2 = map(int, figure.box)
|
@@ -154,6 +156,7 @@ def export_html(
|
|
154
156
|
img=None,
|
155
157
|
figure_width=200,
|
156
158
|
figure_dir="figures",
|
159
|
+
encoding: str = "utf-8",
|
157
160
|
):
|
158
161
|
html_string = ""
|
159
162
|
elements = []
|
@@ -184,5 +187,5 @@ def export_html(
|
|
184
187
|
parsed_html = html.fromstring(html_string)
|
185
188
|
formatted_html = etree.tostring(parsed_html, pretty_print=True, encoding="unicode")
|
186
189
|
|
187
|
-
with open(out_path, "w", encoding="
|
190
|
+
with open(out_path, "w", encoding=encoding, errors="ignore") as f:
|
188
191
|
f.write(formatted_html)
|
yomitoku/export/export_json.py
CHANGED
@@ -1,5 +1,8 @@
|
|
1
1
|
import json
|
2
2
|
|
3
|
+
import cv2
|
4
|
+
import os
|
5
|
+
|
3
6
|
|
4
7
|
def paragraph_to_json(paragraph, ignore_line_break):
|
5
8
|
if ignore_line_break:
|
@@ -12,7 +15,36 @@ def table_to_json(table, ignore_line_break):
|
|
12
15
|
cell.contents = cell.contents.replace("\n", "")
|
13
16
|
|
14
17
|
|
15
|
-
def
|
18
|
+
def save_figure(
|
19
|
+
figures,
|
20
|
+
img,
|
21
|
+
out_path,
|
22
|
+
figure_dir="figures",
|
23
|
+
):
|
24
|
+
assert img is not None, "img is required for saving figures"
|
25
|
+
|
26
|
+
for i, figure in enumerate(figures):
|
27
|
+
x1, y1, x2, y2 = map(int, figure.box)
|
28
|
+
figure_img = img[y1:y2, x1:x2, :]
|
29
|
+
save_dir = os.path.dirname(out_path)
|
30
|
+
save_dir = os.path.join(save_dir, figure_dir)
|
31
|
+
os.makedirs(save_dir, exist_ok=True)
|
32
|
+
|
33
|
+
filename = os.path.splitext(os.path.basename(out_path))[0]
|
34
|
+
figure_name = f"{filename}_figure_{i}.png"
|
35
|
+
figure_path = os.path.join(save_dir, figure_name)
|
36
|
+
cv2.imwrite(figure_path, figure_img)
|
37
|
+
|
38
|
+
|
39
|
+
def export_json(
|
40
|
+
inputs,
|
41
|
+
out_path,
|
42
|
+
ignore_line_break=False,
|
43
|
+
encoding: str = "utf-8",
|
44
|
+
img=None,
|
45
|
+
export_figure=False,
|
46
|
+
figure_dir="figures",
|
47
|
+
):
|
16
48
|
from yomitoku.document_analyzer import DocumentAnalyzerSchema
|
17
49
|
|
18
50
|
if isinstance(inputs, DocumentAnalyzerSchema):
|
@@ -23,7 +55,15 @@ def export_json(inputs, out_path, ignore_line_break=False):
|
|
23
55
|
for paragraph in inputs.paragraphs:
|
24
56
|
paragraph_to_json(paragraph, ignore_line_break)
|
25
57
|
|
26
|
-
|
58
|
+
if export_figure:
|
59
|
+
save_figure(
|
60
|
+
inputs.figures,
|
61
|
+
img,
|
62
|
+
out_path,
|
63
|
+
figure_dir=figure_dir,
|
64
|
+
)
|
65
|
+
|
66
|
+
with open(out_path, "w", encoding=encoding, errors="ignore") as f:
|
27
67
|
json.dump(
|
28
68
|
inputs.model_dump(),
|
29
69
|
f,
|
@@ -75,6 +75,8 @@ def figure_to_md(
|
|
75
75
|
width=200,
|
76
76
|
figure_dir="figures",
|
77
77
|
):
|
78
|
+
assert img is not None, "img is required for saving figures"
|
79
|
+
|
78
80
|
elements = []
|
79
81
|
for i, figure in enumerate(figures):
|
80
82
|
x1, y1, x2, y2 = map(int, figure.box)
|
@@ -117,6 +119,7 @@ def export_markdown(
|
|
117
119
|
export_figure=True,
|
118
120
|
figure_width=200,
|
119
121
|
figure_dir="figures",
|
122
|
+
encoding: str = "utf-8",
|
120
123
|
):
|
121
124
|
elements = []
|
122
125
|
for table in inputs.tables:
|
@@ -141,5 +144,5 @@ def export_markdown(
|
|
141
144
|
elements = sorted(elements, key=lambda x: x["order"])
|
142
145
|
markdown = "\n".join([element["md"] for element in elements])
|
143
146
|
|
144
|
-
with open(out_path, "w", encoding="
|
147
|
+
with open(out_path, "w", encoding=encoding, errors="ignore") as f:
|
145
148
|
f.write(markdown)
|
yomitoku/layout_analyzer.py
CHANGED
@@ -15,7 +15,7 @@ class LayoutAnalyzerSchema(BaseSchema):
|
|
15
15
|
|
16
16
|
|
17
17
|
class LayoutAnalyzer:
|
18
|
-
def __init__(self, configs=
|
18
|
+
def __init__(self, configs={}, device="cuda", visualize=False):
|
19
19
|
layout_parser_kwargs = {
|
20
20
|
"device": device,
|
21
21
|
"visualize": visualize,
|
@@ -26,10 +26,6 @@ class LayoutAnalyzer:
|
|
26
26
|
}
|
27
27
|
|
28
28
|
if isinstance(configs, dict):
|
29
|
-
assert (
|
30
|
-
"layout_parser" in configs or "table_structure_recognizer" in configs
|
31
|
-
), "Invalid config key. Please check the config keys."
|
32
|
-
|
33
29
|
if "layout_parser" in configs:
|
34
30
|
layout_parser_kwargs.update(configs["layout_parser"])
|
35
31
|
|
yomitoku/layout_parser.py
CHANGED
@@ -104,7 +104,6 @@ class LayoutParser(BaseModule):
|
|
104
104
|
self.visualize = visualize
|
105
105
|
|
106
106
|
self.model.eval()
|
107
|
-
self.model.to(self.device)
|
108
107
|
|
109
108
|
self.postprocessor = RTDETRPostProcessor(
|
110
109
|
num_classes=self._cfg.RTDETRTransformerv2.num_classes,
|
@@ -132,6 +131,8 @@ class LayoutParser(BaseModule):
|
|
132
131
|
if not os.path.exists(path_onnx):
|
133
132
|
self.convert_onnx(path_onnx)
|
134
133
|
|
134
|
+
self.model = None
|
135
|
+
|
135
136
|
model = onnx.load(path_onnx)
|
136
137
|
if torch.cuda.is_available() and device == "cuda":
|
137
138
|
self.sess = onnxruntime.InferenceSession(
|
@@ -140,6 +141,9 @@ class LayoutParser(BaseModule):
|
|
140
141
|
else:
|
141
142
|
self.sess = onnxruntime.InferenceSession(model.SerializeToString())
|
142
143
|
|
144
|
+
if self.model is not None:
|
145
|
+
self.model.to(self.device)
|
146
|
+
|
143
147
|
def convert_onnx(self, path_onnx):
|
144
148
|
dynamic_axes = {
|
145
149
|
"input": {0: "batch_size"},
|
yomitoku/ocr.py
CHANGED
@@ -16,16 +16,37 @@ class WordPrediction(BaseSchema):
|
|
16
16
|
)
|
17
17
|
content: str
|
18
18
|
direction: str
|
19
|
-
det_score: float
|
20
19
|
rec_score: float
|
20
|
+
det_score: float
|
21
21
|
|
22
22
|
|
23
23
|
class OCRSchema(BaseSchema):
|
24
24
|
words: List[WordPrediction]
|
25
25
|
|
26
26
|
|
27
|
+
def ocr_aggregate(det_outputs, rec_outputs):
|
28
|
+
words = []
|
29
|
+
for points, det_score, pred, rec_score, direction in zip(
|
30
|
+
det_outputs.points,
|
31
|
+
det_outputs.scores,
|
32
|
+
rec_outputs.contents,
|
33
|
+
rec_outputs.scores,
|
34
|
+
rec_outputs.directions,
|
35
|
+
):
|
36
|
+
words.append(
|
37
|
+
{
|
38
|
+
"points": points,
|
39
|
+
"content": pred,
|
40
|
+
"direction": direction,
|
41
|
+
"det_score": det_score,
|
42
|
+
"rec_score": rec_score,
|
43
|
+
}
|
44
|
+
)
|
45
|
+
return words
|
46
|
+
|
47
|
+
|
27
48
|
class OCR:
|
28
|
-
def __init__(self, configs=
|
49
|
+
def __init__(self, configs={}, device="cuda", visualize=False):
|
29
50
|
text_detector_kwargs = {
|
30
51
|
"device": device,
|
31
52
|
"visualize": visualize,
|
@@ -36,10 +57,6 @@ class OCR:
|
|
36
57
|
}
|
37
58
|
|
38
59
|
if isinstance(configs, dict):
|
39
|
-
assert (
|
40
|
-
"text_detector" in configs or "text_recognizer" in configs
|
41
|
-
), "Invalid config key. Please check the config keys."
|
42
|
-
|
43
60
|
if "text_detector" in configs:
|
44
61
|
text_detector_kwargs.update(configs["text_detector"])
|
45
62
|
if "text_recognizer" in configs:
|
@@ -52,26 +69,6 @@ class OCR:
|
|
52
69
|
self.detector = TextDetector(**text_detector_kwargs)
|
53
70
|
self.recognizer = TextRecognizer(**text_recognizer_kwargs)
|
54
71
|
|
55
|
-
def aggregate(self, det_outputs, rec_outputs):
|
56
|
-
words = []
|
57
|
-
for points, det_score, pred, rec_score, direction in zip(
|
58
|
-
det_outputs.points,
|
59
|
-
det_outputs.scores,
|
60
|
-
rec_outputs.contents,
|
61
|
-
rec_outputs.scores,
|
62
|
-
rec_outputs.directions,
|
63
|
-
):
|
64
|
-
words.append(
|
65
|
-
{
|
66
|
-
"points": points,
|
67
|
-
"content": pred,
|
68
|
-
"direction": direction,
|
69
|
-
"det_score": det_score,
|
70
|
-
"rec_score": rec_score,
|
71
|
-
}
|
72
|
-
)
|
73
|
-
return words
|
74
|
-
|
75
72
|
def __call__(self, img):
|
76
73
|
"""_summary_
|
77
74
|
|
@@ -82,6 +79,6 @@ class OCR:
|
|
82
79
|
det_outputs, vis = self.detector(img)
|
83
80
|
rec_outputs, vis = self.recognizer(img, det_outputs.points, vis=vis)
|
84
81
|
|
85
|
-
outputs = {"words":
|
82
|
+
outputs = {"words": ocr_aggregate(det_outputs, rec_outputs)}
|
86
83
|
results = OCRSchema(**outputs)
|
87
84
|
return results, vis
|
@@ -35,10 +35,17 @@ class TableCellSchema(BaseSchema):
|
|
35
35
|
contents: Union[str, None]
|
36
36
|
|
37
37
|
|
38
|
+
class TableLineSchema(BaseSchema):
|
39
|
+
box: conlist(int, min_length=4, max_length=4)
|
40
|
+
score: float
|
41
|
+
|
42
|
+
|
38
43
|
class TableStructureRecognizerSchema(BaseSchema):
|
39
44
|
box: conlist(int, min_length=4, max_length=4)
|
40
45
|
n_row: int
|
41
46
|
n_col: int
|
47
|
+
rows: List[TableLineSchema]
|
48
|
+
cols: List[TableLineSchema]
|
42
49
|
cells: List[TableCellSchema]
|
43
50
|
order: int
|
44
51
|
|
@@ -133,8 +140,6 @@ class TableStructureRecognizer(BaseModule):
|
|
133
140
|
num_top_queries=self._cfg.RTDETRTransformerv2.num_queries,
|
134
141
|
)
|
135
142
|
|
136
|
-
self.save_config("table_structure_recognitizer.yaml")
|
137
|
-
|
138
143
|
self.transforms = T.Compose(
|
139
144
|
[
|
140
145
|
T.Resize(self._cfg.data.img_size),
|
@@ -155,6 +160,8 @@ class TableStructureRecognizer(BaseModule):
|
|
155
160
|
if not os.path.exists(path_onnx):
|
156
161
|
self.convert_onnx(path_onnx)
|
157
162
|
|
163
|
+
self.model = None
|
164
|
+
|
158
165
|
model = onnx.load(path_onnx)
|
159
166
|
if torch.cuda.is_available() and device == "cuda":
|
160
167
|
self.sess = onnxruntime.InferenceSession(
|
@@ -163,6 +170,9 @@ class TableStructureRecognizer(BaseModule):
|
|
163
170
|
else:
|
164
171
|
self.sess = onnxruntime.InferenceSession(model.SerializeToString())
|
165
172
|
|
173
|
+
if self.model is not None:
|
174
|
+
self.model.to(self.device)
|
175
|
+
|
166
176
|
def convert_onnx(self, path_onnx):
|
167
177
|
dynamic_axes = {
|
168
178
|
"input": {0: "batch_size"},
|
@@ -232,7 +242,7 @@ class TableStructureRecognizer(BaseModule):
|
|
232
242
|
category_elements
|
233
243
|
)
|
234
244
|
|
235
|
-
cells,
|
245
|
+
cells, rows, cols = self.extract_cell_elements(category_elements)
|
236
246
|
|
237
247
|
table_x, table_y = data["offset"]
|
238
248
|
table_x2 = table_x + data["size"][1]
|
@@ -241,8 +251,10 @@ class TableStructureRecognizer(BaseModule):
|
|
241
251
|
|
242
252
|
table = {
|
243
253
|
"box": table_box,
|
244
|
-
"n_row":
|
245
|
-
"n_col":
|
254
|
+
"n_row": len(rows),
|
255
|
+
"n_col": len(cols),
|
256
|
+
"rows": rows,
|
257
|
+
"cols": cols,
|
246
258
|
"cells": cells,
|
247
259
|
"order": 0,
|
248
260
|
}
|
@@ -262,7 +274,10 @@ class TableStructureRecognizer(BaseModule):
|
|
262
274
|
cells = extract_cells(row_boxes, col_boxes)
|
263
275
|
cells = filter_contained_cells_within_spancell(cells, span_boxes)
|
264
276
|
|
265
|
-
|
277
|
+
rows = sorted(elements["row"], key=lambda x: x["box"][1])
|
278
|
+
cols = sorted(elements["col"], key=lambda x: x["box"][0])
|
279
|
+
|
280
|
+
return cells, rows, cols
|
266
281
|
|
267
282
|
def __call__(self, img, table_boxes, vis=None):
|
268
283
|
img_tensors = self.preprocess(img, table_boxes)
|
@@ -282,7 +297,9 @@ class TableStructureRecognizer(BaseModule):
|
|
282
297
|
pred = self.model(data["tensor"])
|
283
298
|
|
284
299
|
table = self.postprocess(pred, data)
|
285
|
-
|
300
|
+
|
301
|
+
if table.n_row > 0 and table.n_col > 0:
|
302
|
+
outputs.append(table)
|
286
303
|
|
287
304
|
if vis is None and self.visualize:
|
288
305
|
vis = img.copy()
|
yomitoku/text_detector.py
CHANGED
@@ -61,8 +61,6 @@ class TextDetector(BaseModule):
|
|
61
61
|
self.visualize = visualize
|
62
62
|
|
63
63
|
self.model.eval()
|
64
|
-
self.model.to(self.device)
|
65
|
-
|
66
64
|
self.post_processor = DBnetPostProcessor(**self._cfg.post_process)
|
67
65
|
self.infer_onnx = infer_onnx
|
68
66
|
|
@@ -72,6 +70,8 @@ class TextDetector(BaseModule):
|
|
72
70
|
if not os.path.exists(path_onnx):
|
73
71
|
self.convert_onnx(path_onnx)
|
74
72
|
|
73
|
+
self.model = None
|
74
|
+
|
75
75
|
model = onnx.load(path_onnx)
|
76
76
|
if torch.cuda.is_available() and device == "cuda":
|
77
77
|
self.sess = onnxruntime.InferenceSession(
|
@@ -80,6 +80,11 @@ class TextDetector(BaseModule):
|
|
80
80
|
else:
|
81
81
|
self.sess = onnxruntime.InferenceSession(model.SerializeToString())
|
82
82
|
|
83
|
+
self.model = None
|
84
|
+
|
85
|
+
if self.model is not None:
|
86
|
+
self.model.to(self.device)
|
87
|
+
|
83
88
|
def convert_onnx(self, path_onnx):
|
84
89
|
dynamic_axes = {
|
85
90
|
"input": {0: "batch_size", 2: "height", 3: "width"},
|
@@ -138,9 +143,9 @@ class TextDetector(BaseModule):
|
|
138
143
|
vis = None
|
139
144
|
if self.visualize:
|
140
145
|
vis = det_visualizer(
|
141
|
-
preds,
|
142
146
|
img,
|
143
147
|
quads,
|
148
|
+
preds=preds,
|
144
149
|
vis_heatmap=self._cfg.visualize.heatmap,
|
145
150
|
line_color=tuple(self._cfg.visualize.color[::-1]),
|
146
151
|
)
|
yomitoku/text_recognizer.py
CHANGED
@@ -64,7 +64,6 @@ class TextRecognizer(BaseModule):
|
|
64
64
|
|
65
65
|
self.model.tokenizer = self.tokenizer
|
66
66
|
self.model.eval()
|
67
|
-
self.model.to(self.device)
|
68
67
|
|
69
68
|
self.visualize = visualize
|
70
69
|
|
@@ -76,6 +75,8 @@ class TextRecognizer(BaseModule):
|
|
76
75
|
if not os.path.exists(path_onnx):
|
77
76
|
self.convert_onnx(path_onnx)
|
78
77
|
|
78
|
+
self.model = None
|
79
|
+
|
79
80
|
model = onnx.load(path_onnx)
|
80
81
|
if torch.cuda.is_available() and device == "cuda":
|
81
82
|
self.sess = onnxruntime.InferenceSession(
|
@@ -84,17 +85,31 @@ class TextRecognizer(BaseModule):
|
|
84
85
|
else:
|
85
86
|
self.sess = onnxruntime.InferenceSession(model.SerializeToString())
|
86
87
|
|
88
|
+
if self.model is not None:
|
89
|
+
self.model.to(self.device)
|
90
|
+
|
87
91
|
def preprocess(self, img, polygons):
|
88
92
|
dataset = ParseqDataset(self._cfg, img, polygons)
|
89
|
-
dataloader =
|
90
|
-
dataset,
|
91
|
-
batch_size=self._cfg.data.batch_size,
|
92
|
-
shuffle=False,
|
93
|
-
num_workers=self._cfg.data.num_workers,
|
94
|
-
)
|
93
|
+
dataloader = self._make_mini_batch(dataset)
|
95
94
|
|
96
95
|
return dataloader
|
97
96
|
|
97
|
+
def _make_mini_batch(self, dataset):
|
98
|
+
mini_batches = []
|
99
|
+
mini_batch = []
|
100
|
+
for data in dataset:
|
101
|
+
data = torch.unsqueeze(data, 0)
|
102
|
+
mini_batch.append(data)
|
103
|
+
|
104
|
+
if len(mini_batch) == self._cfg.data.batch_size:
|
105
|
+
mini_batches.append(torch.cat(mini_batch, 0))
|
106
|
+
mini_batch = []
|
107
|
+
else:
|
108
|
+
if len(mini_batch) > 0:
|
109
|
+
mini_batches.append(torch.cat(mini_batch, 0))
|
110
|
+
|
111
|
+
return mini_batches
|
112
|
+
|
98
113
|
def convert_onnx(self, path_onnx):
|
99
114
|
img_size = self._cfg.data.img_size
|
100
115
|
input = torch.randn(1, 3, *img_size, requires_grad=True)
|
yomitoku/utils/misc.py
CHANGED
@@ -9,6 +9,24 @@ def filter_by_flag(elements, flags):
|
|
9
9
|
return [element for element, flag in zip(elements, flags) if flag]
|
10
10
|
|
11
11
|
|
12
|
+
def calc_overlap_ratio(rect_a, rect_b):
|
13
|
+
intersection = calc_intersection(rect_a, rect_b)
|
14
|
+
if intersection is None:
|
15
|
+
return 0, None
|
16
|
+
|
17
|
+
ix1, iy1, ix2, iy2 = intersection
|
18
|
+
|
19
|
+
overlap_width = ix2 - ix1
|
20
|
+
overlap_height = iy2 - iy1
|
21
|
+
bx1, by1, bx2, by2 = rect_b
|
22
|
+
|
23
|
+
b_area = (bx2 - bx1) * (by2 - by1)
|
24
|
+
overlap_area = overlap_width * overlap_height
|
25
|
+
|
26
|
+
overlap_ratio = overlap_area / b_area
|
27
|
+
return overlap_ratio, intersection
|
28
|
+
|
29
|
+
|
12
30
|
def is_contained(rect_a, rect_b, threshold=0.8):
|
13
31
|
"""二つの矩形A, Bが与えられたとき、矩形Bが矩形Aに含まれるかどうかを判定する。
|
14
32
|
ずれを許容するため、重複率求め、thresholdを超える場合にTrueを返す。
|
@@ -23,20 +41,9 @@ def is_contained(rect_a, rect_b, threshold=0.8):
|
|
23
41
|
bool: 矩形Bが矩形Aに含まれる場合True
|
24
42
|
"""
|
25
43
|
|
26
|
-
|
27
|
-
if intersection is None:
|
28
|
-
return False
|
29
|
-
|
30
|
-
ix1, iy1, ix2, iy2 = intersection
|
31
|
-
|
32
|
-
overlap_width = ix2 - ix1
|
33
|
-
overlap_height = iy2 - iy1
|
34
|
-
bx1, by1, bx2, by2 = rect_b
|
35
|
-
|
36
|
-
b_area = (bx2 - bx1) * (by2 - by1)
|
37
|
-
overlap_area = overlap_width * overlap_height
|
44
|
+
overlap_ratio, _ = calc_overlap_ratio(rect_a, rect_b)
|
38
45
|
|
39
|
-
if
|
46
|
+
if overlap_ratio > threshold:
|
40
47
|
return True
|
41
48
|
|
42
49
|
return False
|
yomitoku/utils/visualizer.py
CHANGED
@@ -66,14 +66,14 @@ def reading_order_visualizer(
|
|
66
66
|
return out
|
67
67
|
|
68
68
|
|
69
|
-
def det_visualizer(
|
70
|
-
preds = preds["binary"][0]
|
71
|
-
binary = preds.detach().cpu().numpy()
|
69
|
+
def det_visualizer(img, quads, preds=None, vis_heatmap=False, line_color=(0, 255, 0)):
|
72
70
|
out = img.copy()
|
73
71
|
h, w = out.shape[:2]
|
74
|
-
binary = binary.squeeze(0)
|
75
|
-
binary = (binary * 255).astype(np.uint8)
|
76
72
|
if vis_heatmap:
|
73
|
+
preds = preds["binary"][0]
|
74
|
+
binary = preds.detach().cpu().numpy()
|
75
|
+
binary = binary.squeeze(0)
|
76
|
+
binary = (binary * 255).astype(np.uint8)
|
77
77
|
binary = cv2.resize(binary, (w, h), interpolation=cv2.INTER_LINEAR)
|
78
78
|
heatmap = cv2.applyColorMap(binary, cv2.COLORMAP_JET)
|
79
79
|
out = cv2.addWeighted(out, 0.5, heatmap, 0.5, 0)
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: yomitoku
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.7.1
|
4
4
|
Summary: Yomitoku is an AI-powered document image analysis package designed specifically for the Japanese language.
|
5
5
|
Author-email: Kotaro Kinoshita <kotaro.kinoshita@mlism.com>
|
6
6
|
License: CC BY-NC-SA 4.0
|
@@ -10,7 +10,6 @@ Requires-Dist: huggingface-hub>=0.26.1
|
|
10
10
|
Requires-Dist: lxml>=5.3.0
|
11
11
|
Requires-Dist: omegaconf>=2.3.0
|
12
12
|
Requires-Dist: onnx>=1.17.0
|
13
|
-
Requires-Dist: onnxruntime-gpu>=1.20.1
|
14
13
|
Requires-Dist: onnxruntime>=1.20.1
|
15
14
|
Requires-Dist: opencv-python>=4.10.0.84
|
16
15
|
Requires-Dist: pyclipper>=1.3.0.post6
|
@@ -20,6 +19,8 @@ Requires-Dist: shapely>=2.0.6
|
|
20
19
|
Requires-Dist: timm>=1.0.11
|
21
20
|
Requires-Dist: torch>=2.5.0
|
22
21
|
Requires-Dist: torchvision>=0.20.0
|
22
|
+
Provides-Extra: gpu
|
23
|
+
Requires-Dist: onnxruntime-gpu>=1.20.1; extra == 'gpu'
|
23
24
|
Description-Content-Type: text/markdown
|
24
25
|
|
25
26
|
日本語版 | [English](README_EN.md)
|
@@ -72,6 +73,11 @@ Markdown でエクスポートした結果は関してはリポジトリ内の[s
|
|
72
73
|
pip install yomitoku
|
73
74
|
```
|
74
75
|
|
76
|
+
onnxruntimeの実行にGPUを使用する場合
|
77
|
+
```
|
78
|
+
pip install yomitoku[gpu]
|
79
|
+
```
|
80
|
+
|
75
81
|
- pytorch はご自身の CUDA のバージョンにあったものをインストールしてください。デフォルトでは CUDA12.4 以上に対応したものがインストールされます。
|
76
82
|
- pytorch は 2.5 以上のバージョンに対応しています。その関係で CUDA11.8 以上のバージョンが必要になります。対応できない場合は、リポジトリ内の Dockerfile を利用してください。
|
77
83
|
|
@@ -89,7 +95,8 @@ yomitoku ${path_data} -f md -o results -v --figure --lite
|
|
89
95
|
- `-d`, `--device` モデルを実行するためのデバイスを指定します。gpu が利用できない場合は cpu で推論が実行されます。(デフォルト: cuda)
|
90
96
|
- `--ignore_line_break` 画像の改行位置を無視して、段落内の文章を連結して返します。(デフォルト:画像通りの改行位置位置で改行します。)
|
91
97
|
- `--figure_letter` 検出した図表に含まれる文字も出力ファイルにエクスポートします。
|
92
|
-
- `--figure` 検出した図、画像を出力ファイルにエクスポートします。
|
98
|
+
- `--figure` 検出した図、画像を出力ファイルにエクスポートします。
|
99
|
+
- `--encoding` エクスポートする出力ファイルの文字エンコーディングを指定します。サポートされていない文字コードが含まれる場合は、その文字を無視します。(utf-8, utf-8-sig, shift-jis, enc-jp, cp932)
|
93
100
|
|
94
101
|
その他のオプションに関しては、ヘルプを参照
|
95
102
|
|
@@ -1,16 +1,16 @@
|
|
1
1
|
yomitoku/__init__.py,sha256=kXOM8RbpwwLABG3p3vPT3dJWBk4JX2MFGrOeBEW0hKM,543
|
2
2
|
yomitoku/base.py,sha256=lzR_V8t87aRasmFdFwD-8KAeSahSTI3AZaEn6g8sOv8,3871
|
3
3
|
yomitoku/constants.py,sha256=zlW5QRc_u_F3C2RAgBFWyHJZexBnJT5N15GC-9d3iLo,686
|
4
|
-
yomitoku/document_analyzer.py,sha256=
|
5
|
-
yomitoku/layout_analyzer.py,sha256=
|
6
|
-
yomitoku/layout_parser.py,sha256=
|
7
|
-
yomitoku/ocr.py,sha256=
|
4
|
+
yomitoku/document_analyzer.py,sha256=B2F_MXFKbq58ePDCgcZKk_bgQUkno1ehYb6CZmAekCk,16234
|
5
|
+
yomitoku/layout_analyzer.py,sha256=VhNf1ZQFoozj6WUGk5ll1p2p1jk5X3j-JPcDbTAoSl4,1856
|
6
|
+
yomitoku/layout_parser.py,sha256=V_mAkZxke1gwHfnxBFMTOJ8hnz2X_kfZu2lLiMd8cAs,7610
|
7
|
+
yomitoku/ocr.py,sha256=JSTjkupcxHITQm6ERnzU7As0c3KWf8-oxc0AqNoWHXo,2272
|
8
8
|
yomitoku/reading_order.py,sha256=OfhOS9ttPDoPSuHrIRKyOzG19GGeRufbuSKDqhsohh4,6404
|
9
|
-
yomitoku/table_structure_recognizer.py,sha256=
|
10
|
-
yomitoku/text_detector.py,sha256=
|
11
|
-
yomitoku/text_recognizer.py,sha256=
|
9
|
+
yomitoku/table_structure_recognizer.py,sha256=Eam9t7OjW4a-UWk_dl-ylbOcinN_Te_ovuri2naldL0,9482
|
10
|
+
yomitoku/text_detector.py,sha256=XgqhtbNcJww2x3BrH8EFz45qC6kqPKCX9hsa-dzRoIA,4274
|
11
|
+
yomitoku/text_recognizer.py,sha256=LVMjy-PaGlDQqfJrjKX_7vOQXDyFg6FaCeIQIyWUJX8,5833
|
12
12
|
yomitoku/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
|
-
yomitoku/cli/main.py,sha256=
|
13
|
+
yomitoku/cli/main.py,sha256=Dcu-jBql5dP4DiN0QWbPAQscnJ7tTtbWUnaohz3FFbs,7868
|
14
14
|
yomitoku/configs/__init__.py,sha256=e1Alss5QJLZSNfD6zLEG6xu5vDQDw-4Jayiqq8bq52s,571
|
15
15
|
yomitoku/configs/cfg_layout_parser_rtdtrv2.py,sha256=8PRxB2Ar9UF7-DLtbgSokhrzdXb0veWI6Wc-X8qigRw,2329
|
16
16
|
yomitoku/configs/cfg_table_structure_recognizer_rtdtrv2.py,sha256=o70GMHD8k-zeBeJtuhPS8x7vVB-ffucnJXeSyn-0AXo,2116
|
@@ -21,10 +21,10 @@ yomitoku/data/__init__.py,sha256=KAofFc9rk9ZdTKBjemu9RM8Vj9XnKbWC2MPZ2RWtOdE,82
|
|
21
21
|
yomitoku/data/dataset.py,sha256=-I4f-FDtgsPnJ2MnXB7FtwihMW3koDaSI1OEoqKneIg,1014
|
22
22
|
yomitoku/data/functions.py,sha256=eOyxo8S6EoAf1xGSPLWQFb9-t5Rg52NggD9MFIrOSpY,7506
|
23
23
|
yomitoku/export/__init__.py,sha256=aANEfuovH2aevFjb2pGrBLFP-4iRzEzD9wcriCR-M7I,229
|
24
|
-
yomitoku/export/export_csv.py,sha256
|
25
|
-
yomitoku/export/export_html.py,sha256=
|
26
|
-
yomitoku/export/export_json.py,sha256
|
27
|
-
yomitoku/export/export_markdown.py,sha256=
|
24
|
+
yomitoku/export/export_csv.py,sha256=gKIhhFKOyZaxwmjbtxk8rkFOv3uPaRwfOnKwTC8d7K0,2935
|
25
|
+
yomitoku/export/export_html.py,sha256=RsFU-IMtBOJWCdqHhp5btswf2fzfow01ypY1h6E6Vvo,4979
|
26
|
+
yomitoku/export/export_json.py,sha256=-57hcT2ENTa1HcT7YMjXiv6tZZf_Y_1q1xu2Jt9T1P4,1976
|
27
|
+
yomitoku/export/export_markdown.py,sha256=kGOyopq_vT2NqM4LoAu4JIPbjV_SLg9H0MIOAKpS-Gk,4099
|
28
28
|
yomitoku/models/__init__.py,sha256=Enxq9sjJWusZuxecTori8IQa8NEYKaiiptDluHX1avg,144
|
29
29
|
yomitoku/models/dbnet_plus.py,sha256=jeWJZm0ihbxoJeAXBFK7uVIwoosx2IUNk7Ut5wRH0vA,7998
|
30
30
|
yomitoku/models/parseq.py,sha256=-DQMQuON2jwtb4Ib2V0O19un9w-WG4rXS0SiscydrXU,8593
|
@@ -46,9 +46,9 @@ yomitoku/resource/charset.txt,sha256=sU91kSi-9Wk4733bCXy4j_UDmvcsj96sHOq1ppUJlOY
|
|
46
46
|
yomitoku/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
47
47
|
yomitoku/utils/graph.py,sha256=LKNB8ZhSQwOZMfeAimPMF5UCVVr2ZaUWoGDkz8z-uGU,456
|
48
48
|
yomitoku/utils/logger.py,sha256=uOmtQDr0A0JD7wyFshedL08BiNrQorHnpktRXba8bjU,424
|
49
|
-
yomitoku/utils/misc.py,sha256=
|
50
|
-
yomitoku/utils/visualizer.py,sha256=
|
51
|
-
yomitoku-0.
|
52
|
-
yomitoku-0.
|
53
|
-
yomitoku-0.
|
54
|
-
yomitoku-0.
|
49
|
+
yomitoku/utils/misc.py,sha256=FbwPLeIYYBvNf9wQh2RoEonTM5BF7_IwaEqmRsYHKA8,2673
|
50
|
+
yomitoku/utils/visualizer.py,sha256=DjDwHiAu1iFRKh96H3Egq4vuI2s_-9dLCDeykhKi8jo,5251
|
51
|
+
yomitoku-0.7.1.dist-info/METADATA,sha256=NgweyozOKBkIn9-yxX0hOV313-dBh29SsFQ22BSvoD8,8488
|
52
|
+
yomitoku-0.7.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
53
|
+
yomitoku-0.7.1.dist-info/entry_points.txt,sha256=nFV3S11zgBNW0Qq_D0XQNg2R4lNXU_9XUFr6rdJoyF8,52
|
54
|
+
yomitoku-0.7.1.dist-info/RECORD,,
|
File without changes
|