yomitoku 0.5.3__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- yomitoku/cli/main.py +47 -1
- yomitoku/configs/__init__.py +2 -0
- yomitoku/configs/cfg_text_recognizer_parseq_small.py +51 -0
- yomitoku/document_analyzer.py +229 -26
- yomitoku/export/export_csv.py +39 -2
- yomitoku/export/export_html.py +2 -1
- yomitoku/export/export_json.py +40 -2
- yomitoku/export/export_markdown.py +2 -1
- yomitoku/layout_analyzer.py +1 -5
- yomitoku/layout_parser.py +58 -4
- yomitoku/models/layers/rtdetr_backbone.py +5 -15
- yomitoku/models/layers/rtdetr_hybrid_encoder.py +6 -18
- yomitoku/models/layers/rtdetrv2_decoder.py +17 -42
- yomitoku/models/parseq.py +9 -9
- yomitoku/ocr.py +24 -27
- yomitoku/onnx/.gitkeep +0 -0
- yomitoku/postprocessor/rtdetr_postprocessor.py +4 -13
- yomitoku/table_structure_recognizer.py +79 -9
- yomitoku/text_detector.py +57 -7
- yomitoku/text_recognizer.py +80 -16
- yomitoku/utils/misc.py +20 -13
- yomitoku/utils/visualizer.py +5 -5
- {yomitoku-0.5.3.dist-info → yomitoku-0.7.0.dist-info}/METADATA +21 -9
- {yomitoku-0.5.3.dist-info → yomitoku-0.7.0.dist-info}/RECORD +26 -24
- {yomitoku-0.5.3.dist-info → yomitoku-0.7.0.dist-info}/WHEEL +1 -1
- {yomitoku-0.5.3.dist-info → yomitoku-0.7.0.dist-info}/entry_points.txt +0 -0
@@ -1,11 +1,16 @@
|
|
1
1
|
from typing import List, Union
|
2
2
|
|
3
3
|
import cv2
|
4
|
+
import os
|
5
|
+
import onnx
|
6
|
+
import onnxruntime
|
4
7
|
import torch
|
5
8
|
import torchvision.transforms as T
|
6
9
|
from PIL import Image
|
7
10
|
from pydantic import conlist
|
8
11
|
|
12
|
+
from .constants import ROOT_DIR
|
13
|
+
|
9
14
|
from .base import BaseModelCatalog, BaseModule, BaseSchema
|
10
15
|
from .configs import TableStructureRecognizerRTDETRv2Config
|
11
16
|
from .layout_parser import filter_contained_rectangles_within_category
|
@@ -30,10 +35,17 @@ class TableCellSchema(BaseSchema):
|
|
30
35
|
contents: Union[str, None]
|
31
36
|
|
32
37
|
|
38
|
+
class TableLineSchema(BaseSchema):
|
39
|
+
box: conlist(int, min_length=4, max_length=4)
|
40
|
+
score: float
|
41
|
+
|
42
|
+
|
33
43
|
class TableStructureRecognizerSchema(BaseSchema):
|
34
44
|
box: conlist(int, min_length=4, max_length=4)
|
35
45
|
n_row: int
|
36
46
|
n_col: int
|
47
|
+
rows: List[TableLineSchema]
|
48
|
+
cols: List[TableLineSchema]
|
37
49
|
cells: List[TableCellSchema]
|
38
50
|
order: int
|
39
51
|
|
@@ -109,12 +121,13 @@ class TableStructureRecognizer(BaseModule):
|
|
109
121
|
device="cuda",
|
110
122
|
visualize=False,
|
111
123
|
from_pretrained=True,
|
124
|
+
infer_onnx=False,
|
112
125
|
):
|
113
126
|
super().__init__()
|
114
127
|
self.load_model(
|
115
128
|
model_name,
|
116
129
|
path_cfg,
|
117
|
-
from_pretrained=
|
130
|
+
from_pretrained=from_pretrained,
|
118
131
|
)
|
119
132
|
self.device = device
|
120
133
|
self.visualize = visualize
|
@@ -140,6 +153,45 @@ class TableStructureRecognizer(BaseModule):
|
|
140
153
|
id: category for id, category in enumerate(self._cfg.category)
|
141
154
|
}
|
142
155
|
|
156
|
+
self.infer_onnx = infer_onnx
|
157
|
+
if infer_onnx:
|
158
|
+
name = self._cfg.hf_hub_repo.split("/")[-1]
|
159
|
+
path_onnx = f"{ROOT_DIR}/onnx/{name}.onnx"
|
160
|
+
if not os.path.exists(path_onnx):
|
161
|
+
self.convert_onnx(path_onnx)
|
162
|
+
|
163
|
+
self.model = None
|
164
|
+
|
165
|
+
model = onnx.load(path_onnx)
|
166
|
+
if torch.cuda.is_available() and device == "cuda":
|
167
|
+
self.sess = onnxruntime.InferenceSession(
|
168
|
+
model.SerializeToString(), providers=["CUDAExecutionProvider"]
|
169
|
+
)
|
170
|
+
else:
|
171
|
+
self.sess = onnxruntime.InferenceSession(model.SerializeToString())
|
172
|
+
|
173
|
+
if self.model is not None:
|
174
|
+
self.model.to(self.device)
|
175
|
+
|
176
|
+
def convert_onnx(self, path_onnx):
|
177
|
+
dynamic_axes = {
|
178
|
+
"input": {0: "batch_size"},
|
179
|
+
"output": {0: "batch_size"},
|
180
|
+
}
|
181
|
+
|
182
|
+
img_size = self._cfg.data.img_size
|
183
|
+
dummy_input = torch.randn(1, 3, *img_size, requires_grad=True)
|
184
|
+
|
185
|
+
torch.onnx.export(
|
186
|
+
self.model,
|
187
|
+
dummy_input,
|
188
|
+
path_onnx,
|
189
|
+
opset_version=16,
|
190
|
+
input_names=["input"],
|
191
|
+
output_names=["pred_logits", "pred_boxes"],
|
192
|
+
dynamic_axes=dynamic_axes,
|
193
|
+
)
|
194
|
+
|
143
195
|
def preprocess(self, img, boxes):
|
144
196
|
cv_img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
145
197
|
|
@@ -149,7 +201,7 @@ class TableStructureRecognizer(BaseModule):
|
|
149
201
|
table_img = cv_img[y1:y2, x1:x2, :]
|
150
202
|
th, hw = table_img.shape[:2]
|
151
203
|
table_img = Image.fromarray(table_img)
|
152
|
-
img_tensor = self.transforms(table_img)[None]
|
204
|
+
img_tensor = self.transforms(table_img)[None]
|
153
205
|
table_imgs.append(
|
154
206
|
{
|
155
207
|
"tensor": img_tensor,
|
@@ -190,7 +242,7 @@ class TableStructureRecognizer(BaseModule):
|
|
190
242
|
category_elements
|
191
243
|
)
|
192
244
|
|
193
|
-
cells,
|
245
|
+
cells, rows, cols = self.extract_cell_elements(category_elements)
|
194
246
|
|
195
247
|
table_x, table_y = data["offset"]
|
196
248
|
table_x2 = table_x + data["size"][1]
|
@@ -199,8 +251,10 @@ class TableStructureRecognizer(BaseModule):
|
|
199
251
|
|
200
252
|
table = {
|
201
253
|
"box": table_box,
|
202
|
-
"n_row":
|
203
|
-
"n_col":
|
254
|
+
"n_row": len(rows),
|
255
|
+
"n_col": len(cols),
|
256
|
+
"rows": rows,
|
257
|
+
"cols": cols,
|
204
258
|
"cells": cells,
|
205
259
|
"order": 0,
|
206
260
|
}
|
@@ -220,16 +274,32 @@ class TableStructureRecognizer(BaseModule):
|
|
220
274
|
cells = extract_cells(row_boxes, col_boxes)
|
221
275
|
cells = filter_contained_cells_within_spancell(cells, span_boxes)
|
222
276
|
|
223
|
-
|
277
|
+
rows = sorted(elements["row"], key=lambda x: x["box"][1])
|
278
|
+
cols = sorted(elements["col"], key=lambda x: x["box"][0])
|
279
|
+
|
280
|
+
return cells, rows, cols
|
224
281
|
|
225
282
|
def __call__(self, img, table_boxes, vis=None):
|
226
283
|
img_tensors = self.preprocess(img, table_boxes)
|
227
284
|
outputs = []
|
228
285
|
for data in img_tensors:
|
229
|
-
|
230
|
-
|
286
|
+
if self.infer_onnx:
|
287
|
+
input = data["tensor"].numpy()
|
288
|
+
results = self.sess.run(None, {"input": input})
|
289
|
+
pred = {
|
290
|
+
"pred_logits": torch.tensor(results[0]).to(self.device),
|
291
|
+
"pred_boxes": torch.tensor(results[1]).to(self.device),
|
292
|
+
}
|
293
|
+
|
294
|
+
else:
|
295
|
+
with torch.inference_mode():
|
296
|
+
data["tensor"] = data["tensor"].to(self.device)
|
297
|
+
pred = self.model(data["tensor"])
|
298
|
+
|
231
299
|
table = self.postprocess(pred, data)
|
232
|
-
|
300
|
+
|
301
|
+
if table.n_row > 0 and table.n_col > 0:
|
302
|
+
outputs.append(table)
|
233
303
|
|
234
304
|
if vis is None and self.visualize:
|
235
305
|
vis = img.copy()
|
yomitoku/text_detector.py
CHANGED
@@ -2,6 +2,7 @@ from typing import List
|
|
2
2
|
|
3
3
|
import numpy as np
|
4
4
|
import torch
|
5
|
+
import os
|
5
6
|
from pydantic import conlist
|
6
7
|
|
7
8
|
from .base import BaseModelCatalog, BaseModule, BaseSchema
|
@@ -14,6 +15,10 @@ from .data.functions import (
|
|
14
15
|
from .models import DBNet
|
15
16
|
from .postprocessor import DBnetPostProcessor
|
16
17
|
from .utils.visualizer import det_visualizer
|
18
|
+
from .constants import ROOT_DIR
|
19
|
+
|
20
|
+
import onnx
|
21
|
+
import onnxruntime
|
17
22
|
|
18
23
|
|
19
24
|
class TextDetectorModelCatalog(BaseModelCatalog):
|
@@ -43,21 +48,60 @@ class TextDetector(BaseModule):
|
|
43
48
|
device="cuda",
|
44
49
|
visualize=False,
|
45
50
|
from_pretrained=True,
|
51
|
+
infer_onnx=False,
|
46
52
|
):
|
47
53
|
super().__init__()
|
48
54
|
self.load_model(
|
49
55
|
model_name,
|
50
56
|
path_cfg,
|
51
|
-
from_pretrained=
|
57
|
+
from_pretrained=from_pretrained,
|
52
58
|
)
|
53
59
|
|
54
60
|
self.device = device
|
55
61
|
self.visualize = visualize
|
56
62
|
|
57
63
|
self.model.eval()
|
58
|
-
self.model.to(self.device)
|
59
|
-
|
60
64
|
self.post_processor = DBnetPostProcessor(**self._cfg.post_process)
|
65
|
+
self.infer_onnx = infer_onnx
|
66
|
+
|
67
|
+
if infer_onnx:
|
68
|
+
name = self._cfg.hf_hub_repo.split("/")[-1]
|
69
|
+
path_onnx = f"{ROOT_DIR}/onnx/{name}.onnx"
|
70
|
+
if not os.path.exists(path_onnx):
|
71
|
+
self.convert_onnx(path_onnx)
|
72
|
+
|
73
|
+
self.model = None
|
74
|
+
|
75
|
+
model = onnx.load(path_onnx)
|
76
|
+
if torch.cuda.is_available() and device == "cuda":
|
77
|
+
self.sess = onnxruntime.InferenceSession(
|
78
|
+
model.SerializeToString(), providers=["CUDAExecutionProvider"]
|
79
|
+
)
|
80
|
+
else:
|
81
|
+
self.sess = onnxruntime.InferenceSession(model.SerializeToString())
|
82
|
+
|
83
|
+
self.model = None
|
84
|
+
|
85
|
+
if self.model is not None:
|
86
|
+
self.model.to(self.device)
|
87
|
+
|
88
|
+
def convert_onnx(self, path_onnx):
|
89
|
+
dynamic_axes = {
|
90
|
+
"input": {0: "batch_size", 2: "height", 3: "width"},
|
91
|
+
"output": {0: "batch_size", 2: "height", 3: "width"},
|
92
|
+
}
|
93
|
+
|
94
|
+
dummy_input = torch.randn(1, 3, 256, 256, requires_grad=True)
|
95
|
+
|
96
|
+
torch.onnx.export(
|
97
|
+
self.model,
|
98
|
+
dummy_input,
|
99
|
+
path_onnx,
|
100
|
+
opset_version=14,
|
101
|
+
input_names=["input"],
|
102
|
+
output_names=["output"],
|
103
|
+
dynamic_axes=dynamic_axes,
|
104
|
+
)
|
61
105
|
|
62
106
|
def preprocess(self, img):
|
63
107
|
img = img.copy()
|
@@ -81,9 +125,15 @@ class TextDetector(BaseModule):
|
|
81
125
|
|
82
126
|
ori_h, ori_w = img.shape[:2]
|
83
127
|
tensor = self.preprocess(img)
|
84
|
-
|
85
|
-
|
86
|
-
|
128
|
+
|
129
|
+
if self.infer_onnx:
|
130
|
+
input = tensor.numpy()
|
131
|
+
results = self.sess.run(["output"], {"input": input})
|
132
|
+
preds = {"binary": torch.tensor(results[0])}
|
133
|
+
else:
|
134
|
+
with torch.inference_mode():
|
135
|
+
tensor = tensor.to(self.device)
|
136
|
+
preds = self.model(tensor)
|
87
137
|
|
88
138
|
quads, scores = self.postprocess(preds, (ori_h, ori_w))
|
89
139
|
outputs = {"points": quads, "scores": scores}
|
@@ -93,9 +143,9 @@ class TextDetector(BaseModule):
|
|
93
143
|
vis = None
|
94
144
|
if self.visualize:
|
95
145
|
vis = det_visualizer(
|
96
|
-
preds,
|
97
146
|
img,
|
98
147
|
quads,
|
148
|
+
preds=preds,
|
99
149
|
vis_heatmap=self._cfg.visualize.heatmap,
|
100
150
|
line_color=tuple(self._cfg.visualize.color[::-1]),
|
101
151
|
)
|
yomitoku/text_recognizer.py
CHANGED
@@ -2,22 +2,28 @@ from typing import List
|
|
2
2
|
|
3
3
|
import numpy as np
|
4
4
|
import torch
|
5
|
+
import os
|
5
6
|
import unicodedata
|
6
7
|
from pydantic import conlist
|
7
8
|
|
8
9
|
from .base import BaseModelCatalog, BaseModule, BaseSchema
|
9
|
-
from .configs import TextRecognizerPARSeqConfig
|
10
|
+
from .configs import TextRecognizerPARSeqConfig, TextRecognizerPARSeqSmallConfig
|
10
11
|
from .data.dataset import ParseqDataset
|
11
12
|
from .models import PARSeq
|
12
13
|
from .postprocessor import ParseqTokenizer as Tokenizer
|
13
14
|
from .utils.misc import load_charset
|
14
15
|
from .utils.visualizer import rec_visualizer
|
15
16
|
|
17
|
+
from .constants import ROOT_DIR
|
18
|
+
import onnx
|
19
|
+
import onnxruntime
|
20
|
+
|
16
21
|
|
17
22
|
class TextRecognizerModelCatalog(BaseModelCatalog):
|
18
23
|
def __init__(self):
|
19
24
|
super().__init__()
|
20
25
|
self.register("parseq", TextRecognizerPARSeqConfig, PARSeq)
|
26
|
+
self.register("parseq-small", TextRecognizerPARSeqSmallConfig, PARSeq)
|
21
27
|
|
22
28
|
|
23
29
|
class TextRecognizerSchema(BaseSchema):
|
@@ -43,34 +49,86 @@ class TextRecognizer(BaseModule):
|
|
43
49
|
device="cuda",
|
44
50
|
visualize=False,
|
45
51
|
from_pretrained=True,
|
52
|
+
infer_onnx=False,
|
46
53
|
):
|
47
54
|
super().__init__()
|
48
55
|
self.load_model(
|
49
56
|
model_name,
|
50
57
|
path_cfg,
|
51
|
-
from_pretrained=
|
58
|
+
from_pretrained=from_pretrained,
|
52
59
|
)
|
53
60
|
self.charset = load_charset(self._cfg.charset)
|
54
61
|
self.tokenizer = Tokenizer(self.charset)
|
55
62
|
|
56
63
|
self.device = device
|
57
64
|
|
65
|
+
self.model.tokenizer = self.tokenizer
|
58
66
|
self.model.eval()
|
59
|
-
self.model.to(self.device)
|
60
67
|
|
61
68
|
self.visualize = visualize
|
62
69
|
|
70
|
+
self.infer_onnx = infer_onnx
|
71
|
+
|
72
|
+
if infer_onnx:
|
73
|
+
name = self._cfg.hf_hub_repo.split("/")[-1]
|
74
|
+
path_onnx = f"{ROOT_DIR}/onnx/{name}.onnx"
|
75
|
+
if not os.path.exists(path_onnx):
|
76
|
+
self.convert_onnx(path_onnx)
|
77
|
+
|
78
|
+
self.model = None
|
79
|
+
|
80
|
+
model = onnx.load(path_onnx)
|
81
|
+
if torch.cuda.is_available() and device == "cuda":
|
82
|
+
self.sess = onnxruntime.InferenceSession(
|
83
|
+
model.SerializeToString(), providers=["CUDAExecutionProvider"]
|
84
|
+
)
|
85
|
+
else:
|
86
|
+
self.sess = onnxruntime.InferenceSession(model.SerializeToString())
|
87
|
+
|
88
|
+
if self.model is not None:
|
89
|
+
self.model.to(self.device)
|
90
|
+
|
63
91
|
def preprocess(self, img, polygons):
|
64
92
|
dataset = ParseqDataset(self._cfg, img, polygons)
|
65
|
-
dataloader =
|
66
|
-
dataset,
|
67
|
-
batch_size=self._cfg.data.batch_size,
|
68
|
-
shuffle=False,
|
69
|
-
num_workers=self._cfg.data.num_workers,
|
70
|
-
)
|
93
|
+
dataloader = self._make_mini_batch(dataset)
|
71
94
|
|
72
95
|
return dataloader
|
73
96
|
|
97
|
+
def _make_mini_batch(self, dataset):
|
98
|
+
mini_batches = []
|
99
|
+
mini_batch = []
|
100
|
+
for data in dataset:
|
101
|
+
data = torch.unsqueeze(data, 0)
|
102
|
+
mini_batch.append(data)
|
103
|
+
|
104
|
+
if len(mini_batch) == self._cfg.data.batch_size:
|
105
|
+
mini_batches.append(torch.cat(mini_batch, 0))
|
106
|
+
mini_batch = []
|
107
|
+
else:
|
108
|
+
if len(mini_batch) > 0:
|
109
|
+
mini_batches.append(torch.cat(mini_batch, 0))
|
110
|
+
|
111
|
+
return mini_batches
|
112
|
+
|
113
|
+
def convert_onnx(self, path_onnx):
|
114
|
+
img_size = self._cfg.data.img_size
|
115
|
+
input = torch.randn(1, 3, *img_size, requires_grad=True)
|
116
|
+
dynamic_axes = {
|
117
|
+
"input": {0: "batch_size"},
|
118
|
+
"output": {0: "batch_size"},
|
119
|
+
}
|
120
|
+
|
121
|
+
torch.onnx.export(
|
122
|
+
self.model,
|
123
|
+
input,
|
124
|
+
path_onnx,
|
125
|
+
opset_version=14,
|
126
|
+
input_names=["input"],
|
127
|
+
output_names=["output"],
|
128
|
+
do_constant_folding=True,
|
129
|
+
dynamic_axes=dynamic_axes,
|
130
|
+
)
|
131
|
+
|
74
132
|
def postprocess(self, p, points):
|
75
133
|
pred, score = self.tokenizer.decode(p)
|
76
134
|
pred = [unicodedata.normalize("NFKC", x) for x in pred]
|
@@ -101,13 +159,19 @@ class TextRecognizer(BaseModule):
|
|
101
159
|
scores = []
|
102
160
|
directions = []
|
103
161
|
for data in dataloader:
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
162
|
+
if self.infer_onnx:
|
163
|
+
input = data.numpy()
|
164
|
+
results = self.sess.run(["output"], {"input": input})
|
165
|
+
p = torch.tensor(results[0])
|
166
|
+
else:
|
167
|
+
with torch.inference_mode():
|
168
|
+
data = data.to(self.device)
|
169
|
+
p = self.model(data).softmax(-1)
|
170
|
+
|
171
|
+
pred, score, direction = self.postprocess(p, points)
|
172
|
+
preds.extend(pred)
|
173
|
+
scores.extend(score)
|
174
|
+
directions.extend(direction)
|
111
175
|
|
112
176
|
outputs = {
|
113
177
|
"contents": preds,
|
yomitoku/utils/misc.py
CHANGED
@@ -9,6 +9,24 @@ def filter_by_flag(elements, flags):
|
|
9
9
|
return [element for element, flag in zip(elements, flags) if flag]
|
10
10
|
|
11
11
|
|
12
|
+
def calc_overlap_ratio(rect_a, rect_b):
|
13
|
+
intersection = calc_intersection(rect_a, rect_b)
|
14
|
+
if intersection is None:
|
15
|
+
return 0, None
|
16
|
+
|
17
|
+
ix1, iy1, ix2, iy2 = intersection
|
18
|
+
|
19
|
+
overlap_width = ix2 - ix1
|
20
|
+
overlap_height = iy2 - iy1
|
21
|
+
bx1, by1, bx2, by2 = rect_b
|
22
|
+
|
23
|
+
b_area = (bx2 - bx1) * (by2 - by1)
|
24
|
+
overlap_area = overlap_width * overlap_height
|
25
|
+
|
26
|
+
overlap_ratio = overlap_area / b_area
|
27
|
+
return overlap_ratio, intersection
|
28
|
+
|
29
|
+
|
12
30
|
def is_contained(rect_a, rect_b, threshold=0.8):
|
13
31
|
"""二つの矩形A, Bが与えられたとき、矩形Bが矩形Aに含まれるかどうかを判定する。
|
14
32
|
ずれを許容するため、重複率求め、thresholdを超える場合にTrueを返す。
|
@@ -23,20 +41,9 @@ def is_contained(rect_a, rect_b, threshold=0.8):
|
|
23
41
|
bool: 矩形Bが矩形Aに含まれる場合True
|
24
42
|
"""
|
25
43
|
|
26
|
-
|
27
|
-
if intersection is None:
|
28
|
-
return False
|
29
|
-
|
30
|
-
ix1, iy1, ix2, iy2 = intersection
|
31
|
-
|
32
|
-
overlap_width = ix2 - ix1
|
33
|
-
overlap_height = iy2 - iy1
|
34
|
-
bx1, by1, bx2, by2 = rect_b
|
35
|
-
|
36
|
-
b_area = (bx2 - bx1) * (by2 - by1)
|
37
|
-
overlap_area = overlap_width * overlap_height
|
44
|
+
overlap_ratio, _ = calc_overlap_ratio(rect_a, rect_b)
|
38
45
|
|
39
|
-
if
|
46
|
+
if overlap_ratio > threshold:
|
40
47
|
return True
|
41
48
|
|
42
49
|
return False
|
yomitoku/utils/visualizer.py
CHANGED
@@ -66,14 +66,14 @@ def reading_order_visualizer(
|
|
66
66
|
return out
|
67
67
|
|
68
68
|
|
69
|
-
def det_visualizer(
|
70
|
-
preds = preds["binary"][0]
|
71
|
-
binary = preds.detach().cpu().numpy()
|
69
|
+
def det_visualizer(img, quads, preds=None, vis_heatmap=False, line_color=(0, 255, 0)):
|
72
70
|
out = img.copy()
|
73
71
|
h, w = out.shape[:2]
|
74
|
-
binary = binary.squeeze(0)
|
75
|
-
binary = (binary * 255).astype(np.uint8)
|
76
72
|
if vis_heatmap:
|
73
|
+
preds = preds["binary"][0]
|
74
|
+
binary = preds.detach().cpu().numpy()
|
75
|
+
binary = binary.squeeze(0)
|
76
|
+
binary = (binary * 255).astype(np.uint8)
|
77
77
|
binary = cv2.resize(binary, (w, h), interpolation=cv2.INTER_LINEAR)
|
78
78
|
heatmap = cv2.applyColorMap(binary, cv2.COLORMAP_JET)
|
79
79
|
out = cv2.addWeighted(out, 0.5, heatmap, 0.5, 0)
|
@@ -1,14 +1,16 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: yomitoku
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.7.0
|
4
4
|
Summary: Yomitoku is an AI-powered document image analysis package designed specifically for the Japanese language.
|
5
5
|
Author-email: Kotaro Kinoshita <kotaro.kinoshita@mlism.com>
|
6
6
|
License: CC BY-NC-SA 4.0
|
7
7
|
Keywords: Deep Learning,Japanese,OCR
|
8
|
-
Requires-Python: <3.13,>=3.
|
8
|
+
Requires-Python: <3.13,>=3.10
|
9
9
|
Requires-Dist: huggingface-hub>=0.26.1
|
10
10
|
Requires-Dist: lxml>=5.3.0
|
11
11
|
Requires-Dist: omegaconf>=2.3.0
|
12
|
+
Requires-Dist: onnx>=1.17.0
|
13
|
+
Requires-Dist: onnxruntime>=1.20.1
|
12
14
|
Requires-Dist: opencv-python>=4.10.0.84
|
13
15
|
Requires-Dist: pyclipper>=1.3.0.post6
|
14
16
|
Requires-Dist: pydantic>=2.9.2
|
@@ -17,13 +19,15 @@ Requires-Dist: shapely>=2.0.6
|
|
17
19
|
Requires-Dist: timm>=1.0.11
|
18
20
|
Requires-Dist: torch>=2.5.0
|
19
21
|
Requires-Dist: torchvision>=0.20.0
|
22
|
+
Provides-Extra: gpu
|
23
|
+
Requires-Dist: onnxruntime-gpu>=1.20.1; extra == 'gpu'
|
20
24
|
Description-Content-Type: text/markdown
|
21
25
|
|
22
26
|
日本語版 | [English](README_EN.md)
|
23
27
|
|
24
28
|
<img src="static/logo/horizontal.png" width="800px">
|
25
29
|
|
26
|
-

|
27
31
|

|
28
32
|

|
29
33
|

|
@@ -69,23 +73,30 @@ Markdown でエクスポートした結果は関してはリポジトリ内の[s
|
|
69
73
|
pip install yomitoku
|
70
74
|
```
|
71
75
|
|
72
|
-
|
73
|
-
|
76
|
+
onnxruntimeの実行にGPUを使用する場合
|
77
|
+
```
|
78
|
+
pip install yomitoku[gpu]
|
79
|
+
```
|
80
|
+
|
81
|
+
- pytorch はご自身の CUDA のバージョンにあったものをインストールしてください。デフォルトでは CUDA12.4 以上に対応したものがインストールされます。
|
82
|
+
- pytorch は 2.5 以上のバージョンに対応しています。その関係で CUDA11.8 以上のバージョンが必要になります。対応できない場合は、リポジトリ内の Dockerfile を利用してください。
|
74
83
|
|
75
84
|
## 🚀 実行方法
|
76
85
|
|
77
86
|
```
|
78
|
-
yomitoku ${path_data} -f md -o results -v --figure
|
87
|
+
yomitoku ${path_data} -f md -o results -v --figure --lite
|
79
88
|
```
|
80
89
|
|
81
90
|
- `${path_data}` 解析対象の画像が含まれたディレクトリか画像ファイルのパスを直接して指定してください。ディレクトリを対象とした場合はディレクトリのサブディレクトリ内の画像も含めて処理を実行します。
|
82
91
|
- `-f`, `--format` 出力形式のファイルフォーマットを指定します。(json, csv, html, md をサポート)
|
83
92
|
- `-o`, `--outdir` 出力先のディレクトリ名を指定します。存在しない場合は新規で作成されます。
|
84
93
|
- `-v`, `--vis` を指定すると解析結果を可視化した画像を出力します。
|
94
|
+
- `-l`, `--lite` を指定すると軽量モデルで推論を実行します。通常より高速に推論できますが、若干、精度が低下する可能性があります。
|
85
95
|
- `-d`, `--device` モデルを実行するためのデバイスを指定します。gpu が利用できない場合は cpu で推論が実行されます。(デフォルト: cuda)
|
86
96
|
- `--ignore_line_break` 画像の改行位置を無視して、段落内の文章を連結して返します。(デフォルト:画像通りの改行位置位置で改行します。)
|
87
97
|
- `--figure_letter` 検出した図表に含まれる文字も出力ファイルにエクスポートします。
|
88
|
-
- `--figure` 検出した図、画像を出力ファイルにエクスポートします。
|
98
|
+
- `--figure` 検出した図、画像を出力ファイルにエクスポートします。
|
99
|
+
- `--encoding` エクスポートする出力ファイルの文字エンコーディングを指定します。サポートされていない文字コードが含まれる場合は、その文字を無視します。(utf-8, utf-8-sig, shift-jis, enc-jp, cp932)
|
89
100
|
|
90
101
|
その他のオプションに関しては、ヘルプを参照
|
91
102
|
|
@@ -94,6 +105,7 @@ yomitoku --help
|
|
94
105
|
```
|
95
106
|
|
96
107
|
**NOTE**
|
108
|
+
|
97
109
|
- GPU での実行を推奨します。CPU を用いての推論向けに最適化されておらず、処理時間が長くなります。
|
98
110
|
- 活字のみ識別をサポートしております。手書き文字に関しては、読み取れる場合もありますが、公式にはサポートしておりません。
|
99
111
|
- Yomitoku は文書 OCR 向けに最適化されており、情景 OCR(看板など紙以外にプリントされた文字の読み取り)向けには最適化されていません。
|
@@ -107,6 +119,6 @@ yomitoku --help
|
|
107
119
|
|
108
120
|
本リポジトリ内に格納されているソースコードおよび本プロジェクトに関連する HuggingFaceHub 上のモデルの重みファイルのライセンスは CC BY-NC-SA 4.0 に従います。
|
109
121
|
非商用での個人利用、研究目的での利用はご自由にお使いください。
|
110
|
-
|
122
|
+
商用目的での利用に関しては、別途、商用ライセンスを提供しますので、https://www.mlism.com/ にお問い合わせください。
|
111
123
|
|
112
124
|
YomiToku © 2024 by Kotaro Kinoshita is licensed under CC BY-NC-SA 4.0. To view a copy of this license, visit https://creativecommons.org/licenses/by-nc-sa/4.0/
|
@@ -1,52 +1,54 @@
|
|
1
1
|
yomitoku/__init__.py,sha256=kXOM8RbpwwLABG3p3vPT3dJWBk4JX2MFGrOeBEW0hKM,543
|
2
2
|
yomitoku/base.py,sha256=lzR_V8t87aRasmFdFwD-8KAeSahSTI3AZaEn6g8sOv8,3871
|
3
3
|
yomitoku/constants.py,sha256=zlW5QRc_u_F3C2RAgBFWyHJZexBnJT5N15GC-9d3iLo,686
|
4
|
-
yomitoku/document_analyzer.py,sha256=
|
5
|
-
yomitoku/layout_analyzer.py,sha256=
|
6
|
-
yomitoku/layout_parser.py,sha256=
|
7
|
-
yomitoku/ocr.py,sha256=
|
4
|
+
yomitoku/document_analyzer.py,sha256=85j93l-6rvvRZsL0FD7EQG--84ZLPiKoNm2CE1Ss8LM,16271
|
5
|
+
yomitoku/layout_analyzer.py,sha256=VhNf1ZQFoozj6WUGk5ll1p2p1jk5X3j-JPcDbTAoSl4,1856
|
6
|
+
yomitoku/layout_parser.py,sha256=V_mAkZxke1gwHfnxBFMTOJ8hnz2X_kfZu2lLiMd8cAs,7610
|
7
|
+
yomitoku/ocr.py,sha256=JSTjkupcxHITQm6ERnzU7As0c3KWf8-oxc0AqNoWHXo,2272
|
8
8
|
yomitoku/reading_order.py,sha256=OfhOS9ttPDoPSuHrIRKyOzG19GGeRufbuSKDqhsohh4,6404
|
9
|
-
yomitoku/table_structure_recognizer.py,sha256=
|
10
|
-
yomitoku/text_detector.py,sha256=
|
11
|
-
yomitoku/text_recognizer.py,sha256=
|
9
|
+
yomitoku/table_structure_recognizer.py,sha256=Eam9t7OjW4a-UWk_dl-ylbOcinN_Te_ovuri2naldL0,9482
|
10
|
+
yomitoku/text_detector.py,sha256=XgqhtbNcJww2x3BrH8EFz45qC6kqPKCX9hsa-dzRoIA,4274
|
11
|
+
yomitoku/text_recognizer.py,sha256=LVMjy-PaGlDQqfJrjKX_7vOQXDyFg6FaCeIQIyWUJX8,5833
|
12
12
|
yomitoku/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
|
-
yomitoku/cli/main.py,sha256=
|
14
|
-
yomitoku/configs/__init__.py,sha256=
|
13
|
+
yomitoku/cli/main.py,sha256=N0X4-z_jfFM5_buUpiLHHA68B5oPVVdmvwzXWn7qoUs,7822
|
14
|
+
yomitoku/configs/__init__.py,sha256=e1Alss5QJLZSNfD6zLEG6xu5vDQDw-4Jayiqq8bq52s,571
|
15
15
|
yomitoku/configs/cfg_layout_parser_rtdtrv2.py,sha256=8PRxB2Ar9UF7-DLtbgSokhrzdXb0veWI6Wc-X8qigRw,2329
|
16
16
|
yomitoku/configs/cfg_table_structure_recognizer_rtdtrv2.py,sha256=o70GMHD8k-zeBeJtuhPS8x7vVB-ffucnJXeSyn-0AXo,2116
|
17
17
|
yomitoku/configs/cfg_text_detector_dbnet.py,sha256=U9k48PON7haoOaytiELhbZRpv9RMiUm6nnfHmdxIa9Q,1153
|
18
18
|
yomitoku/configs/cfg_text_recognizer_parseq.py,sha256=hpFs3nKqh4XdU3BZMTultegtLEGahEsCaZdjfKC_MO8,1247
|
19
|
+
yomitoku/configs/cfg_text_recognizer_parseq_small.py,sha256=uCm_VC_G79IbZpOiK8fgYzAJ4b98H5pf328wyQomtfo,1259
|
19
20
|
yomitoku/data/__init__.py,sha256=KAofFc9rk9ZdTKBjemu9RM8Vj9XnKbWC2MPZ2RWtOdE,82
|
20
21
|
yomitoku/data/dataset.py,sha256=-I4f-FDtgsPnJ2MnXB7FtwihMW3koDaSI1OEoqKneIg,1014
|
21
22
|
yomitoku/data/functions.py,sha256=eOyxo8S6EoAf1xGSPLWQFb9-t5Rg52NggD9MFIrOSpY,7506
|
22
23
|
yomitoku/export/__init__.py,sha256=aANEfuovH2aevFjb2pGrBLFP-4iRzEzD9wcriCR-M7I,229
|
23
|
-
yomitoku/export/export_csv.py,sha256
|
24
|
-
yomitoku/export/export_html.py,sha256=
|
25
|
-
yomitoku/export/export_json.py,sha256=
|
26
|
-
yomitoku/export/export_markdown.py,sha256=
|
24
|
+
yomitoku/export/export_csv.py,sha256=MzGS1Y6kiHo7vZV3heKkd_v5gdxJBrpa8Zt9gFMwG88,2869
|
25
|
+
yomitoku/export/export_html.py,sha256=ezj96wQNqkBOCUOIPHFJW_BCh1I4Ij_8RDiKUxqaFok,4913
|
26
|
+
yomitoku/export/export_json.py,sha256=Kz8MgWM0bd6SNaSiHZjs-IjhsvX19Y0ovlIxGcm1vIw,1910
|
27
|
+
yomitoku/export/export_markdown.py,sha256=w9jT-A0__4rw1PaeGtRicuLu1rqeZO-ZLwyJm5F5PXQ,4033
|
27
28
|
yomitoku/models/__init__.py,sha256=Enxq9sjJWusZuxecTori8IQa8NEYKaiiptDluHX1avg,144
|
28
29
|
yomitoku/models/dbnet_plus.py,sha256=jeWJZm0ihbxoJeAXBFK7uVIwoosx2IUNk7Ut5wRH0vA,7998
|
29
|
-
yomitoku/models/parseq.py,sha256
|
30
|
+
yomitoku/models/parseq.py,sha256=-DQMQuON2jwtb4Ib2V0O19un9w-WG4rXS0SiscydrXU,8593
|
30
31
|
yomitoku/models/rtdetr.py,sha256=oJsr8RHz3frslhLfXdVJve47lUsrmqLjfdTrZ41tlQ0,687
|
31
32
|
yomitoku/models/layers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
32
33
|
yomitoku/models/layers/activate.py,sha256=S54GPssZBMloM2oFAXeDVMmBBZOWyjwU98Niq758txE,1244
|
33
34
|
yomitoku/models/layers/dbnet_feature_attention.py,sha256=Vpp_PiLVuI7Zs30TTg4RNRn16KTb81ewonADpUHd4aE,6060
|
34
35
|
yomitoku/models/layers/parseq_transformer.py,sha256=33eroJf8rmgIptP-NpZLJMhG7XOTwV4rXsq674VrKnU,6704
|
35
|
-
yomitoku/models/layers/rtdetr_backbone.py,sha256=
|
36
|
-
yomitoku/models/layers/rtdetr_hybrid_encoder.py,sha256=
|
37
|
-
yomitoku/models/layers/rtdetrv2_decoder.py,sha256=
|
36
|
+
yomitoku/models/layers/rtdetr_backbone.py,sha256=VOWFW7XFfJl4cvPaupqqP4-I-YHdwlVltQEgliD69As,9904
|
37
|
+
yomitoku/models/layers/rtdetr_hybrid_encoder.py,sha256=ZnpEzJLzHgu_hrx7YK6myXZ4F1CDHRM501RbAPQdzdQ,14125
|
38
|
+
yomitoku/models/layers/rtdetrv2_decoder.py,sha256=ggUwTdWpBfyYHnZuLx8vyH8n0XfZkQFtxgpY-1YI2sI,28070
|
39
|
+
yomitoku/onnx/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
38
40
|
yomitoku/postprocessor/__init__.py,sha256=W4vUuqBaFtH5dlSBIYgyaCroGLMjpV6RrNGIBQ8NFVw,243
|
39
41
|
yomitoku/postprocessor/dbnet_postporcessor.py,sha256=o_y8b5REd2dFEdIpRcr6o-XBfOCHo9rBYGwokP_uhTc,4948
|
40
42
|
yomitoku/postprocessor/parseq_tokenizer.py,sha256=e89_g_bc4Au3SchuxoJfJNATJTxFmVYetzXyAzPWm28,4315
|
41
|
-
yomitoku/postprocessor/rtdetr_postprocessor.py,sha256=
|
43
|
+
yomitoku/postprocessor/rtdetr_postprocessor.py,sha256=TCv1t1zCxg2rSirsLm4sXlaltGubH-roVdEqnUoRs-8,3905
|
42
44
|
yomitoku/resource/MPLUS1p-Medium.ttf,sha256=KLL1KkCumIBkgQtx1n4SffdaFuCNffThktEAbkB1OU8,1758908
|
43
45
|
yomitoku/resource/charset.txt,sha256=sU91kSi-9Wk4733bCXy4j_UDmvcsj96sHOq1ppUJlOY,21672
|
44
46
|
yomitoku/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
45
47
|
yomitoku/utils/graph.py,sha256=LKNB8ZhSQwOZMfeAimPMF5UCVVr2ZaUWoGDkz8z-uGU,456
|
46
48
|
yomitoku/utils/logger.py,sha256=uOmtQDr0A0JD7wyFshedL08BiNrQorHnpktRXba8bjU,424
|
47
|
-
yomitoku/utils/misc.py,sha256=
|
48
|
-
yomitoku/utils/visualizer.py,sha256=
|
49
|
-
yomitoku-0.
|
50
|
-
yomitoku-0.
|
51
|
-
yomitoku-0.
|
52
|
-
yomitoku-0.
|
49
|
+
yomitoku/utils/misc.py,sha256=FbwPLeIYYBvNf9wQh2RoEonTM5BF7_IwaEqmRsYHKA8,2673
|
50
|
+
yomitoku/utils/visualizer.py,sha256=DjDwHiAu1iFRKh96H3Egq4vuI2s_-9dLCDeykhKi8jo,5251
|
51
|
+
yomitoku-0.7.0.dist-info/METADATA,sha256=Yvpxy_oWORSz_db4yzledIhFHbuQbORz0DrMisf59zQ,8488
|
52
|
+
yomitoku-0.7.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
53
|
+
yomitoku-0.7.0.dist-info/entry_points.txt,sha256=nFV3S11zgBNW0Qq_D0XQNg2R4lNXU_9XUFr6rdJoyF8,52
|
54
|
+
yomitoku-0.7.0.dist-info/RECORD,,
|
File without changes
|