yomitoku 0.5.3__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- yomitoku/cli/main.py +15 -0
- yomitoku/configs/__init__.py +2 -0
- yomitoku/configs/cfg_text_recognizer_parseq_small.py +51 -0
- yomitoku/layout_parser.py +53 -3
- yomitoku/models/layers/rtdetr_backbone.py +5 -15
- yomitoku/models/layers/rtdetr_hybrid_encoder.py +6 -18
- yomitoku/models/layers/rtdetrv2_decoder.py +17 -42
- yomitoku/models/parseq.py +9 -9
- yomitoku/onnx/.gitkeep +0 -0
- yomitoku/postprocessor/rtdetr_postprocessor.py +4 -13
- yomitoku/table_structure_recognizer.py +57 -4
- yomitoku/text_detector.py +49 -4
- yomitoku/text_recognizer.py +58 -9
- {yomitoku-0.5.3.dist-info → yomitoku-0.6.0.dist-info}/METADATA +12 -7
- {yomitoku-0.5.3.dist-info → yomitoku-0.6.0.dist-info}/RECORD +17 -15
- {yomitoku-0.5.3.dist-info → yomitoku-0.6.0.dist-info}/WHEEL +0 -0
- {yomitoku-0.5.3.dist-info → yomitoku-0.6.0.dist-info}/entry_points.txt +0 -0
yomitoku/cli/main.py
CHANGED
@@ -104,6 +104,12 @@ def main():
|
|
104
104
|
default="results",
|
105
105
|
help="output directory",
|
106
106
|
)
|
107
|
+
parser.add_argument(
|
108
|
+
"-l",
|
109
|
+
"--lite",
|
110
|
+
action="store_true",
|
111
|
+
help="if set, use lite model",
|
112
|
+
)
|
107
113
|
parser.add_argument(
|
108
114
|
"-d",
|
109
115
|
"--device",
|
@@ -197,6 +203,15 @@ def main():
|
|
197
203
|
},
|
198
204
|
}
|
199
205
|
|
206
|
+
if args.lite:
|
207
|
+
configs["ocr"]["text_recognizer"]["model_name"] = "parseq-small"
|
208
|
+
configs["ocr"]["text_detector"]["infer_onnx"] = True
|
209
|
+
|
210
|
+
# Note: Text Detector以外はONNX推論よりもPyTorch推論の方が速いため、ONNX推論は行わない
|
211
|
+
# configs["ocr"]["text_recognizer"]["infer_onnx"] = True
|
212
|
+
# configs["layout_analyzer"]["table_structure_recognizer"]["infer_onnx"] = True
|
213
|
+
# configs["layout_analyzer"]["layout_parser"]["infer_onnx"] = True
|
214
|
+
|
200
215
|
analyzer = DocumentAnalyzer(
|
201
216
|
configs=configs,
|
202
217
|
visualize=args.vis,
|
yomitoku/configs/__init__.py
CHANGED
@@ -4,10 +4,12 @@ from .cfg_table_structure_recognizer_rtdtrv2 import (
|
|
4
4
|
)
|
5
5
|
from .cfg_text_detector_dbnet import TextDetectorDBNetConfig
|
6
6
|
from .cfg_text_recognizer_parseq import TextRecognizerPARSeqConfig
|
7
|
+
from .cfg_text_recognizer_parseq_small import TextRecognizerPARSeqSmallConfig
|
7
8
|
|
8
9
|
__all__ = [
|
9
10
|
"TextDetectorDBNetConfig",
|
10
11
|
"TextRecognizerPARSeqConfig",
|
11
12
|
"LayoutParserRTDETRv2Config",
|
12
13
|
"TableStructureRecognizerRTDETRv2Config",
|
14
|
+
"TextRecognizerPARSeqSmallConfig",
|
13
15
|
]
|
@@ -0,0 +1,51 @@
|
|
1
|
+
from dataclasses import dataclass, field
|
2
|
+
from typing import List
|
3
|
+
|
4
|
+
from ..constants import ROOT_DIR
|
5
|
+
|
6
|
+
|
7
|
+
@dataclass
|
8
|
+
class Data:
|
9
|
+
num_workers: int = 4
|
10
|
+
batch_size: int = 128
|
11
|
+
img_size: List[int] = field(default_factory=lambda: [32, 800])
|
12
|
+
|
13
|
+
|
14
|
+
@dataclass
|
15
|
+
class Encoder:
|
16
|
+
patch_size: List[int] = field(default_factory=lambda: [16, 16])
|
17
|
+
num_heads: int = 8
|
18
|
+
embed_dim: int = 384
|
19
|
+
mlp_ratio: int = 4
|
20
|
+
depth: int = 9
|
21
|
+
|
22
|
+
|
23
|
+
@dataclass
|
24
|
+
class Decoder:
|
25
|
+
embed_dim: int = 384
|
26
|
+
num_heads: int = 8
|
27
|
+
mlp_ratio: int = 4
|
28
|
+
depth: int = 1
|
29
|
+
|
30
|
+
|
31
|
+
@dataclass
|
32
|
+
class Visualize:
|
33
|
+
font: str = str(ROOT_DIR + "/resource/MPLUS1p-Medium.ttf")
|
34
|
+
color: List[int] = field(default_factory=lambda: [0, 0, 255]) # RGB
|
35
|
+
font_size: int = 18
|
36
|
+
|
37
|
+
|
38
|
+
@dataclass
|
39
|
+
class TextRecognizerPARSeqSmallConfig:
|
40
|
+
hf_hub_repo: str = "KotaroKinoshita/yomitoku-text-recognizer-parseq-small-open-beta"
|
41
|
+
charset: str = str(ROOT_DIR + "/resource/charset.txt")
|
42
|
+
num_tokens: int = 7312
|
43
|
+
max_label_length: int = 100
|
44
|
+
decode_ar: int = 1
|
45
|
+
refine_iters: int = 1
|
46
|
+
|
47
|
+
data: Data = field(default_factory=Data)
|
48
|
+
encoder: Encoder = field(default_factory=Encoder)
|
49
|
+
decoder: Decoder = field(default_factory=Decoder)
|
50
|
+
|
51
|
+
visualize: Visualize = field(default_factory=Visualize)
|
yomitoku/layout_parser.py
CHANGED
@@ -1,11 +1,16 @@
|
|
1
1
|
from typing import List, Union
|
2
2
|
|
3
3
|
import cv2
|
4
|
+
import os
|
5
|
+
import onnx
|
6
|
+
import onnxruntime
|
4
7
|
import torch
|
5
8
|
import torchvision.transforms as T
|
6
9
|
from PIL import Image
|
7
10
|
from pydantic import conlist
|
8
11
|
|
12
|
+
from .constants import ROOT_DIR
|
13
|
+
|
9
14
|
from .base import BaseModelCatalog, BaseModule, BaseSchema
|
10
15
|
from .configs import LayoutParserRTDETRv2Config
|
11
16
|
from .models import RTDETRv2
|
@@ -91,6 +96,7 @@ class LayoutParser(BaseModule):
|
|
91
96
|
device="cuda",
|
92
97
|
visualize=False,
|
93
98
|
from_pretrained=True,
|
99
|
+
infer_onnx=False,
|
94
100
|
):
|
95
101
|
super().__init__()
|
96
102
|
self.load_model(model_name, path_cfg, from_pretrained)
|
@@ -119,11 +125,44 @@ class LayoutParser(BaseModule):
|
|
119
125
|
}
|
120
126
|
|
121
127
|
self.role = self._cfg.role
|
128
|
+
self.infer_onnx = infer_onnx
|
129
|
+
if infer_onnx:
|
130
|
+
name = self._cfg.hf_hub_repo.split("/")[-1]
|
131
|
+
path_onnx = f"{ROOT_DIR}/onnx/{name}.onnx"
|
132
|
+
if not os.path.exists(path_onnx):
|
133
|
+
self.convert_onnx(path_onnx)
|
134
|
+
|
135
|
+
model = onnx.load(path_onnx)
|
136
|
+
if torch.cuda.is_available() and device == "cuda":
|
137
|
+
self.sess = onnxruntime.InferenceSession(
|
138
|
+
model.SerializeToString(), providers=["CUDAExecutionProvider"]
|
139
|
+
)
|
140
|
+
else:
|
141
|
+
self.sess = onnxruntime.InferenceSession(model.SerializeToString())
|
142
|
+
|
143
|
+
def convert_onnx(self, path_onnx):
|
144
|
+
dynamic_axes = {
|
145
|
+
"input": {0: "batch_size"},
|
146
|
+
"output": {0: "batch_size"},
|
147
|
+
}
|
148
|
+
|
149
|
+
img_size = self._cfg.data.img_size
|
150
|
+
dummy_input = torch.randn(1, 3, *img_size, requires_grad=True)
|
151
|
+
|
152
|
+
torch.onnx.export(
|
153
|
+
self.model,
|
154
|
+
dummy_input,
|
155
|
+
path_onnx,
|
156
|
+
opset_version=16,
|
157
|
+
input_names=["input"],
|
158
|
+
output_names=["pred_logits", "pred_boxes"],
|
159
|
+
dynamic_axes=dynamic_axes,
|
160
|
+
)
|
122
161
|
|
123
162
|
def preprocess(self, img):
|
124
163
|
cv_img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
125
164
|
img = Image.fromarray(cv_img)
|
126
|
-
img_tensor = self.transforms(img)[None]
|
165
|
+
img_tensor = self.transforms(img)[None]
|
127
166
|
return img_tensor
|
128
167
|
|
129
168
|
def postprocess(self, preds, image_size):
|
@@ -175,8 +214,19 @@ class LayoutParser(BaseModule):
|
|
175
214
|
ori_h, ori_w = img.shape[:2]
|
176
215
|
img_tensor = self.preprocess(img)
|
177
216
|
|
178
|
-
|
179
|
-
|
217
|
+
if self.infer_onnx:
|
218
|
+
input = img_tensor.numpy()
|
219
|
+
results = self.sess.run(None, {"input": input})
|
220
|
+
preds = {
|
221
|
+
"pred_logits": torch.tensor(results[0]).to(self.device),
|
222
|
+
"pred_boxes": torch.tensor(results[1]).to(self.device),
|
223
|
+
}
|
224
|
+
|
225
|
+
else:
|
226
|
+
with torch.inference_mode():
|
227
|
+
img_tensor = img_tensor.to(self.device)
|
228
|
+
preds = self.model(img_tensor)
|
229
|
+
|
180
230
|
results = self.postprocess(preds, (ori_h, ori_w))
|
181
231
|
|
182
232
|
vis = None
|
@@ -59,9 +59,7 @@ class ConvNormLayer(nn.Module):
|
|
59
59
|
class BasicBlock(nn.Module):
|
60
60
|
expansion = 1
|
61
61
|
|
62
|
-
def __init__(
|
63
|
-
self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"
|
64
|
-
):
|
62
|
+
def __init__(self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"):
|
65
63
|
super().__init__()
|
66
64
|
|
67
65
|
self.shortcut = shortcut
|
@@ -100,9 +98,7 @@ class BasicBlock(nn.Module):
|
|
100
98
|
class BottleNeck(nn.Module):
|
101
99
|
expansion = 4
|
102
100
|
|
103
|
-
def __init__(
|
104
|
-
self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"
|
105
|
-
):
|
101
|
+
def __init__(self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"):
|
106
102
|
super().__init__()
|
107
103
|
|
108
104
|
if variant == "a":
|
@@ -125,17 +121,13 @@ class BottleNeck(nn.Module):
|
|
125
121
|
("pool", nn.AvgPool2d(2, 2, 0, ceil_mode=True)),
|
126
122
|
(
|
127
123
|
"conv",
|
128
|
-
ConvNormLayer(
|
129
|
-
ch_in, ch_out * self.expansion, 1, 1
|
130
|
-
),
|
124
|
+
ConvNormLayer(ch_in, ch_out * self.expansion, 1, 1),
|
131
125
|
),
|
132
126
|
]
|
133
127
|
)
|
134
128
|
)
|
135
129
|
else:
|
136
|
-
self.short = ConvNormLayer(
|
137
|
-
ch_in, ch_out * self.expansion, 1, stride
|
138
|
-
)
|
130
|
+
self.short = ConvNormLayer(ch_in, ch_out * self.expansion, 1, stride)
|
139
131
|
|
140
132
|
self.act = nn.Identity() if act is None else get_activation(act)
|
141
133
|
|
@@ -156,9 +148,7 @@ class BottleNeck(nn.Module):
|
|
156
148
|
|
157
149
|
|
158
150
|
class Blocks(nn.Module):
|
159
|
-
def __init__(
|
160
|
-
self, block, ch_in, ch_out, count, stage_num, act="relu", variant="b"
|
161
|
-
):
|
151
|
+
def __init__(self, block, ch_in, ch_out, count, stage_num, act="relu", variant="b"):
|
162
152
|
super().__init__()
|
163
153
|
|
164
154
|
self.blocks = nn.ModuleList()
|
@@ -252,9 +252,7 @@ class HybridEncoder(nn.Module):
|
|
252
252
|
for in_channel in in_channels:
|
253
253
|
if version == "v1":
|
254
254
|
proj = nn.Sequential(
|
255
|
-
nn.Conv2d(
|
256
|
-
in_channel, hidden_dim, kernel_size=1, bias=False
|
257
|
-
),
|
255
|
+
nn.Conv2d(in_channel, hidden_dim, kernel_size=1, bias=False),
|
258
256
|
nn.BatchNorm2d(hidden_dim),
|
259
257
|
)
|
260
258
|
elif version == "v2":
|
@@ -290,9 +288,7 @@ class HybridEncoder(nn.Module):
|
|
290
288
|
|
291
289
|
self.encoder = nn.ModuleList(
|
292
290
|
[
|
293
|
-
TransformerEncoder(
|
294
|
-
copy.deepcopy(encoder_layer), num_encoder_layers
|
295
|
-
)
|
291
|
+
TransformerEncoder(copy.deepcopy(encoder_layer), num_encoder_layers)
|
296
292
|
for _ in range(len(use_encoder_idx))
|
297
293
|
]
|
298
294
|
)
|
@@ -347,9 +343,7 @@ class HybridEncoder(nn.Module):
|
|
347
343
|
# self.register_buffer(f'pos_embed{idx}', pos_embed)
|
348
344
|
|
349
345
|
@staticmethod
|
350
|
-
def build_2d_sincos_position_embedding(
|
351
|
-
w, h, embed_dim=256, temperature=10000.0
|
352
|
-
):
|
346
|
+
def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.0):
|
353
347
|
""" """
|
354
348
|
grid_w = torch.arange(int(w), dtype=torch.float32)
|
355
349
|
grid_h = torch.arange(int(h), dtype=torch.float32)
|
@@ -387,9 +381,7 @@ class HybridEncoder(nn.Module):
|
|
387
381
|
src_flatten.device
|
388
382
|
)
|
389
383
|
|
390
|
-
memory: torch.Tensor = self.encoder[i](
|
391
|
-
src_flatten, pos_embed=pos_embed
|
392
|
-
)
|
384
|
+
memory: torch.Tensor = self.encoder[i](src_flatten, pos_embed=pos_embed)
|
393
385
|
proj_feats[enc_ind] = (
|
394
386
|
memory.permute(0, 2, 1)
|
395
387
|
.reshape(-1, self.hidden_dim, h, w)
|
@@ -401,13 +393,9 @@ class HybridEncoder(nn.Module):
|
|
401
393
|
for idx in range(len(self.in_channels) - 1, 0, -1):
|
402
394
|
feat_heigh = inner_outs[0]
|
403
395
|
feat_low = proj_feats[idx - 1]
|
404
|
-
feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](
|
405
|
-
feat_heigh
|
406
|
-
)
|
396
|
+
feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](feat_heigh)
|
407
397
|
inner_outs[0] = feat_heigh
|
408
|
-
upsample_feat = F.interpolate(
|
409
|
-
feat_heigh, scale_factor=2.0, mode="nearest"
|
410
|
-
)
|
398
|
+
upsample_feat = F.interpolate(feat_heigh, scale_factor=2.0, mode="nearest")
|
411
399
|
inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](
|
412
400
|
torch.concat([upsample_feat, feat_low], dim=1)
|
413
401
|
)
|
@@ -40,9 +40,7 @@ def inverse_sigmoid(x: torch.Tensor, eps: float = 1e-5) -> torch.Tensor:
|
|
40
40
|
|
41
41
|
|
42
42
|
class MLP(nn.Module):
|
43
|
-
def __init__(
|
44
|
-
self, input_dim, hidden_dim, output_dim, num_layers, act="relu"
|
45
|
-
):
|
43
|
+
def __init__(self, input_dim, hidden_dim, output_dim, num_layers, act="relu"):
|
46
44
|
super().__init__()
|
47
45
|
self.num_layers = num_layers
|
48
46
|
h = [hidden_dim] * (num_layers - 1)
|
@@ -193,9 +191,7 @@ class MSDeformableAttention(nn.Module):
|
|
193
191
|
elif reference_points.shape[-1] == 4:
|
194
192
|
# reference_points [8, 480, None, 1, 4]
|
195
193
|
# sampling_offsets [8, 480, 8, 12, 2]
|
196
|
-
num_points_scale = self.num_points_scale.to(
|
197
|
-
dtype=query.dtype
|
198
|
-
).unsqueeze(-1)
|
194
|
+
num_points_scale = self.num_points_scale.to(dtype=query.dtype).unsqueeze(-1)
|
199
195
|
offset = (
|
200
196
|
sampling_offsets
|
201
197
|
* num_points_scale
|
@@ -330,9 +326,7 @@ def deformable_attention_core_func_v2(
|
|
330
326
|
_, Len_q, _, _, _ = sampling_locations.shape
|
331
327
|
|
332
328
|
split_shape = [h * w for h, w in value_spatial_shapes]
|
333
|
-
value_list = (
|
334
|
-
value.permute(0, 2, 3, 1).flatten(0, 1).split(split_shape, dim=-1)
|
335
|
-
)
|
329
|
+
value_list = value.permute(0, 2, 3, 1).flatten(0, 1).split(split_shape, dim=-1)
|
336
330
|
|
337
331
|
# sampling_offsets [8, 480, 8, 12, 2]
|
338
332
|
if method == "default":
|
@@ -361,8 +355,7 @@ def deformable_attention_core_func_v2(
|
|
361
355
|
elif method == "discrete":
|
362
356
|
# n * m, seq, n, 2
|
363
357
|
sampling_coord = (
|
364
|
-
sampling_grid_l * torch.tensor([[w, h]], device=value.device)
|
365
|
-
+ 0.5
|
358
|
+
sampling_grid_l * torch.tensor([[w, h]], device=value.device) + 0.5
|
366
359
|
).to(torch.int64)
|
367
360
|
|
368
361
|
# FIX ME? for rectangle input
|
@@ -389,9 +382,7 @@ def deformable_attention_core_func_v2(
|
|
389
382
|
attn_weights = attention_weights.permute(0, 2, 1, 3).reshape(
|
390
383
|
bs * n_head, 1, Len_q, sum(num_points_list)
|
391
384
|
)
|
392
|
-
weighted_sample_locs = (
|
393
|
-
torch.concat(sampling_value_list, dim=-1) * attn_weights
|
394
|
-
)
|
385
|
+
weighted_sample_locs = torch.concat(sampling_value_list, dim=-1) * attn_weights
|
395
386
|
output = weighted_sample_locs.sum(-1).reshape(bs, n_head * c, Len_q)
|
396
387
|
|
397
388
|
return output.permute(0, 2, 1)
|
@@ -606,9 +597,7 @@ class RTDETRTransformerv2(nn.Module):
|
|
606
597
|
[
|
607
598
|
(
|
608
599
|
"conv",
|
609
|
-
nn.Conv2d(
|
610
|
-
in_channels, self.hidden_dim, 1, bias=False
|
611
|
-
),
|
600
|
+
nn.Conv2d(in_channels, self.hidden_dim, 1, bias=False),
|
612
601
|
),
|
613
602
|
(
|
614
603
|
"norm",
|
@@ -689,13 +678,9 @@ class RTDETRTransformerv2(nn.Module):
|
|
689
678
|
torch.arange(h), torch.arange(w), indexing="ij"
|
690
679
|
)
|
691
680
|
grid_xy = torch.stack([grid_x, grid_y], dim=-1)
|
692
|
-
grid_xy = (grid_xy.unsqueeze(0) + 0.5) / torch.tensor(
|
693
|
-
[w, h], dtype=dtype
|
694
|
-
)
|
681
|
+
grid_xy = (grid_xy.unsqueeze(0) + 0.5) / torch.tensor([w, h], dtype=dtype)
|
695
682
|
wh = torch.ones_like(grid_xy) * grid_size * (2.0**lvl)
|
696
|
-
lvl_anchors = torch.concat([grid_xy, wh], dim=-1).reshape(
|
697
|
-
-1, h * w, 4
|
698
|
-
)
|
683
|
+
lvl_anchors = torch.concat([grid_xy, wh], dim=-1).reshape(-1, h * w, 4)
|
699
684
|
anchors.append(lvl_anchors)
|
700
685
|
|
701
686
|
anchors = torch.concat(anchors, dim=1).to(device)
|
@@ -729,22 +714,18 @@ class RTDETRTransformerv2(nn.Module):
|
|
729
714
|
)
|
730
715
|
|
731
716
|
enc_topk_bboxes_list, enc_topk_logits_list = [], []
|
732
|
-
enc_topk_memory, enc_topk_logits, enc_topk_bbox_unact = (
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
self.num_queries,
|
738
|
-
)
|
717
|
+
enc_topk_memory, enc_topk_logits, enc_topk_bbox_unact = self._select_topk(
|
718
|
+
output_memory,
|
719
|
+
enc_outputs_logits,
|
720
|
+
enc_outputs_coord_unact,
|
721
|
+
self.num_queries,
|
739
722
|
)
|
740
723
|
|
741
724
|
# if self.num_select_queries != self.num_queries:
|
742
725
|
# raise NotImplementedError('')
|
743
726
|
|
744
727
|
if self.learn_query_content:
|
745
|
-
content = self.tgt_embed.weight.unsqueeze(0).tile(
|
746
|
-
[memory.shape[0], 1, 1]
|
747
|
-
)
|
728
|
+
content = self.tgt_embed.weight.unsqueeze(0).tile([memory.shape[0], 1, 1])
|
748
729
|
else:
|
749
730
|
content = enc_topk_memory.detach()
|
750
731
|
|
@@ -771,9 +752,7 @@ class RTDETRTransformerv2(nn.Module):
|
|
771
752
|
topk: int,
|
772
753
|
):
|
773
754
|
if self.query_select_method == "default":
|
774
|
-
_, topk_ind = torch.topk(
|
775
|
-
outputs_logits.max(-1).values, topk, dim=-1
|
776
|
-
)
|
755
|
+
_, topk_ind = torch.topk(outputs_logits.max(-1).values, topk, dim=-1)
|
777
756
|
|
778
757
|
elif self.query_select_method == "one2many":
|
779
758
|
_, topk_ind = torch.topk(outputs_logits.flatten(1), topk, dim=-1)
|
@@ -786,16 +765,12 @@ class RTDETRTransformerv2(nn.Module):
|
|
786
765
|
|
787
766
|
topk_coords = outputs_coords_unact.gather(
|
788
767
|
dim=1,
|
789
|
-
index=topk_ind.unsqueeze(-1).repeat(
|
790
|
-
1, 1, outputs_coords_unact.shape[-1]
|
791
|
-
),
|
768
|
+
index=topk_ind.unsqueeze(-1).repeat(1, 1, outputs_coords_unact.shape[-1]),
|
792
769
|
)
|
793
770
|
|
794
771
|
topk_logits = outputs_logits.gather(
|
795
772
|
dim=1,
|
796
|
-
index=topk_ind.unsqueeze(-1).repeat(
|
797
|
-
1, 1, outputs_logits.shape[-1]
|
798
|
-
),
|
773
|
+
index=topk_ind.unsqueeze(-1).repeat(1, 1, outputs_logits.shape[-1]),
|
799
774
|
)
|
800
775
|
|
801
776
|
topk_memory = memory.gather(
|
yomitoku/models/parseq.py
CHANGED
@@ -22,7 +22,6 @@ from huggingface_hub import PyTorchModelHubMixin
|
|
22
22
|
from timm.models.helpers import named_apply
|
23
23
|
from torch import Tensor
|
24
24
|
|
25
|
-
from ..postprocessor import ParseqTokenizer as Tokenizer
|
26
25
|
from .layers.parseq_transformer import Decoder, Encoder, TokenEmbedding
|
27
26
|
|
28
27
|
|
@@ -123,7 +122,6 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
|
|
123
122
|
|
124
123
|
def forward(
|
125
124
|
self,
|
126
|
-
tokenizer: Tokenizer,
|
127
125
|
images: Tensor,
|
128
126
|
max_length: Optional[int] = None,
|
129
127
|
) -> Tensor:
|
@@ -150,11 +148,11 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
|
|
150
148
|
if self.decode_ar:
|
151
149
|
tgt_in = torch.full(
|
152
150
|
(bs, num_steps),
|
153
|
-
tokenizer.pad_id,
|
151
|
+
self.tokenizer.pad_id,
|
154
152
|
dtype=torch.long,
|
155
153
|
device=self._device,
|
156
154
|
)
|
157
|
-
tgt_in[:, 0] = tokenizer.bos_id
|
155
|
+
tgt_in[:, 0] = self.tokenizer.bos_id
|
158
156
|
|
159
157
|
logits = []
|
160
158
|
for i in range(num_steps):
|
@@ -177,7 +175,7 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
|
|
177
175
|
# greedy decode. add the next token index to the target input
|
178
176
|
tgt_in[:, j] = p_i.squeeze().argmax(-1)
|
179
177
|
# Efficient batch decoding: If all output words have at least one EOS token, end decoding.
|
180
|
-
if testing and (tgt_in == tokenizer.eos_id).any(dim=-1).all():
|
178
|
+
if testing and (tgt_in == self.tokenizer.eos_id).any(dim=-1).all():
|
181
179
|
break
|
182
180
|
|
183
181
|
logits = torch.cat(logits, dim=1)
|
@@ -185,7 +183,7 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
|
|
185
183
|
# No prior context, so input is just <bos>. We query all positions.
|
186
184
|
tgt_in = torch.full(
|
187
185
|
(bs, 1),
|
188
|
-
tokenizer.bos_id,
|
186
|
+
self.tokenizer.bos_id,
|
189
187
|
dtype=torch.long,
|
190
188
|
device=self._device,
|
191
189
|
)
|
@@ -200,7 +198,7 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
|
|
200
198
|
torch.ones(
|
201
199
|
num_steps,
|
202
200
|
num_steps,
|
203
|
-
dtype=torch.
|
201
|
+
dtype=torch.int64,
|
204
202
|
device=self._device,
|
205
203
|
),
|
206
204
|
2,
|
@@ -208,7 +206,7 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
|
|
208
206
|
] = 0
|
209
207
|
bos = torch.full(
|
210
208
|
(bs, 1),
|
211
|
-
tokenizer.bos_id,
|
209
|
+
self.tokenizer.bos_id,
|
212
210
|
dtype=torch.long,
|
213
211
|
device=self._device,
|
214
212
|
)
|
@@ -216,7 +214,9 @@ class PARSeq(nn.Module, PyTorchModelHubMixin):
|
|
216
214
|
# Prior context is the previous output.
|
217
215
|
tgt_in = torch.cat([bos, logits[:, :-1].argmax(-1)], dim=1)
|
218
216
|
# Mask tokens beyond the first EOS token.
|
219
|
-
tgt_padding_mask = (tgt_in == tokenizer.eos_id).int().cumsum(
|
217
|
+
tgt_padding_mask = (tgt_in == self.tokenizer.eos_id).int().cumsum(
|
218
|
+
-1
|
219
|
+
) > 0
|
220
220
|
tgt_out = self.decode(
|
221
221
|
tgt_in,
|
222
222
|
memory,
|
yomitoku/onnx/.gitkeep
ADDED
File without changes
|
@@ -54,16 +54,12 @@ class RTDETRPostProcessor(nn.Module):
|
|
54
54
|
logits, boxes = outputs["pred_logits"], outputs["pred_boxes"]
|
55
55
|
# orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)
|
56
56
|
|
57
|
-
bbox_pred = torchvision.ops.box_convert(
|
58
|
-
boxes, in_fmt="cxcywh", out_fmt="xyxy"
|
59
|
-
)
|
57
|
+
bbox_pred = torchvision.ops.box_convert(boxes, in_fmt="cxcywh", out_fmt="xyxy")
|
60
58
|
bbox_pred *= orig_target_sizes.repeat(1, 2).unsqueeze(1)
|
61
59
|
|
62
60
|
if self.use_focal_loss:
|
63
61
|
scores = F.sigmoid(logits)
|
64
|
-
scores, index = torch.topk(
|
65
|
-
scores.flatten(1), self.num_top_queries, dim=-1
|
66
|
-
)
|
62
|
+
scores, index = torch.topk(scores.flatten(1), self.num_top_queries, dim=-1)
|
67
63
|
# TODO for older tensorrt
|
68
64
|
# labels = index % self.num_classes
|
69
65
|
labels = mod(index, self.num_classes)
|
@@ -77,9 +73,7 @@ class RTDETRPostProcessor(nn.Module):
|
|
77
73
|
scores = F.softmax(logits)[:, :, :-1]
|
78
74
|
scores, labels = scores.max(dim=-1)
|
79
75
|
if scores.shape[1] > self.num_top_queries:
|
80
|
-
scores, index = torch.topk(
|
81
|
-
scores, self.num_top_queries, dim=-1
|
82
|
-
)
|
76
|
+
scores, index = torch.topk(scores, self.num_top_queries, dim=-1)
|
83
77
|
labels = torch.gather(labels, dim=1, index=index)
|
84
78
|
boxes = torch.gather(
|
85
79
|
boxes,
|
@@ -97,10 +91,7 @@ class RTDETRPostProcessor(nn.Module):
|
|
97
91
|
|
98
92
|
labels = (
|
99
93
|
torch.tensor(
|
100
|
-
[
|
101
|
-
mscoco_label2category[int(x.item())]
|
102
|
-
for x in labels.flatten()
|
103
|
-
]
|
94
|
+
[mscoco_label2category[int(x.item())] for x in labels.flatten()]
|
104
95
|
)
|
105
96
|
.to(boxes.device)
|
106
97
|
.reshape(labels.shape)
|
@@ -1,11 +1,16 @@
|
|
1
1
|
from typing import List, Union
|
2
2
|
|
3
3
|
import cv2
|
4
|
+
import os
|
5
|
+
import onnx
|
6
|
+
import onnxruntime
|
4
7
|
import torch
|
5
8
|
import torchvision.transforms as T
|
6
9
|
from PIL import Image
|
7
10
|
from pydantic import conlist
|
8
11
|
|
12
|
+
from .constants import ROOT_DIR
|
13
|
+
|
9
14
|
from .base import BaseModelCatalog, BaseModule, BaseSchema
|
10
15
|
from .configs import TableStructureRecognizerRTDETRv2Config
|
11
16
|
from .layout_parser import filter_contained_rectangles_within_category
|
@@ -109,12 +114,13 @@ class TableStructureRecognizer(BaseModule):
|
|
109
114
|
device="cuda",
|
110
115
|
visualize=False,
|
111
116
|
from_pretrained=True,
|
117
|
+
infer_onnx=False,
|
112
118
|
):
|
113
119
|
super().__init__()
|
114
120
|
self.load_model(
|
115
121
|
model_name,
|
116
122
|
path_cfg,
|
117
|
-
from_pretrained=
|
123
|
+
from_pretrained=from_pretrained,
|
118
124
|
)
|
119
125
|
self.device = device
|
120
126
|
self.visualize = visualize
|
@@ -127,6 +133,8 @@ class TableStructureRecognizer(BaseModule):
|
|
127
133
|
num_top_queries=self._cfg.RTDETRTransformerv2.num_queries,
|
128
134
|
)
|
129
135
|
|
136
|
+
self.save_config("table_structure_recognitizer.yaml")
|
137
|
+
|
130
138
|
self.transforms = T.Compose(
|
131
139
|
[
|
132
140
|
T.Resize(self._cfg.data.img_size),
|
@@ -140,6 +148,40 @@ class TableStructureRecognizer(BaseModule):
|
|
140
148
|
id: category for id, category in enumerate(self._cfg.category)
|
141
149
|
}
|
142
150
|
|
151
|
+
self.infer_onnx = infer_onnx
|
152
|
+
if infer_onnx:
|
153
|
+
name = self._cfg.hf_hub_repo.split("/")[-1]
|
154
|
+
path_onnx = f"{ROOT_DIR}/onnx/{name}.onnx"
|
155
|
+
if not os.path.exists(path_onnx):
|
156
|
+
self.convert_onnx(path_onnx)
|
157
|
+
|
158
|
+
model = onnx.load(path_onnx)
|
159
|
+
if torch.cuda.is_available() and device == "cuda":
|
160
|
+
self.sess = onnxruntime.InferenceSession(
|
161
|
+
model.SerializeToString(), providers=["CUDAExecutionProvider"]
|
162
|
+
)
|
163
|
+
else:
|
164
|
+
self.sess = onnxruntime.InferenceSession(model.SerializeToString())
|
165
|
+
|
166
|
+
def convert_onnx(self, path_onnx):
|
167
|
+
dynamic_axes = {
|
168
|
+
"input": {0: "batch_size"},
|
169
|
+
"output": {0: "batch_size"},
|
170
|
+
}
|
171
|
+
|
172
|
+
img_size = self._cfg.data.img_size
|
173
|
+
dummy_input = torch.randn(1, 3, *img_size, requires_grad=True)
|
174
|
+
|
175
|
+
torch.onnx.export(
|
176
|
+
self.model,
|
177
|
+
dummy_input,
|
178
|
+
path_onnx,
|
179
|
+
opset_version=16,
|
180
|
+
input_names=["input"],
|
181
|
+
output_names=["pred_logits", "pred_boxes"],
|
182
|
+
dynamic_axes=dynamic_axes,
|
183
|
+
)
|
184
|
+
|
143
185
|
def preprocess(self, img, boxes):
|
144
186
|
cv_img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
145
187
|
|
@@ -149,7 +191,7 @@ class TableStructureRecognizer(BaseModule):
|
|
149
191
|
table_img = cv_img[y1:y2, x1:x2, :]
|
150
192
|
th, hw = table_img.shape[:2]
|
151
193
|
table_img = Image.fromarray(table_img)
|
152
|
-
img_tensor = self.transforms(table_img)[None]
|
194
|
+
img_tensor = self.transforms(table_img)[None]
|
153
195
|
table_imgs.append(
|
154
196
|
{
|
155
197
|
"tensor": img_tensor,
|
@@ -226,8 +268,19 @@ class TableStructureRecognizer(BaseModule):
|
|
226
268
|
img_tensors = self.preprocess(img, table_boxes)
|
227
269
|
outputs = []
|
228
270
|
for data in img_tensors:
|
229
|
-
|
230
|
-
|
271
|
+
if self.infer_onnx:
|
272
|
+
input = data["tensor"].numpy()
|
273
|
+
results = self.sess.run(None, {"input": input})
|
274
|
+
pred = {
|
275
|
+
"pred_logits": torch.tensor(results[0]).to(self.device),
|
276
|
+
"pred_boxes": torch.tensor(results[1]).to(self.device),
|
277
|
+
}
|
278
|
+
|
279
|
+
else:
|
280
|
+
with torch.inference_mode():
|
281
|
+
data["tensor"] = data["tensor"].to(self.device)
|
282
|
+
pred = self.model(data["tensor"])
|
283
|
+
|
231
284
|
table = self.postprocess(pred, data)
|
232
285
|
outputs.append(table)
|
233
286
|
|
yomitoku/text_detector.py
CHANGED
@@ -2,6 +2,7 @@ from typing import List
|
|
2
2
|
|
3
3
|
import numpy as np
|
4
4
|
import torch
|
5
|
+
import os
|
5
6
|
from pydantic import conlist
|
6
7
|
|
7
8
|
from .base import BaseModelCatalog, BaseModule, BaseSchema
|
@@ -14,6 +15,10 @@ from .data.functions import (
|
|
14
15
|
from .models import DBNet
|
15
16
|
from .postprocessor import DBnetPostProcessor
|
16
17
|
from .utils.visualizer import det_visualizer
|
18
|
+
from .constants import ROOT_DIR
|
19
|
+
|
20
|
+
import onnx
|
21
|
+
import onnxruntime
|
17
22
|
|
18
23
|
|
19
24
|
class TextDetectorModelCatalog(BaseModelCatalog):
|
@@ -43,12 +48,13 @@ class TextDetector(BaseModule):
|
|
43
48
|
device="cuda",
|
44
49
|
visualize=False,
|
45
50
|
from_pretrained=True,
|
51
|
+
infer_onnx=False,
|
46
52
|
):
|
47
53
|
super().__init__()
|
48
54
|
self.load_model(
|
49
55
|
model_name,
|
50
56
|
path_cfg,
|
51
|
-
from_pretrained=
|
57
|
+
from_pretrained=from_pretrained,
|
52
58
|
)
|
53
59
|
|
54
60
|
self.device = device
|
@@ -58,6 +64,39 @@ class TextDetector(BaseModule):
|
|
58
64
|
self.model.to(self.device)
|
59
65
|
|
60
66
|
self.post_processor = DBnetPostProcessor(**self._cfg.post_process)
|
67
|
+
self.infer_onnx = infer_onnx
|
68
|
+
|
69
|
+
if infer_onnx:
|
70
|
+
name = self._cfg.hf_hub_repo.split("/")[-1]
|
71
|
+
path_onnx = f"{ROOT_DIR}/onnx/{name}.onnx"
|
72
|
+
if not os.path.exists(path_onnx):
|
73
|
+
self.convert_onnx(path_onnx)
|
74
|
+
|
75
|
+
model = onnx.load(path_onnx)
|
76
|
+
if torch.cuda.is_available() and device == "cuda":
|
77
|
+
self.sess = onnxruntime.InferenceSession(
|
78
|
+
model.SerializeToString(), providers=["CUDAExecutionProvider"]
|
79
|
+
)
|
80
|
+
else:
|
81
|
+
self.sess = onnxruntime.InferenceSession(model.SerializeToString())
|
82
|
+
|
83
|
+
def convert_onnx(self, path_onnx):
|
84
|
+
dynamic_axes = {
|
85
|
+
"input": {0: "batch_size", 2: "height", 3: "width"},
|
86
|
+
"output": {0: "batch_size", 2: "height", 3: "width"},
|
87
|
+
}
|
88
|
+
|
89
|
+
dummy_input = torch.randn(1, 3, 256, 256, requires_grad=True)
|
90
|
+
|
91
|
+
torch.onnx.export(
|
92
|
+
self.model,
|
93
|
+
dummy_input,
|
94
|
+
path_onnx,
|
95
|
+
opset_version=14,
|
96
|
+
input_names=["input"],
|
97
|
+
output_names=["output"],
|
98
|
+
dynamic_axes=dynamic_axes,
|
99
|
+
)
|
61
100
|
|
62
101
|
def preprocess(self, img):
|
63
102
|
img = img.copy()
|
@@ -81,9 +120,15 @@ class TextDetector(BaseModule):
|
|
81
120
|
|
82
121
|
ori_h, ori_w = img.shape[:2]
|
83
122
|
tensor = self.preprocess(img)
|
84
|
-
|
85
|
-
|
86
|
-
|
123
|
+
|
124
|
+
if self.infer_onnx:
|
125
|
+
input = tensor.numpy()
|
126
|
+
results = self.sess.run(["output"], {"input": input})
|
127
|
+
preds = {"binary": torch.tensor(results[0])}
|
128
|
+
else:
|
129
|
+
with torch.inference_mode():
|
130
|
+
tensor = tensor.to(self.device)
|
131
|
+
preds = self.model(tensor)
|
87
132
|
|
88
133
|
quads, scores = self.postprocess(preds, (ori_h, ori_w))
|
89
134
|
outputs = {"points": quads, "scores": scores}
|
yomitoku/text_recognizer.py
CHANGED
@@ -2,22 +2,28 @@ from typing import List
|
|
2
2
|
|
3
3
|
import numpy as np
|
4
4
|
import torch
|
5
|
+
import os
|
5
6
|
import unicodedata
|
6
7
|
from pydantic import conlist
|
7
8
|
|
8
9
|
from .base import BaseModelCatalog, BaseModule, BaseSchema
|
9
|
-
from .configs import TextRecognizerPARSeqConfig
|
10
|
+
from .configs import TextRecognizerPARSeqConfig, TextRecognizerPARSeqSmallConfig
|
10
11
|
from .data.dataset import ParseqDataset
|
11
12
|
from .models import PARSeq
|
12
13
|
from .postprocessor import ParseqTokenizer as Tokenizer
|
13
14
|
from .utils.misc import load_charset
|
14
15
|
from .utils.visualizer import rec_visualizer
|
15
16
|
|
17
|
+
from .constants import ROOT_DIR
|
18
|
+
import onnx
|
19
|
+
import onnxruntime
|
20
|
+
|
16
21
|
|
17
22
|
class TextRecognizerModelCatalog(BaseModelCatalog):
|
18
23
|
def __init__(self):
|
19
24
|
super().__init__()
|
20
25
|
self.register("parseq", TextRecognizerPARSeqConfig, PARSeq)
|
26
|
+
self.register("parseq-small", TextRecognizerPARSeqSmallConfig, PARSeq)
|
21
27
|
|
22
28
|
|
23
29
|
class TextRecognizerSchema(BaseSchema):
|
@@ -43,23 +49,41 @@ class TextRecognizer(BaseModule):
|
|
43
49
|
device="cuda",
|
44
50
|
visualize=False,
|
45
51
|
from_pretrained=True,
|
52
|
+
infer_onnx=False,
|
46
53
|
):
|
47
54
|
super().__init__()
|
48
55
|
self.load_model(
|
49
56
|
model_name,
|
50
57
|
path_cfg,
|
51
|
-
from_pretrained=
|
58
|
+
from_pretrained=from_pretrained,
|
52
59
|
)
|
53
60
|
self.charset = load_charset(self._cfg.charset)
|
54
61
|
self.tokenizer = Tokenizer(self.charset)
|
55
62
|
|
56
63
|
self.device = device
|
57
64
|
|
65
|
+
self.model.tokenizer = self.tokenizer
|
58
66
|
self.model.eval()
|
59
67
|
self.model.to(self.device)
|
60
68
|
|
61
69
|
self.visualize = visualize
|
62
70
|
|
71
|
+
self.infer_onnx = infer_onnx
|
72
|
+
|
73
|
+
if infer_onnx:
|
74
|
+
name = self._cfg.hf_hub_repo.split("/")[-1]
|
75
|
+
path_onnx = f"{ROOT_DIR}/onnx/{name}.onnx"
|
76
|
+
if not os.path.exists(path_onnx):
|
77
|
+
self.convert_onnx(path_onnx)
|
78
|
+
|
79
|
+
model = onnx.load(path_onnx)
|
80
|
+
if torch.cuda.is_available() and device == "cuda":
|
81
|
+
self.sess = onnxruntime.InferenceSession(
|
82
|
+
model.SerializeToString(), providers=["CUDAExecutionProvider"]
|
83
|
+
)
|
84
|
+
else:
|
85
|
+
self.sess = onnxruntime.InferenceSession(model.SerializeToString())
|
86
|
+
|
63
87
|
def preprocess(self, img, polygons):
|
64
88
|
dataset = ParseqDataset(self._cfg, img, polygons)
|
65
89
|
dataloader = torch.utils.data.DataLoader(
|
@@ -71,6 +95,25 @@ class TextRecognizer(BaseModule):
|
|
71
95
|
|
72
96
|
return dataloader
|
73
97
|
|
98
|
+
def convert_onnx(self, path_onnx):
|
99
|
+
img_size = self._cfg.data.img_size
|
100
|
+
input = torch.randn(1, 3, *img_size, requires_grad=True)
|
101
|
+
dynamic_axes = {
|
102
|
+
"input": {0: "batch_size"},
|
103
|
+
"output": {0: "batch_size"},
|
104
|
+
}
|
105
|
+
|
106
|
+
torch.onnx.export(
|
107
|
+
self.model,
|
108
|
+
input,
|
109
|
+
path_onnx,
|
110
|
+
opset_version=14,
|
111
|
+
input_names=["input"],
|
112
|
+
output_names=["output"],
|
113
|
+
do_constant_folding=True,
|
114
|
+
dynamic_axes=dynamic_axes,
|
115
|
+
)
|
116
|
+
|
74
117
|
def postprocess(self, p, points):
|
75
118
|
pred, score = self.tokenizer.decode(p)
|
76
119
|
pred = [unicodedata.normalize("NFKC", x) for x in pred]
|
@@ -101,13 +144,19 @@ class TextRecognizer(BaseModule):
|
|
101
144
|
scores = []
|
102
145
|
directions = []
|
103
146
|
for data in dataloader:
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
147
|
+
if self.infer_onnx:
|
148
|
+
input = data.numpy()
|
149
|
+
results = self.sess.run(["output"], {"input": input})
|
150
|
+
p = torch.tensor(results[0])
|
151
|
+
else:
|
152
|
+
with torch.inference_mode():
|
153
|
+
data = data.to(self.device)
|
154
|
+
p = self.model(data).softmax(-1)
|
155
|
+
|
156
|
+
pred, score, direction = self.postprocess(p, points)
|
157
|
+
preds.extend(pred)
|
158
|
+
scores.extend(score)
|
159
|
+
directions.extend(direction)
|
111
160
|
|
112
161
|
outputs = {
|
113
162
|
"contents": preds,
|
@@ -1,14 +1,17 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: yomitoku
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.6.0
|
4
4
|
Summary: Yomitoku is an AI-powered document image analysis package designed specifically for the Japanese language.
|
5
5
|
Author-email: Kotaro Kinoshita <kotaro.kinoshita@mlism.com>
|
6
6
|
License: CC BY-NC-SA 4.0
|
7
7
|
Keywords: Deep Learning,Japanese,OCR
|
8
|
-
Requires-Python: <3.13,>=3.
|
8
|
+
Requires-Python: <3.13,>=3.10
|
9
9
|
Requires-Dist: huggingface-hub>=0.26.1
|
10
10
|
Requires-Dist: lxml>=5.3.0
|
11
11
|
Requires-Dist: omegaconf>=2.3.0
|
12
|
+
Requires-Dist: onnx>=1.17.0
|
13
|
+
Requires-Dist: onnxruntime-gpu>=1.20.1
|
14
|
+
Requires-Dist: onnxruntime>=1.20.1
|
12
15
|
Requires-Dist: opencv-python>=4.10.0.84
|
13
16
|
Requires-Dist: pyclipper>=1.3.0.post6
|
14
17
|
Requires-Dist: pydantic>=2.9.2
|
@@ -23,7 +26,7 @@ Description-Content-Type: text/markdown
|
|
23
26
|
|
24
27
|
<img src="static/logo/horizontal.png" width="800px">
|
25
28
|
|
26
|
-

|
27
30
|

|
28
31
|

|
29
32
|

|
@@ -69,19 +72,20 @@ Markdown でエクスポートした結果は関してはリポジトリ内の[s
|
|
69
72
|
pip install yomitoku
|
70
73
|
```
|
71
74
|
|
72
|
-
- pytorch はご自身の CUDAのバージョンにあったものをインストールしてください。デフォルトではCUDA12.4以上に対応したものがインストールされます。
|
73
|
-
- pytorch は2.5以上のバージョンに対応しています。その関係でCUDA11.8以上のバージョンが必要になります。対応できない場合は、リポジトリ内のDockerfileを利用してください。
|
75
|
+
- pytorch はご自身の CUDA のバージョンにあったものをインストールしてください。デフォルトでは CUDA12.4 以上に対応したものがインストールされます。
|
76
|
+
- pytorch は 2.5 以上のバージョンに対応しています。その関係で CUDA11.8 以上のバージョンが必要になります。対応できない場合は、リポジトリ内の Dockerfile を利用してください。
|
74
77
|
|
75
78
|
## 🚀 実行方法
|
76
79
|
|
77
80
|
```
|
78
|
-
yomitoku ${path_data} -f md -o results -v --figure
|
81
|
+
yomitoku ${path_data} -f md -o results -v --figure --lite
|
79
82
|
```
|
80
83
|
|
81
84
|
- `${path_data}` 解析対象の画像が含まれたディレクトリか画像ファイルのパスを直接して指定してください。ディレクトリを対象とした場合はディレクトリのサブディレクトリ内の画像も含めて処理を実行します。
|
82
85
|
- `-f`, `--format` 出力形式のファイルフォーマットを指定します。(json, csv, html, md をサポート)
|
83
86
|
- `-o`, `--outdir` 出力先のディレクトリ名を指定します。存在しない場合は新規で作成されます。
|
84
87
|
- `-v`, `--vis` を指定すると解析結果を可視化した画像を出力します。
|
88
|
+
- `-l`, `--lite` を指定すると軽量モデルで推論を実行します。通常より高速に推論できますが、若干、精度が低下する可能性があります。
|
85
89
|
- `-d`, `--device` モデルを実行するためのデバイスを指定します。gpu が利用できない場合は cpu で推論が実行されます。(デフォルト: cuda)
|
86
90
|
- `--ignore_line_break` 画像の改行位置を無視して、段落内の文章を連結して返します。(デフォルト:画像通りの改行位置位置で改行します。)
|
87
91
|
- `--figure_letter` 検出した図表に含まれる文字も出力ファイルにエクスポートします。
|
@@ -94,6 +98,7 @@ yomitoku --help
|
|
94
98
|
```
|
95
99
|
|
96
100
|
**NOTE**
|
101
|
+
|
97
102
|
- GPU での実行を推奨します。CPU を用いての推論向けに最適化されておらず、処理時間が長くなります。
|
98
103
|
- 活字のみ識別をサポートしております。手書き文字に関しては、読み取れる場合もありますが、公式にはサポートしておりません。
|
99
104
|
- Yomitoku は文書 OCR 向けに最適化されており、情景 OCR(看板など紙以外にプリントされた文字の読み取り)向けには最適化されていません。
|
@@ -107,6 +112,6 @@ yomitoku --help
|
|
107
112
|
|
108
113
|
本リポジトリ内に格納されているソースコードおよび本プロジェクトに関連する HuggingFaceHub 上のモデルの重みファイルのライセンスは CC BY-NC-SA 4.0 に従います。
|
109
114
|
非商用での個人利用、研究目的での利用はご自由にお使いください。
|
110
|
-
|
115
|
+
商用目的での利用に関しては、別途、商用ライセンスを提供しますので、https://www.mlism.com/ にお問い合わせください。
|
111
116
|
|
112
117
|
YomiToku © 2024 by Kotaro Kinoshita is licensed under CC BY-NC-SA 4.0. To view a copy of this license, visit https://creativecommons.org/licenses/by-nc-sa/4.0/
|
@@ -3,19 +3,20 @@ yomitoku/base.py,sha256=lzR_V8t87aRasmFdFwD-8KAeSahSTI3AZaEn6g8sOv8,3871
|
|
3
3
|
yomitoku/constants.py,sha256=zlW5QRc_u_F3C2RAgBFWyHJZexBnJT5N15GC-9d3iLo,686
|
4
4
|
yomitoku/document_analyzer.py,sha256=HIg-nVzDhJIP-h-tn4uU86KakgHdlAhosEqK_i-SWe4,9906
|
5
5
|
yomitoku/layout_analyzer.py,sha256=QTeRcVd8aySz8u6dg2ikET77ar3sqlukRLBwYfTyMPM,2033
|
6
|
-
yomitoku/layout_parser.py,sha256=
|
6
|
+
yomitoku/layout_parser.py,sha256=Yni1C_7j4fzHcdmBNNGRZPc23W_6J6HwPPQVjYvaztM,7539
|
7
7
|
yomitoku/ocr.py,sha256=Rcojw0aGA6yDF2RjqfK23_rMw-xm61KGd8JmTCTOOVU,2516
|
8
8
|
yomitoku/reading_order.py,sha256=OfhOS9ttPDoPSuHrIRKyOzG19GGeRufbuSKDqhsohh4,6404
|
9
|
-
yomitoku/table_structure_recognizer.py,sha256=
|
10
|
-
yomitoku/text_detector.py,sha256=
|
11
|
-
yomitoku/text_recognizer.py,sha256=
|
9
|
+
yomitoku/table_structure_recognizer.py,sha256=Wf_Ehmf6V27iVLmw2o9i7kJnbwEOhuExI-ljIO3a8NE,9043
|
10
|
+
yomitoku/text_detector.py,sha256=fbwKelsVfwCt5YL4h-WEf4qkniv5cXmyaLR6oSYz0eA,4167
|
11
|
+
yomitoku/text_recognizer.py,sha256=Iu-IzwaziNjmrTeSw9aoN9BDTHkNOzsZhViCv45yiN8,5422
|
12
12
|
yomitoku/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
|
-
yomitoku/cli/main.py,sha256=
|
14
|
-
yomitoku/configs/__init__.py,sha256=
|
13
|
+
yomitoku/cli/main.py,sha256=qDB_YNK7abstIr9tYLiJjNU3xLSCd5x1UNDKqwUi2Rk,6885
|
14
|
+
yomitoku/configs/__init__.py,sha256=e1Alss5QJLZSNfD6zLEG6xu5vDQDw-4Jayiqq8bq52s,571
|
15
15
|
yomitoku/configs/cfg_layout_parser_rtdtrv2.py,sha256=8PRxB2Ar9UF7-DLtbgSokhrzdXb0veWI6Wc-X8qigRw,2329
|
16
16
|
yomitoku/configs/cfg_table_structure_recognizer_rtdtrv2.py,sha256=o70GMHD8k-zeBeJtuhPS8x7vVB-ffucnJXeSyn-0AXo,2116
|
17
17
|
yomitoku/configs/cfg_text_detector_dbnet.py,sha256=U9k48PON7haoOaytiELhbZRpv9RMiUm6nnfHmdxIa9Q,1153
|
18
18
|
yomitoku/configs/cfg_text_recognizer_parseq.py,sha256=hpFs3nKqh4XdU3BZMTultegtLEGahEsCaZdjfKC_MO8,1247
|
19
|
+
yomitoku/configs/cfg_text_recognizer_parseq_small.py,sha256=uCm_VC_G79IbZpOiK8fgYzAJ4b98H5pf328wyQomtfo,1259
|
19
20
|
yomitoku/data/__init__.py,sha256=KAofFc9rk9ZdTKBjemu9RM8Vj9XnKbWC2MPZ2RWtOdE,82
|
20
21
|
yomitoku/data/dataset.py,sha256=-I4f-FDtgsPnJ2MnXB7FtwihMW3koDaSI1OEoqKneIg,1014
|
21
22
|
yomitoku/data/functions.py,sha256=eOyxo8S6EoAf1xGSPLWQFb9-t5Rg52NggD9MFIrOSpY,7506
|
@@ -26,19 +27,20 @@ yomitoku/export/export_json.py,sha256=1ChvCAHfCmMQvCfcAb1p3fSpr4elNAs3xBSIbpfn3b
|
|
26
27
|
yomitoku/export/export_markdown.py,sha256=mCcsXUWBLrYc1NcRSBFfBT28d6eCddAF1oHp0qdBEnE,3986
|
27
28
|
yomitoku/models/__init__.py,sha256=Enxq9sjJWusZuxecTori8IQa8NEYKaiiptDluHX1avg,144
|
28
29
|
yomitoku/models/dbnet_plus.py,sha256=jeWJZm0ihbxoJeAXBFK7uVIwoosx2IUNk7Ut5wRH0vA,7998
|
29
|
-
yomitoku/models/parseq.py,sha256
|
30
|
+
yomitoku/models/parseq.py,sha256=-DQMQuON2jwtb4Ib2V0O19un9w-WG4rXS0SiscydrXU,8593
|
30
31
|
yomitoku/models/rtdetr.py,sha256=oJsr8RHz3frslhLfXdVJve47lUsrmqLjfdTrZ41tlQ0,687
|
31
32
|
yomitoku/models/layers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
32
33
|
yomitoku/models/layers/activate.py,sha256=S54GPssZBMloM2oFAXeDVMmBBZOWyjwU98Niq758txE,1244
|
33
34
|
yomitoku/models/layers/dbnet_feature_attention.py,sha256=Vpp_PiLVuI7Zs30TTg4RNRn16KTb81ewonADpUHd4aE,6060
|
34
35
|
yomitoku/models/layers/parseq_transformer.py,sha256=33eroJf8rmgIptP-NpZLJMhG7XOTwV4rXsq674VrKnU,6704
|
35
|
-
yomitoku/models/layers/rtdetr_backbone.py,sha256=
|
36
|
-
yomitoku/models/layers/rtdetr_hybrid_encoder.py,sha256=
|
37
|
-
yomitoku/models/layers/rtdetrv2_decoder.py,sha256=
|
36
|
+
yomitoku/models/layers/rtdetr_backbone.py,sha256=VOWFW7XFfJl4cvPaupqqP4-I-YHdwlVltQEgliD69As,9904
|
37
|
+
yomitoku/models/layers/rtdetr_hybrid_encoder.py,sha256=ZnpEzJLzHgu_hrx7YK6myXZ4F1CDHRM501RbAPQdzdQ,14125
|
38
|
+
yomitoku/models/layers/rtdetrv2_decoder.py,sha256=ggUwTdWpBfyYHnZuLx8vyH8n0XfZkQFtxgpY-1YI2sI,28070
|
39
|
+
yomitoku/onnx/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
38
40
|
yomitoku/postprocessor/__init__.py,sha256=W4vUuqBaFtH5dlSBIYgyaCroGLMjpV6RrNGIBQ8NFVw,243
|
39
41
|
yomitoku/postprocessor/dbnet_postporcessor.py,sha256=o_y8b5REd2dFEdIpRcr6o-XBfOCHo9rBYGwokP_uhTc,4948
|
40
42
|
yomitoku/postprocessor/parseq_tokenizer.py,sha256=e89_g_bc4Au3SchuxoJfJNATJTxFmVYetzXyAzPWm28,4315
|
41
|
-
yomitoku/postprocessor/rtdetr_postprocessor.py,sha256=
|
43
|
+
yomitoku/postprocessor/rtdetr_postprocessor.py,sha256=TCv1t1zCxg2rSirsLm4sXlaltGubH-roVdEqnUoRs-8,3905
|
42
44
|
yomitoku/resource/MPLUS1p-Medium.ttf,sha256=KLL1KkCumIBkgQtx1n4SffdaFuCNffThktEAbkB1OU8,1758908
|
43
45
|
yomitoku/resource/charset.txt,sha256=sU91kSi-9Wk4733bCXy4j_UDmvcsj96sHOq1ppUJlOY,21672
|
44
46
|
yomitoku/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -46,7 +48,7 @@ yomitoku/utils/graph.py,sha256=LKNB8ZhSQwOZMfeAimPMF5UCVVr2ZaUWoGDkz8z-uGU,456
|
|
46
48
|
yomitoku/utils/logger.py,sha256=uOmtQDr0A0JD7wyFshedL08BiNrQorHnpktRXba8bjU,424
|
47
49
|
yomitoku/utils/misc.py,sha256=2Eyy7-9K_h4Mal1zGXq6OlxubfNzhS0mEYwn_xt7xl8,2497
|
48
50
|
yomitoku/utils/visualizer.py,sha256=2pSmbhUPylzVVJ0bXtGDoNmMdArAByab4Py7Xavvs_A,5230
|
49
|
-
yomitoku-0.
|
50
|
-
yomitoku-0.
|
51
|
-
yomitoku-0.
|
52
|
-
yomitoku-0.
|
51
|
+
yomitoku-0.6.0.dist-info/METADATA,sha256=XDmMBtDx9MjXPuzcARwOwJXRN7PMCsQDwc38jDSwX5g,8134
|
52
|
+
yomitoku-0.6.0.dist-info/WHEEL,sha256=C2FUgwZgiLbznR-k0b_5k3Ai_1aASOXDss3lzCUsUug,87
|
53
|
+
yomitoku-0.6.0.dist-info/entry_points.txt,sha256=nFV3S11zgBNW0Qq_D0XQNg2R4lNXU_9XUFr6rdJoyF8,52
|
54
|
+
yomitoku-0.6.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|