yomitoku 0.4.0.post1.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- yomitoku/__init__.py +20 -0
- yomitoku/base.py +136 -0
- yomitoku/cli/__init__.py +0 -0
- yomitoku/cli/main.py +230 -0
- yomitoku/configs/__init__.py +13 -0
- yomitoku/configs/cfg_layout_parser_rtdtrv2.py +89 -0
- yomitoku/configs/cfg_table_structure_recognizer_rtdtrv2.py +80 -0
- yomitoku/configs/cfg_text_detector_dbnet.py +49 -0
- yomitoku/configs/cfg_text_recognizer_parseq.py +51 -0
- yomitoku/constants.py +32 -0
- yomitoku/data/__init__.py +3 -0
- yomitoku/data/dataset.py +40 -0
- yomitoku/data/functions.py +279 -0
- yomitoku/document_analyzer.py +315 -0
- yomitoku/export/__init__.py +6 -0
- yomitoku/export/export_csv.py +71 -0
- yomitoku/export/export_html.py +188 -0
- yomitoku/export/export_json.py +34 -0
- yomitoku/export/export_markdown.py +145 -0
- yomitoku/layout_analyzer.py +66 -0
- yomitoku/layout_parser.py +189 -0
- yomitoku/models/__init__.py +9 -0
- yomitoku/models/dbnet_plus.py +272 -0
- yomitoku/models/layers/__init__.py +0 -0
- yomitoku/models/layers/activate.py +38 -0
- yomitoku/models/layers/dbnet_feature_attention.py +160 -0
- yomitoku/models/layers/parseq_transformer.py +218 -0
- yomitoku/models/layers/rtdetr_backbone.py +333 -0
- yomitoku/models/layers/rtdetr_hybrid_encoder.py +433 -0
- yomitoku/models/layers/rtdetrv2_decoder.py +811 -0
- yomitoku/models/parseq.py +243 -0
- yomitoku/models/rtdetr.py +22 -0
- yomitoku/ocr.py +87 -0
- yomitoku/postprocessor/__init__.py +9 -0
- yomitoku/postprocessor/dbnet_postporcessor.py +137 -0
- yomitoku/postprocessor/parseq_tokenizer.py +128 -0
- yomitoku/postprocessor/rtdetr_postprocessor.py +107 -0
- yomitoku/reading_order.py +214 -0
- yomitoku/resource/MPLUS1p-Medium.ttf +0 -0
- yomitoku/resource/charset.txt +1 -0
- yomitoku/table_structure_recognizer.py +244 -0
- yomitoku/text_detector.py +103 -0
- yomitoku/text_recognizer.py +128 -0
- yomitoku/utils/__init__.py +0 -0
- yomitoku/utils/graph.py +20 -0
- yomitoku/utils/logger.py +15 -0
- yomitoku/utils/misc.py +102 -0
- yomitoku/utils/visualizer.py +179 -0
- yomitoku-0.4.0.post1.dev0.dist-info/METADATA +127 -0
- yomitoku-0.4.0.post1.dev0.dist-info/RECORD +52 -0
- yomitoku-0.4.0.post1.dev0.dist-info/WHEEL +4 -0
- yomitoku-0.4.0.post1.dev0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,433 @@
|
|
1
|
+
"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
|
2
|
+
"""
|
3
|
+
|
4
|
+
import copy
|
5
|
+
from collections import OrderedDict
|
6
|
+
|
7
|
+
import torch
|
8
|
+
import torch.nn as nn
|
9
|
+
import torch.nn.functional as F
|
10
|
+
|
11
|
+
from .activate import get_activation
|
12
|
+
|
13
|
+
|
14
|
+
class ConvNormLayer(nn.Module):
|
15
|
+
def __init__(
|
16
|
+
self,
|
17
|
+
ch_in,
|
18
|
+
ch_out,
|
19
|
+
kernel_size,
|
20
|
+
stride,
|
21
|
+
padding=None,
|
22
|
+
bias=False,
|
23
|
+
act=None,
|
24
|
+
):
|
25
|
+
super().__init__()
|
26
|
+
self.conv = nn.Conv2d(
|
27
|
+
ch_in,
|
28
|
+
ch_out,
|
29
|
+
kernel_size,
|
30
|
+
stride,
|
31
|
+
padding=(kernel_size - 1) // 2 if padding is None else padding,
|
32
|
+
bias=bias,
|
33
|
+
)
|
34
|
+
self.norm = nn.BatchNorm2d(ch_out)
|
35
|
+
self.act = nn.Identity() if act is None else get_activation(act)
|
36
|
+
|
37
|
+
def forward(self, x):
|
38
|
+
return self.act(self.norm(self.conv(x)))
|
39
|
+
|
40
|
+
|
41
|
+
class TransformerEncoder(nn.Module):
|
42
|
+
def __init__(self, encoder_layer, num_layers, norm=None):
|
43
|
+
super(TransformerEncoder, self).__init__()
|
44
|
+
self.layers = nn.ModuleList(
|
45
|
+
[copy.deepcopy(encoder_layer) for _ in range(num_layers)]
|
46
|
+
)
|
47
|
+
self.num_layers = num_layers
|
48
|
+
self.norm = norm
|
49
|
+
|
50
|
+
def forward(self, src, src_mask=None, pos_embed=None) -> torch.Tensor:
|
51
|
+
output = src
|
52
|
+
for layer in self.layers:
|
53
|
+
output = layer(output, src_mask=src_mask, pos_embed=pos_embed)
|
54
|
+
|
55
|
+
if self.norm is not None:
|
56
|
+
output = self.norm(output)
|
57
|
+
|
58
|
+
return output
|
59
|
+
|
60
|
+
|
61
|
+
class TransformerEncoderLayer(nn.Module):
|
62
|
+
def __init__(
|
63
|
+
self,
|
64
|
+
d_model,
|
65
|
+
nhead,
|
66
|
+
dim_feedforward=2048,
|
67
|
+
dropout=0.1,
|
68
|
+
activation="relu",
|
69
|
+
normalize_before=False,
|
70
|
+
):
|
71
|
+
super().__init__()
|
72
|
+
self.normalize_before = normalize_before
|
73
|
+
|
74
|
+
self.self_attn = nn.MultiheadAttention(
|
75
|
+
d_model, nhead, dropout, batch_first=True
|
76
|
+
)
|
77
|
+
|
78
|
+
self.linear1 = nn.Linear(d_model, dim_feedforward)
|
79
|
+
self.dropout = nn.Dropout(dropout)
|
80
|
+
self.linear2 = nn.Linear(dim_feedforward, d_model)
|
81
|
+
|
82
|
+
self.norm1 = nn.LayerNorm(d_model)
|
83
|
+
self.norm2 = nn.LayerNorm(d_model)
|
84
|
+
self.dropout1 = nn.Dropout(dropout)
|
85
|
+
self.dropout2 = nn.Dropout(dropout)
|
86
|
+
self.activation = get_activation(activation)
|
87
|
+
|
88
|
+
@staticmethod
|
89
|
+
def with_pos_embed(tensor, pos_embed):
|
90
|
+
return tensor if pos_embed is None else tensor + pos_embed
|
91
|
+
|
92
|
+
def forward(self, src, src_mask=None, pos_embed=None) -> torch.Tensor:
|
93
|
+
residual = src
|
94
|
+
if self.normalize_before:
|
95
|
+
src = self.norm1(src)
|
96
|
+
q = k = self.with_pos_embed(src, pos_embed)
|
97
|
+
src, _ = self.self_attn(q, k, value=src, attn_mask=src_mask)
|
98
|
+
|
99
|
+
src = residual + self.dropout1(src)
|
100
|
+
if not self.normalize_before:
|
101
|
+
src = self.norm1(src)
|
102
|
+
|
103
|
+
residual = src
|
104
|
+
if self.normalize_before:
|
105
|
+
src = self.norm2(src)
|
106
|
+
src = self.linear2(self.dropout(self.activation(self.linear1(src))))
|
107
|
+
src = residual + self.dropout2(src)
|
108
|
+
if not self.normalize_before:
|
109
|
+
src = self.norm2(src)
|
110
|
+
return src
|
111
|
+
|
112
|
+
|
113
|
+
class RepVggBlock(nn.Module):
|
114
|
+
def __init__(self, ch_in, ch_out, act="relu"):
|
115
|
+
super().__init__()
|
116
|
+
self.ch_in = ch_in
|
117
|
+
self.ch_out = ch_out
|
118
|
+
self.conv1 = ConvNormLayer(ch_in, ch_out, 3, 1, padding=1, act=None)
|
119
|
+
self.conv2 = ConvNormLayer(ch_in, ch_out, 1, 1, padding=0, act=None)
|
120
|
+
self.act = nn.Identity() if act is None else get_activation(act)
|
121
|
+
|
122
|
+
def forward(self, x):
|
123
|
+
if hasattr(self, "conv"):
|
124
|
+
y = self.conv(x)
|
125
|
+
else:
|
126
|
+
y = self.conv1(x) + self.conv2(x)
|
127
|
+
|
128
|
+
return self.act(y)
|
129
|
+
|
130
|
+
def convert_to_deploy(self):
|
131
|
+
if not hasattr(self, "conv"):
|
132
|
+
self.conv = nn.Conv2d(self.ch_in, self.ch_out, 3, 1, padding=1)
|
133
|
+
|
134
|
+
kernel, bias = self.get_equivalent_kernel_bias()
|
135
|
+
self.conv.weight.data = kernel
|
136
|
+
self.conv.bias.data = bias
|
137
|
+
|
138
|
+
def get_equivalent_kernel_bias(self):
|
139
|
+
kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
|
140
|
+
kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
|
141
|
+
|
142
|
+
return (
|
143
|
+
kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1),
|
144
|
+
bias3x3 + bias1x1,
|
145
|
+
)
|
146
|
+
|
147
|
+
def _pad_1x1_to_3x3_tensor(self, kernel1x1):
|
148
|
+
if kernel1x1 is None:
|
149
|
+
return 0
|
150
|
+
else:
|
151
|
+
return F.pad(kernel1x1, [1, 1, 1, 1])
|
152
|
+
|
153
|
+
def _fuse_bn_tensor(self, branch: ConvNormLayer):
|
154
|
+
if branch is None:
|
155
|
+
return 0, 0
|
156
|
+
kernel = branch.conv.weight
|
157
|
+
running_mean = branch.norm.running_mean
|
158
|
+
running_var = branch.norm.running_var
|
159
|
+
gamma = branch.norm.weight
|
160
|
+
beta = branch.norm.bias
|
161
|
+
eps = branch.norm.eps
|
162
|
+
std = (running_var + eps).sqrt()
|
163
|
+
t = (gamma / std).reshape(-1, 1, 1, 1)
|
164
|
+
return kernel * t, beta - running_mean * gamma / std
|
165
|
+
|
166
|
+
|
167
|
+
class CSPRepLayer(nn.Module):
|
168
|
+
def __init__(
|
169
|
+
self,
|
170
|
+
in_channels,
|
171
|
+
out_channels,
|
172
|
+
num_blocks=3,
|
173
|
+
expansion=1.0,
|
174
|
+
bias=None,
|
175
|
+
act="silu",
|
176
|
+
):
|
177
|
+
super(CSPRepLayer, self).__init__()
|
178
|
+
hidden_channels = int(out_channels * expansion)
|
179
|
+
self.conv1 = ConvNormLayer(
|
180
|
+
in_channels, hidden_channels, 1, 1, bias=bias, act=act
|
181
|
+
)
|
182
|
+
self.conv2 = ConvNormLayer(
|
183
|
+
in_channels, hidden_channels, 1, 1, bias=bias, act=act
|
184
|
+
)
|
185
|
+
self.bottlenecks = nn.Sequential(
|
186
|
+
*[
|
187
|
+
RepVggBlock(hidden_channels, hidden_channels, act=act)
|
188
|
+
for _ in range(num_blocks)
|
189
|
+
]
|
190
|
+
)
|
191
|
+
if hidden_channels != out_channels:
|
192
|
+
self.conv3 = ConvNormLayer(
|
193
|
+
hidden_channels, out_channels, 1, 1, bias=bias, act=act
|
194
|
+
)
|
195
|
+
else:
|
196
|
+
self.conv3 = nn.Identity()
|
197
|
+
|
198
|
+
def forward(self, x):
|
199
|
+
x_1 = self.conv1(x)
|
200
|
+
x_1 = self.bottlenecks(x_1)
|
201
|
+
x_2 = self.conv2(x)
|
202
|
+
return self.conv3(x_1 + x_2)
|
203
|
+
|
204
|
+
|
205
|
+
class HybridEncoder(nn.Module):
|
206
|
+
__share__ = [
|
207
|
+
"eval_spatial_size",
|
208
|
+
]
|
209
|
+
|
210
|
+
def __init__(
|
211
|
+
self,
|
212
|
+
in_channels=[512, 1024, 2048],
|
213
|
+
feat_strides=[8, 16, 32],
|
214
|
+
hidden_dim=256,
|
215
|
+
nhead=8,
|
216
|
+
dim_feedforward=1024,
|
217
|
+
dropout=0.0,
|
218
|
+
enc_act="gelu",
|
219
|
+
use_encoder_idx=[2],
|
220
|
+
num_encoder_layers=1,
|
221
|
+
pe_temperature=10000,
|
222
|
+
expansion=1.0,
|
223
|
+
depth_mult=1.0,
|
224
|
+
act="silu",
|
225
|
+
eval_spatial_size=None,
|
226
|
+
version="v2",
|
227
|
+
):
|
228
|
+
super().__init__()
|
229
|
+
self.in_channels = in_channels
|
230
|
+
self.feat_strides = feat_strides
|
231
|
+
self.hidden_dim = hidden_dim
|
232
|
+
self.use_encoder_idx = use_encoder_idx
|
233
|
+
self.num_encoder_layers = num_encoder_layers
|
234
|
+
self.pe_temperature = pe_temperature
|
235
|
+
self.eval_spatial_size = eval_spatial_size
|
236
|
+
self.out_channels = [hidden_dim for _ in range(len(in_channels))]
|
237
|
+
self.out_strides = feat_strides
|
238
|
+
|
239
|
+
# channel projection
|
240
|
+
self.input_proj = nn.ModuleList()
|
241
|
+
for in_channel in in_channels:
|
242
|
+
if version == "v1":
|
243
|
+
proj = nn.Sequential(
|
244
|
+
nn.Conv2d(
|
245
|
+
in_channel, hidden_dim, kernel_size=1, bias=False
|
246
|
+
),
|
247
|
+
nn.BatchNorm2d(hidden_dim),
|
248
|
+
)
|
249
|
+
elif version == "v2":
|
250
|
+
proj = nn.Sequential(
|
251
|
+
OrderedDict(
|
252
|
+
[
|
253
|
+
(
|
254
|
+
"conv",
|
255
|
+
nn.Conv2d(
|
256
|
+
in_channel,
|
257
|
+
hidden_dim,
|
258
|
+
kernel_size=1,
|
259
|
+
bias=False,
|
260
|
+
),
|
261
|
+
),
|
262
|
+
("norm", nn.BatchNorm2d(hidden_dim)),
|
263
|
+
]
|
264
|
+
)
|
265
|
+
)
|
266
|
+
else:
|
267
|
+
raise AttributeError()
|
268
|
+
|
269
|
+
self.input_proj.append(proj)
|
270
|
+
|
271
|
+
# encoder transformer
|
272
|
+
encoder_layer = TransformerEncoderLayer(
|
273
|
+
hidden_dim,
|
274
|
+
nhead=nhead,
|
275
|
+
dim_feedforward=dim_feedforward,
|
276
|
+
dropout=dropout,
|
277
|
+
activation=enc_act,
|
278
|
+
)
|
279
|
+
|
280
|
+
self.encoder = nn.ModuleList(
|
281
|
+
[
|
282
|
+
TransformerEncoder(
|
283
|
+
copy.deepcopy(encoder_layer), num_encoder_layers
|
284
|
+
)
|
285
|
+
for _ in range(len(use_encoder_idx))
|
286
|
+
]
|
287
|
+
)
|
288
|
+
|
289
|
+
# top-down fpn
|
290
|
+
self.lateral_convs = nn.ModuleList()
|
291
|
+
self.fpn_blocks = nn.ModuleList()
|
292
|
+
for _ in range(len(in_channels) - 1, 0, -1):
|
293
|
+
self.lateral_convs.append(
|
294
|
+
ConvNormLayer(hidden_dim, hidden_dim, 1, 1, act=act)
|
295
|
+
)
|
296
|
+
self.fpn_blocks.append(
|
297
|
+
CSPRepLayer(
|
298
|
+
hidden_dim * 2,
|
299
|
+
hidden_dim,
|
300
|
+
round(3 * depth_mult),
|
301
|
+
act=act,
|
302
|
+
expansion=expansion,
|
303
|
+
)
|
304
|
+
)
|
305
|
+
|
306
|
+
# bottom-up pan
|
307
|
+
self.downsample_convs = nn.ModuleList()
|
308
|
+
self.pan_blocks = nn.ModuleList()
|
309
|
+
for _ in range(len(in_channels) - 1):
|
310
|
+
self.downsample_convs.append(
|
311
|
+
ConvNormLayer(hidden_dim, hidden_dim, 3, 2, act=act)
|
312
|
+
)
|
313
|
+
self.pan_blocks.append(
|
314
|
+
CSPRepLayer(
|
315
|
+
hidden_dim * 2,
|
316
|
+
hidden_dim,
|
317
|
+
round(3 * depth_mult),
|
318
|
+
act=act,
|
319
|
+
expansion=expansion,
|
320
|
+
)
|
321
|
+
)
|
322
|
+
|
323
|
+
self._reset_parameters()
|
324
|
+
|
325
|
+
def _reset_parameters(self):
|
326
|
+
if self.eval_spatial_size:
|
327
|
+
for idx in self.use_encoder_idx:
|
328
|
+
stride = self.feat_strides[idx]
|
329
|
+
pos_embed = self.build_2d_sincos_position_embedding(
|
330
|
+
self.eval_spatial_size[1] // stride,
|
331
|
+
self.eval_spatial_size[0] // stride,
|
332
|
+
self.hidden_dim,
|
333
|
+
self.pe_temperature,
|
334
|
+
)
|
335
|
+
setattr(self, f"pos_embed{idx}", pos_embed)
|
336
|
+
# self.register_buffer(f'pos_embed{idx}', pos_embed)
|
337
|
+
|
338
|
+
@staticmethod
|
339
|
+
def build_2d_sincos_position_embedding(
|
340
|
+
w, h, embed_dim=256, temperature=10000.0
|
341
|
+
):
|
342
|
+
""" """
|
343
|
+
grid_w = torch.arange(int(w), dtype=torch.float32)
|
344
|
+
grid_h = torch.arange(int(h), dtype=torch.float32)
|
345
|
+
grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij")
|
346
|
+
assert (
|
347
|
+
embed_dim % 4 == 0
|
348
|
+
), "Embed dimension must be divisible by 4 for 2D sin-cos position embedding"
|
349
|
+
pos_dim = embed_dim // 4
|
350
|
+
omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
|
351
|
+
omega = 1.0 / (temperature**omega)
|
352
|
+
|
353
|
+
out_w = grid_w.flatten()[..., None] @ omega[None]
|
354
|
+
out_h = grid_h.flatten()[..., None] @ omega[None]
|
355
|
+
|
356
|
+
return torch.concat(
|
357
|
+
[out_w.sin(), out_w.cos(), out_h.sin(), out_h.cos()], dim=1
|
358
|
+
)[None, :, :]
|
359
|
+
|
360
|
+
def forward(self, feats):
|
361
|
+
assert len(feats) == len(self.in_channels)
|
362
|
+
proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
|
363
|
+
|
364
|
+
# encoder
|
365
|
+
if self.num_encoder_layers > 0:
|
366
|
+
for i, enc_ind in enumerate(self.use_encoder_idx):
|
367
|
+
h, w = proj_feats[enc_ind].shape[2:]
|
368
|
+
# flatten [B, C, H, W] to [B, HxW, C]
|
369
|
+
src_flatten = proj_feats[enc_ind].flatten(2).permute(0, 2, 1)
|
370
|
+
if self.training or self.eval_spatial_size is None:
|
371
|
+
pos_embed = self.build_2d_sincos_position_embedding(
|
372
|
+
w, h, self.hidden_dim, self.pe_temperature
|
373
|
+
).to(src_flatten.device)
|
374
|
+
else:
|
375
|
+
pos_embed = getattr(self, f"pos_embed{enc_ind}", None).to(
|
376
|
+
src_flatten.device
|
377
|
+
)
|
378
|
+
|
379
|
+
memory: torch.Tensor = self.encoder[i](
|
380
|
+
src_flatten, pos_embed=pos_embed
|
381
|
+
)
|
382
|
+
proj_feats[enc_ind] = (
|
383
|
+
memory.permute(0, 2, 1)
|
384
|
+
.reshape(-1, self.hidden_dim, h, w)
|
385
|
+
.contiguous()
|
386
|
+
)
|
387
|
+
|
388
|
+
# broadcasting and fusion
|
389
|
+
inner_outs = [proj_feats[-1]]
|
390
|
+
for idx in range(len(self.in_channels) - 1, 0, -1):
|
391
|
+
feat_heigh = inner_outs[0]
|
392
|
+
feat_low = proj_feats[idx - 1]
|
393
|
+
feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](
|
394
|
+
feat_heigh
|
395
|
+
)
|
396
|
+
inner_outs[0] = feat_heigh
|
397
|
+
upsample_feat = F.interpolate(
|
398
|
+
feat_heigh, scale_factor=2.0, mode="nearest"
|
399
|
+
)
|
400
|
+
inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](
|
401
|
+
torch.concat([upsample_feat, feat_low], dim=1)
|
402
|
+
)
|
403
|
+
inner_outs.insert(0, inner_out)
|
404
|
+
|
405
|
+
outs = [inner_outs[0]]
|
406
|
+
for idx in range(len(self.in_channels) - 1):
|
407
|
+
feat_low = outs[-1]
|
408
|
+
feat_height = inner_outs[idx + 1]
|
409
|
+
downsample_feat = self.downsample_convs[idx](feat_low)
|
410
|
+
out = self.pan_blocks[idx](
|
411
|
+
torch.concat([downsample_feat, feat_height], dim=1)
|
412
|
+
)
|
413
|
+
outs.append(out)
|
414
|
+
|
415
|
+
return outs
|
416
|
+
|
417
|
+
|
418
|
+
# model = HybridEncoder(
|
419
|
+
# in_channels=[512, 1024, 2048],
|
420
|
+
# feat_strides=[8, 16, 32],
|
421
|
+
# hidden_dim=256,
|
422
|
+
# use_encoder_idx=[2],
|
423
|
+
# num_encoder_layers=1,
|
424
|
+
# nhead=8,
|
425
|
+
# dim_feedforward=1024,
|
426
|
+
# dropout=0.0,
|
427
|
+
# enc_act="gelu",
|
428
|
+
# expansion=1.0,
|
429
|
+
# depth_mult=1.0,
|
430
|
+
# act="silu",
|
431
|
+
# )
|
432
|
+
|
433
|
+
# print(model)
|