yomitoku 0.4.0.post1.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. yomitoku/__init__.py +20 -0
  2. yomitoku/base.py +136 -0
  3. yomitoku/cli/__init__.py +0 -0
  4. yomitoku/cli/main.py +230 -0
  5. yomitoku/configs/__init__.py +13 -0
  6. yomitoku/configs/cfg_layout_parser_rtdtrv2.py +89 -0
  7. yomitoku/configs/cfg_table_structure_recognizer_rtdtrv2.py +80 -0
  8. yomitoku/configs/cfg_text_detector_dbnet.py +49 -0
  9. yomitoku/configs/cfg_text_recognizer_parseq.py +51 -0
  10. yomitoku/constants.py +32 -0
  11. yomitoku/data/__init__.py +3 -0
  12. yomitoku/data/dataset.py +40 -0
  13. yomitoku/data/functions.py +279 -0
  14. yomitoku/document_analyzer.py +315 -0
  15. yomitoku/export/__init__.py +6 -0
  16. yomitoku/export/export_csv.py +71 -0
  17. yomitoku/export/export_html.py +188 -0
  18. yomitoku/export/export_json.py +34 -0
  19. yomitoku/export/export_markdown.py +145 -0
  20. yomitoku/layout_analyzer.py +66 -0
  21. yomitoku/layout_parser.py +189 -0
  22. yomitoku/models/__init__.py +9 -0
  23. yomitoku/models/dbnet_plus.py +272 -0
  24. yomitoku/models/layers/__init__.py +0 -0
  25. yomitoku/models/layers/activate.py +38 -0
  26. yomitoku/models/layers/dbnet_feature_attention.py +160 -0
  27. yomitoku/models/layers/parseq_transformer.py +218 -0
  28. yomitoku/models/layers/rtdetr_backbone.py +333 -0
  29. yomitoku/models/layers/rtdetr_hybrid_encoder.py +433 -0
  30. yomitoku/models/layers/rtdetrv2_decoder.py +811 -0
  31. yomitoku/models/parseq.py +243 -0
  32. yomitoku/models/rtdetr.py +22 -0
  33. yomitoku/ocr.py +87 -0
  34. yomitoku/postprocessor/__init__.py +9 -0
  35. yomitoku/postprocessor/dbnet_postporcessor.py +137 -0
  36. yomitoku/postprocessor/parseq_tokenizer.py +128 -0
  37. yomitoku/postprocessor/rtdetr_postprocessor.py +107 -0
  38. yomitoku/reading_order.py +214 -0
  39. yomitoku/resource/MPLUS1p-Medium.ttf +0 -0
  40. yomitoku/resource/charset.txt +1 -0
  41. yomitoku/table_structure_recognizer.py +244 -0
  42. yomitoku/text_detector.py +103 -0
  43. yomitoku/text_recognizer.py +128 -0
  44. yomitoku/utils/__init__.py +0 -0
  45. yomitoku/utils/graph.py +20 -0
  46. yomitoku/utils/logger.py +15 -0
  47. yomitoku/utils/misc.py +102 -0
  48. yomitoku/utils/visualizer.py +179 -0
  49. yomitoku-0.4.0.post1.dev0.dist-info/METADATA +127 -0
  50. yomitoku-0.4.0.post1.dev0.dist-info/RECORD +52 -0
  51. yomitoku-0.4.0.post1.dev0.dist-info/WHEEL +4 -0
  52. yomitoku-0.4.0.post1.dev0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,433 @@
1
+ """Copyright(c) 2023 lyuwenyu. All Rights Reserved.
2
+ """
3
+
4
+ import copy
5
+ from collections import OrderedDict
6
+
7
+ import torch
8
+ import torch.nn as nn
9
+ import torch.nn.functional as F
10
+
11
+ from .activate import get_activation
12
+
13
+
14
+ class ConvNormLayer(nn.Module):
15
+ def __init__(
16
+ self,
17
+ ch_in,
18
+ ch_out,
19
+ kernel_size,
20
+ stride,
21
+ padding=None,
22
+ bias=False,
23
+ act=None,
24
+ ):
25
+ super().__init__()
26
+ self.conv = nn.Conv2d(
27
+ ch_in,
28
+ ch_out,
29
+ kernel_size,
30
+ stride,
31
+ padding=(kernel_size - 1) // 2 if padding is None else padding,
32
+ bias=bias,
33
+ )
34
+ self.norm = nn.BatchNorm2d(ch_out)
35
+ self.act = nn.Identity() if act is None else get_activation(act)
36
+
37
+ def forward(self, x):
38
+ return self.act(self.norm(self.conv(x)))
39
+
40
+
41
+ class TransformerEncoder(nn.Module):
42
+ def __init__(self, encoder_layer, num_layers, norm=None):
43
+ super(TransformerEncoder, self).__init__()
44
+ self.layers = nn.ModuleList(
45
+ [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
46
+ )
47
+ self.num_layers = num_layers
48
+ self.norm = norm
49
+
50
+ def forward(self, src, src_mask=None, pos_embed=None) -> torch.Tensor:
51
+ output = src
52
+ for layer in self.layers:
53
+ output = layer(output, src_mask=src_mask, pos_embed=pos_embed)
54
+
55
+ if self.norm is not None:
56
+ output = self.norm(output)
57
+
58
+ return output
59
+
60
+
61
+ class TransformerEncoderLayer(nn.Module):
62
+ def __init__(
63
+ self,
64
+ d_model,
65
+ nhead,
66
+ dim_feedforward=2048,
67
+ dropout=0.1,
68
+ activation="relu",
69
+ normalize_before=False,
70
+ ):
71
+ super().__init__()
72
+ self.normalize_before = normalize_before
73
+
74
+ self.self_attn = nn.MultiheadAttention(
75
+ d_model, nhead, dropout, batch_first=True
76
+ )
77
+
78
+ self.linear1 = nn.Linear(d_model, dim_feedforward)
79
+ self.dropout = nn.Dropout(dropout)
80
+ self.linear2 = nn.Linear(dim_feedforward, d_model)
81
+
82
+ self.norm1 = nn.LayerNorm(d_model)
83
+ self.norm2 = nn.LayerNorm(d_model)
84
+ self.dropout1 = nn.Dropout(dropout)
85
+ self.dropout2 = nn.Dropout(dropout)
86
+ self.activation = get_activation(activation)
87
+
88
+ @staticmethod
89
+ def with_pos_embed(tensor, pos_embed):
90
+ return tensor if pos_embed is None else tensor + pos_embed
91
+
92
+ def forward(self, src, src_mask=None, pos_embed=None) -> torch.Tensor:
93
+ residual = src
94
+ if self.normalize_before:
95
+ src = self.norm1(src)
96
+ q = k = self.with_pos_embed(src, pos_embed)
97
+ src, _ = self.self_attn(q, k, value=src, attn_mask=src_mask)
98
+
99
+ src = residual + self.dropout1(src)
100
+ if not self.normalize_before:
101
+ src = self.norm1(src)
102
+
103
+ residual = src
104
+ if self.normalize_before:
105
+ src = self.norm2(src)
106
+ src = self.linear2(self.dropout(self.activation(self.linear1(src))))
107
+ src = residual + self.dropout2(src)
108
+ if not self.normalize_before:
109
+ src = self.norm2(src)
110
+ return src
111
+
112
+
113
+ class RepVggBlock(nn.Module):
114
+ def __init__(self, ch_in, ch_out, act="relu"):
115
+ super().__init__()
116
+ self.ch_in = ch_in
117
+ self.ch_out = ch_out
118
+ self.conv1 = ConvNormLayer(ch_in, ch_out, 3, 1, padding=1, act=None)
119
+ self.conv2 = ConvNormLayer(ch_in, ch_out, 1, 1, padding=0, act=None)
120
+ self.act = nn.Identity() if act is None else get_activation(act)
121
+
122
+ def forward(self, x):
123
+ if hasattr(self, "conv"):
124
+ y = self.conv(x)
125
+ else:
126
+ y = self.conv1(x) + self.conv2(x)
127
+
128
+ return self.act(y)
129
+
130
+ def convert_to_deploy(self):
131
+ if not hasattr(self, "conv"):
132
+ self.conv = nn.Conv2d(self.ch_in, self.ch_out, 3, 1, padding=1)
133
+
134
+ kernel, bias = self.get_equivalent_kernel_bias()
135
+ self.conv.weight.data = kernel
136
+ self.conv.bias.data = bias
137
+
138
+ def get_equivalent_kernel_bias(self):
139
+ kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
140
+ kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
141
+
142
+ return (
143
+ kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1),
144
+ bias3x3 + bias1x1,
145
+ )
146
+
147
+ def _pad_1x1_to_3x3_tensor(self, kernel1x1):
148
+ if kernel1x1 is None:
149
+ return 0
150
+ else:
151
+ return F.pad(kernel1x1, [1, 1, 1, 1])
152
+
153
+ def _fuse_bn_tensor(self, branch: ConvNormLayer):
154
+ if branch is None:
155
+ return 0, 0
156
+ kernel = branch.conv.weight
157
+ running_mean = branch.norm.running_mean
158
+ running_var = branch.norm.running_var
159
+ gamma = branch.norm.weight
160
+ beta = branch.norm.bias
161
+ eps = branch.norm.eps
162
+ std = (running_var + eps).sqrt()
163
+ t = (gamma / std).reshape(-1, 1, 1, 1)
164
+ return kernel * t, beta - running_mean * gamma / std
165
+
166
+
167
+ class CSPRepLayer(nn.Module):
168
+ def __init__(
169
+ self,
170
+ in_channels,
171
+ out_channels,
172
+ num_blocks=3,
173
+ expansion=1.0,
174
+ bias=None,
175
+ act="silu",
176
+ ):
177
+ super(CSPRepLayer, self).__init__()
178
+ hidden_channels = int(out_channels * expansion)
179
+ self.conv1 = ConvNormLayer(
180
+ in_channels, hidden_channels, 1, 1, bias=bias, act=act
181
+ )
182
+ self.conv2 = ConvNormLayer(
183
+ in_channels, hidden_channels, 1, 1, bias=bias, act=act
184
+ )
185
+ self.bottlenecks = nn.Sequential(
186
+ *[
187
+ RepVggBlock(hidden_channels, hidden_channels, act=act)
188
+ for _ in range(num_blocks)
189
+ ]
190
+ )
191
+ if hidden_channels != out_channels:
192
+ self.conv3 = ConvNormLayer(
193
+ hidden_channels, out_channels, 1, 1, bias=bias, act=act
194
+ )
195
+ else:
196
+ self.conv3 = nn.Identity()
197
+
198
+ def forward(self, x):
199
+ x_1 = self.conv1(x)
200
+ x_1 = self.bottlenecks(x_1)
201
+ x_2 = self.conv2(x)
202
+ return self.conv3(x_1 + x_2)
203
+
204
+
205
+ class HybridEncoder(nn.Module):
206
+ __share__ = [
207
+ "eval_spatial_size",
208
+ ]
209
+
210
+ def __init__(
211
+ self,
212
+ in_channels=[512, 1024, 2048],
213
+ feat_strides=[8, 16, 32],
214
+ hidden_dim=256,
215
+ nhead=8,
216
+ dim_feedforward=1024,
217
+ dropout=0.0,
218
+ enc_act="gelu",
219
+ use_encoder_idx=[2],
220
+ num_encoder_layers=1,
221
+ pe_temperature=10000,
222
+ expansion=1.0,
223
+ depth_mult=1.0,
224
+ act="silu",
225
+ eval_spatial_size=None,
226
+ version="v2",
227
+ ):
228
+ super().__init__()
229
+ self.in_channels = in_channels
230
+ self.feat_strides = feat_strides
231
+ self.hidden_dim = hidden_dim
232
+ self.use_encoder_idx = use_encoder_idx
233
+ self.num_encoder_layers = num_encoder_layers
234
+ self.pe_temperature = pe_temperature
235
+ self.eval_spatial_size = eval_spatial_size
236
+ self.out_channels = [hidden_dim for _ in range(len(in_channels))]
237
+ self.out_strides = feat_strides
238
+
239
+ # channel projection
240
+ self.input_proj = nn.ModuleList()
241
+ for in_channel in in_channels:
242
+ if version == "v1":
243
+ proj = nn.Sequential(
244
+ nn.Conv2d(
245
+ in_channel, hidden_dim, kernel_size=1, bias=False
246
+ ),
247
+ nn.BatchNorm2d(hidden_dim),
248
+ )
249
+ elif version == "v2":
250
+ proj = nn.Sequential(
251
+ OrderedDict(
252
+ [
253
+ (
254
+ "conv",
255
+ nn.Conv2d(
256
+ in_channel,
257
+ hidden_dim,
258
+ kernel_size=1,
259
+ bias=False,
260
+ ),
261
+ ),
262
+ ("norm", nn.BatchNorm2d(hidden_dim)),
263
+ ]
264
+ )
265
+ )
266
+ else:
267
+ raise AttributeError()
268
+
269
+ self.input_proj.append(proj)
270
+
271
+ # encoder transformer
272
+ encoder_layer = TransformerEncoderLayer(
273
+ hidden_dim,
274
+ nhead=nhead,
275
+ dim_feedforward=dim_feedforward,
276
+ dropout=dropout,
277
+ activation=enc_act,
278
+ )
279
+
280
+ self.encoder = nn.ModuleList(
281
+ [
282
+ TransformerEncoder(
283
+ copy.deepcopy(encoder_layer), num_encoder_layers
284
+ )
285
+ for _ in range(len(use_encoder_idx))
286
+ ]
287
+ )
288
+
289
+ # top-down fpn
290
+ self.lateral_convs = nn.ModuleList()
291
+ self.fpn_blocks = nn.ModuleList()
292
+ for _ in range(len(in_channels) - 1, 0, -1):
293
+ self.lateral_convs.append(
294
+ ConvNormLayer(hidden_dim, hidden_dim, 1, 1, act=act)
295
+ )
296
+ self.fpn_blocks.append(
297
+ CSPRepLayer(
298
+ hidden_dim * 2,
299
+ hidden_dim,
300
+ round(3 * depth_mult),
301
+ act=act,
302
+ expansion=expansion,
303
+ )
304
+ )
305
+
306
+ # bottom-up pan
307
+ self.downsample_convs = nn.ModuleList()
308
+ self.pan_blocks = nn.ModuleList()
309
+ for _ in range(len(in_channels) - 1):
310
+ self.downsample_convs.append(
311
+ ConvNormLayer(hidden_dim, hidden_dim, 3, 2, act=act)
312
+ )
313
+ self.pan_blocks.append(
314
+ CSPRepLayer(
315
+ hidden_dim * 2,
316
+ hidden_dim,
317
+ round(3 * depth_mult),
318
+ act=act,
319
+ expansion=expansion,
320
+ )
321
+ )
322
+
323
+ self._reset_parameters()
324
+
325
+ def _reset_parameters(self):
326
+ if self.eval_spatial_size:
327
+ for idx in self.use_encoder_idx:
328
+ stride = self.feat_strides[idx]
329
+ pos_embed = self.build_2d_sincos_position_embedding(
330
+ self.eval_spatial_size[1] // stride,
331
+ self.eval_spatial_size[0] // stride,
332
+ self.hidden_dim,
333
+ self.pe_temperature,
334
+ )
335
+ setattr(self, f"pos_embed{idx}", pos_embed)
336
+ # self.register_buffer(f'pos_embed{idx}', pos_embed)
337
+
338
+ @staticmethod
339
+ def build_2d_sincos_position_embedding(
340
+ w, h, embed_dim=256, temperature=10000.0
341
+ ):
342
+ """ """
343
+ grid_w = torch.arange(int(w), dtype=torch.float32)
344
+ grid_h = torch.arange(int(h), dtype=torch.float32)
345
+ grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij")
346
+ assert (
347
+ embed_dim % 4 == 0
348
+ ), "Embed dimension must be divisible by 4 for 2D sin-cos position embedding"
349
+ pos_dim = embed_dim // 4
350
+ omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
351
+ omega = 1.0 / (temperature**omega)
352
+
353
+ out_w = grid_w.flatten()[..., None] @ omega[None]
354
+ out_h = grid_h.flatten()[..., None] @ omega[None]
355
+
356
+ return torch.concat(
357
+ [out_w.sin(), out_w.cos(), out_h.sin(), out_h.cos()], dim=1
358
+ )[None, :, :]
359
+
360
+ def forward(self, feats):
361
+ assert len(feats) == len(self.in_channels)
362
+ proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
363
+
364
+ # encoder
365
+ if self.num_encoder_layers > 0:
366
+ for i, enc_ind in enumerate(self.use_encoder_idx):
367
+ h, w = proj_feats[enc_ind].shape[2:]
368
+ # flatten [B, C, H, W] to [B, HxW, C]
369
+ src_flatten = proj_feats[enc_ind].flatten(2).permute(0, 2, 1)
370
+ if self.training or self.eval_spatial_size is None:
371
+ pos_embed = self.build_2d_sincos_position_embedding(
372
+ w, h, self.hidden_dim, self.pe_temperature
373
+ ).to(src_flatten.device)
374
+ else:
375
+ pos_embed = getattr(self, f"pos_embed{enc_ind}", None).to(
376
+ src_flatten.device
377
+ )
378
+
379
+ memory: torch.Tensor = self.encoder[i](
380
+ src_flatten, pos_embed=pos_embed
381
+ )
382
+ proj_feats[enc_ind] = (
383
+ memory.permute(0, 2, 1)
384
+ .reshape(-1, self.hidden_dim, h, w)
385
+ .contiguous()
386
+ )
387
+
388
+ # broadcasting and fusion
389
+ inner_outs = [proj_feats[-1]]
390
+ for idx in range(len(self.in_channels) - 1, 0, -1):
391
+ feat_heigh = inner_outs[0]
392
+ feat_low = proj_feats[idx - 1]
393
+ feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](
394
+ feat_heigh
395
+ )
396
+ inner_outs[0] = feat_heigh
397
+ upsample_feat = F.interpolate(
398
+ feat_heigh, scale_factor=2.0, mode="nearest"
399
+ )
400
+ inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](
401
+ torch.concat([upsample_feat, feat_low], dim=1)
402
+ )
403
+ inner_outs.insert(0, inner_out)
404
+
405
+ outs = [inner_outs[0]]
406
+ for idx in range(len(self.in_channels) - 1):
407
+ feat_low = outs[-1]
408
+ feat_height = inner_outs[idx + 1]
409
+ downsample_feat = self.downsample_convs[idx](feat_low)
410
+ out = self.pan_blocks[idx](
411
+ torch.concat([downsample_feat, feat_height], dim=1)
412
+ )
413
+ outs.append(out)
414
+
415
+ return outs
416
+
417
+
418
+ # model = HybridEncoder(
419
+ # in_channels=[512, 1024, 2048],
420
+ # feat_strides=[8, 16, 32],
421
+ # hidden_dim=256,
422
+ # use_encoder_idx=[2],
423
+ # num_encoder_layers=1,
424
+ # nhead=8,
425
+ # dim_feedforward=1024,
426
+ # dropout=0.0,
427
+ # enc_act="gelu",
428
+ # expansion=1.0,
429
+ # depth_mult=1.0,
430
+ # act="silu",
431
+ # )
432
+
433
+ # print(model)