magic-pdf 1.3.11__py3-none-any.whl → 1.3.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/data/utils.py +4 -4
- magic_pdf/dict2md/ocr_mkcontent.py +36 -22
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/batch_analyze.py +14 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +1 -1
- magic_pdf/model/sub_modules/model_utils.py +4 -4
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/__init__.py +2 -1
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_pphgnetv2.py +810 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/rnn.py +18 -5
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml +68 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv5_dict.txt +18383 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml +8 -0
- {magic_pdf-1.3.11.dist-info → magic_pdf-1.3.12.dist-info}/METADATA +15 -1
- {magic_pdf-1.3.11.dist-info → magic_pdf-1.3.12.dist-info}/RECORD +18 -16
- {magic_pdf-1.3.11.dist-info → magic_pdf-1.3.12.dist-info}/LICENSE.md +0 -0
- {magic_pdf-1.3.11.dist-info → magic_pdf-1.3.12.dist-info}/WHEEL +0 -0
- {magic_pdf-1.3.11.dist-info → magic_pdf-1.3.12.dist-info}/entry_points.txt +0 -0
- {magic_pdf-1.3.11.dist-info → magic_pdf-1.3.12.dist-info}/top_level.txt +0 -0
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_pphgnetv2.py
ADDED
@@ -0,0 +1,810 @@
|
|
1
|
+
import math
|
2
|
+
import torch
|
3
|
+
import torch.nn as nn
|
4
|
+
import torch.nn.functional as F
|
5
|
+
|
6
|
+
|
7
|
+
class AdaptiveAvgPool2D(nn.AdaptiveAvgPool2d):
|
8
|
+
def __init__(self, *args, **kwargs):
|
9
|
+
super().__init__(*args, **kwargs)
|
10
|
+
|
11
|
+
if isinstance(self.output_size, int) and self.output_size == 1:
|
12
|
+
self._gap = True
|
13
|
+
elif (
|
14
|
+
isinstance(self.output_size, tuple)
|
15
|
+
and self.output_size[0] == 1
|
16
|
+
and self.output_size[1] == 1
|
17
|
+
):
|
18
|
+
self._gap = True
|
19
|
+
else:
|
20
|
+
self._gap = False
|
21
|
+
|
22
|
+
def forward(self, x):
|
23
|
+
if self._gap:
|
24
|
+
# Global Average Pooling
|
25
|
+
N, C, _, _ = x.shape
|
26
|
+
x_mean = torch.mean(x, dim=[2, 3])
|
27
|
+
x_mean = torch.reshape(x_mean, [N, C, 1, 1])
|
28
|
+
return x_mean
|
29
|
+
else:
|
30
|
+
return F.adaptive_avg_pool2d(
|
31
|
+
x,
|
32
|
+
output_size=self.output_size
|
33
|
+
)
|
34
|
+
|
35
|
+
class LearnableAffineBlock(nn.Module):
|
36
|
+
"""
|
37
|
+
Create a learnable affine block module. This module can significantly improve accuracy on smaller models.
|
38
|
+
|
39
|
+
Args:
|
40
|
+
scale_value (float): The initial value of the scale parameter, default is 1.0.
|
41
|
+
bias_value (float): The initial value of the bias parameter, default is 0.0.
|
42
|
+
lr_mult (float): The learning rate multiplier, default is 1.0.
|
43
|
+
lab_lr (float): The learning rate, default is 0.01.
|
44
|
+
"""
|
45
|
+
|
46
|
+
def __init__(self, scale_value=1.0, bias_value=0.0, lr_mult=1.0, lab_lr=0.01):
|
47
|
+
super().__init__()
|
48
|
+
self.scale = nn.Parameter(torch.Tensor([scale_value]))
|
49
|
+
self.bias = nn.Parameter(torch.Tensor([bias_value]))
|
50
|
+
|
51
|
+
def forward(self, x):
|
52
|
+
return self.scale * x + self.bias
|
53
|
+
|
54
|
+
|
55
|
+
class ConvBNAct(nn.Module):
|
56
|
+
"""
|
57
|
+
ConvBNAct is a combination of convolution and batchnorm layers.
|
58
|
+
|
59
|
+
Args:
|
60
|
+
in_channels (int): Number of input channels.
|
61
|
+
out_channels (int): Number of output channels.
|
62
|
+
kernel_size (int): Size of the convolution kernel. Defaults to 3.
|
63
|
+
stride (int): Stride of the convolution. Defaults to 1.
|
64
|
+
padding (int/str): Padding or padding type for the convolution. Defaults to 1.
|
65
|
+
groups (int): Number of groups for the convolution. Defaults to 1.
|
66
|
+
use_act: (bool): Whether to use activation function. Defaults to True.
|
67
|
+
use_lab (bool): Whether to use the LAB operation. Defaults to False.
|
68
|
+
lr_mult (float): Learning rate multiplier for the layer. Defaults to 1.0.
|
69
|
+
"""
|
70
|
+
|
71
|
+
def __init__(
|
72
|
+
self,
|
73
|
+
in_channels,
|
74
|
+
out_channels,
|
75
|
+
kernel_size=3,
|
76
|
+
stride=1,
|
77
|
+
padding=1,
|
78
|
+
groups=1,
|
79
|
+
use_act=True,
|
80
|
+
use_lab=False,
|
81
|
+
lr_mult=1.0,
|
82
|
+
):
|
83
|
+
super().__init__()
|
84
|
+
self.use_act = use_act
|
85
|
+
self.use_lab = use_lab
|
86
|
+
|
87
|
+
self.conv = nn.Conv2d(
|
88
|
+
in_channels,
|
89
|
+
out_channels,
|
90
|
+
kernel_size,
|
91
|
+
stride,
|
92
|
+
padding=padding if isinstance(padding, str) else (kernel_size - 1) // 2,
|
93
|
+
# padding=(kernel_size - 1) // 2,
|
94
|
+
groups=groups,
|
95
|
+
bias=False,
|
96
|
+
)
|
97
|
+
self.bn = nn.BatchNorm2d(
|
98
|
+
out_channels,
|
99
|
+
)
|
100
|
+
if self.use_act:
|
101
|
+
self.act = nn.ReLU()
|
102
|
+
if self.use_lab:
|
103
|
+
self.lab = LearnableAffineBlock(lr_mult=lr_mult)
|
104
|
+
|
105
|
+
def forward(self, x):
|
106
|
+
x = self.conv(x)
|
107
|
+
x = self.bn(x)
|
108
|
+
if self.use_act:
|
109
|
+
x = self.act(x)
|
110
|
+
if self.use_lab:
|
111
|
+
x = self.lab(x)
|
112
|
+
return x
|
113
|
+
|
114
|
+
|
115
|
+
class LightConvBNAct(nn.Module):
|
116
|
+
"""
|
117
|
+
LightConvBNAct is a combination of pw and dw layers.
|
118
|
+
|
119
|
+
Args:
|
120
|
+
in_channels (int): Number of input channels.
|
121
|
+
out_channels (int): Number of output channels.
|
122
|
+
kernel_size (int): Size of the depth-wise convolution kernel.
|
123
|
+
use_lab (bool): Whether to use the LAB operation. Defaults to False.
|
124
|
+
lr_mult (float): Learning rate multiplier for the layer. Defaults to 1.0.
|
125
|
+
"""
|
126
|
+
|
127
|
+
def __init__(
|
128
|
+
self,
|
129
|
+
in_channels,
|
130
|
+
out_channels,
|
131
|
+
kernel_size,
|
132
|
+
use_lab=False,
|
133
|
+
lr_mult=1.0,
|
134
|
+
**kwargs,
|
135
|
+
):
|
136
|
+
super().__init__()
|
137
|
+
self.conv1 = ConvBNAct(
|
138
|
+
in_channels=in_channels,
|
139
|
+
out_channels=out_channels,
|
140
|
+
kernel_size=1,
|
141
|
+
use_act=False,
|
142
|
+
use_lab=use_lab,
|
143
|
+
lr_mult=lr_mult,
|
144
|
+
)
|
145
|
+
self.conv2 = ConvBNAct(
|
146
|
+
in_channels=out_channels,
|
147
|
+
out_channels=out_channels,
|
148
|
+
kernel_size=kernel_size,
|
149
|
+
groups=out_channels,
|
150
|
+
use_act=True,
|
151
|
+
use_lab=use_lab,
|
152
|
+
lr_mult=lr_mult,
|
153
|
+
)
|
154
|
+
|
155
|
+
def forward(self, x):
|
156
|
+
x = self.conv1(x)
|
157
|
+
x = self.conv2(x)
|
158
|
+
return x
|
159
|
+
|
160
|
+
|
161
|
+
class CustomMaxPool2d(nn.Module):
|
162
|
+
def __init__(
|
163
|
+
self,
|
164
|
+
kernel_size,
|
165
|
+
stride=None,
|
166
|
+
padding=0,
|
167
|
+
dilation=1,
|
168
|
+
return_indices=False,
|
169
|
+
ceil_mode=False,
|
170
|
+
data_format="NCHW",
|
171
|
+
):
|
172
|
+
super(CustomMaxPool2d, self).__init__()
|
173
|
+
self.kernel_size = kernel_size if isinstance(kernel_size, (tuple, list)) else (kernel_size, kernel_size)
|
174
|
+
self.stride = stride if stride is not None else self.kernel_size
|
175
|
+
self.stride = self.stride if isinstance(self.stride, (tuple, list)) else (self.stride, self.stride)
|
176
|
+
self.dilation = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
|
177
|
+
self.return_indices = return_indices
|
178
|
+
self.ceil_mode = ceil_mode
|
179
|
+
self.padding_mode = padding
|
180
|
+
|
181
|
+
# 当padding不是"same"时使用标准MaxPool2d
|
182
|
+
if padding != "same":
|
183
|
+
self.padding = padding if isinstance(padding, (tuple, list)) else (padding, padding)
|
184
|
+
self.pool = nn.MaxPool2d(
|
185
|
+
kernel_size=self.kernel_size,
|
186
|
+
stride=self.stride,
|
187
|
+
padding=self.padding,
|
188
|
+
dilation=self.dilation,
|
189
|
+
return_indices=self.return_indices,
|
190
|
+
ceil_mode=self.ceil_mode
|
191
|
+
)
|
192
|
+
|
193
|
+
def forward(self, x):
|
194
|
+
# 处理same padding
|
195
|
+
if self.padding_mode == "same":
|
196
|
+
input_height, input_width = x.size(2), x.size(3)
|
197
|
+
|
198
|
+
# 计算期望的输出尺寸
|
199
|
+
out_height = math.ceil(input_height / self.stride[0])
|
200
|
+
out_width = math.ceil(input_width / self.stride[1])
|
201
|
+
|
202
|
+
# 计算需要的padding
|
203
|
+
pad_height = max((out_height - 1) * self.stride[0] + self.kernel_size[0] - input_height, 0)
|
204
|
+
pad_width = max((out_width - 1) * self.stride[1] + self.kernel_size[1] - input_width, 0)
|
205
|
+
|
206
|
+
# 将padding分配到两边
|
207
|
+
pad_top = pad_height // 2
|
208
|
+
pad_bottom = pad_height - pad_top
|
209
|
+
pad_left = pad_width // 2
|
210
|
+
pad_right = pad_width - pad_left
|
211
|
+
|
212
|
+
# 应用padding
|
213
|
+
x = F.pad(x, (pad_left, pad_right, pad_top, pad_bottom))
|
214
|
+
|
215
|
+
# 使用标准max_pool2d函数
|
216
|
+
if self.return_indices:
|
217
|
+
return F.max_pool2d_with_indices(
|
218
|
+
x,
|
219
|
+
kernel_size=self.kernel_size,
|
220
|
+
stride=self.stride,
|
221
|
+
padding=0, # 已经手动pad过了
|
222
|
+
dilation=self.dilation,
|
223
|
+
ceil_mode=self.ceil_mode
|
224
|
+
)
|
225
|
+
else:
|
226
|
+
return F.max_pool2d(
|
227
|
+
x,
|
228
|
+
kernel_size=self.kernel_size,
|
229
|
+
stride=self.stride,
|
230
|
+
padding=0, # 已经手动pad过了
|
231
|
+
dilation=self.dilation,
|
232
|
+
ceil_mode=self.ceil_mode
|
233
|
+
)
|
234
|
+
else:
|
235
|
+
# 使用预定义的MaxPool2d
|
236
|
+
return self.pool(x)
|
237
|
+
|
238
|
+
class StemBlock(nn.Module):
|
239
|
+
"""
|
240
|
+
StemBlock for PP-HGNetV2.
|
241
|
+
|
242
|
+
Args:
|
243
|
+
in_channels (int): Number of input channels.
|
244
|
+
mid_channels (int): Number of middle channels.
|
245
|
+
out_channels (int): Number of output channels.
|
246
|
+
use_lab (bool): Whether to use the LAB operation. Defaults to False.
|
247
|
+
lr_mult (float): Learning rate multiplier for the layer. Defaults to 1.0.
|
248
|
+
"""
|
249
|
+
|
250
|
+
def __init__(
|
251
|
+
self,
|
252
|
+
in_channels,
|
253
|
+
mid_channels,
|
254
|
+
out_channels,
|
255
|
+
use_lab=False,
|
256
|
+
lr_mult=1.0,
|
257
|
+
text_rec=False,
|
258
|
+
):
|
259
|
+
super().__init__()
|
260
|
+
self.stem1 = ConvBNAct(
|
261
|
+
in_channels=in_channels,
|
262
|
+
out_channels=mid_channels,
|
263
|
+
kernel_size=3,
|
264
|
+
stride=2,
|
265
|
+
use_lab=use_lab,
|
266
|
+
lr_mult=lr_mult,
|
267
|
+
)
|
268
|
+
self.stem2a = ConvBNAct(
|
269
|
+
in_channels=mid_channels,
|
270
|
+
out_channels=mid_channels // 2,
|
271
|
+
kernel_size=2,
|
272
|
+
stride=1,
|
273
|
+
padding="same",
|
274
|
+
use_lab=use_lab,
|
275
|
+
lr_mult=lr_mult,
|
276
|
+
)
|
277
|
+
self.stem2b = ConvBNAct(
|
278
|
+
in_channels=mid_channels // 2,
|
279
|
+
out_channels=mid_channels,
|
280
|
+
kernel_size=2,
|
281
|
+
stride=1,
|
282
|
+
padding="same",
|
283
|
+
use_lab=use_lab,
|
284
|
+
lr_mult=lr_mult,
|
285
|
+
)
|
286
|
+
self.stem3 = ConvBNAct(
|
287
|
+
in_channels=mid_channels * 2,
|
288
|
+
out_channels=mid_channels,
|
289
|
+
kernel_size=3,
|
290
|
+
stride=1 if text_rec else 2,
|
291
|
+
use_lab=use_lab,
|
292
|
+
lr_mult=lr_mult,
|
293
|
+
)
|
294
|
+
self.stem4 = ConvBNAct(
|
295
|
+
in_channels=mid_channels,
|
296
|
+
out_channels=out_channels,
|
297
|
+
kernel_size=1,
|
298
|
+
stride=1,
|
299
|
+
use_lab=use_lab,
|
300
|
+
lr_mult=lr_mult,
|
301
|
+
)
|
302
|
+
self.pool = CustomMaxPool2d(
|
303
|
+
kernel_size=2, stride=1, ceil_mode=True, padding="same"
|
304
|
+
)
|
305
|
+
# self.pool = nn.MaxPool2d(
|
306
|
+
# kernel_size=2, stride=1, ceil_mode=True, padding=1
|
307
|
+
# )
|
308
|
+
|
309
|
+
def forward(self, x):
|
310
|
+
x = self.stem1(x)
|
311
|
+
x2 = self.stem2a(x)
|
312
|
+
x2 = self.stem2b(x2)
|
313
|
+
x1 = self.pool(x)
|
314
|
+
|
315
|
+
# if x1.shape[2:] != x2.shape[2:]:
|
316
|
+
# x1 = F.interpolate(x1, size=x2.shape[2:], mode='bilinear', align_corners=False)
|
317
|
+
|
318
|
+
x = torch.cat([x1, x2], 1)
|
319
|
+
x = self.stem3(x)
|
320
|
+
x = self.stem4(x)
|
321
|
+
|
322
|
+
return x
|
323
|
+
|
324
|
+
|
325
|
+
class HGV2_Block(nn.Module):
|
326
|
+
"""
|
327
|
+
HGV2_Block, the basic unit that constitutes the HGV2_Stage.
|
328
|
+
|
329
|
+
Args:
|
330
|
+
in_channels (int): Number of input channels.
|
331
|
+
mid_channels (int): Number of middle channels.
|
332
|
+
out_channels (int): Number of output channels.
|
333
|
+
kernel_size (int): Size of the convolution kernel. Defaults to 3.
|
334
|
+
layer_num (int): Number of layers in the HGV2 block. Defaults to 6.
|
335
|
+
stride (int): Stride of the convolution. Defaults to 1.
|
336
|
+
padding (int/str): Padding or padding type for the convolution. Defaults to 1.
|
337
|
+
groups (int): Number of groups for the convolution. Defaults to 1.
|
338
|
+
use_act (bool): Whether to use activation function. Defaults to True.
|
339
|
+
use_lab (bool): Whether to use the LAB operation. Defaults to False.
|
340
|
+
lr_mult (float): Learning rate multiplier for the layer. Defaults to 1.0.
|
341
|
+
"""
|
342
|
+
|
343
|
+
def __init__(
|
344
|
+
self,
|
345
|
+
in_channels,
|
346
|
+
mid_channels,
|
347
|
+
out_channels,
|
348
|
+
kernel_size=3,
|
349
|
+
layer_num=6,
|
350
|
+
identity=False,
|
351
|
+
light_block=True,
|
352
|
+
use_lab=False,
|
353
|
+
lr_mult=1.0,
|
354
|
+
):
|
355
|
+
super().__init__()
|
356
|
+
self.identity = identity
|
357
|
+
|
358
|
+
self.layers = nn.ModuleList()
|
359
|
+
block_type = "LightConvBNAct" if light_block else "ConvBNAct"
|
360
|
+
for i in range(layer_num):
|
361
|
+
self.layers.append(
|
362
|
+
eval(block_type)(
|
363
|
+
in_channels=in_channels if i == 0 else mid_channels,
|
364
|
+
out_channels=mid_channels,
|
365
|
+
stride=1,
|
366
|
+
kernel_size=kernel_size,
|
367
|
+
use_lab=use_lab,
|
368
|
+
lr_mult=lr_mult,
|
369
|
+
)
|
370
|
+
)
|
371
|
+
# feature aggregation
|
372
|
+
total_channels = in_channels + layer_num * mid_channels
|
373
|
+
self.aggregation_squeeze_conv = ConvBNAct(
|
374
|
+
in_channels=total_channels,
|
375
|
+
out_channels=out_channels // 2,
|
376
|
+
kernel_size=1,
|
377
|
+
stride=1,
|
378
|
+
use_lab=use_lab,
|
379
|
+
lr_mult=lr_mult,
|
380
|
+
)
|
381
|
+
self.aggregation_excitation_conv = ConvBNAct(
|
382
|
+
in_channels=out_channels // 2,
|
383
|
+
out_channels=out_channels,
|
384
|
+
kernel_size=1,
|
385
|
+
stride=1,
|
386
|
+
use_lab=use_lab,
|
387
|
+
lr_mult=lr_mult,
|
388
|
+
)
|
389
|
+
|
390
|
+
def forward(self, x):
|
391
|
+
identity = x
|
392
|
+
output = []
|
393
|
+
output.append(x)
|
394
|
+
for layer in self.layers:
|
395
|
+
x = layer(x)
|
396
|
+
output.append(x)
|
397
|
+
x = torch.cat(output, dim=1)
|
398
|
+
x = self.aggregation_squeeze_conv(x)
|
399
|
+
x = self.aggregation_excitation_conv(x)
|
400
|
+
if self.identity:
|
401
|
+
x += identity
|
402
|
+
return x
|
403
|
+
|
404
|
+
|
405
|
+
class HGV2_Stage(nn.Module):
|
406
|
+
"""
|
407
|
+
HGV2_Stage, the basic unit that constitutes the PPHGNetV2.
|
408
|
+
|
409
|
+
Args:
|
410
|
+
in_channels (int): Number of input channels.
|
411
|
+
mid_channels (int): Number of middle channels.
|
412
|
+
out_channels (int): Number of output channels.
|
413
|
+
block_num (int): Number of blocks in the HGV2 stage.
|
414
|
+
layer_num (int): Number of layers in the HGV2 block. Defaults to 6.
|
415
|
+
is_downsample (bool): Whether to use downsampling operation. Defaults to False.
|
416
|
+
light_block (bool): Whether to use light block. Defaults to True.
|
417
|
+
kernel_size (int): Size of the convolution kernel. Defaults to 3.
|
418
|
+
use_lab (bool, optional): Whether to use the LAB operation. Defaults to False.
|
419
|
+
lr_mult (float, optional): Learning rate multiplier for the layer. Defaults to 1.0.
|
420
|
+
"""
|
421
|
+
|
422
|
+
def __init__(
|
423
|
+
self,
|
424
|
+
in_channels,
|
425
|
+
mid_channels,
|
426
|
+
out_channels,
|
427
|
+
block_num,
|
428
|
+
layer_num=6,
|
429
|
+
is_downsample=True,
|
430
|
+
light_block=True,
|
431
|
+
kernel_size=3,
|
432
|
+
use_lab=False,
|
433
|
+
stride=2,
|
434
|
+
lr_mult=1.0,
|
435
|
+
):
|
436
|
+
|
437
|
+
super().__init__()
|
438
|
+
self.is_downsample = is_downsample
|
439
|
+
if self.is_downsample:
|
440
|
+
self.downsample = ConvBNAct(
|
441
|
+
in_channels=in_channels,
|
442
|
+
out_channels=in_channels,
|
443
|
+
kernel_size=3,
|
444
|
+
stride=stride,
|
445
|
+
groups=in_channels,
|
446
|
+
use_act=False,
|
447
|
+
use_lab=use_lab,
|
448
|
+
lr_mult=lr_mult,
|
449
|
+
)
|
450
|
+
|
451
|
+
blocks_list = []
|
452
|
+
for i in range(block_num):
|
453
|
+
blocks_list.append(
|
454
|
+
HGV2_Block(
|
455
|
+
in_channels=in_channels if i == 0 else out_channels,
|
456
|
+
mid_channels=mid_channels,
|
457
|
+
out_channels=out_channels,
|
458
|
+
kernel_size=kernel_size,
|
459
|
+
layer_num=layer_num,
|
460
|
+
identity=False if i == 0 else True,
|
461
|
+
light_block=light_block,
|
462
|
+
use_lab=use_lab,
|
463
|
+
lr_mult=lr_mult,
|
464
|
+
)
|
465
|
+
)
|
466
|
+
self.blocks = nn.Sequential(*blocks_list)
|
467
|
+
|
468
|
+
def forward(self, x):
|
469
|
+
if self.is_downsample:
|
470
|
+
x = self.downsample(x)
|
471
|
+
x = self.blocks(x)
|
472
|
+
return x
|
473
|
+
|
474
|
+
|
475
|
+
class DropoutInferDownscale(nn.Module):
|
476
|
+
"""
|
477
|
+
实现与Paddle的mode="downscale_in_infer"等效的Dropout
|
478
|
+
训练模式:out = input * mask(直接应用掩码,不进行放大)
|
479
|
+
推理模式:out = input * (1.0 - p)(在推理时按概率缩小)
|
480
|
+
"""
|
481
|
+
|
482
|
+
def __init__(self, p=0.5):
|
483
|
+
super().__init__()
|
484
|
+
self.p = p
|
485
|
+
|
486
|
+
def forward(self, x):
|
487
|
+
if self.training:
|
488
|
+
# 训练时:应用随机mask但不放大
|
489
|
+
return F.dropout(x, self.p, training=True) * (1.0 - self.p)
|
490
|
+
else:
|
491
|
+
# 推理时:按照dropout概率缩小输出
|
492
|
+
return x * (1.0 - self.p)
|
493
|
+
|
494
|
+
class PPHGNetV2(nn.Module):
|
495
|
+
"""
|
496
|
+
PPHGNetV2
|
497
|
+
|
498
|
+
Args:
|
499
|
+
stage_config (dict): Config for PPHGNetV2 stages. such as the number of channels, stride, etc.
|
500
|
+
stem_channels: (list): Number of channels of the stem of the PPHGNetV2.
|
501
|
+
use_lab (bool): Whether to use the LAB operation. Defaults to False.
|
502
|
+
use_last_conv (bool): Whether to use the last conv layer as the output channel. Defaults to True.
|
503
|
+
class_expand (int): Number of channels for the last 1x1 convolutional layer.
|
504
|
+
drop_prob (float): Dropout probability for the last 1x1 convolutional layer. Defaults to 0.0.
|
505
|
+
class_num (int): The number of classes for the classification layer. Defaults to 1000.
|
506
|
+
lr_mult_list (list): Learning rate multiplier for the stages. Defaults to [1.0, 1.0, 1.0, 1.0, 1.0].
|
507
|
+
Returns:
|
508
|
+
model: nn.Layer. Specific PPHGNetV2 model depends on args.
|
509
|
+
"""
|
510
|
+
|
511
|
+
def __init__(
|
512
|
+
self,
|
513
|
+
stage_config,
|
514
|
+
stem_channels=[3, 32, 64],
|
515
|
+
use_lab=False,
|
516
|
+
use_last_conv=True,
|
517
|
+
class_expand=2048,
|
518
|
+
dropout_prob=0.0,
|
519
|
+
class_num=1000,
|
520
|
+
lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
|
521
|
+
det=False,
|
522
|
+
text_rec=False,
|
523
|
+
out_indices=None,
|
524
|
+
**kwargs,
|
525
|
+
):
|
526
|
+
super().__init__()
|
527
|
+
self.det = det
|
528
|
+
self.text_rec = text_rec
|
529
|
+
self.use_lab = use_lab
|
530
|
+
self.use_last_conv = use_last_conv
|
531
|
+
self.class_expand = class_expand
|
532
|
+
self.class_num = class_num
|
533
|
+
self.out_indices = out_indices if out_indices is not None else [0, 1, 2, 3]
|
534
|
+
self.out_channels = []
|
535
|
+
|
536
|
+
# stem
|
537
|
+
self.stem = StemBlock(
|
538
|
+
in_channels=stem_channels[0],
|
539
|
+
mid_channels=stem_channels[1],
|
540
|
+
out_channels=stem_channels[2],
|
541
|
+
use_lab=use_lab,
|
542
|
+
lr_mult=lr_mult_list[0],
|
543
|
+
text_rec=text_rec,
|
544
|
+
)
|
545
|
+
|
546
|
+
# stages
|
547
|
+
self.stages = nn.ModuleList()
|
548
|
+
for i, k in enumerate(stage_config):
|
549
|
+
(
|
550
|
+
in_channels,
|
551
|
+
mid_channels,
|
552
|
+
out_channels,
|
553
|
+
block_num,
|
554
|
+
is_downsample,
|
555
|
+
light_block,
|
556
|
+
kernel_size,
|
557
|
+
layer_num,
|
558
|
+
stride,
|
559
|
+
) = stage_config[k]
|
560
|
+
self.stages.append(
|
561
|
+
HGV2_Stage(
|
562
|
+
in_channels,
|
563
|
+
mid_channels,
|
564
|
+
out_channels,
|
565
|
+
block_num,
|
566
|
+
layer_num,
|
567
|
+
is_downsample,
|
568
|
+
light_block,
|
569
|
+
kernel_size,
|
570
|
+
use_lab,
|
571
|
+
stride,
|
572
|
+
lr_mult=lr_mult_list[i + 1],
|
573
|
+
)
|
574
|
+
)
|
575
|
+
if i in self.out_indices:
|
576
|
+
self.out_channels.append(out_channels)
|
577
|
+
if not self.det:
|
578
|
+
self.out_channels = stage_config["stage4"][2]
|
579
|
+
|
580
|
+
self.avg_pool = AdaptiveAvgPool2D(1)
|
581
|
+
|
582
|
+
if self.use_last_conv:
|
583
|
+
self.last_conv = nn.Conv2d(
|
584
|
+
in_channels=out_channels,
|
585
|
+
out_channels=self.class_expand,
|
586
|
+
kernel_size=1,
|
587
|
+
stride=1,
|
588
|
+
padding=0,
|
589
|
+
bias=False,
|
590
|
+
)
|
591
|
+
self.act = nn.ReLU()
|
592
|
+
if self.use_lab:
|
593
|
+
self.lab = LearnableAffineBlock()
|
594
|
+
self.dropout = DropoutInferDownscale(p=dropout_prob)
|
595
|
+
|
596
|
+
self.flatten = nn.Flatten(start_dim=1, end_dim=-1)
|
597
|
+
if not self.det:
|
598
|
+
self.fc = nn.Linear(
|
599
|
+
self.class_expand if self.use_last_conv else out_channels,
|
600
|
+
self.class_num,
|
601
|
+
)
|
602
|
+
|
603
|
+
self._init_weights()
|
604
|
+
|
605
|
+
def _init_weights(self):
|
606
|
+
for m in self.modules():
|
607
|
+
if isinstance(m, nn.Conv2d):
|
608
|
+
nn.init.kaiming_normal_(m.weight)
|
609
|
+
elif isinstance(m, nn.BatchNorm2d):
|
610
|
+
nn.init.ones_(m.weight)
|
611
|
+
nn.init.zeros_(m.bias)
|
612
|
+
elif isinstance(m, nn.Linear):
|
613
|
+
nn.init.zeros_(m.bias)
|
614
|
+
|
615
|
+
def forward(self, x):
|
616
|
+
x = self.stem(x)
|
617
|
+
out = []
|
618
|
+
for i, stage in enumerate(self.stages):
|
619
|
+
x = stage(x)
|
620
|
+
if self.det and i in self.out_indices:
|
621
|
+
out.append(x)
|
622
|
+
if self.det:
|
623
|
+
return out
|
624
|
+
|
625
|
+
if self.text_rec:
|
626
|
+
if self.training:
|
627
|
+
x = F.adaptive_avg_pool2d(x, [1, 40])
|
628
|
+
else:
|
629
|
+
x = F.avg_pool2d(x, [3, 2])
|
630
|
+
return x
|
631
|
+
|
632
|
+
|
633
|
+
def PPHGNetV2_B0(pretrained=False, use_ssld=False, **kwargs):
|
634
|
+
"""
|
635
|
+
PPHGNetV2_B0
|
636
|
+
Args:
|
637
|
+
pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
|
638
|
+
If str, means the path of the pretrained model.
|
639
|
+
use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
|
640
|
+
Returns:
|
641
|
+
model: nn.Layer. Specific `PPHGNetV2_B0` model depends on args.
|
642
|
+
"""
|
643
|
+
stage_config = {
|
644
|
+
# in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
|
645
|
+
"stage1": [16, 16, 64, 1, False, False, 3, 3],
|
646
|
+
"stage2": [64, 32, 256, 1, True, False, 3, 3],
|
647
|
+
"stage3": [256, 64, 512, 2, True, True, 5, 3],
|
648
|
+
"stage4": [512, 128, 1024, 1, True, True, 5, 3],
|
649
|
+
}
|
650
|
+
|
651
|
+
model = PPHGNetV2(
|
652
|
+
stem_channels=[3, 16, 16], stage_config=stage_config, use_lab=True, **kwargs
|
653
|
+
)
|
654
|
+
return model
|
655
|
+
|
656
|
+
|
657
|
+
def PPHGNetV2_B1(pretrained=False, use_ssld=False, **kwargs):
|
658
|
+
"""
|
659
|
+
PPHGNetV2_B1
|
660
|
+
Args:
|
661
|
+
pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
|
662
|
+
If str, means the path of the pretrained model.
|
663
|
+
use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
|
664
|
+
Returns:
|
665
|
+
model: nn.Layer. Specific `PPHGNetV2_B1` model depends on args.
|
666
|
+
"""
|
667
|
+
stage_config = {
|
668
|
+
# in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
|
669
|
+
"stage1": [32, 32, 64, 1, False, False, 3, 3],
|
670
|
+
"stage2": [64, 48, 256, 1, True, False, 3, 3],
|
671
|
+
"stage3": [256, 96, 512, 2, True, True, 5, 3],
|
672
|
+
"stage4": [512, 192, 1024, 1, True, True, 5, 3],
|
673
|
+
}
|
674
|
+
|
675
|
+
model = PPHGNetV2(
|
676
|
+
stem_channels=[3, 24, 32], stage_config=stage_config, use_lab=True, **kwargs
|
677
|
+
)
|
678
|
+
return model
|
679
|
+
|
680
|
+
|
681
|
+
def PPHGNetV2_B2(pretrained=False, use_ssld=False, **kwargs):
|
682
|
+
"""
|
683
|
+
PPHGNetV2_B2
|
684
|
+
Args:
|
685
|
+
pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
|
686
|
+
If str, means the path of the pretrained model.
|
687
|
+
use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
|
688
|
+
Returns:
|
689
|
+
model: nn.Layer. Specific `PPHGNetV2_B2` model depends on args.
|
690
|
+
"""
|
691
|
+
stage_config = {
|
692
|
+
# in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
|
693
|
+
"stage1": [32, 32, 96, 1, False, False, 3, 4],
|
694
|
+
"stage2": [96, 64, 384, 1, True, False, 3, 4],
|
695
|
+
"stage3": [384, 128, 768, 3, True, True, 5, 4],
|
696
|
+
"stage4": [768, 256, 1536, 1, True, True, 5, 4],
|
697
|
+
}
|
698
|
+
|
699
|
+
model = PPHGNetV2(
|
700
|
+
stem_channels=[3, 24, 32], stage_config=stage_config, use_lab=True, **kwargs
|
701
|
+
)
|
702
|
+
return model
|
703
|
+
|
704
|
+
|
705
|
+
def PPHGNetV2_B3(pretrained=False, use_ssld=False, **kwargs):
|
706
|
+
"""
|
707
|
+
PPHGNetV2_B3
|
708
|
+
Args:
|
709
|
+
pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
|
710
|
+
If str, means the path of the pretrained model.
|
711
|
+
use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
|
712
|
+
Returns:
|
713
|
+
model: nn.Layer. Specific `PPHGNetV2_B3` model depends on args.
|
714
|
+
"""
|
715
|
+
stage_config = {
|
716
|
+
# in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
|
717
|
+
"stage1": [32, 32, 128, 1, False, False, 3, 5],
|
718
|
+
"stage2": [128, 64, 512, 1, True, False, 3, 5],
|
719
|
+
"stage3": [512, 128, 1024, 3, True, True, 5, 5],
|
720
|
+
"stage4": [1024, 256, 2048, 1, True, True, 5, 5],
|
721
|
+
}
|
722
|
+
|
723
|
+
model = PPHGNetV2(
|
724
|
+
stem_channels=[3, 24, 32], stage_config=stage_config, use_lab=True, **kwargs
|
725
|
+
)
|
726
|
+
return model
|
727
|
+
|
728
|
+
|
729
|
+
def PPHGNetV2_B4(pretrained=False, use_ssld=False, det=False, text_rec=False, **kwargs):
|
730
|
+
"""
|
731
|
+
PPHGNetV2_B4
|
732
|
+
Args:
|
733
|
+
pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
|
734
|
+
If str, means the path of the pretrained model.
|
735
|
+
use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
|
736
|
+
Returns:
|
737
|
+
model: nn.Layer. Specific `PPHGNetV2_B4` model depends on args.
|
738
|
+
"""
|
739
|
+
stage_config_rec = {
|
740
|
+
# in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num, stride
|
741
|
+
"stage1": [48, 48, 128, 1, True, False, 3, 6, [2, 1]],
|
742
|
+
"stage2": [128, 96, 512, 1, True, False, 3, 6, [1, 2]],
|
743
|
+
"stage3": [512, 192, 1024, 3, True, True, 5, 6, [2, 1]],
|
744
|
+
"stage4": [1024, 384, 2048, 1, True, True, 5, 6, [2, 1]],
|
745
|
+
}
|
746
|
+
|
747
|
+
stage_config_det = {
|
748
|
+
# in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
|
749
|
+
"stage1": [48, 48, 128, 1, False, False, 3, 6, 2],
|
750
|
+
"stage2": [128, 96, 512, 1, True, False, 3, 6, 2],
|
751
|
+
"stage3": [512, 192, 1024, 3, True, True, 5, 6, 2],
|
752
|
+
"stage4": [1024, 384, 2048, 1, True, True, 5, 6, 2],
|
753
|
+
}
|
754
|
+
model = PPHGNetV2(
|
755
|
+
stem_channels=[3, 32, 48],
|
756
|
+
stage_config=stage_config_det if det else stage_config_rec,
|
757
|
+
use_lab=False,
|
758
|
+
det=det,
|
759
|
+
text_rec=text_rec,
|
760
|
+
**kwargs,
|
761
|
+
)
|
762
|
+
return model
|
763
|
+
|
764
|
+
|
765
|
+
def PPHGNetV2_B5(pretrained=False, use_ssld=False, **kwargs):
|
766
|
+
"""
|
767
|
+
PPHGNetV2_B5
|
768
|
+
Args:
|
769
|
+
pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
|
770
|
+
If str, means the path of the pretrained model.
|
771
|
+
use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
|
772
|
+
Returns:
|
773
|
+
model: nn.Layer. Specific `PPHGNetV2_B5` model depends on args.
|
774
|
+
"""
|
775
|
+
stage_config = {
|
776
|
+
# in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
|
777
|
+
"stage1": [64, 64, 128, 1, False, False, 3, 6],
|
778
|
+
"stage2": [128, 128, 512, 2, True, False, 3, 6],
|
779
|
+
"stage3": [512, 256, 1024, 5, True, True, 5, 6],
|
780
|
+
"stage4": [1024, 512, 2048, 2, True, True, 5, 6],
|
781
|
+
}
|
782
|
+
|
783
|
+
model = PPHGNetV2(
|
784
|
+
stem_channels=[3, 32, 64], stage_config=stage_config, use_lab=False, **kwargs
|
785
|
+
)
|
786
|
+
return model
|
787
|
+
|
788
|
+
|
789
|
+
def PPHGNetV2_B6(pretrained=False, use_ssld=False, **kwargs):
|
790
|
+
"""
|
791
|
+
PPHGNetV2_B6
|
792
|
+
Args:
|
793
|
+
pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
|
794
|
+
If str, means the path of the pretrained model.
|
795
|
+
use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
|
796
|
+
Returns:
|
797
|
+
model: nn.Layer. Specific `PPHGNetV2_B6` model depends on args.
|
798
|
+
"""
|
799
|
+
stage_config = {
|
800
|
+
# in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
|
801
|
+
"stage1": [96, 96, 192, 2, False, False, 3, 6],
|
802
|
+
"stage2": [192, 192, 512, 3, True, False, 3, 6],
|
803
|
+
"stage3": [512, 384, 1024, 6, True, True, 5, 6],
|
804
|
+
"stage4": [1024, 768, 2048, 3, True, True, 5, 6],
|
805
|
+
}
|
806
|
+
|
807
|
+
model = PPHGNetV2(
|
808
|
+
stem_channels=[3, 48, 96], stage_config=stage_config, use_lab=False, **kwargs
|
809
|
+
)
|
810
|
+
return model
|