magic-pdf 1.3.10__py3-none-any.whl → 1.3.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,810 @@
1
+ import math
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+
6
+
7
+ class AdaptiveAvgPool2D(nn.AdaptiveAvgPool2d):
8
+ def __init__(self, *args, **kwargs):
9
+ super().__init__(*args, **kwargs)
10
+
11
+ if isinstance(self.output_size, int) and self.output_size == 1:
12
+ self._gap = True
13
+ elif (
14
+ isinstance(self.output_size, tuple)
15
+ and self.output_size[0] == 1
16
+ and self.output_size[1] == 1
17
+ ):
18
+ self._gap = True
19
+ else:
20
+ self._gap = False
21
+
22
+ def forward(self, x):
23
+ if self._gap:
24
+ # Global Average Pooling
25
+ N, C, _, _ = x.shape
26
+ x_mean = torch.mean(x, dim=[2, 3])
27
+ x_mean = torch.reshape(x_mean, [N, C, 1, 1])
28
+ return x_mean
29
+ else:
30
+ return F.adaptive_avg_pool2d(
31
+ x,
32
+ output_size=self.output_size
33
+ )
34
+
35
+ class LearnableAffineBlock(nn.Module):
36
+ """
37
+ Create a learnable affine block module. This module can significantly improve accuracy on smaller models.
38
+
39
+ Args:
40
+ scale_value (float): The initial value of the scale parameter, default is 1.0.
41
+ bias_value (float): The initial value of the bias parameter, default is 0.0.
42
+ lr_mult (float): The learning rate multiplier, default is 1.0.
43
+ lab_lr (float): The learning rate, default is 0.01.
44
+ """
45
+
46
+ def __init__(self, scale_value=1.0, bias_value=0.0, lr_mult=1.0, lab_lr=0.01):
47
+ super().__init__()
48
+ self.scale = nn.Parameter(torch.Tensor([scale_value]))
49
+ self.bias = nn.Parameter(torch.Tensor([bias_value]))
50
+
51
+ def forward(self, x):
52
+ return self.scale * x + self.bias
53
+
54
+
55
+ class ConvBNAct(nn.Module):
56
+ """
57
+ ConvBNAct is a combination of convolution and batchnorm layers.
58
+
59
+ Args:
60
+ in_channels (int): Number of input channels.
61
+ out_channels (int): Number of output channels.
62
+ kernel_size (int): Size of the convolution kernel. Defaults to 3.
63
+ stride (int): Stride of the convolution. Defaults to 1.
64
+ padding (int/str): Padding or padding type for the convolution. Defaults to 1.
65
+ groups (int): Number of groups for the convolution. Defaults to 1.
66
+ use_act: (bool): Whether to use activation function. Defaults to True.
67
+ use_lab (bool): Whether to use the LAB operation. Defaults to False.
68
+ lr_mult (float): Learning rate multiplier for the layer. Defaults to 1.0.
69
+ """
70
+
71
+ def __init__(
72
+ self,
73
+ in_channels,
74
+ out_channels,
75
+ kernel_size=3,
76
+ stride=1,
77
+ padding=1,
78
+ groups=1,
79
+ use_act=True,
80
+ use_lab=False,
81
+ lr_mult=1.0,
82
+ ):
83
+ super().__init__()
84
+ self.use_act = use_act
85
+ self.use_lab = use_lab
86
+
87
+ self.conv = nn.Conv2d(
88
+ in_channels,
89
+ out_channels,
90
+ kernel_size,
91
+ stride,
92
+ padding=padding if isinstance(padding, str) else (kernel_size - 1) // 2,
93
+ # padding=(kernel_size - 1) // 2,
94
+ groups=groups,
95
+ bias=False,
96
+ )
97
+ self.bn = nn.BatchNorm2d(
98
+ out_channels,
99
+ )
100
+ if self.use_act:
101
+ self.act = nn.ReLU()
102
+ if self.use_lab:
103
+ self.lab = LearnableAffineBlock(lr_mult=lr_mult)
104
+
105
+ def forward(self, x):
106
+ x = self.conv(x)
107
+ x = self.bn(x)
108
+ if self.use_act:
109
+ x = self.act(x)
110
+ if self.use_lab:
111
+ x = self.lab(x)
112
+ return x
113
+
114
+
115
+ class LightConvBNAct(nn.Module):
116
+ """
117
+ LightConvBNAct is a combination of pw and dw layers.
118
+
119
+ Args:
120
+ in_channels (int): Number of input channels.
121
+ out_channels (int): Number of output channels.
122
+ kernel_size (int): Size of the depth-wise convolution kernel.
123
+ use_lab (bool): Whether to use the LAB operation. Defaults to False.
124
+ lr_mult (float): Learning rate multiplier for the layer. Defaults to 1.0.
125
+ """
126
+
127
+ def __init__(
128
+ self,
129
+ in_channels,
130
+ out_channels,
131
+ kernel_size,
132
+ use_lab=False,
133
+ lr_mult=1.0,
134
+ **kwargs,
135
+ ):
136
+ super().__init__()
137
+ self.conv1 = ConvBNAct(
138
+ in_channels=in_channels,
139
+ out_channels=out_channels,
140
+ kernel_size=1,
141
+ use_act=False,
142
+ use_lab=use_lab,
143
+ lr_mult=lr_mult,
144
+ )
145
+ self.conv2 = ConvBNAct(
146
+ in_channels=out_channels,
147
+ out_channels=out_channels,
148
+ kernel_size=kernel_size,
149
+ groups=out_channels,
150
+ use_act=True,
151
+ use_lab=use_lab,
152
+ lr_mult=lr_mult,
153
+ )
154
+
155
+ def forward(self, x):
156
+ x = self.conv1(x)
157
+ x = self.conv2(x)
158
+ return x
159
+
160
+
161
+ class CustomMaxPool2d(nn.Module):
162
+ def __init__(
163
+ self,
164
+ kernel_size,
165
+ stride=None,
166
+ padding=0,
167
+ dilation=1,
168
+ return_indices=False,
169
+ ceil_mode=False,
170
+ data_format="NCHW",
171
+ ):
172
+ super(CustomMaxPool2d, self).__init__()
173
+ self.kernel_size = kernel_size if isinstance(kernel_size, (tuple, list)) else (kernel_size, kernel_size)
174
+ self.stride = stride if stride is not None else self.kernel_size
175
+ self.stride = self.stride if isinstance(self.stride, (tuple, list)) else (self.stride, self.stride)
176
+ self.dilation = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
177
+ self.return_indices = return_indices
178
+ self.ceil_mode = ceil_mode
179
+ self.padding_mode = padding
180
+
181
+ # 当padding不是"same"时使用标准MaxPool2d
182
+ if padding != "same":
183
+ self.padding = padding if isinstance(padding, (tuple, list)) else (padding, padding)
184
+ self.pool = nn.MaxPool2d(
185
+ kernel_size=self.kernel_size,
186
+ stride=self.stride,
187
+ padding=self.padding,
188
+ dilation=self.dilation,
189
+ return_indices=self.return_indices,
190
+ ceil_mode=self.ceil_mode
191
+ )
192
+
193
+ def forward(self, x):
194
+ # 处理same padding
195
+ if self.padding_mode == "same":
196
+ input_height, input_width = x.size(2), x.size(3)
197
+
198
+ # 计算期望的输出尺寸
199
+ out_height = math.ceil(input_height / self.stride[0])
200
+ out_width = math.ceil(input_width / self.stride[1])
201
+
202
+ # 计算需要的padding
203
+ pad_height = max((out_height - 1) * self.stride[0] + self.kernel_size[0] - input_height, 0)
204
+ pad_width = max((out_width - 1) * self.stride[1] + self.kernel_size[1] - input_width, 0)
205
+
206
+ # 将padding分配到两边
207
+ pad_top = pad_height // 2
208
+ pad_bottom = pad_height - pad_top
209
+ pad_left = pad_width // 2
210
+ pad_right = pad_width - pad_left
211
+
212
+ # 应用padding
213
+ x = F.pad(x, (pad_left, pad_right, pad_top, pad_bottom))
214
+
215
+ # 使用标准max_pool2d函数
216
+ if self.return_indices:
217
+ return F.max_pool2d_with_indices(
218
+ x,
219
+ kernel_size=self.kernel_size,
220
+ stride=self.stride,
221
+ padding=0, # 已经手动pad过了
222
+ dilation=self.dilation,
223
+ ceil_mode=self.ceil_mode
224
+ )
225
+ else:
226
+ return F.max_pool2d(
227
+ x,
228
+ kernel_size=self.kernel_size,
229
+ stride=self.stride,
230
+ padding=0, # 已经手动pad过了
231
+ dilation=self.dilation,
232
+ ceil_mode=self.ceil_mode
233
+ )
234
+ else:
235
+ # 使用预定义的MaxPool2d
236
+ return self.pool(x)
237
+
238
+ class StemBlock(nn.Module):
239
+ """
240
+ StemBlock for PP-HGNetV2.
241
+
242
+ Args:
243
+ in_channels (int): Number of input channels.
244
+ mid_channels (int): Number of middle channels.
245
+ out_channels (int): Number of output channels.
246
+ use_lab (bool): Whether to use the LAB operation. Defaults to False.
247
+ lr_mult (float): Learning rate multiplier for the layer. Defaults to 1.0.
248
+ """
249
+
250
+ def __init__(
251
+ self,
252
+ in_channels,
253
+ mid_channels,
254
+ out_channels,
255
+ use_lab=False,
256
+ lr_mult=1.0,
257
+ text_rec=False,
258
+ ):
259
+ super().__init__()
260
+ self.stem1 = ConvBNAct(
261
+ in_channels=in_channels,
262
+ out_channels=mid_channels,
263
+ kernel_size=3,
264
+ stride=2,
265
+ use_lab=use_lab,
266
+ lr_mult=lr_mult,
267
+ )
268
+ self.stem2a = ConvBNAct(
269
+ in_channels=mid_channels,
270
+ out_channels=mid_channels // 2,
271
+ kernel_size=2,
272
+ stride=1,
273
+ padding="same",
274
+ use_lab=use_lab,
275
+ lr_mult=lr_mult,
276
+ )
277
+ self.stem2b = ConvBNAct(
278
+ in_channels=mid_channels // 2,
279
+ out_channels=mid_channels,
280
+ kernel_size=2,
281
+ stride=1,
282
+ padding="same",
283
+ use_lab=use_lab,
284
+ lr_mult=lr_mult,
285
+ )
286
+ self.stem3 = ConvBNAct(
287
+ in_channels=mid_channels * 2,
288
+ out_channels=mid_channels,
289
+ kernel_size=3,
290
+ stride=1 if text_rec else 2,
291
+ use_lab=use_lab,
292
+ lr_mult=lr_mult,
293
+ )
294
+ self.stem4 = ConvBNAct(
295
+ in_channels=mid_channels,
296
+ out_channels=out_channels,
297
+ kernel_size=1,
298
+ stride=1,
299
+ use_lab=use_lab,
300
+ lr_mult=lr_mult,
301
+ )
302
+ self.pool = CustomMaxPool2d(
303
+ kernel_size=2, stride=1, ceil_mode=True, padding="same"
304
+ )
305
+ # self.pool = nn.MaxPool2d(
306
+ # kernel_size=2, stride=1, ceil_mode=True, padding=1
307
+ # )
308
+
309
+ def forward(self, x):
310
+ x = self.stem1(x)
311
+ x2 = self.stem2a(x)
312
+ x2 = self.stem2b(x2)
313
+ x1 = self.pool(x)
314
+
315
+ # if x1.shape[2:] != x2.shape[2:]:
316
+ # x1 = F.interpolate(x1, size=x2.shape[2:], mode='bilinear', align_corners=False)
317
+
318
+ x = torch.cat([x1, x2], 1)
319
+ x = self.stem3(x)
320
+ x = self.stem4(x)
321
+
322
+ return x
323
+
324
+
325
+ class HGV2_Block(nn.Module):
326
+ """
327
+ HGV2_Block, the basic unit that constitutes the HGV2_Stage.
328
+
329
+ Args:
330
+ in_channels (int): Number of input channels.
331
+ mid_channels (int): Number of middle channels.
332
+ out_channels (int): Number of output channels.
333
+ kernel_size (int): Size of the convolution kernel. Defaults to 3.
334
+ layer_num (int): Number of layers in the HGV2 block. Defaults to 6.
335
+ stride (int): Stride of the convolution. Defaults to 1.
336
+ padding (int/str): Padding or padding type for the convolution. Defaults to 1.
337
+ groups (int): Number of groups for the convolution. Defaults to 1.
338
+ use_act (bool): Whether to use activation function. Defaults to True.
339
+ use_lab (bool): Whether to use the LAB operation. Defaults to False.
340
+ lr_mult (float): Learning rate multiplier for the layer. Defaults to 1.0.
341
+ """
342
+
343
+ def __init__(
344
+ self,
345
+ in_channels,
346
+ mid_channels,
347
+ out_channels,
348
+ kernel_size=3,
349
+ layer_num=6,
350
+ identity=False,
351
+ light_block=True,
352
+ use_lab=False,
353
+ lr_mult=1.0,
354
+ ):
355
+ super().__init__()
356
+ self.identity = identity
357
+
358
+ self.layers = nn.ModuleList()
359
+ block_type = "LightConvBNAct" if light_block else "ConvBNAct"
360
+ for i in range(layer_num):
361
+ self.layers.append(
362
+ eval(block_type)(
363
+ in_channels=in_channels if i == 0 else mid_channels,
364
+ out_channels=mid_channels,
365
+ stride=1,
366
+ kernel_size=kernel_size,
367
+ use_lab=use_lab,
368
+ lr_mult=lr_mult,
369
+ )
370
+ )
371
+ # feature aggregation
372
+ total_channels = in_channels + layer_num * mid_channels
373
+ self.aggregation_squeeze_conv = ConvBNAct(
374
+ in_channels=total_channels,
375
+ out_channels=out_channels // 2,
376
+ kernel_size=1,
377
+ stride=1,
378
+ use_lab=use_lab,
379
+ lr_mult=lr_mult,
380
+ )
381
+ self.aggregation_excitation_conv = ConvBNAct(
382
+ in_channels=out_channels // 2,
383
+ out_channels=out_channels,
384
+ kernel_size=1,
385
+ stride=1,
386
+ use_lab=use_lab,
387
+ lr_mult=lr_mult,
388
+ )
389
+
390
+ def forward(self, x):
391
+ identity = x
392
+ output = []
393
+ output.append(x)
394
+ for layer in self.layers:
395
+ x = layer(x)
396
+ output.append(x)
397
+ x = torch.cat(output, dim=1)
398
+ x = self.aggregation_squeeze_conv(x)
399
+ x = self.aggregation_excitation_conv(x)
400
+ if self.identity:
401
+ x += identity
402
+ return x
403
+
404
+
405
+ class HGV2_Stage(nn.Module):
406
+ """
407
+ HGV2_Stage, the basic unit that constitutes the PPHGNetV2.
408
+
409
+ Args:
410
+ in_channels (int): Number of input channels.
411
+ mid_channels (int): Number of middle channels.
412
+ out_channels (int): Number of output channels.
413
+ block_num (int): Number of blocks in the HGV2 stage.
414
+ layer_num (int): Number of layers in the HGV2 block. Defaults to 6.
415
+ is_downsample (bool): Whether to use downsampling operation. Defaults to False.
416
+ light_block (bool): Whether to use light block. Defaults to True.
417
+ kernel_size (int): Size of the convolution kernel. Defaults to 3.
418
+ use_lab (bool, optional): Whether to use the LAB operation. Defaults to False.
419
+ lr_mult (float, optional): Learning rate multiplier for the layer. Defaults to 1.0.
420
+ """
421
+
422
+ def __init__(
423
+ self,
424
+ in_channels,
425
+ mid_channels,
426
+ out_channels,
427
+ block_num,
428
+ layer_num=6,
429
+ is_downsample=True,
430
+ light_block=True,
431
+ kernel_size=3,
432
+ use_lab=False,
433
+ stride=2,
434
+ lr_mult=1.0,
435
+ ):
436
+
437
+ super().__init__()
438
+ self.is_downsample = is_downsample
439
+ if self.is_downsample:
440
+ self.downsample = ConvBNAct(
441
+ in_channels=in_channels,
442
+ out_channels=in_channels,
443
+ kernel_size=3,
444
+ stride=stride,
445
+ groups=in_channels,
446
+ use_act=False,
447
+ use_lab=use_lab,
448
+ lr_mult=lr_mult,
449
+ )
450
+
451
+ blocks_list = []
452
+ for i in range(block_num):
453
+ blocks_list.append(
454
+ HGV2_Block(
455
+ in_channels=in_channels if i == 0 else out_channels,
456
+ mid_channels=mid_channels,
457
+ out_channels=out_channels,
458
+ kernel_size=kernel_size,
459
+ layer_num=layer_num,
460
+ identity=False if i == 0 else True,
461
+ light_block=light_block,
462
+ use_lab=use_lab,
463
+ lr_mult=lr_mult,
464
+ )
465
+ )
466
+ self.blocks = nn.Sequential(*blocks_list)
467
+
468
+ def forward(self, x):
469
+ if self.is_downsample:
470
+ x = self.downsample(x)
471
+ x = self.blocks(x)
472
+ return x
473
+
474
+
475
+ class DropoutInferDownscale(nn.Module):
476
+ """
477
+ 实现与Paddle的mode="downscale_in_infer"等效的Dropout
478
+ 训练模式:out = input * mask(直接应用掩码,不进行放大)
479
+ 推理模式:out = input * (1.0 - p)(在推理时按概率缩小)
480
+ """
481
+
482
+ def __init__(self, p=0.5):
483
+ super().__init__()
484
+ self.p = p
485
+
486
+ def forward(self, x):
487
+ if self.training:
488
+ # 训练时:应用随机mask但不放大
489
+ return F.dropout(x, self.p, training=True) * (1.0 - self.p)
490
+ else:
491
+ # 推理时:按照dropout概率缩小输出
492
+ return x * (1.0 - self.p)
493
+
494
+ class PPHGNetV2(nn.Module):
495
+ """
496
+ PPHGNetV2
497
+
498
+ Args:
499
+ stage_config (dict): Config for PPHGNetV2 stages. such as the number of channels, stride, etc.
500
+ stem_channels: (list): Number of channels of the stem of the PPHGNetV2.
501
+ use_lab (bool): Whether to use the LAB operation. Defaults to False.
502
+ use_last_conv (bool): Whether to use the last conv layer as the output channel. Defaults to True.
503
+ class_expand (int): Number of channels for the last 1x1 convolutional layer.
504
+ drop_prob (float): Dropout probability for the last 1x1 convolutional layer. Defaults to 0.0.
505
+ class_num (int): The number of classes for the classification layer. Defaults to 1000.
506
+ lr_mult_list (list): Learning rate multiplier for the stages. Defaults to [1.0, 1.0, 1.0, 1.0, 1.0].
507
+ Returns:
508
+ model: nn.Layer. Specific PPHGNetV2 model depends on args.
509
+ """
510
+
511
+ def __init__(
512
+ self,
513
+ stage_config,
514
+ stem_channels=[3, 32, 64],
515
+ use_lab=False,
516
+ use_last_conv=True,
517
+ class_expand=2048,
518
+ dropout_prob=0.0,
519
+ class_num=1000,
520
+ lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
521
+ det=False,
522
+ text_rec=False,
523
+ out_indices=None,
524
+ **kwargs,
525
+ ):
526
+ super().__init__()
527
+ self.det = det
528
+ self.text_rec = text_rec
529
+ self.use_lab = use_lab
530
+ self.use_last_conv = use_last_conv
531
+ self.class_expand = class_expand
532
+ self.class_num = class_num
533
+ self.out_indices = out_indices if out_indices is not None else [0, 1, 2, 3]
534
+ self.out_channels = []
535
+
536
+ # stem
537
+ self.stem = StemBlock(
538
+ in_channels=stem_channels[0],
539
+ mid_channels=stem_channels[1],
540
+ out_channels=stem_channels[2],
541
+ use_lab=use_lab,
542
+ lr_mult=lr_mult_list[0],
543
+ text_rec=text_rec,
544
+ )
545
+
546
+ # stages
547
+ self.stages = nn.ModuleList()
548
+ for i, k in enumerate(stage_config):
549
+ (
550
+ in_channels,
551
+ mid_channels,
552
+ out_channels,
553
+ block_num,
554
+ is_downsample,
555
+ light_block,
556
+ kernel_size,
557
+ layer_num,
558
+ stride,
559
+ ) = stage_config[k]
560
+ self.stages.append(
561
+ HGV2_Stage(
562
+ in_channels,
563
+ mid_channels,
564
+ out_channels,
565
+ block_num,
566
+ layer_num,
567
+ is_downsample,
568
+ light_block,
569
+ kernel_size,
570
+ use_lab,
571
+ stride,
572
+ lr_mult=lr_mult_list[i + 1],
573
+ )
574
+ )
575
+ if i in self.out_indices:
576
+ self.out_channels.append(out_channels)
577
+ if not self.det:
578
+ self.out_channels = stage_config["stage4"][2]
579
+
580
+ self.avg_pool = AdaptiveAvgPool2D(1)
581
+
582
+ if self.use_last_conv:
583
+ self.last_conv = nn.Conv2d(
584
+ in_channels=out_channels,
585
+ out_channels=self.class_expand,
586
+ kernel_size=1,
587
+ stride=1,
588
+ padding=0,
589
+ bias=False,
590
+ )
591
+ self.act = nn.ReLU()
592
+ if self.use_lab:
593
+ self.lab = LearnableAffineBlock()
594
+ self.dropout = DropoutInferDownscale(p=dropout_prob)
595
+
596
+ self.flatten = nn.Flatten(start_dim=1, end_dim=-1)
597
+ if not self.det:
598
+ self.fc = nn.Linear(
599
+ self.class_expand if self.use_last_conv else out_channels,
600
+ self.class_num,
601
+ )
602
+
603
+ self._init_weights()
604
+
605
+ def _init_weights(self):
606
+ for m in self.modules():
607
+ if isinstance(m, nn.Conv2d):
608
+ nn.init.kaiming_normal_(m.weight)
609
+ elif isinstance(m, nn.BatchNorm2d):
610
+ nn.init.ones_(m.weight)
611
+ nn.init.zeros_(m.bias)
612
+ elif isinstance(m, nn.Linear):
613
+ nn.init.zeros_(m.bias)
614
+
615
+ def forward(self, x):
616
+ x = self.stem(x)
617
+ out = []
618
+ for i, stage in enumerate(self.stages):
619
+ x = stage(x)
620
+ if self.det and i in self.out_indices:
621
+ out.append(x)
622
+ if self.det:
623
+ return out
624
+
625
+ if self.text_rec:
626
+ if self.training:
627
+ x = F.adaptive_avg_pool2d(x, [1, 40])
628
+ else:
629
+ x = F.avg_pool2d(x, [3, 2])
630
+ return x
631
+
632
+
633
+ def PPHGNetV2_B0(pretrained=False, use_ssld=False, **kwargs):
634
+ """
635
+ PPHGNetV2_B0
636
+ Args:
637
+ pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
638
+ If str, means the path of the pretrained model.
639
+ use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
640
+ Returns:
641
+ model: nn.Layer. Specific `PPHGNetV2_B0` model depends on args.
642
+ """
643
+ stage_config = {
644
+ # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
645
+ "stage1": [16, 16, 64, 1, False, False, 3, 3],
646
+ "stage2": [64, 32, 256, 1, True, False, 3, 3],
647
+ "stage3": [256, 64, 512, 2, True, True, 5, 3],
648
+ "stage4": [512, 128, 1024, 1, True, True, 5, 3],
649
+ }
650
+
651
+ model = PPHGNetV2(
652
+ stem_channels=[3, 16, 16], stage_config=stage_config, use_lab=True, **kwargs
653
+ )
654
+ return model
655
+
656
+
657
+ def PPHGNetV2_B1(pretrained=False, use_ssld=False, **kwargs):
658
+ """
659
+ PPHGNetV2_B1
660
+ Args:
661
+ pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
662
+ If str, means the path of the pretrained model.
663
+ use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
664
+ Returns:
665
+ model: nn.Layer. Specific `PPHGNetV2_B1` model depends on args.
666
+ """
667
+ stage_config = {
668
+ # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
669
+ "stage1": [32, 32, 64, 1, False, False, 3, 3],
670
+ "stage2": [64, 48, 256, 1, True, False, 3, 3],
671
+ "stage3": [256, 96, 512, 2, True, True, 5, 3],
672
+ "stage4": [512, 192, 1024, 1, True, True, 5, 3],
673
+ }
674
+
675
+ model = PPHGNetV2(
676
+ stem_channels=[3, 24, 32], stage_config=stage_config, use_lab=True, **kwargs
677
+ )
678
+ return model
679
+
680
+
681
+ def PPHGNetV2_B2(pretrained=False, use_ssld=False, **kwargs):
682
+ """
683
+ PPHGNetV2_B2
684
+ Args:
685
+ pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
686
+ If str, means the path of the pretrained model.
687
+ use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
688
+ Returns:
689
+ model: nn.Layer. Specific `PPHGNetV2_B2` model depends on args.
690
+ """
691
+ stage_config = {
692
+ # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
693
+ "stage1": [32, 32, 96, 1, False, False, 3, 4],
694
+ "stage2": [96, 64, 384, 1, True, False, 3, 4],
695
+ "stage3": [384, 128, 768, 3, True, True, 5, 4],
696
+ "stage4": [768, 256, 1536, 1, True, True, 5, 4],
697
+ }
698
+
699
+ model = PPHGNetV2(
700
+ stem_channels=[3, 24, 32], stage_config=stage_config, use_lab=True, **kwargs
701
+ )
702
+ return model
703
+
704
+
705
+ def PPHGNetV2_B3(pretrained=False, use_ssld=False, **kwargs):
706
+ """
707
+ PPHGNetV2_B3
708
+ Args:
709
+ pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
710
+ If str, means the path of the pretrained model.
711
+ use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
712
+ Returns:
713
+ model: nn.Layer. Specific `PPHGNetV2_B3` model depends on args.
714
+ """
715
+ stage_config = {
716
+ # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
717
+ "stage1": [32, 32, 128, 1, False, False, 3, 5],
718
+ "stage2": [128, 64, 512, 1, True, False, 3, 5],
719
+ "stage3": [512, 128, 1024, 3, True, True, 5, 5],
720
+ "stage4": [1024, 256, 2048, 1, True, True, 5, 5],
721
+ }
722
+
723
+ model = PPHGNetV2(
724
+ stem_channels=[3, 24, 32], stage_config=stage_config, use_lab=True, **kwargs
725
+ )
726
+ return model
727
+
728
+
729
+ def PPHGNetV2_B4(pretrained=False, use_ssld=False, det=False, text_rec=False, **kwargs):
730
+ """
731
+ PPHGNetV2_B4
732
+ Args:
733
+ pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
734
+ If str, means the path of the pretrained model.
735
+ use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
736
+ Returns:
737
+ model: nn.Layer. Specific `PPHGNetV2_B4` model depends on args.
738
+ """
739
+ stage_config_rec = {
740
+ # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num, stride
741
+ "stage1": [48, 48, 128, 1, True, False, 3, 6, [2, 1]],
742
+ "stage2": [128, 96, 512, 1, True, False, 3, 6, [1, 2]],
743
+ "stage3": [512, 192, 1024, 3, True, True, 5, 6, [2, 1]],
744
+ "stage4": [1024, 384, 2048, 1, True, True, 5, 6, [2, 1]],
745
+ }
746
+
747
+ stage_config_det = {
748
+ # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
749
+ "stage1": [48, 48, 128, 1, False, False, 3, 6, 2],
750
+ "stage2": [128, 96, 512, 1, True, False, 3, 6, 2],
751
+ "stage3": [512, 192, 1024, 3, True, True, 5, 6, 2],
752
+ "stage4": [1024, 384, 2048, 1, True, True, 5, 6, 2],
753
+ }
754
+ model = PPHGNetV2(
755
+ stem_channels=[3, 32, 48],
756
+ stage_config=stage_config_det if det else stage_config_rec,
757
+ use_lab=False,
758
+ det=det,
759
+ text_rec=text_rec,
760
+ **kwargs,
761
+ )
762
+ return model
763
+
764
+
765
+ def PPHGNetV2_B5(pretrained=False, use_ssld=False, **kwargs):
766
+ """
767
+ PPHGNetV2_B5
768
+ Args:
769
+ pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
770
+ If str, means the path of the pretrained model.
771
+ use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
772
+ Returns:
773
+ model: nn.Layer. Specific `PPHGNetV2_B5` model depends on args.
774
+ """
775
+ stage_config = {
776
+ # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
777
+ "stage1": [64, 64, 128, 1, False, False, 3, 6],
778
+ "stage2": [128, 128, 512, 2, True, False, 3, 6],
779
+ "stage3": [512, 256, 1024, 5, True, True, 5, 6],
780
+ "stage4": [1024, 512, 2048, 2, True, True, 5, 6],
781
+ }
782
+
783
+ model = PPHGNetV2(
784
+ stem_channels=[3, 32, 64], stage_config=stage_config, use_lab=False, **kwargs
785
+ )
786
+ return model
787
+
788
+
789
+ def PPHGNetV2_B6(pretrained=False, use_ssld=False, **kwargs):
790
+ """
791
+ PPHGNetV2_B6
792
+ Args:
793
+ pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
794
+ If str, means the path of the pretrained model.
795
+ use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
796
+ Returns:
797
+ model: nn.Layer. Specific `PPHGNetV2_B6` model depends on args.
798
+ """
799
+ stage_config = {
800
+ # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
801
+ "stage1": [96, 96, 192, 2, False, False, 3, 6],
802
+ "stage2": [192, 192, 512, 3, True, False, 3, 6],
803
+ "stage3": [512, 384, 1024, 6, True, True, 5, 6],
804
+ "stage4": [1024, 768, 2048, 3, True, True, 5, 6],
805
+ }
806
+
807
+ model = PPHGNetV2(
808
+ stem_channels=[3, 48, 96], stage_config=stage_config, use_lab=False, **kwargs
809
+ )
810
+ return model