python-doctr 0.10.0__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctr/contrib/__init__.py +1 -0
- doctr/contrib/artefacts.py +7 -9
- doctr/contrib/base.py +8 -17
- doctr/datasets/cord.py +8 -7
- doctr/datasets/datasets/__init__.py +4 -4
- doctr/datasets/datasets/base.py +16 -16
- doctr/datasets/datasets/pytorch.py +12 -12
- doctr/datasets/datasets/tensorflow.py +10 -10
- doctr/datasets/detection.py +6 -9
- doctr/datasets/doc_artefacts.py +3 -4
- doctr/datasets/funsd.py +7 -6
- doctr/datasets/generator/__init__.py +4 -4
- doctr/datasets/generator/base.py +16 -17
- doctr/datasets/generator/pytorch.py +1 -3
- doctr/datasets/generator/tensorflow.py +1 -3
- doctr/datasets/ic03.py +4 -5
- doctr/datasets/ic13.py +4 -5
- doctr/datasets/iiit5k.py +6 -5
- doctr/datasets/iiithws.py +4 -5
- doctr/datasets/imgur5k.py +6 -5
- doctr/datasets/loader.py +4 -7
- doctr/datasets/mjsynth.py +6 -5
- doctr/datasets/ocr.py +3 -4
- doctr/datasets/orientation.py +3 -4
- doctr/datasets/recognition.py +3 -4
- doctr/datasets/sroie.py +6 -5
- doctr/datasets/svhn.py +6 -5
- doctr/datasets/svt.py +4 -5
- doctr/datasets/synthtext.py +4 -5
- doctr/datasets/utils.py +34 -29
- doctr/datasets/vocabs.py +17 -7
- doctr/datasets/wildreceipt.py +14 -10
- doctr/file_utils.py +2 -7
- doctr/io/elements.py +59 -79
- doctr/io/html.py +1 -3
- doctr/io/image/__init__.py +3 -3
- doctr/io/image/base.py +2 -5
- doctr/io/image/pytorch.py +3 -12
- doctr/io/image/tensorflow.py +2 -11
- doctr/io/pdf.py +5 -7
- doctr/io/reader.py +5 -11
- doctr/models/_utils.py +14 -22
- doctr/models/builder.py +30 -48
- doctr/models/classification/magc_resnet/__init__.py +3 -3
- doctr/models/classification/magc_resnet/pytorch.py +10 -13
- doctr/models/classification/magc_resnet/tensorflow.py +8 -11
- doctr/models/classification/mobilenet/__init__.py +3 -3
- doctr/models/classification/mobilenet/pytorch.py +5 -17
- doctr/models/classification/mobilenet/tensorflow.py +8 -21
- doctr/models/classification/predictor/__init__.py +4 -4
- doctr/models/classification/predictor/pytorch.py +6 -8
- doctr/models/classification/predictor/tensorflow.py +6 -8
- doctr/models/classification/resnet/__init__.py +4 -4
- doctr/models/classification/resnet/pytorch.py +21 -31
- doctr/models/classification/resnet/tensorflow.py +20 -31
- doctr/models/classification/textnet/__init__.py +3 -3
- doctr/models/classification/textnet/pytorch.py +10 -17
- doctr/models/classification/textnet/tensorflow.py +8 -15
- doctr/models/classification/vgg/__init__.py +3 -3
- doctr/models/classification/vgg/pytorch.py +5 -7
- doctr/models/classification/vgg/tensorflow.py +9 -12
- doctr/models/classification/vit/__init__.py +3 -3
- doctr/models/classification/vit/pytorch.py +8 -14
- doctr/models/classification/vit/tensorflow.py +6 -12
- doctr/models/classification/zoo.py +19 -14
- doctr/models/core.py +3 -3
- doctr/models/detection/_utils/__init__.py +4 -4
- doctr/models/detection/_utils/base.py +4 -7
- doctr/models/detection/_utils/pytorch.py +1 -5
- doctr/models/detection/_utils/tensorflow.py +1 -5
- doctr/models/detection/core.py +2 -8
- doctr/models/detection/differentiable_binarization/__init__.py +4 -4
- doctr/models/detection/differentiable_binarization/base.py +7 -17
- doctr/models/detection/differentiable_binarization/pytorch.py +27 -30
- doctr/models/detection/differentiable_binarization/tensorflow.py +15 -25
- doctr/models/detection/fast/__init__.py +4 -4
- doctr/models/detection/fast/base.py +6 -14
- doctr/models/detection/fast/pytorch.py +24 -31
- doctr/models/detection/fast/tensorflow.py +14 -26
- doctr/models/detection/linknet/__init__.py +4 -4
- doctr/models/detection/linknet/base.py +6 -15
- doctr/models/detection/linknet/pytorch.py +24 -27
- doctr/models/detection/linknet/tensorflow.py +14 -23
- doctr/models/detection/predictor/__init__.py +5 -5
- doctr/models/detection/predictor/pytorch.py +6 -7
- doctr/models/detection/predictor/tensorflow.py +5 -6
- doctr/models/detection/zoo.py +27 -7
- doctr/models/factory/hub.py +3 -7
- doctr/models/kie_predictor/__init__.py +5 -5
- doctr/models/kie_predictor/base.py +4 -5
- doctr/models/kie_predictor/pytorch.py +18 -19
- doctr/models/kie_predictor/tensorflow.py +13 -14
- doctr/models/modules/layers/__init__.py +3 -3
- doctr/models/modules/layers/pytorch.py +6 -9
- doctr/models/modules/layers/tensorflow.py +5 -7
- doctr/models/modules/transformer/__init__.py +3 -3
- doctr/models/modules/transformer/pytorch.py +12 -13
- doctr/models/modules/transformer/tensorflow.py +9 -10
- doctr/models/modules/vision_transformer/__init__.py +3 -3
- doctr/models/modules/vision_transformer/pytorch.py +2 -3
- doctr/models/modules/vision_transformer/tensorflow.py +3 -3
- doctr/models/predictor/__init__.py +5 -5
- doctr/models/predictor/base.py +28 -29
- doctr/models/predictor/pytorch.py +12 -13
- doctr/models/predictor/tensorflow.py +8 -9
- doctr/models/preprocessor/__init__.py +4 -4
- doctr/models/preprocessor/pytorch.py +13 -17
- doctr/models/preprocessor/tensorflow.py +10 -14
- doctr/models/recognition/core.py +3 -7
- doctr/models/recognition/crnn/__init__.py +4 -4
- doctr/models/recognition/crnn/pytorch.py +20 -28
- doctr/models/recognition/crnn/tensorflow.py +11 -23
- doctr/models/recognition/master/__init__.py +3 -3
- doctr/models/recognition/master/base.py +3 -7
- doctr/models/recognition/master/pytorch.py +22 -24
- doctr/models/recognition/master/tensorflow.py +12 -22
- doctr/models/recognition/parseq/__init__.py +3 -3
- doctr/models/recognition/parseq/base.py +3 -7
- doctr/models/recognition/parseq/pytorch.py +26 -26
- doctr/models/recognition/parseq/tensorflow.py +16 -22
- doctr/models/recognition/predictor/__init__.py +5 -5
- doctr/models/recognition/predictor/_utils.py +7 -10
- doctr/models/recognition/predictor/pytorch.py +6 -6
- doctr/models/recognition/predictor/tensorflow.py +5 -6
- doctr/models/recognition/sar/__init__.py +4 -4
- doctr/models/recognition/sar/pytorch.py +20 -21
- doctr/models/recognition/sar/tensorflow.py +12 -21
- doctr/models/recognition/utils.py +5 -10
- doctr/models/recognition/vitstr/__init__.py +4 -4
- doctr/models/recognition/vitstr/base.py +3 -7
- doctr/models/recognition/vitstr/pytorch.py +18 -20
- doctr/models/recognition/vitstr/tensorflow.py +12 -20
- doctr/models/recognition/zoo.py +22 -11
- doctr/models/utils/__init__.py +4 -4
- doctr/models/utils/pytorch.py +14 -17
- doctr/models/utils/tensorflow.py +17 -16
- doctr/models/zoo.py +1 -5
- doctr/transforms/functional/__init__.py +3 -3
- doctr/transforms/functional/base.py +4 -11
- doctr/transforms/functional/pytorch.py +20 -28
- doctr/transforms/functional/tensorflow.py +10 -22
- doctr/transforms/modules/__init__.py +4 -4
- doctr/transforms/modules/base.py +48 -55
- doctr/transforms/modules/pytorch.py +58 -22
- doctr/transforms/modules/tensorflow.py +18 -32
- doctr/utils/common_types.py +8 -9
- doctr/utils/data.py +8 -12
- doctr/utils/fonts.py +2 -7
- doctr/utils/geometry.py +16 -47
- doctr/utils/metrics.py +17 -37
- doctr/utils/multithreading.py +4 -6
- doctr/utils/reconstitution.py +9 -13
- doctr/utils/repr.py +2 -3
- doctr/utils/visualization.py +16 -29
- doctr/version.py +1 -1
- {python_doctr-0.10.0.dist-info → python_doctr-0.11.0.dist-info}/METADATA +54 -52
- python_doctr-0.11.0.dist-info/RECORD +173 -0
- {python_doctr-0.10.0.dist-info → python_doctr-0.11.0.dist-info}/WHEEL +1 -1
- python_doctr-0.10.0.dist-info/RECORD +0 -173
- {python_doctr-0.10.0.dist-info → python_doctr-0.11.0.dist-info}/LICENSE +0 -0
- {python_doctr-0.10.0.dist-info → python_doctr-0.11.0.dist-info}/top_level.txt +0 -0
- {python_doctr-0.10.0.dist-info → python_doctr-0.11.0.dist-info}/zip-safe +0 -0
|
@@ -1,10 +1,11 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
|
+
from collections.abc import Callable
|
|
6
7
|
from copy import deepcopy
|
|
7
|
-
from typing import Any
|
|
8
|
+
from typing import Any
|
|
8
9
|
|
|
9
10
|
import tensorflow as tf
|
|
10
11
|
from tensorflow.keras import layers
|
|
@@ -18,7 +19,7 @@ from ...utils import _build_model, conv_sequence, load_pretrained_params
|
|
|
18
19
|
__all__ = ["ResNet", "resnet18", "resnet31", "resnet34", "resnet50", "resnet34_wide"]
|
|
19
20
|
|
|
20
21
|
|
|
21
|
-
default_cfgs:
|
|
22
|
+
default_cfgs: dict[str, dict[str, Any]] = {
|
|
22
23
|
"resnet18": {
|
|
23
24
|
"mean": (0.694, 0.695, 0.693),
|
|
24
25
|
"std": (0.299, 0.296, 0.301),
|
|
@@ -61,7 +62,6 @@ class ResnetBlock(layers.Layer):
|
|
|
61
62
|
"""Implements a resnet31 block with shortcut
|
|
62
63
|
|
|
63
64
|
Args:
|
|
64
|
-
----
|
|
65
65
|
conv_shortcut: Use of shortcut
|
|
66
66
|
output_channels: number of channels to use in Conv2D
|
|
67
67
|
kernel_size: size of square kernels
|
|
@@ -92,7 +92,7 @@ class ResnetBlock(layers.Layer):
|
|
|
92
92
|
output_channels: int,
|
|
93
93
|
kernel_size: int,
|
|
94
94
|
strides: int = 1,
|
|
95
|
-
) ->
|
|
95
|
+
) -> list[layers.Layer]:
|
|
96
96
|
return [
|
|
97
97
|
*conv_sequence(output_channels, "relu", bn=True, strides=strides, kernel_size=kernel_size),
|
|
98
98
|
*conv_sequence(output_channels, None, bn=True, kernel_size=kernel_size),
|
|
@@ -108,8 +108,8 @@ class ResnetBlock(layers.Layer):
|
|
|
108
108
|
|
|
109
109
|
def resnet_stage(
|
|
110
110
|
num_blocks: int, out_channels: int, shortcut: bool = False, downsample: bool = False
|
|
111
|
-
) ->
|
|
112
|
-
_layers:
|
|
111
|
+
) -> list[layers.Layer]:
|
|
112
|
+
_layers: list[layers.Layer] = [ResnetBlock(out_channels, conv_shortcut=shortcut, strides=2 if downsample else 1)]
|
|
113
113
|
|
|
114
114
|
for _ in range(1, num_blocks):
|
|
115
115
|
_layers.append(ResnetBlock(out_channels, conv_shortcut=False))
|
|
@@ -121,7 +121,6 @@ class ResNet(Sequential):
|
|
|
121
121
|
"""Implements a ResNet architecture
|
|
122
122
|
|
|
123
123
|
Args:
|
|
124
|
-
----
|
|
125
124
|
num_blocks: number of resnet block in each stage
|
|
126
125
|
output_channels: number of channels in each stage
|
|
127
126
|
stage_downsample: whether the first residual block of a stage should downsample
|
|
@@ -137,18 +136,18 @@ class ResNet(Sequential):
|
|
|
137
136
|
|
|
138
137
|
def __init__(
|
|
139
138
|
self,
|
|
140
|
-
num_blocks:
|
|
141
|
-
output_channels:
|
|
142
|
-
stage_downsample:
|
|
143
|
-
stage_conv:
|
|
144
|
-
stage_pooling:
|
|
139
|
+
num_blocks: list[int],
|
|
140
|
+
output_channels: list[int],
|
|
141
|
+
stage_downsample: list[bool],
|
|
142
|
+
stage_conv: list[bool],
|
|
143
|
+
stage_pooling: list[tuple[int, int] | None],
|
|
145
144
|
origin_stem: bool = True,
|
|
146
145
|
stem_channels: int = 64,
|
|
147
|
-
attn_module:
|
|
146
|
+
attn_module: Callable[[int], layers.Layer] | None = None,
|
|
148
147
|
include_top: bool = True,
|
|
149
148
|
num_classes: int = 1000,
|
|
150
|
-
cfg:
|
|
151
|
-
input_shape:
|
|
149
|
+
cfg: dict[str, Any] | None = None,
|
|
150
|
+
input_shape: tuple[int, int, int] | None = None,
|
|
152
151
|
) -> None:
|
|
153
152
|
inplanes = stem_channels
|
|
154
153
|
if origin_stem:
|
|
@@ -188,11 +187,11 @@ class ResNet(Sequential):
|
|
|
188
187
|
def _resnet(
|
|
189
188
|
arch: str,
|
|
190
189
|
pretrained: bool,
|
|
191
|
-
num_blocks:
|
|
192
|
-
output_channels:
|
|
193
|
-
stage_downsample:
|
|
194
|
-
stage_conv:
|
|
195
|
-
stage_pooling:
|
|
190
|
+
num_blocks: list[int],
|
|
191
|
+
output_channels: list[int],
|
|
192
|
+
stage_downsample: list[bool],
|
|
193
|
+
stage_conv: list[bool],
|
|
194
|
+
stage_pooling: list[tuple[int, int] | None],
|
|
196
195
|
origin_stem: bool = True,
|
|
197
196
|
**kwargs: Any,
|
|
198
197
|
) -> ResNet:
|
|
@@ -234,12 +233,10 @@ def resnet18(pretrained: bool = False, **kwargs: Any) -> ResNet:
|
|
|
234
233
|
>>> out = model(input_tensor)
|
|
235
234
|
|
|
236
235
|
Args:
|
|
237
|
-
----
|
|
238
236
|
pretrained: boolean, True if model is pretrained
|
|
239
237
|
**kwargs: keyword arguments of the ResNet architecture
|
|
240
238
|
|
|
241
239
|
Returns:
|
|
242
|
-
-------
|
|
243
240
|
A classification model
|
|
244
241
|
"""
|
|
245
242
|
return _resnet(
|
|
@@ -267,12 +264,10 @@ def resnet31(pretrained: bool = False, **kwargs: Any) -> ResNet:
|
|
|
267
264
|
>>> out = model(input_tensor)
|
|
268
265
|
|
|
269
266
|
Args:
|
|
270
|
-
----
|
|
271
267
|
pretrained: boolean, True if model is pretrained
|
|
272
268
|
**kwargs: keyword arguments of the ResNet architecture
|
|
273
269
|
|
|
274
270
|
Returns:
|
|
275
|
-
-------
|
|
276
271
|
A classification model
|
|
277
272
|
"""
|
|
278
273
|
return _resnet(
|
|
@@ -300,12 +295,10 @@ def resnet34(pretrained: bool = False, **kwargs: Any) -> ResNet:
|
|
|
300
295
|
>>> out = model(input_tensor)
|
|
301
296
|
|
|
302
297
|
Args:
|
|
303
|
-
----
|
|
304
298
|
pretrained: boolean, True if model is pretrained
|
|
305
299
|
**kwargs: keyword arguments of the ResNet architecture
|
|
306
300
|
|
|
307
301
|
Returns:
|
|
308
|
-
-------
|
|
309
302
|
A classification model
|
|
310
303
|
"""
|
|
311
304
|
return _resnet(
|
|
@@ -332,12 +325,10 @@ def resnet50(pretrained: bool = False, **kwargs: Any) -> ResNet:
|
|
|
332
325
|
>>> out = model(input_tensor)
|
|
333
326
|
|
|
334
327
|
Args:
|
|
335
|
-
----
|
|
336
328
|
pretrained: boolean, True if model is pretrained
|
|
337
329
|
**kwargs: keyword arguments of the ResNet architecture
|
|
338
330
|
|
|
339
331
|
Returns:
|
|
340
|
-
-------
|
|
341
332
|
A classification model
|
|
342
333
|
"""
|
|
343
334
|
kwargs["num_classes"] = kwargs.get("num_classes", len(default_cfgs["resnet50"]["classes"]))
|
|
@@ -386,12 +377,10 @@ def resnet34_wide(pretrained: bool = False, **kwargs: Any) -> ResNet:
|
|
|
386
377
|
>>> out = model(input_tensor)
|
|
387
378
|
|
|
388
379
|
Args:
|
|
389
|
-
----
|
|
390
380
|
pretrained: boolean, True if model is pretrained
|
|
391
381
|
**kwargs: keyword arguments of the ResNet architecture
|
|
392
382
|
|
|
393
383
|
Returns:
|
|
394
|
-
-------
|
|
395
384
|
A classification model
|
|
396
385
|
"""
|
|
397
386
|
return _resnet(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from doctr.file_utils import is_tf_available, is_torch_available
|
|
2
2
|
|
|
3
|
-
if
|
|
3
|
+
if is_torch_available():
|
|
4
|
+
from .pytorch import *
|
|
5
|
+
elif is_tf_available():
|
|
4
6
|
from .tensorflow import *
|
|
5
|
-
elif is_torch_available():
|
|
6
|
-
from .pytorch import * # type: ignore[assignment]
|
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
from copy import deepcopy
|
|
8
|
-
from typing import Any
|
|
8
|
+
from typing import Any
|
|
9
9
|
|
|
10
10
|
from torch import nn
|
|
11
11
|
|
|
@@ -16,7 +16,7 @@ from ...utils import conv_sequence_pt, load_pretrained_params
|
|
|
16
16
|
|
|
17
17
|
__all__ = ["textnet_tiny", "textnet_small", "textnet_base"]
|
|
18
18
|
|
|
19
|
-
default_cfgs:
|
|
19
|
+
default_cfgs: dict[str, dict[str, Any]] = {
|
|
20
20
|
"textnet_tiny": {
|
|
21
21
|
"mean": (0.694, 0.695, 0.693),
|
|
22
22
|
"std": (0.299, 0.296, 0.301),
|
|
@@ -47,22 +47,21 @@ class TextNet(nn.Sequential):
|
|
|
47
47
|
Implementation based on the official Pytorch implementation: <https://github.com/czczup/FAST>`_.
|
|
48
48
|
|
|
49
49
|
Args:
|
|
50
|
-
|
|
51
|
-
stages (List[Dict[str, List[int]]]): List of dictionaries containing the parameters of each stage.
|
|
50
|
+
stages (list[dict[str, list[int]]]): list of dictionaries containing the parameters of each stage.
|
|
52
51
|
include_top (bool, optional): Whether to include the classifier head. Defaults to True.
|
|
53
52
|
num_classes (int, optional): Number of output classes. Defaults to 1000.
|
|
54
|
-
cfg (
|
|
53
|
+
cfg (dict[str, Any], optional): Additional configuration. Defaults to None.
|
|
55
54
|
"""
|
|
56
55
|
|
|
57
56
|
def __init__(
|
|
58
57
|
self,
|
|
59
|
-
stages:
|
|
60
|
-
input_shape:
|
|
58
|
+
stages: list[dict[str, list[int]]],
|
|
59
|
+
input_shape: tuple[int, int, int] = (3, 32, 32),
|
|
61
60
|
num_classes: int = 1000,
|
|
62
61
|
include_top: bool = True,
|
|
63
|
-
cfg:
|
|
62
|
+
cfg: dict[str, Any] | None = None,
|
|
64
63
|
) -> None:
|
|
65
|
-
_layers:
|
|
64
|
+
_layers: list[nn.Module] = [
|
|
66
65
|
*conv_sequence_pt(
|
|
67
66
|
in_channels=3, out_channels=64, relu=True, bn=True, kernel_size=3, stride=2, padding=(1, 1)
|
|
68
67
|
),
|
|
@@ -98,7 +97,7 @@ class TextNet(nn.Sequential):
|
|
|
98
97
|
def _textnet(
|
|
99
98
|
arch: str,
|
|
100
99
|
pretrained: bool,
|
|
101
|
-
ignore_keys:
|
|
100
|
+
ignore_keys: list[str] | None = None,
|
|
102
101
|
**kwargs: Any,
|
|
103
102
|
) -> TextNet:
|
|
104
103
|
kwargs["num_classes"] = kwargs.get("num_classes", len(default_cfgs[arch]["classes"]))
|
|
@@ -135,12 +134,10 @@ def textnet_tiny(pretrained: bool = False, **kwargs: Any) -> TextNet:
|
|
|
135
134
|
>>> out = model(input_tensor)
|
|
136
135
|
|
|
137
136
|
Args:
|
|
138
|
-
----
|
|
139
137
|
pretrained: boolean, True if model is pretrained
|
|
140
138
|
**kwargs: keyword arguments of the TextNet architecture
|
|
141
139
|
|
|
142
140
|
Returns:
|
|
143
|
-
-------
|
|
144
141
|
A textnet tiny model
|
|
145
142
|
"""
|
|
146
143
|
return _textnet(
|
|
@@ -184,12 +181,10 @@ def textnet_small(pretrained: bool = False, **kwargs: Any) -> TextNet:
|
|
|
184
181
|
>>> out = model(input_tensor)
|
|
185
182
|
|
|
186
183
|
Args:
|
|
187
|
-
----
|
|
188
184
|
pretrained: boolean, True if model is pretrained
|
|
189
185
|
**kwargs: keyword arguments of the TextNet architecture
|
|
190
186
|
|
|
191
187
|
Returns:
|
|
192
|
-
-------
|
|
193
188
|
A TextNet small model
|
|
194
189
|
"""
|
|
195
190
|
return _textnet(
|
|
@@ -233,12 +228,10 @@ def textnet_base(pretrained: bool = False, **kwargs: Any) -> TextNet:
|
|
|
233
228
|
>>> out = model(input_tensor)
|
|
234
229
|
|
|
235
230
|
Args:
|
|
236
|
-
----
|
|
237
231
|
pretrained: boolean, True if model is pretrained
|
|
238
232
|
**kwargs: keyword arguments of the TextNet architecture
|
|
239
233
|
|
|
240
234
|
Returns:
|
|
241
|
-
-------
|
|
242
235
|
A TextNet base model
|
|
243
236
|
"""
|
|
244
237
|
return _textnet(
|
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
from copy import deepcopy
|
|
8
|
-
from typing import Any
|
|
8
|
+
from typing import Any
|
|
9
9
|
|
|
10
10
|
from tensorflow.keras import Sequential, layers
|
|
11
11
|
|
|
@@ -16,7 +16,7 @@ from ...utils import _build_model, conv_sequence, load_pretrained_params
|
|
|
16
16
|
|
|
17
17
|
__all__ = ["textnet_tiny", "textnet_small", "textnet_base"]
|
|
18
18
|
|
|
19
|
-
default_cfgs:
|
|
19
|
+
default_cfgs: dict[str, dict[str, Any]] = {
|
|
20
20
|
"textnet_tiny": {
|
|
21
21
|
"mean": (0.694, 0.695, 0.693),
|
|
22
22
|
"std": (0.299, 0.296, 0.301),
|
|
@@ -47,20 +47,19 @@ class TextNet(Sequential):
|
|
|
47
47
|
Implementation based on the official Pytorch implementation: <https://github.com/czczup/FAST>`_.
|
|
48
48
|
|
|
49
49
|
Args:
|
|
50
|
-
|
|
51
|
-
stages (List[Dict[str, List[int]]]): List of dictionaries containing the parameters of each stage.
|
|
50
|
+
stages (list[dict[str, list[int]]]): list of dictionaries containing the parameters of each stage.
|
|
52
51
|
include_top (bool, optional): Whether to include the classifier head. Defaults to True.
|
|
53
52
|
num_classes (int, optional): Number of output classes. Defaults to 1000.
|
|
54
|
-
cfg (
|
|
53
|
+
cfg (dict[str, Any], optional): Additional configuration. Defaults to None.
|
|
55
54
|
"""
|
|
56
55
|
|
|
57
56
|
def __init__(
|
|
58
57
|
self,
|
|
59
|
-
stages:
|
|
60
|
-
input_shape:
|
|
58
|
+
stages: list[dict[str, list[int]]],
|
|
59
|
+
input_shape: tuple[int, int, int] = (32, 32, 3),
|
|
61
60
|
num_classes: int = 1000,
|
|
62
61
|
include_top: bool = True,
|
|
63
|
-
cfg:
|
|
62
|
+
cfg: dict[str, Any] | None = None,
|
|
64
63
|
) -> None:
|
|
65
64
|
_layers = [
|
|
66
65
|
*conv_sequence(
|
|
@@ -136,12 +135,10 @@ def textnet_tiny(pretrained: bool = False, **kwargs: Any) -> TextNet:
|
|
|
136
135
|
>>> out = model(input_tensor)
|
|
137
136
|
|
|
138
137
|
Args:
|
|
139
|
-
----
|
|
140
138
|
pretrained: boolean, True if model is pretrained
|
|
141
139
|
**kwargs: keyword arguments of the TextNet architecture
|
|
142
140
|
|
|
143
141
|
Returns:
|
|
144
|
-
-------
|
|
145
142
|
A textnet tiny model
|
|
146
143
|
"""
|
|
147
144
|
return _textnet(
|
|
@@ -184,12 +181,10 @@ def textnet_small(pretrained: bool = False, **kwargs: Any) -> TextNet:
|
|
|
184
181
|
>>> out = model(input_tensor)
|
|
185
182
|
|
|
186
183
|
Args:
|
|
187
|
-
----
|
|
188
184
|
pretrained: boolean, True if model is pretrained
|
|
189
185
|
**kwargs: keyword arguments of the TextNet architecture
|
|
190
186
|
|
|
191
187
|
Returns:
|
|
192
|
-
-------
|
|
193
188
|
A TextNet small model
|
|
194
189
|
"""
|
|
195
190
|
return _textnet(
|
|
@@ -232,12 +227,10 @@ def textnet_base(pretrained: bool = False, **kwargs: Any) -> TextNet:
|
|
|
232
227
|
>>> out = model(input_tensor)
|
|
233
228
|
|
|
234
229
|
Args:
|
|
235
|
-
----
|
|
236
230
|
pretrained: boolean, True if model is pretrained
|
|
237
231
|
**kwargs: keyword arguments of the TextNet architecture
|
|
238
232
|
|
|
239
233
|
Returns:
|
|
240
|
-
-------
|
|
241
234
|
A TextNet base model
|
|
242
235
|
"""
|
|
243
236
|
return _textnet(
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
6
|
from copy import deepcopy
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any
|
|
8
8
|
|
|
9
9
|
from torch import nn
|
|
10
10
|
from torchvision.models import vgg as tv_vgg
|
|
@@ -16,7 +16,7 @@ from ...utils import load_pretrained_params
|
|
|
16
16
|
__all__ = ["vgg16_bn_r"]
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
default_cfgs:
|
|
19
|
+
default_cfgs: dict[str, dict[str, Any]] = {
|
|
20
20
|
"vgg16_bn_r": {
|
|
21
21
|
"mean": (0.694, 0.695, 0.693),
|
|
22
22
|
"std": (0.299, 0.296, 0.301),
|
|
@@ -32,7 +32,7 @@ def _vgg(
|
|
|
32
32
|
pretrained: bool,
|
|
33
33
|
tv_arch: str,
|
|
34
34
|
num_rect_pools: int = 3,
|
|
35
|
-
ignore_keys:
|
|
35
|
+
ignore_keys: list[str] | None = None,
|
|
36
36
|
**kwargs: Any,
|
|
37
37
|
) -> tv_vgg.VGG:
|
|
38
38
|
kwargs["num_classes"] = kwargs.get("num_classes", len(default_cfgs[arch]["classes"]))
|
|
@@ -45,7 +45,7 @@ def _vgg(
|
|
|
45
45
|
|
|
46
46
|
# Build the model
|
|
47
47
|
model = tv_vgg.__dict__[tv_arch](**kwargs, weights=None)
|
|
48
|
-
#
|
|
48
|
+
# list the MaxPool2d
|
|
49
49
|
pool_idcs = [idx for idx, m in enumerate(model.features) if isinstance(m, nn.MaxPool2d)]
|
|
50
50
|
# Replace their kernel with rectangular ones
|
|
51
51
|
for idx in pool_idcs[-num_rect_pools:]:
|
|
@@ -77,12 +77,10 @@ def vgg16_bn_r(pretrained: bool = False, **kwargs: Any) -> tv_vgg.VGG:
|
|
|
77
77
|
>>> out = model(input_tensor)
|
|
78
78
|
|
|
79
79
|
Args:
|
|
80
|
-
----
|
|
81
80
|
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
|
82
81
|
**kwargs: keyword arguments of the VGG architecture
|
|
83
82
|
|
|
84
83
|
Returns:
|
|
85
|
-
-------
|
|
86
84
|
VGG feature extractor
|
|
87
85
|
"""
|
|
88
86
|
return _vgg(
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
6
|
from copy import deepcopy
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any
|
|
8
8
|
|
|
9
9
|
from tensorflow.keras import layers
|
|
10
10
|
from tensorflow.keras.models import Sequential
|
|
@@ -16,7 +16,7 @@ from ...utils import _build_model, conv_sequence, load_pretrained_params
|
|
|
16
16
|
__all__ = ["VGG", "vgg16_bn_r"]
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
default_cfgs:
|
|
19
|
+
default_cfgs: dict[str, dict[str, Any]] = {
|
|
20
20
|
"vgg16_bn_r": {
|
|
21
21
|
"mean": (0.5, 0.5, 0.5),
|
|
22
22
|
"std": (1.0, 1.0, 1.0),
|
|
@@ -32,7 +32,6 @@ class VGG(Sequential):
|
|
|
32
32
|
<https://arxiv.org/pdf/1409.1556.pdf>`_.
|
|
33
33
|
|
|
34
34
|
Args:
|
|
35
|
-
----
|
|
36
35
|
num_blocks: number of convolutional block in each stage
|
|
37
36
|
planes: number of output channels in each stage
|
|
38
37
|
rect_pools: whether pooling square kernels should be replace with rectangular ones
|
|
@@ -43,13 +42,13 @@ class VGG(Sequential):
|
|
|
43
42
|
|
|
44
43
|
def __init__(
|
|
45
44
|
self,
|
|
46
|
-
num_blocks:
|
|
47
|
-
planes:
|
|
48
|
-
rect_pools:
|
|
45
|
+
num_blocks: list[int],
|
|
46
|
+
planes: list[int],
|
|
47
|
+
rect_pools: list[bool],
|
|
49
48
|
include_top: bool = False,
|
|
50
49
|
num_classes: int = 1000,
|
|
51
|
-
input_shape:
|
|
52
|
-
cfg:
|
|
50
|
+
input_shape: tuple[int, int, int] | None = None,
|
|
51
|
+
cfg: dict[str, Any] | None = None,
|
|
53
52
|
) -> None:
|
|
54
53
|
_layers = []
|
|
55
54
|
# Specify input_shape only for the first layer
|
|
@@ -67,7 +66,7 @@ class VGG(Sequential):
|
|
|
67
66
|
|
|
68
67
|
|
|
69
68
|
def _vgg(
|
|
70
|
-
arch: str, pretrained: bool, num_blocks:
|
|
69
|
+
arch: str, pretrained: bool, num_blocks: list[int], planes: list[int], rect_pools: list[bool], **kwargs: Any
|
|
71
70
|
) -> VGG:
|
|
72
71
|
kwargs["num_classes"] = kwargs.get("num_classes", len(default_cfgs[arch]["classes"]))
|
|
73
72
|
kwargs["input_shape"] = kwargs.get("input_shape", default_cfgs[arch]["input_shape"])
|
|
@@ -106,12 +105,10 @@ def vgg16_bn_r(pretrained: bool = False, **kwargs: Any) -> VGG:
|
|
|
106
105
|
>>> out = model(input_tensor)
|
|
107
106
|
|
|
108
107
|
Args:
|
|
109
|
-
----
|
|
110
108
|
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
|
111
109
|
**kwargs: keyword arguments of the VGG architecture
|
|
112
110
|
|
|
113
111
|
Returns:
|
|
114
|
-
-------
|
|
115
112
|
VGG feature extractor
|
|
116
113
|
"""
|
|
117
114
|
return _vgg(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from doctr.file_utils import is_tf_available, is_torch_available
|
|
2
2
|
|
|
3
|
-
if
|
|
3
|
+
if is_torch_available():
|
|
4
|
+
from .pytorch import *
|
|
5
|
+
elif is_tf_available():
|
|
4
6
|
from .tensorflow import *
|
|
5
|
-
elif is_torch_available():
|
|
6
|
-
from .pytorch import * # type: ignore[assignment]
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
6
|
from copy import deepcopy
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any
|
|
8
8
|
|
|
9
9
|
import torch
|
|
10
10
|
from torch import nn
|
|
@@ -18,7 +18,7 @@ from ...utils.pytorch import load_pretrained_params
|
|
|
18
18
|
__all__ = ["vit_s", "vit_b"]
|
|
19
19
|
|
|
20
20
|
|
|
21
|
-
default_cfgs:
|
|
21
|
+
default_cfgs: dict[str, dict[str, Any]] = {
|
|
22
22
|
"vit_s": {
|
|
23
23
|
"mean": (0.694, 0.695, 0.693),
|
|
24
24
|
"std": (0.299, 0.296, 0.301),
|
|
@@ -40,7 +40,6 @@ class ClassifierHead(nn.Module):
|
|
|
40
40
|
"""Classifier head for Vision Transformer
|
|
41
41
|
|
|
42
42
|
Args:
|
|
43
|
-
----
|
|
44
43
|
in_channels: number of input channels
|
|
45
44
|
num_classes: number of output classes
|
|
46
45
|
"""
|
|
@@ -65,7 +64,6 @@ class VisionTransformer(nn.Sequential):
|
|
|
65
64
|
<https://arxiv.org/pdf/2010.11929.pdf>`_.
|
|
66
65
|
|
|
67
66
|
Args:
|
|
68
|
-
----
|
|
69
67
|
d_model: dimension of the transformer layers
|
|
70
68
|
num_layers: number of transformer layers
|
|
71
69
|
num_heads: number of attention heads
|
|
@@ -83,14 +81,14 @@ class VisionTransformer(nn.Sequential):
|
|
|
83
81
|
num_layers: int,
|
|
84
82
|
num_heads: int,
|
|
85
83
|
ffd_ratio: int,
|
|
86
|
-
patch_size:
|
|
87
|
-
input_shape:
|
|
84
|
+
patch_size: tuple[int, int] = (4, 4),
|
|
85
|
+
input_shape: tuple[int, int, int] = (3, 32, 32),
|
|
88
86
|
dropout: float = 0.0,
|
|
89
87
|
num_classes: int = 1000,
|
|
90
88
|
include_top: bool = True,
|
|
91
|
-
cfg:
|
|
89
|
+
cfg: dict[str, Any] | None = None,
|
|
92
90
|
) -> None:
|
|
93
|
-
_layers:
|
|
91
|
+
_layers: list[nn.Module] = [
|
|
94
92
|
PatchEmbedding(input_shape, d_model, patch_size),
|
|
95
93
|
EncoderBlock(num_layers, num_heads, d_model, d_model * ffd_ratio, dropout, nn.GELU()),
|
|
96
94
|
]
|
|
@@ -104,7 +102,7 @@ class VisionTransformer(nn.Sequential):
|
|
|
104
102
|
def _vit(
|
|
105
103
|
arch: str,
|
|
106
104
|
pretrained: bool,
|
|
107
|
-
ignore_keys:
|
|
105
|
+
ignore_keys: list[str] | None = None,
|
|
108
106
|
**kwargs: Any,
|
|
109
107
|
) -> VisionTransformer:
|
|
110
108
|
kwargs["num_classes"] = kwargs.get("num_classes", len(default_cfgs[arch]["classes"]))
|
|
@@ -143,12 +141,10 @@ def vit_s(pretrained: bool = False, **kwargs: Any) -> VisionTransformer:
|
|
|
143
141
|
>>> out = model(input_tensor)
|
|
144
142
|
|
|
145
143
|
Args:
|
|
146
|
-
----
|
|
147
144
|
pretrained: boolean, True if model is pretrained
|
|
148
145
|
**kwargs: keyword arguments of the VisionTransformer architecture
|
|
149
146
|
|
|
150
147
|
Returns:
|
|
151
|
-
-------
|
|
152
148
|
A feature extractor model
|
|
153
149
|
"""
|
|
154
150
|
return _vit(
|
|
@@ -175,12 +171,10 @@ def vit_b(pretrained: bool = False, **kwargs: Any) -> VisionTransformer:
|
|
|
175
171
|
>>> out = model(input_tensor)
|
|
176
172
|
|
|
177
173
|
Args:
|
|
178
|
-
----
|
|
179
174
|
pretrained: boolean, True if model is pretrained
|
|
180
175
|
**kwargs: keyword arguments of the VisionTransformer architecture
|
|
181
176
|
|
|
182
177
|
Returns:
|
|
183
|
-
-------
|
|
184
178
|
A feature extractor model
|
|
185
179
|
"""
|
|
186
180
|
return _vit(
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
6
|
from copy import deepcopy
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any
|
|
8
8
|
|
|
9
9
|
import tensorflow as tf
|
|
10
10
|
from tensorflow.keras import Sequential, layers
|
|
@@ -19,7 +19,7 @@ from ...utils import _build_model, load_pretrained_params
|
|
|
19
19
|
__all__ = ["vit_s", "vit_b"]
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
default_cfgs:
|
|
22
|
+
default_cfgs: dict[str, dict[str, Any]] = {
|
|
23
23
|
"vit_s": {
|
|
24
24
|
"mean": (0.694, 0.695, 0.693),
|
|
25
25
|
"std": (0.299, 0.296, 0.301),
|
|
@@ -41,7 +41,6 @@ class ClassifierHead(layers.Layer, NestedObject):
|
|
|
41
41
|
"""Classifier head for Vision Transformer
|
|
42
42
|
|
|
43
43
|
Args:
|
|
44
|
-
----
|
|
45
44
|
num_classes: number of output classes
|
|
46
45
|
"""
|
|
47
46
|
|
|
@@ -61,7 +60,6 @@ class VisionTransformer(Sequential):
|
|
|
61
60
|
<https://arxiv.org/pdf/2010.11929.pdf>`_.
|
|
62
61
|
|
|
63
62
|
Args:
|
|
64
|
-
----
|
|
65
63
|
d_model: dimension of the transformer layers
|
|
66
64
|
num_layers: number of transformer layers
|
|
67
65
|
num_heads: number of attention heads
|
|
@@ -79,12 +77,12 @@ class VisionTransformer(Sequential):
|
|
|
79
77
|
num_layers: int,
|
|
80
78
|
num_heads: int,
|
|
81
79
|
ffd_ratio: int,
|
|
82
|
-
patch_size:
|
|
83
|
-
input_shape:
|
|
80
|
+
patch_size: tuple[int, int] = (4, 4),
|
|
81
|
+
input_shape: tuple[int, int, int] = (32, 32, 3),
|
|
84
82
|
dropout: float = 0.0,
|
|
85
83
|
num_classes: int = 1000,
|
|
86
84
|
include_top: bool = True,
|
|
87
|
-
cfg:
|
|
85
|
+
cfg: dict[str, Any] | None = None,
|
|
88
86
|
) -> None:
|
|
89
87
|
_layers = [
|
|
90
88
|
PatchEmbedding(input_shape, d_model, patch_size),
|
|
@@ -148,12 +146,10 @@ def vit_s(pretrained: bool = False, **kwargs: Any) -> VisionTransformer:
|
|
|
148
146
|
>>> out = model(input_tensor)
|
|
149
147
|
|
|
150
148
|
Args:
|
|
151
|
-
----
|
|
152
149
|
pretrained: boolean, True if model is pretrained
|
|
153
150
|
**kwargs: keyword arguments of the VisionTransformer architecture
|
|
154
151
|
|
|
155
152
|
Returns:
|
|
156
|
-
-------
|
|
157
153
|
A feature extractor model
|
|
158
154
|
"""
|
|
159
155
|
return _vit(
|
|
@@ -179,12 +175,10 @@ def vit_b(pretrained: bool = False, **kwargs: Any) -> VisionTransformer:
|
|
|
179
175
|
>>> out = model(input_tensor)
|
|
180
176
|
|
|
181
177
|
Args:
|
|
182
|
-
----
|
|
183
178
|
pretrained: boolean, True if model is pretrained
|
|
184
179
|
**kwargs: keyword arguments of the VisionTransformer architecture
|
|
185
180
|
|
|
186
181
|
Returns:
|
|
187
|
-
-------
|
|
188
182
|
A feature extractor model
|
|
189
183
|
"""
|
|
190
184
|
return _vit(
|