python-doctr 0.12.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctr/__init__.py +0 -1
- doctr/contrib/artefacts.py +1 -1
- doctr/contrib/base.py +1 -1
- doctr/datasets/__init__.py +0 -5
- doctr/datasets/coco_text.py +1 -1
- doctr/datasets/cord.py +1 -1
- doctr/datasets/datasets/__init__.py +1 -6
- doctr/datasets/datasets/base.py +1 -1
- doctr/datasets/datasets/pytorch.py +3 -3
- doctr/datasets/detection.py +1 -1
- doctr/datasets/doc_artefacts.py +1 -1
- doctr/datasets/funsd.py +1 -1
- doctr/datasets/generator/__init__.py +1 -6
- doctr/datasets/generator/base.py +1 -1
- doctr/datasets/generator/pytorch.py +1 -1
- doctr/datasets/ic03.py +1 -1
- doctr/datasets/ic13.py +1 -1
- doctr/datasets/iiit5k.py +1 -1
- doctr/datasets/iiithws.py +1 -1
- doctr/datasets/imgur5k.py +1 -1
- doctr/datasets/mjsynth.py +1 -1
- doctr/datasets/ocr.py +1 -1
- doctr/datasets/orientation.py +1 -1
- doctr/datasets/recognition.py +1 -1
- doctr/datasets/sroie.py +1 -1
- doctr/datasets/svhn.py +1 -1
- doctr/datasets/svt.py +1 -1
- doctr/datasets/synthtext.py +1 -1
- doctr/datasets/utils.py +1 -1
- doctr/datasets/vocabs.py +1 -3
- doctr/datasets/wildreceipt.py +1 -1
- doctr/file_utils.py +3 -102
- doctr/io/elements.py +1 -1
- doctr/io/html.py +1 -1
- doctr/io/image/__init__.py +1 -7
- doctr/io/image/base.py +1 -1
- doctr/io/image/pytorch.py +2 -2
- doctr/io/pdf.py +1 -1
- doctr/io/reader.py +1 -1
- doctr/models/_utils.py +56 -18
- doctr/models/builder.py +1 -1
- doctr/models/classification/magc_resnet/__init__.py +1 -6
- doctr/models/classification/magc_resnet/pytorch.py +3 -3
- doctr/models/classification/mobilenet/__init__.py +1 -6
- doctr/models/classification/mobilenet/pytorch.py +1 -1
- doctr/models/classification/predictor/__init__.py +1 -6
- doctr/models/classification/predictor/pytorch.py +2 -2
- doctr/models/classification/resnet/__init__.py +1 -6
- doctr/models/classification/resnet/pytorch.py +1 -1
- doctr/models/classification/textnet/__init__.py +1 -6
- doctr/models/classification/textnet/pytorch.py +2 -2
- doctr/models/classification/vgg/__init__.py +1 -6
- doctr/models/classification/vgg/pytorch.py +1 -1
- doctr/models/classification/vip/__init__.py +1 -4
- doctr/models/classification/vip/layers/__init__.py +1 -4
- doctr/models/classification/vip/layers/pytorch.py +2 -2
- doctr/models/classification/vip/pytorch.py +1 -1
- doctr/models/classification/vit/__init__.py +1 -6
- doctr/models/classification/vit/pytorch.py +3 -3
- doctr/models/classification/zoo.py +7 -12
- doctr/models/core.py +1 -1
- doctr/models/detection/_utils/__init__.py +1 -6
- doctr/models/detection/_utils/base.py +1 -1
- doctr/models/detection/_utils/pytorch.py +1 -1
- doctr/models/detection/core.py +2 -2
- doctr/models/detection/differentiable_binarization/__init__.py +1 -6
- doctr/models/detection/differentiable_binarization/base.py +5 -13
- doctr/models/detection/differentiable_binarization/pytorch.py +4 -4
- doctr/models/detection/fast/__init__.py +1 -6
- doctr/models/detection/fast/base.py +5 -15
- doctr/models/detection/fast/pytorch.py +5 -5
- doctr/models/detection/linknet/__init__.py +1 -6
- doctr/models/detection/linknet/base.py +4 -13
- doctr/models/detection/linknet/pytorch.py +3 -3
- doctr/models/detection/predictor/__init__.py +1 -6
- doctr/models/detection/predictor/pytorch.py +2 -2
- doctr/models/detection/zoo.py +16 -33
- doctr/models/factory/hub.py +26 -34
- doctr/models/kie_predictor/__init__.py +1 -6
- doctr/models/kie_predictor/base.py +1 -1
- doctr/models/kie_predictor/pytorch.py +3 -7
- doctr/models/modules/layers/__init__.py +1 -6
- doctr/models/modules/layers/pytorch.py +4 -4
- doctr/models/modules/transformer/__init__.py +1 -6
- doctr/models/modules/transformer/pytorch.py +3 -3
- doctr/models/modules/vision_transformer/__init__.py +1 -6
- doctr/models/modules/vision_transformer/pytorch.py +1 -1
- doctr/models/predictor/__init__.py +1 -6
- doctr/models/predictor/base.py +4 -9
- doctr/models/predictor/pytorch.py +3 -6
- doctr/models/preprocessor/__init__.py +1 -6
- doctr/models/preprocessor/pytorch.py +28 -33
- doctr/models/recognition/core.py +1 -1
- doctr/models/recognition/crnn/__init__.py +1 -6
- doctr/models/recognition/crnn/pytorch.py +7 -7
- doctr/models/recognition/master/__init__.py +1 -6
- doctr/models/recognition/master/base.py +1 -1
- doctr/models/recognition/master/pytorch.py +6 -6
- doctr/models/recognition/parseq/__init__.py +1 -6
- doctr/models/recognition/parseq/base.py +1 -1
- doctr/models/recognition/parseq/pytorch.py +6 -6
- doctr/models/recognition/predictor/__init__.py +1 -6
- doctr/models/recognition/predictor/_utils.py +8 -17
- doctr/models/recognition/predictor/pytorch.py +2 -3
- doctr/models/recognition/sar/__init__.py +1 -6
- doctr/models/recognition/sar/pytorch.py +4 -4
- doctr/models/recognition/utils.py +1 -1
- doctr/models/recognition/viptr/__init__.py +1 -4
- doctr/models/recognition/viptr/pytorch.py +4 -4
- doctr/models/recognition/vitstr/__init__.py +1 -6
- doctr/models/recognition/vitstr/base.py +1 -1
- doctr/models/recognition/vitstr/pytorch.py +4 -4
- doctr/models/recognition/zoo.py +14 -14
- doctr/models/utils/__init__.py +1 -6
- doctr/models/utils/pytorch.py +3 -2
- doctr/models/zoo.py +1 -1
- doctr/transforms/functional/__init__.py +1 -6
- doctr/transforms/functional/base.py +3 -2
- doctr/transforms/functional/pytorch.py +5 -5
- doctr/transforms/modules/__init__.py +1 -7
- doctr/transforms/modules/base.py +28 -94
- doctr/transforms/modules/pytorch.py +29 -27
- doctr/utils/common_types.py +1 -1
- doctr/utils/data.py +1 -2
- doctr/utils/fonts.py +1 -1
- doctr/utils/geometry.py +7 -11
- doctr/utils/metrics.py +1 -1
- doctr/utils/multithreading.py +1 -1
- doctr/utils/reconstitution.py +1 -1
- doctr/utils/repr.py +1 -1
- doctr/utils/visualization.py +2 -2
- doctr/version.py +1 -1
- {python_doctr-0.12.0.dist-info → python_doctr-1.0.1.dist-info}/METADATA +30 -80
- python_doctr-1.0.1.dist-info/RECORD +149 -0
- {python_doctr-0.12.0.dist-info → python_doctr-1.0.1.dist-info}/WHEEL +1 -1
- doctr/datasets/datasets/tensorflow.py +0 -59
- doctr/datasets/generator/tensorflow.py +0 -58
- doctr/datasets/loader.py +0 -94
- doctr/io/image/tensorflow.py +0 -101
- doctr/models/classification/magc_resnet/tensorflow.py +0 -196
- doctr/models/classification/mobilenet/tensorflow.py +0 -442
- doctr/models/classification/predictor/tensorflow.py +0 -60
- doctr/models/classification/resnet/tensorflow.py +0 -418
- doctr/models/classification/textnet/tensorflow.py +0 -275
- doctr/models/classification/vgg/tensorflow.py +0 -125
- doctr/models/classification/vit/tensorflow.py +0 -201
- doctr/models/detection/_utils/tensorflow.py +0 -34
- doctr/models/detection/differentiable_binarization/tensorflow.py +0 -421
- doctr/models/detection/fast/tensorflow.py +0 -427
- doctr/models/detection/linknet/tensorflow.py +0 -377
- doctr/models/detection/predictor/tensorflow.py +0 -70
- doctr/models/kie_predictor/tensorflow.py +0 -187
- doctr/models/modules/layers/tensorflow.py +0 -171
- doctr/models/modules/transformer/tensorflow.py +0 -235
- doctr/models/modules/vision_transformer/tensorflow.py +0 -100
- doctr/models/predictor/tensorflow.py +0 -155
- doctr/models/preprocessor/tensorflow.py +0 -122
- doctr/models/recognition/crnn/tensorflow.py +0 -317
- doctr/models/recognition/master/tensorflow.py +0 -320
- doctr/models/recognition/parseq/tensorflow.py +0 -516
- doctr/models/recognition/predictor/tensorflow.py +0 -79
- doctr/models/recognition/sar/tensorflow.py +0 -423
- doctr/models/recognition/vitstr/tensorflow.py +0 -285
- doctr/models/utils/tensorflow.py +0 -189
- doctr/transforms/functional/tensorflow.py +0 -254
- doctr/transforms/modules/tensorflow.py +0 -562
- python_doctr-0.12.0.dist-info/RECORD +0 -180
- {python_doctr-0.12.0.dist-info → python_doctr-1.0.1.dist-info}/licenses/LICENSE +0 -0
- {python_doctr-0.12.0.dist-info → python_doctr-1.0.1.dist-info}/top_level.txt +0 -0
- {python_doctr-0.12.0.dist-info → python_doctr-1.0.1.dist-info}/zip-safe +0 -0
|
@@ -1,171 +0,0 @@
|
|
|
1
|
-
# Copyright (C) 2021-2025, Mindee.
|
|
2
|
-
|
|
3
|
-
# This program is licensed under the Apache License 2.0.
|
|
4
|
-
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
|
-
|
|
6
|
-
from typing import Any
|
|
7
|
-
|
|
8
|
-
import numpy as np
|
|
9
|
-
import tensorflow as tf
|
|
10
|
-
from tensorflow.keras import layers
|
|
11
|
-
|
|
12
|
-
from doctr.utils.repr import NestedObject
|
|
13
|
-
|
|
14
|
-
__all__ = ["FASTConvLayer"]
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class FASTConvLayer(layers.Layer, NestedObject):
|
|
18
|
-
"""Convolutional layer used in the TextNet and FAST architectures"""
|
|
19
|
-
|
|
20
|
-
def __init__(
|
|
21
|
-
self,
|
|
22
|
-
in_channels: int,
|
|
23
|
-
out_channels: int,
|
|
24
|
-
kernel_size: int | tuple[int, int],
|
|
25
|
-
stride: int = 1,
|
|
26
|
-
dilation: int = 1,
|
|
27
|
-
groups: int = 1,
|
|
28
|
-
bias: bool = False,
|
|
29
|
-
) -> None:
|
|
30
|
-
super().__init__()
|
|
31
|
-
|
|
32
|
-
self.groups = groups
|
|
33
|
-
self.in_channels = in_channels
|
|
34
|
-
self.converted_ks = (kernel_size, kernel_size) if isinstance(kernel_size, int) else kernel_size
|
|
35
|
-
|
|
36
|
-
self.hor_conv, self.hor_bn = None, None
|
|
37
|
-
self.ver_conv, self.ver_bn = None, None
|
|
38
|
-
|
|
39
|
-
padding = ((self.converted_ks[0] - 1) * dilation // 2, (self.converted_ks[1] - 1) * dilation // 2)
|
|
40
|
-
|
|
41
|
-
self.activation = layers.ReLU()
|
|
42
|
-
self.conv_pad = layers.ZeroPadding2D(padding=padding)
|
|
43
|
-
|
|
44
|
-
self.conv = layers.Conv2D(
|
|
45
|
-
filters=out_channels,
|
|
46
|
-
kernel_size=self.converted_ks,
|
|
47
|
-
strides=stride,
|
|
48
|
-
dilation_rate=dilation,
|
|
49
|
-
groups=groups,
|
|
50
|
-
use_bias=bias,
|
|
51
|
-
)
|
|
52
|
-
|
|
53
|
-
self.bn = layers.BatchNormalization()
|
|
54
|
-
|
|
55
|
-
if self.converted_ks[1] != 1:
|
|
56
|
-
self.ver_pad = layers.ZeroPadding2D(
|
|
57
|
-
padding=(int(((self.converted_ks[0] - 1) * dilation) / 2), 0),
|
|
58
|
-
)
|
|
59
|
-
self.ver_conv = layers.Conv2D(
|
|
60
|
-
filters=out_channels,
|
|
61
|
-
kernel_size=(self.converted_ks[0], 1),
|
|
62
|
-
strides=stride,
|
|
63
|
-
dilation_rate=dilation,
|
|
64
|
-
groups=groups,
|
|
65
|
-
use_bias=bias,
|
|
66
|
-
)
|
|
67
|
-
self.ver_bn = layers.BatchNormalization()
|
|
68
|
-
|
|
69
|
-
if self.converted_ks[0] != 1:
|
|
70
|
-
self.hor_pad = layers.ZeroPadding2D(
|
|
71
|
-
padding=(0, int(((self.converted_ks[1] - 1) * dilation) / 2)),
|
|
72
|
-
)
|
|
73
|
-
self.hor_conv = layers.Conv2D(
|
|
74
|
-
filters=out_channels,
|
|
75
|
-
kernel_size=(1, self.converted_ks[1]),
|
|
76
|
-
strides=stride,
|
|
77
|
-
dilation_rate=dilation,
|
|
78
|
-
groups=groups,
|
|
79
|
-
use_bias=bias,
|
|
80
|
-
)
|
|
81
|
-
self.hor_bn = layers.BatchNormalization()
|
|
82
|
-
|
|
83
|
-
self.rbr_identity = layers.BatchNormalization() if out_channels == in_channels and stride == 1 else None
|
|
84
|
-
|
|
85
|
-
def call(self, x: tf.Tensor, **kwargs: Any) -> tf.Tensor:
|
|
86
|
-
if hasattr(self, "fused_conv"):
|
|
87
|
-
return self.activation(self.fused_conv(self.conv_pad(x, **kwargs), **kwargs))
|
|
88
|
-
|
|
89
|
-
main_outputs = self.bn(self.conv(self.conv_pad(x, **kwargs), **kwargs), **kwargs)
|
|
90
|
-
vertical_outputs = (
|
|
91
|
-
self.ver_bn(self.ver_conv(self.ver_pad(x, **kwargs), **kwargs), **kwargs)
|
|
92
|
-
if self.ver_conv is not None and self.ver_bn is not None
|
|
93
|
-
else 0
|
|
94
|
-
)
|
|
95
|
-
horizontal_outputs = (
|
|
96
|
-
self.hor_bn(self.hor_conv(self.hor_pad(x, **kwargs), **kwargs), **kwargs)
|
|
97
|
-
if self.hor_bn is not None and self.hor_conv is not None
|
|
98
|
-
else 0
|
|
99
|
-
)
|
|
100
|
-
id_out = self.rbr_identity(x, **kwargs) if self.rbr_identity is not None else 0
|
|
101
|
-
|
|
102
|
-
return self.activation(main_outputs + vertical_outputs + horizontal_outputs + id_out)
|
|
103
|
-
|
|
104
|
-
# The following logic is used to reparametrize the layer
|
|
105
|
-
# Adapted from: https://github.com/mindee/doctr/blob/main/doctr/models/modules/layers/pytorch.py
|
|
106
|
-
def _identity_to_conv(self, identity: layers.BatchNormalization) -> tuple[tf.Tensor, tf.Tensor] | tuple[int, int]:
|
|
107
|
-
if identity is None or not hasattr(identity, "moving_mean") or not hasattr(identity, "moving_variance"):
|
|
108
|
-
return 0, 0
|
|
109
|
-
if not hasattr(self, "id_tensor"):
|
|
110
|
-
input_dim = self.in_channels // self.groups
|
|
111
|
-
kernel_value = np.zeros((1, 1, input_dim, self.in_channels), dtype=np.float32)
|
|
112
|
-
for i in range(self.in_channels):
|
|
113
|
-
kernel_value[0, 0, i % input_dim, i] = 1
|
|
114
|
-
id_tensor = tf.constant(kernel_value, dtype=tf.float32)
|
|
115
|
-
self.id_tensor = self._pad_to_mxn_tensor(id_tensor)
|
|
116
|
-
kernel = self.id_tensor
|
|
117
|
-
std = tf.sqrt(identity.moving_variance + identity.epsilon)
|
|
118
|
-
t = tf.reshape(identity.gamma / std, (1, 1, 1, -1))
|
|
119
|
-
return kernel * t, identity.beta - identity.moving_mean * identity.gamma / std
|
|
120
|
-
|
|
121
|
-
def _fuse_bn_tensor(self, conv: layers.Conv2D, bn: layers.BatchNormalization) -> tuple[tf.Tensor, tf.Tensor]:
|
|
122
|
-
kernel = conv.kernel
|
|
123
|
-
kernel = self._pad_to_mxn_tensor(kernel)
|
|
124
|
-
std = tf.sqrt(bn.moving_variance + bn.epsilon)
|
|
125
|
-
t = tf.reshape(bn.gamma / std, (1, 1, 1, -1))
|
|
126
|
-
return kernel * t, bn.beta - bn.moving_mean * bn.gamma / std
|
|
127
|
-
|
|
128
|
-
def _get_equivalent_kernel_bias(self):
|
|
129
|
-
kernel_mxn, bias_mxn = self._fuse_bn_tensor(self.conv, self.bn)
|
|
130
|
-
if self.ver_conv is not None:
|
|
131
|
-
kernel_mx1, bias_mx1 = self._fuse_bn_tensor(self.ver_conv, self.ver_bn)
|
|
132
|
-
else:
|
|
133
|
-
kernel_mx1, bias_mx1 = 0, 0
|
|
134
|
-
if self.hor_conv is not None:
|
|
135
|
-
kernel_1xn, bias_1xn = self._fuse_bn_tensor(self.hor_conv, self.hor_bn)
|
|
136
|
-
else:
|
|
137
|
-
kernel_1xn, bias_1xn = 0, 0
|
|
138
|
-
kernel_id, bias_id = self._identity_to_conv(self.rbr_identity)
|
|
139
|
-
kernel_mxn = kernel_mxn + kernel_mx1 + kernel_1xn + kernel_id
|
|
140
|
-
bias_mxn = bias_mxn + bias_mx1 + bias_1xn + bias_id
|
|
141
|
-
return kernel_mxn, bias_mxn
|
|
142
|
-
|
|
143
|
-
def _pad_to_mxn_tensor(self, kernel: tf.Tensor) -> tf.Tensor:
|
|
144
|
-
kernel_height, kernel_width = self.converted_ks
|
|
145
|
-
height, width = kernel.shape[:2]
|
|
146
|
-
pad_left_right = tf.maximum(0, (kernel_width - width) // 2)
|
|
147
|
-
pad_top_down = tf.maximum(0, (kernel_height - height) // 2)
|
|
148
|
-
return tf.pad(kernel, [[pad_top_down, pad_top_down], [pad_left_right, pad_left_right], [0, 0], [0, 0]])
|
|
149
|
-
|
|
150
|
-
def reparameterize_layer(self):
|
|
151
|
-
kernel, bias = self._get_equivalent_kernel_bias()
|
|
152
|
-
self.fused_conv = layers.Conv2D(
|
|
153
|
-
filters=self.conv.filters,
|
|
154
|
-
kernel_size=self.conv.kernel_size,
|
|
155
|
-
strides=self.conv.strides,
|
|
156
|
-
padding=self.conv.padding,
|
|
157
|
-
dilation_rate=self.conv.dilation_rate,
|
|
158
|
-
groups=self.conv.groups,
|
|
159
|
-
use_bias=True,
|
|
160
|
-
)
|
|
161
|
-
# build layer to initialize weights and biases
|
|
162
|
-
self.fused_conv.build(input_shape=(None, None, None, kernel.shape[-2]))
|
|
163
|
-
self.fused_conv.set_weights([kernel.numpy(), bias.numpy()])
|
|
164
|
-
for para in self.trainable_variables:
|
|
165
|
-
para._trainable = False
|
|
166
|
-
for attr in ["conv", "bn", "ver_conv", "ver_bn", "hor_conv", "hor_bn"]:
|
|
167
|
-
if hasattr(self, attr):
|
|
168
|
-
delattr(self, attr)
|
|
169
|
-
|
|
170
|
-
if hasattr(self, "rbr_identity"):
|
|
171
|
-
delattr(self, "rbr_identity")
|
|
@@ -1,235 +0,0 @@
|
|
|
1
|
-
# Copyright (C) 2021-2025, Mindee.
|
|
2
|
-
|
|
3
|
-
# This program is licensed under the Apache License 2.0.
|
|
4
|
-
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
|
-
|
|
6
|
-
import math
|
|
7
|
-
from collections.abc import Callable
|
|
8
|
-
from typing import Any
|
|
9
|
-
|
|
10
|
-
import tensorflow as tf
|
|
11
|
-
from tensorflow.keras import layers
|
|
12
|
-
|
|
13
|
-
from doctr.utils.repr import NestedObject
|
|
14
|
-
|
|
15
|
-
__all__ = ["Decoder", "PositionalEncoding", "EncoderBlock", "PositionwiseFeedForward", "MultiHeadAttention"]
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class PositionalEncoding(layers.Layer, NestedObject):
|
|
19
|
-
"""Compute positional encoding"""
|
|
20
|
-
|
|
21
|
-
def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000) -> None:
|
|
22
|
-
super(PositionalEncoding, self).__init__()
|
|
23
|
-
self.dropout = layers.Dropout(rate=dropout)
|
|
24
|
-
|
|
25
|
-
# Compute the positional encodings once in log space.
|
|
26
|
-
pe = tf.Variable(tf.zeros((max_len, d_model)))
|
|
27
|
-
position = tf.cast(
|
|
28
|
-
tf.expand_dims(tf.experimental.numpy.arange(start=0, stop=max_len), axis=1), dtype=tf.float32
|
|
29
|
-
)
|
|
30
|
-
div_term = tf.math.exp(
|
|
31
|
-
tf.cast(tf.experimental.numpy.arange(start=0, stop=d_model, step=2), dtype=tf.float32)
|
|
32
|
-
* -(math.log(10000.0) / d_model)
|
|
33
|
-
)
|
|
34
|
-
pe = pe.numpy()
|
|
35
|
-
pe[:, 0::2] = tf.math.sin(position * div_term)
|
|
36
|
-
pe[:, 1::2] = tf.math.cos(position * div_term)
|
|
37
|
-
self.pe = tf.expand_dims(tf.convert_to_tensor(pe), axis=0)
|
|
38
|
-
|
|
39
|
-
def call(
|
|
40
|
-
self,
|
|
41
|
-
x: tf.Tensor,
|
|
42
|
-
**kwargs: Any,
|
|
43
|
-
) -> tf.Tensor:
|
|
44
|
-
"""Forward pass
|
|
45
|
-
|
|
46
|
-
Args:
|
|
47
|
-
x: embeddings (batch, max_len, d_model)
|
|
48
|
-
**kwargs: additional arguments
|
|
49
|
-
|
|
50
|
-
Returns:
|
|
51
|
-
positional embeddings (batch, max_len, d_model)
|
|
52
|
-
"""
|
|
53
|
-
if x.dtype == tf.float16: # amp fix: cast to half
|
|
54
|
-
x = x + tf.cast(self.pe[:, : x.shape[1]], dtype=tf.half)
|
|
55
|
-
else:
|
|
56
|
-
x = x + self.pe[:, : x.shape[1]]
|
|
57
|
-
return self.dropout(x, **kwargs)
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
@tf.function
|
|
61
|
-
def scaled_dot_product_attention(
|
|
62
|
-
query: tf.Tensor, key: tf.Tensor, value: tf.Tensor, mask: tf.Tensor | None = None
|
|
63
|
-
) -> tuple[tf.Tensor, tf.Tensor]:
|
|
64
|
-
"""Scaled Dot-Product Attention"""
|
|
65
|
-
scores = tf.matmul(query, tf.transpose(key, perm=[0, 1, 3, 2])) / math.sqrt(query.shape[-1])
|
|
66
|
-
if mask is not None:
|
|
67
|
-
# NOTE: to ensure the ONNX compatibility, tf.where works only with bool type condition
|
|
68
|
-
scores = tf.where(mask == False, float("-inf"), scores) # noqa: E712
|
|
69
|
-
p_attn = tf.nn.softmax(scores, axis=-1)
|
|
70
|
-
return tf.matmul(p_attn, value), p_attn
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
class PositionwiseFeedForward(layers.Layer, NestedObject):
|
|
74
|
-
"""Position-wise Feed-Forward Network"""
|
|
75
|
-
|
|
76
|
-
def __init__(
|
|
77
|
-
self, d_model: int, ffd: int, dropout=0.1, activation_fct: Callable[[Any], Any] = layers.ReLU()
|
|
78
|
-
) -> None:
|
|
79
|
-
super(PositionwiseFeedForward, self).__init__()
|
|
80
|
-
self.activation_fct = activation_fct
|
|
81
|
-
|
|
82
|
-
self.first_linear = layers.Dense(ffd, kernel_initializer=tf.initializers.he_uniform())
|
|
83
|
-
self.sec_linear = layers.Dense(d_model, kernel_initializer=tf.initializers.he_uniform())
|
|
84
|
-
self.dropout = layers.Dropout(rate=dropout)
|
|
85
|
-
|
|
86
|
-
def call(self, x: tf.Tensor, **kwargs: Any) -> tf.Tensor:
|
|
87
|
-
x = self.first_linear(x, **kwargs)
|
|
88
|
-
x = self.activation_fct(x)
|
|
89
|
-
x = self.dropout(x, **kwargs)
|
|
90
|
-
x = self.sec_linear(x, **kwargs)
|
|
91
|
-
x = self.dropout(x, **kwargs)
|
|
92
|
-
return x
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
class MultiHeadAttention(layers.Layer, NestedObject):
|
|
96
|
-
"""Multi-Head Attention"""
|
|
97
|
-
|
|
98
|
-
def __init__(self, num_heads: int, d_model: int, dropout: float = 0.1) -> None:
|
|
99
|
-
super().__init__()
|
|
100
|
-
assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
|
|
101
|
-
|
|
102
|
-
self.d_k = d_model // num_heads
|
|
103
|
-
self.num_heads = num_heads
|
|
104
|
-
|
|
105
|
-
self.linear_layers = [layers.Dense(d_model, kernel_initializer=tf.initializers.he_uniform()) for _ in range(3)]
|
|
106
|
-
self.output_linear = layers.Dense(d_model, kernel_initializer=tf.initializers.he_uniform())
|
|
107
|
-
|
|
108
|
-
def call(
|
|
109
|
-
self,
|
|
110
|
-
query: tf.Tensor,
|
|
111
|
-
key: tf.Tensor,
|
|
112
|
-
value: tf.Tensor,
|
|
113
|
-
mask: tf.Tensor = None,
|
|
114
|
-
**kwargs: Any,
|
|
115
|
-
) -> tf.Tensor:
|
|
116
|
-
batch_size = query.shape[0]
|
|
117
|
-
|
|
118
|
-
# linear projections of Q, K, V
|
|
119
|
-
query, key, value = [
|
|
120
|
-
tf.transpose(
|
|
121
|
-
tf.reshape(linear(x, **kwargs), shape=[batch_size, -1, self.num_heads, self.d_k]), perm=[0, 2, 1, 3]
|
|
122
|
-
)
|
|
123
|
-
for linear, x in zip(self.linear_layers, (query, key, value))
|
|
124
|
-
]
|
|
125
|
-
|
|
126
|
-
# apply attention on all the projected vectors in batch
|
|
127
|
-
x, attn = scaled_dot_product_attention(query, key, value, mask=mask)
|
|
128
|
-
|
|
129
|
-
# Concat attention heads
|
|
130
|
-
x = tf.transpose(x, perm=[0, 2, 1, 3])
|
|
131
|
-
x = tf.reshape(x, shape=[batch_size, -1, self.num_heads * self.d_k])
|
|
132
|
-
|
|
133
|
-
return self.output_linear(x, **kwargs)
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
class EncoderBlock(layers.Layer, NestedObject):
|
|
137
|
-
"""Transformer Encoder Block"""
|
|
138
|
-
|
|
139
|
-
def __init__(
|
|
140
|
-
self,
|
|
141
|
-
num_layers: int,
|
|
142
|
-
num_heads: int,
|
|
143
|
-
d_model: int,
|
|
144
|
-
dff: int, # hidden dimension of the feedforward network
|
|
145
|
-
dropout: float,
|
|
146
|
-
activation_fct: Callable[[Any], Any] = layers.ReLU(),
|
|
147
|
-
) -> None:
|
|
148
|
-
super().__init__()
|
|
149
|
-
|
|
150
|
-
self.num_layers = num_layers
|
|
151
|
-
|
|
152
|
-
self.layer_norm_input = layers.LayerNormalization(epsilon=1e-5)
|
|
153
|
-
self.layer_norm_attention = layers.LayerNormalization(epsilon=1e-5)
|
|
154
|
-
self.layer_norm_output = layers.LayerNormalization(epsilon=1e-5)
|
|
155
|
-
self.dropout = layers.Dropout(rate=dropout)
|
|
156
|
-
|
|
157
|
-
self.attention = [MultiHeadAttention(num_heads, d_model, dropout) for _ in range(self.num_layers)]
|
|
158
|
-
self.position_feed_forward = [
|
|
159
|
-
PositionwiseFeedForward(d_model, dff, dropout, activation_fct) for _ in range(self.num_layers)
|
|
160
|
-
]
|
|
161
|
-
|
|
162
|
-
def call(self, x: tf.Tensor, mask: tf.Tensor | None = None, **kwargs: Any) -> tf.Tensor:
|
|
163
|
-
output = x
|
|
164
|
-
|
|
165
|
-
for i in range(self.num_layers):
|
|
166
|
-
normed_output = self.layer_norm_input(output, **kwargs)
|
|
167
|
-
output = output + self.dropout(
|
|
168
|
-
self.attention[i](normed_output, normed_output, normed_output, mask, **kwargs),
|
|
169
|
-
**kwargs,
|
|
170
|
-
)
|
|
171
|
-
normed_output = self.layer_norm_attention(output, **kwargs)
|
|
172
|
-
output = output + self.dropout(self.position_feed_forward[i](normed_output, **kwargs), **kwargs)
|
|
173
|
-
|
|
174
|
-
# (batch_size, seq_len, d_model)
|
|
175
|
-
return self.layer_norm_output(output, **kwargs)
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
class Decoder(layers.Layer, NestedObject):
|
|
179
|
-
"""Transformer Decoder"""
|
|
180
|
-
|
|
181
|
-
def __init__(
|
|
182
|
-
self,
|
|
183
|
-
num_layers: int,
|
|
184
|
-
num_heads: int,
|
|
185
|
-
d_model: int,
|
|
186
|
-
vocab_size: int,
|
|
187
|
-
dropout: float = 0.2,
|
|
188
|
-
dff: int = 2048, # hidden dimension of the feedforward network
|
|
189
|
-
maximum_position_encoding: int = 50,
|
|
190
|
-
) -> None:
|
|
191
|
-
super(Decoder, self).__init__()
|
|
192
|
-
self.num_layers = num_layers
|
|
193
|
-
self.d_model = d_model
|
|
194
|
-
|
|
195
|
-
self.layer_norm_input = layers.LayerNormalization(epsilon=1e-5)
|
|
196
|
-
self.layer_norm_masked_attention = layers.LayerNormalization(epsilon=1e-5)
|
|
197
|
-
self.layer_norm_attention = layers.LayerNormalization(epsilon=1e-5)
|
|
198
|
-
self.layer_norm_output = layers.LayerNormalization(epsilon=1e-5)
|
|
199
|
-
|
|
200
|
-
self.dropout = layers.Dropout(rate=dropout)
|
|
201
|
-
self.embed = layers.Embedding(vocab_size, d_model)
|
|
202
|
-
self.positional_encoding = PositionalEncoding(d_model, dropout, maximum_position_encoding)
|
|
203
|
-
|
|
204
|
-
self.attention = [MultiHeadAttention(num_heads, d_model, dropout) for _ in range(self.num_layers)]
|
|
205
|
-
self.source_attention = [MultiHeadAttention(num_heads, d_model, dropout) for _ in range(self.num_layers)]
|
|
206
|
-
self.position_feed_forward = [PositionwiseFeedForward(d_model, dff, dropout) for _ in range(self.num_layers)]
|
|
207
|
-
|
|
208
|
-
def call(
|
|
209
|
-
self,
|
|
210
|
-
tgt: tf.Tensor,
|
|
211
|
-
memory: tf.Tensor,
|
|
212
|
-
source_mask: tf.Tensor | None = None,
|
|
213
|
-
target_mask: tf.Tensor | None = None,
|
|
214
|
-
**kwargs: Any,
|
|
215
|
-
) -> tf.Tensor:
|
|
216
|
-
tgt = self.embed(tgt, **kwargs) * math.sqrt(self.d_model)
|
|
217
|
-
pos_enc_tgt = self.positional_encoding(tgt, **kwargs)
|
|
218
|
-
output = pos_enc_tgt
|
|
219
|
-
|
|
220
|
-
for i in range(self.num_layers):
|
|
221
|
-
normed_output = self.layer_norm_input(output, **kwargs)
|
|
222
|
-
output = output + self.dropout(
|
|
223
|
-
self.attention[i](normed_output, normed_output, normed_output, target_mask, **kwargs),
|
|
224
|
-
**kwargs,
|
|
225
|
-
)
|
|
226
|
-
normed_output = self.layer_norm_masked_attention(output, **kwargs)
|
|
227
|
-
output = output + self.dropout(
|
|
228
|
-
self.source_attention[i](normed_output, memory, memory, source_mask, **kwargs),
|
|
229
|
-
**kwargs,
|
|
230
|
-
)
|
|
231
|
-
normed_output = self.layer_norm_attention(output, **kwargs)
|
|
232
|
-
output = output + self.dropout(self.position_feed_forward[i](normed_output, **kwargs), **kwargs)
|
|
233
|
-
|
|
234
|
-
# (batch_size, seq_len, d_model)
|
|
235
|
-
return self.layer_norm_output(output, **kwargs)
|
|
@@ -1,100 +0,0 @@
|
|
|
1
|
-
# Copyright (C) 2021-2025, Mindee.
|
|
2
|
-
|
|
3
|
-
# This program is licensed under the Apache License 2.0.
|
|
4
|
-
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
|
-
|
|
6
|
-
import math
|
|
7
|
-
from typing import Any
|
|
8
|
-
|
|
9
|
-
import tensorflow as tf
|
|
10
|
-
from tensorflow.keras import layers
|
|
11
|
-
|
|
12
|
-
from doctr.utils.repr import NestedObject
|
|
13
|
-
|
|
14
|
-
__all__ = ["PatchEmbedding"]
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class PatchEmbedding(layers.Layer, NestedObject):
|
|
18
|
-
"""Compute 2D patch embeddings with cls token and positional encoding"""
|
|
19
|
-
|
|
20
|
-
def __init__(self, input_shape: tuple[int, int, int], embed_dim: int, patch_size: tuple[int, int]) -> None:
|
|
21
|
-
super().__init__()
|
|
22
|
-
height, width, _ = input_shape
|
|
23
|
-
self.patch_size = patch_size
|
|
24
|
-
self.interpolate = True if patch_size[0] == patch_size[1] else False
|
|
25
|
-
self.grid_size = tuple(s // p for s, p in zip((height, width), self.patch_size))
|
|
26
|
-
self.num_patches = self.grid_size[0] * self.grid_size[1]
|
|
27
|
-
|
|
28
|
-
self.cls_token = self.add_weight(shape=(1, 1, embed_dim), initializer="zeros", trainable=True, name="cls_token")
|
|
29
|
-
self.positions = self.add_weight(
|
|
30
|
-
shape=(1, self.num_patches + 1, embed_dim),
|
|
31
|
-
initializer="zeros",
|
|
32
|
-
trainable=True,
|
|
33
|
-
name="positions",
|
|
34
|
-
)
|
|
35
|
-
self.projection = layers.Conv2D(
|
|
36
|
-
filters=embed_dim,
|
|
37
|
-
kernel_size=self.patch_size,
|
|
38
|
-
strides=self.patch_size,
|
|
39
|
-
padding="valid",
|
|
40
|
-
data_format="channels_last",
|
|
41
|
-
use_bias=True,
|
|
42
|
-
kernel_initializer="glorot_uniform",
|
|
43
|
-
bias_initializer="zeros",
|
|
44
|
-
name="projection",
|
|
45
|
-
)
|
|
46
|
-
|
|
47
|
-
def interpolate_pos_encoding(self, embeddings: tf.Tensor, height: int, width: int) -> tf.Tensor:
|
|
48
|
-
"""100 % borrowed from:
|
|
49
|
-
https://github.com/huggingface/transformers/blob/main/src/transformers/models/vit/modeling_tf_vit.py
|
|
50
|
-
|
|
51
|
-
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
|
|
52
|
-
resolution images.
|
|
53
|
-
|
|
54
|
-
Source:
|
|
55
|
-
https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py
|
|
56
|
-
"""
|
|
57
|
-
seq_len, dim = embeddings.shape[1:]
|
|
58
|
-
num_patches = seq_len - 1
|
|
59
|
-
|
|
60
|
-
num_positions = self.positions.shape[1] - 1
|
|
61
|
-
|
|
62
|
-
if num_patches == num_positions and height == width:
|
|
63
|
-
return self.positions
|
|
64
|
-
class_pos_embed = self.positions[:, :1]
|
|
65
|
-
patch_pos_embed = self.positions[:, 1:]
|
|
66
|
-
h0 = height // self.patch_size[0]
|
|
67
|
-
w0 = width // self.patch_size[1]
|
|
68
|
-
patch_pos_embed = tf.image.resize(
|
|
69
|
-
images=tf.reshape(
|
|
70
|
-
patch_pos_embed, shape=(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim)
|
|
71
|
-
),
|
|
72
|
-
size=(h0, w0),
|
|
73
|
-
method="bilinear",
|
|
74
|
-
)
|
|
75
|
-
|
|
76
|
-
shape = patch_pos_embed.shape
|
|
77
|
-
assert h0 == shape[-3], "height of interpolated patch embedding doesn't match"
|
|
78
|
-
assert w0 == shape[-2], "width of interpolated patch embedding doesn't match"
|
|
79
|
-
|
|
80
|
-
patch_pos_embed = tf.reshape(tensor=patch_pos_embed, shape=(1, -1, dim))
|
|
81
|
-
return tf.concat(values=(class_pos_embed, patch_pos_embed), axis=1)
|
|
82
|
-
|
|
83
|
-
def call(self, x: tf.Tensor, **kwargs: Any) -> tf.Tensor:
|
|
84
|
-
B, H, W, C = x.shape
|
|
85
|
-
assert H % self.patch_size[0] == 0, "Image height must be divisible by patch height"
|
|
86
|
-
assert W % self.patch_size[1] == 0, "Image width must be divisible by patch width"
|
|
87
|
-
# patchify image
|
|
88
|
-
patches = self.projection(x, **kwargs) # (batch_size, num_patches, d_model)
|
|
89
|
-
patches = tf.reshape(patches, (B, self.num_patches, -1)) # (batch_size, num_patches, d_model)
|
|
90
|
-
|
|
91
|
-
cls_tokens = tf.repeat(self.cls_token, B, axis=0) # (batch_size, 1, d_model)
|
|
92
|
-
# concate cls_tokens to patches
|
|
93
|
-
embeddings = tf.concat([cls_tokens, patches], axis=1) # (batch_size, num_patches + 1, d_model)
|
|
94
|
-
# add positions to embeddings
|
|
95
|
-
if self.interpolate:
|
|
96
|
-
embeddings += self.interpolate_pos_encoding(embeddings, H, W)
|
|
97
|
-
else:
|
|
98
|
-
embeddings += self.positions
|
|
99
|
-
|
|
100
|
-
return embeddings # (batch_size, num_patches + 1, d_model)
|
|
@@ -1,155 +0,0 @@
|
|
|
1
|
-
# Copyright (C) 2021-2025, Mindee.
|
|
2
|
-
|
|
3
|
-
# This program is licensed under the Apache License 2.0.
|
|
4
|
-
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
|
-
|
|
6
|
-
from typing import Any
|
|
7
|
-
|
|
8
|
-
import numpy as np
|
|
9
|
-
import tensorflow as tf
|
|
10
|
-
|
|
11
|
-
from doctr.io.elements import Document
|
|
12
|
-
from doctr.models._utils import get_language
|
|
13
|
-
from doctr.models.detection.predictor import DetectionPredictor
|
|
14
|
-
from doctr.models.recognition.predictor import RecognitionPredictor
|
|
15
|
-
from doctr.utils.geometry import detach_scores
|
|
16
|
-
from doctr.utils.repr import NestedObject
|
|
17
|
-
|
|
18
|
-
from .base import _OCRPredictor
|
|
19
|
-
|
|
20
|
-
__all__ = ["OCRPredictor"]
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
class OCRPredictor(NestedObject, _OCRPredictor):
|
|
24
|
-
"""Implements an object able to localize and identify text elements in a set of documents
|
|
25
|
-
|
|
26
|
-
Args:
|
|
27
|
-
det_predictor: detection module
|
|
28
|
-
reco_predictor: recognition module
|
|
29
|
-
assume_straight_pages: if True, speeds up the inference by assuming you only pass straight pages
|
|
30
|
-
without rotated textual elements.
|
|
31
|
-
straighten_pages: if True, estimates the page general orientation based on the median line orientation.
|
|
32
|
-
Then, rotates page before passing it to the deep learning modules. The final predictions will be remapped
|
|
33
|
-
accordingly. Doing so will improve performances for documents with page-uniform rotations.
|
|
34
|
-
detect_orientation: if True, the estimated general page orientation will be added to the predictions for each
|
|
35
|
-
page. Doing so will slightly deteriorate the overall latency.
|
|
36
|
-
detect_language: if True, the language prediction will be added to the predictions for each
|
|
37
|
-
page. Doing so will slightly deteriorate the overall latency.
|
|
38
|
-
**kwargs: keyword args of `DocumentBuilder`
|
|
39
|
-
"""
|
|
40
|
-
|
|
41
|
-
_children_names = ["det_predictor", "reco_predictor", "doc_builder"]
|
|
42
|
-
|
|
43
|
-
def __init__(
|
|
44
|
-
self,
|
|
45
|
-
det_predictor: DetectionPredictor,
|
|
46
|
-
reco_predictor: RecognitionPredictor,
|
|
47
|
-
assume_straight_pages: bool = True,
|
|
48
|
-
straighten_pages: bool = False,
|
|
49
|
-
preserve_aspect_ratio: bool = True,
|
|
50
|
-
symmetric_pad: bool = True,
|
|
51
|
-
detect_orientation: bool = False,
|
|
52
|
-
detect_language: bool = False,
|
|
53
|
-
**kwargs: Any,
|
|
54
|
-
) -> None:
|
|
55
|
-
self.det_predictor = det_predictor
|
|
56
|
-
self.reco_predictor = reco_predictor
|
|
57
|
-
_OCRPredictor.__init__(
|
|
58
|
-
self,
|
|
59
|
-
assume_straight_pages,
|
|
60
|
-
straighten_pages,
|
|
61
|
-
preserve_aspect_ratio,
|
|
62
|
-
symmetric_pad,
|
|
63
|
-
detect_orientation,
|
|
64
|
-
**kwargs,
|
|
65
|
-
)
|
|
66
|
-
self.detect_orientation = detect_orientation
|
|
67
|
-
self.detect_language = detect_language
|
|
68
|
-
|
|
69
|
-
def __call__(
|
|
70
|
-
self,
|
|
71
|
-
pages: list[np.ndarray | tf.Tensor],
|
|
72
|
-
**kwargs: Any,
|
|
73
|
-
) -> Document:
|
|
74
|
-
# Dimension check
|
|
75
|
-
if any(page.ndim != 3 for page in pages):
|
|
76
|
-
raise ValueError("incorrect input shape: all pages are expected to be multi-channel 2D images.")
|
|
77
|
-
|
|
78
|
-
origin_page_shapes = [page.shape[:2] for page in pages]
|
|
79
|
-
|
|
80
|
-
# Localize text elements
|
|
81
|
-
loc_preds_dict, out_maps = self.det_predictor(pages, return_maps=True, **kwargs)
|
|
82
|
-
|
|
83
|
-
# Detect document rotation and rotate pages
|
|
84
|
-
seg_maps = [
|
|
85
|
-
np.where(out_map > getattr(self.det_predictor.model.postprocessor, "bin_thresh"), 255, 0).astype(np.uint8)
|
|
86
|
-
for out_map in out_maps
|
|
87
|
-
]
|
|
88
|
-
if self.detect_orientation:
|
|
89
|
-
general_pages_orientations, origin_pages_orientations = self._get_orientations(pages, seg_maps)
|
|
90
|
-
orientations = [
|
|
91
|
-
{"value": orientation_page, "confidence": None} for orientation_page in origin_pages_orientations
|
|
92
|
-
]
|
|
93
|
-
else:
|
|
94
|
-
orientations = None
|
|
95
|
-
general_pages_orientations = None
|
|
96
|
-
origin_pages_orientations = None
|
|
97
|
-
if self.straighten_pages:
|
|
98
|
-
pages = self._straighten_pages(pages, seg_maps, general_pages_orientations, origin_pages_orientations)
|
|
99
|
-
# update page shapes after straightening
|
|
100
|
-
origin_page_shapes = [page.shape[:2] for page in pages]
|
|
101
|
-
|
|
102
|
-
# forward again to get predictions on straight pages
|
|
103
|
-
loc_preds_dict = self.det_predictor(pages, **kwargs)
|
|
104
|
-
|
|
105
|
-
assert all(len(loc_pred) == 1 for loc_pred in loc_preds_dict), (
|
|
106
|
-
"Detection Model in ocr_predictor should output only one class"
|
|
107
|
-
)
|
|
108
|
-
loc_preds: list[np.ndarray] = [list(loc_pred.values())[0] for loc_pred in loc_preds_dict]
|
|
109
|
-
# Detach objectness scores from loc_preds
|
|
110
|
-
loc_preds, objectness_scores = detach_scores(loc_preds)
|
|
111
|
-
|
|
112
|
-
# Apply hooks to loc_preds if any
|
|
113
|
-
for hook in self.hooks:
|
|
114
|
-
loc_preds = hook(loc_preds)
|
|
115
|
-
|
|
116
|
-
# Crop images
|
|
117
|
-
crops, loc_preds = self._prepare_crops(
|
|
118
|
-
pages,
|
|
119
|
-
loc_preds,
|
|
120
|
-
channels_last=True,
|
|
121
|
-
assume_straight_pages=self.assume_straight_pages,
|
|
122
|
-
assume_horizontal=self._page_orientation_disabled,
|
|
123
|
-
)
|
|
124
|
-
# Rectify crop orientation and get crop orientation predictions
|
|
125
|
-
crop_orientations: Any = []
|
|
126
|
-
if not self.assume_straight_pages:
|
|
127
|
-
crops, loc_preds, _crop_orientations = self._rectify_crops(crops, loc_preds)
|
|
128
|
-
crop_orientations = [
|
|
129
|
-
{"value": orientation[0], "confidence": orientation[1]} for orientation in _crop_orientations
|
|
130
|
-
]
|
|
131
|
-
|
|
132
|
-
# Identify character sequences
|
|
133
|
-
word_preds = self.reco_predictor([crop for page_crops in crops for crop in page_crops], **kwargs)
|
|
134
|
-
if not crop_orientations:
|
|
135
|
-
crop_orientations = [{"value": 0, "confidence": None} for _ in word_preds]
|
|
136
|
-
|
|
137
|
-
boxes, text_preds, crop_orientations = self._process_predictions(loc_preds, word_preds, crop_orientations)
|
|
138
|
-
|
|
139
|
-
if self.detect_language:
|
|
140
|
-
languages = [get_language(" ".join([item[0] for item in text_pred])) for text_pred in text_preds]
|
|
141
|
-
languages_dict = [{"value": lang[0], "confidence": lang[1]} for lang in languages]
|
|
142
|
-
else:
|
|
143
|
-
languages_dict = None
|
|
144
|
-
|
|
145
|
-
out = self.doc_builder(
|
|
146
|
-
pages,
|
|
147
|
-
boxes,
|
|
148
|
-
objectness_scores,
|
|
149
|
-
text_preds,
|
|
150
|
-
origin_page_shapes,
|
|
151
|
-
crop_orientations,
|
|
152
|
-
orientations,
|
|
153
|
-
languages_dict,
|
|
154
|
-
)
|
|
155
|
-
return out
|