python-doctr 0.12.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. doctr/__init__.py +0 -1
  2. doctr/datasets/__init__.py +0 -5
  3. doctr/datasets/datasets/__init__.py +1 -6
  4. doctr/datasets/datasets/pytorch.py +2 -2
  5. doctr/datasets/generator/__init__.py +1 -6
  6. doctr/datasets/vocabs.py +0 -2
  7. doctr/file_utils.py +2 -101
  8. doctr/io/image/__init__.py +1 -7
  9. doctr/io/image/pytorch.py +1 -1
  10. doctr/models/_utils.py +3 -3
  11. doctr/models/classification/magc_resnet/__init__.py +1 -6
  12. doctr/models/classification/magc_resnet/pytorch.py +2 -2
  13. doctr/models/classification/mobilenet/__init__.py +1 -6
  14. doctr/models/classification/predictor/__init__.py +1 -6
  15. doctr/models/classification/predictor/pytorch.py +1 -1
  16. doctr/models/classification/resnet/__init__.py +1 -6
  17. doctr/models/classification/textnet/__init__.py +1 -6
  18. doctr/models/classification/textnet/pytorch.py +1 -1
  19. doctr/models/classification/vgg/__init__.py +1 -6
  20. doctr/models/classification/vip/__init__.py +1 -4
  21. doctr/models/classification/vip/layers/__init__.py +1 -4
  22. doctr/models/classification/vip/layers/pytorch.py +1 -1
  23. doctr/models/classification/vit/__init__.py +1 -6
  24. doctr/models/classification/vit/pytorch.py +2 -2
  25. doctr/models/classification/zoo.py +6 -11
  26. doctr/models/detection/_utils/__init__.py +1 -6
  27. doctr/models/detection/core.py +1 -1
  28. doctr/models/detection/differentiable_binarization/__init__.py +1 -6
  29. doctr/models/detection/differentiable_binarization/base.py +4 -12
  30. doctr/models/detection/differentiable_binarization/pytorch.py +3 -3
  31. doctr/models/detection/fast/__init__.py +1 -6
  32. doctr/models/detection/fast/base.py +4 -14
  33. doctr/models/detection/fast/pytorch.py +4 -4
  34. doctr/models/detection/linknet/__init__.py +1 -6
  35. doctr/models/detection/linknet/base.py +3 -12
  36. doctr/models/detection/linknet/pytorch.py +2 -2
  37. doctr/models/detection/predictor/__init__.py +1 -6
  38. doctr/models/detection/predictor/pytorch.py +1 -1
  39. doctr/models/detection/zoo.py +15 -32
  40. doctr/models/factory/hub.py +8 -21
  41. doctr/models/kie_predictor/__init__.py +1 -6
  42. doctr/models/kie_predictor/pytorch.py +2 -6
  43. doctr/models/modules/layers/__init__.py +1 -6
  44. doctr/models/modules/layers/pytorch.py +3 -3
  45. doctr/models/modules/transformer/__init__.py +1 -6
  46. doctr/models/modules/transformer/pytorch.py +2 -2
  47. doctr/models/modules/vision_transformer/__init__.py +1 -6
  48. doctr/models/predictor/__init__.py +1 -6
  49. doctr/models/predictor/base.py +3 -8
  50. doctr/models/predictor/pytorch.py +2 -5
  51. doctr/models/preprocessor/__init__.py +1 -6
  52. doctr/models/preprocessor/pytorch.py +27 -32
  53. doctr/models/recognition/crnn/__init__.py +1 -6
  54. doctr/models/recognition/crnn/pytorch.py +6 -6
  55. doctr/models/recognition/master/__init__.py +1 -6
  56. doctr/models/recognition/master/pytorch.py +5 -5
  57. doctr/models/recognition/parseq/__init__.py +1 -6
  58. doctr/models/recognition/parseq/pytorch.py +5 -5
  59. doctr/models/recognition/predictor/__init__.py +1 -6
  60. doctr/models/recognition/predictor/_utils.py +7 -16
  61. doctr/models/recognition/predictor/pytorch.py +1 -2
  62. doctr/models/recognition/sar/__init__.py +1 -6
  63. doctr/models/recognition/sar/pytorch.py +3 -3
  64. doctr/models/recognition/viptr/__init__.py +1 -4
  65. doctr/models/recognition/viptr/pytorch.py +3 -3
  66. doctr/models/recognition/vitstr/__init__.py +1 -6
  67. doctr/models/recognition/vitstr/pytorch.py +3 -3
  68. doctr/models/recognition/zoo.py +13 -13
  69. doctr/models/utils/__init__.py +1 -6
  70. doctr/models/utils/pytorch.py +1 -1
  71. doctr/transforms/functional/__init__.py +1 -6
  72. doctr/transforms/functional/pytorch.py +4 -4
  73. doctr/transforms/modules/__init__.py +1 -7
  74. doctr/transforms/modules/base.py +26 -92
  75. doctr/transforms/modules/pytorch.py +28 -26
  76. doctr/utils/geometry.py +6 -10
  77. doctr/utils/visualization.py +1 -1
  78. doctr/version.py +1 -1
  79. {python_doctr-0.12.0.dist-info → python_doctr-1.0.0.dist-info}/METADATA +18 -75
  80. python_doctr-1.0.0.dist-info/RECORD +149 -0
  81. doctr/datasets/datasets/tensorflow.py +0 -59
  82. doctr/datasets/generator/tensorflow.py +0 -58
  83. doctr/datasets/loader.py +0 -94
  84. doctr/io/image/tensorflow.py +0 -101
  85. doctr/models/classification/magc_resnet/tensorflow.py +0 -196
  86. doctr/models/classification/mobilenet/tensorflow.py +0 -442
  87. doctr/models/classification/predictor/tensorflow.py +0 -60
  88. doctr/models/classification/resnet/tensorflow.py +0 -418
  89. doctr/models/classification/textnet/tensorflow.py +0 -275
  90. doctr/models/classification/vgg/tensorflow.py +0 -125
  91. doctr/models/classification/vit/tensorflow.py +0 -201
  92. doctr/models/detection/_utils/tensorflow.py +0 -34
  93. doctr/models/detection/differentiable_binarization/tensorflow.py +0 -421
  94. doctr/models/detection/fast/tensorflow.py +0 -427
  95. doctr/models/detection/linknet/tensorflow.py +0 -377
  96. doctr/models/detection/predictor/tensorflow.py +0 -70
  97. doctr/models/kie_predictor/tensorflow.py +0 -187
  98. doctr/models/modules/layers/tensorflow.py +0 -171
  99. doctr/models/modules/transformer/tensorflow.py +0 -235
  100. doctr/models/modules/vision_transformer/tensorflow.py +0 -100
  101. doctr/models/predictor/tensorflow.py +0 -155
  102. doctr/models/preprocessor/tensorflow.py +0 -122
  103. doctr/models/recognition/crnn/tensorflow.py +0 -317
  104. doctr/models/recognition/master/tensorflow.py +0 -320
  105. doctr/models/recognition/parseq/tensorflow.py +0 -516
  106. doctr/models/recognition/predictor/tensorflow.py +0 -79
  107. doctr/models/recognition/sar/tensorflow.py +0 -423
  108. doctr/models/recognition/vitstr/tensorflow.py +0 -285
  109. doctr/models/utils/tensorflow.py +0 -189
  110. doctr/transforms/functional/tensorflow.py +0 -254
  111. doctr/transforms/modules/tensorflow.py +0 -562
  112. python_doctr-0.12.0.dist-info/RECORD +0 -180
  113. {python_doctr-0.12.0.dist-info → python_doctr-1.0.0.dist-info}/WHEEL +0 -0
  114. {python_doctr-0.12.0.dist-info → python_doctr-1.0.0.dist-info}/licenses/LICENSE +0 -0
  115. {python_doctr-0.12.0.dist-info → python_doctr-1.0.0.dist-info}/top_level.txt +0 -0
  116. {python_doctr-0.12.0.dist-info → python_doctr-1.0.0.dist-info}/zip-safe +0 -0
@@ -1,171 +0,0 @@
1
- # Copyright (C) 2021-2025, Mindee.
2
-
3
- # This program is licensed under the Apache License 2.0.
4
- # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
-
6
- from typing import Any
7
-
8
- import numpy as np
9
- import tensorflow as tf
10
- from tensorflow.keras import layers
11
-
12
- from doctr.utils.repr import NestedObject
13
-
14
- __all__ = ["FASTConvLayer"]
15
-
16
-
17
- class FASTConvLayer(layers.Layer, NestedObject):
18
- """Convolutional layer used in the TextNet and FAST architectures"""
19
-
20
- def __init__(
21
- self,
22
- in_channels: int,
23
- out_channels: int,
24
- kernel_size: int | tuple[int, int],
25
- stride: int = 1,
26
- dilation: int = 1,
27
- groups: int = 1,
28
- bias: bool = False,
29
- ) -> None:
30
- super().__init__()
31
-
32
- self.groups = groups
33
- self.in_channels = in_channels
34
- self.converted_ks = (kernel_size, kernel_size) if isinstance(kernel_size, int) else kernel_size
35
-
36
- self.hor_conv, self.hor_bn = None, None
37
- self.ver_conv, self.ver_bn = None, None
38
-
39
- padding = ((self.converted_ks[0] - 1) * dilation // 2, (self.converted_ks[1] - 1) * dilation // 2)
40
-
41
- self.activation = layers.ReLU()
42
- self.conv_pad = layers.ZeroPadding2D(padding=padding)
43
-
44
- self.conv = layers.Conv2D(
45
- filters=out_channels,
46
- kernel_size=self.converted_ks,
47
- strides=stride,
48
- dilation_rate=dilation,
49
- groups=groups,
50
- use_bias=bias,
51
- )
52
-
53
- self.bn = layers.BatchNormalization()
54
-
55
- if self.converted_ks[1] != 1:
56
- self.ver_pad = layers.ZeroPadding2D(
57
- padding=(int(((self.converted_ks[0] - 1) * dilation) / 2), 0),
58
- )
59
- self.ver_conv = layers.Conv2D(
60
- filters=out_channels,
61
- kernel_size=(self.converted_ks[0], 1),
62
- strides=stride,
63
- dilation_rate=dilation,
64
- groups=groups,
65
- use_bias=bias,
66
- )
67
- self.ver_bn = layers.BatchNormalization()
68
-
69
- if self.converted_ks[0] != 1:
70
- self.hor_pad = layers.ZeroPadding2D(
71
- padding=(0, int(((self.converted_ks[1] - 1) * dilation) / 2)),
72
- )
73
- self.hor_conv = layers.Conv2D(
74
- filters=out_channels,
75
- kernel_size=(1, self.converted_ks[1]),
76
- strides=stride,
77
- dilation_rate=dilation,
78
- groups=groups,
79
- use_bias=bias,
80
- )
81
- self.hor_bn = layers.BatchNormalization()
82
-
83
- self.rbr_identity = layers.BatchNormalization() if out_channels == in_channels and stride == 1 else None
84
-
85
- def call(self, x: tf.Tensor, **kwargs: Any) -> tf.Tensor:
86
- if hasattr(self, "fused_conv"):
87
- return self.activation(self.fused_conv(self.conv_pad(x, **kwargs), **kwargs))
88
-
89
- main_outputs = self.bn(self.conv(self.conv_pad(x, **kwargs), **kwargs), **kwargs)
90
- vertical_outputs = (
91
- self.ver_bn(self.ver_conv(self.ver_pad(x, **kwargs), **kwargs), **kwargs)
92
- if self.ver_conv is not None and self.ver_bn is not None
93
- else 0
94
- )
95
- horizontal_outputs = (
96
- self.hor_bn(self.hor_conv(self.hor_pad(x, **kwargs), **kwargs), **kwargs)
97
- if self.hor_bn is not None and self.hor_conv is not None
98
- else 0
99
- )
100
- id_out = self.rbr_identity(x, **kwargs) if self.rbr_identity is not None else 0
101
-
102
- return self.activation(main_outputs + vertical_outputs + horizontal_outputs + id_out)
103
-
104
- # The following logic is used to reparametrize the layer
105
- # Adapted from: https://github.com/mindee/doctr/blob/main/doctr/models/modules/layers/pytorch.py
106
- def _identity_to_conv(self, identity: layers.BatchNormalization) -> tuple[tf.Tensor, tf.Tensor] | tuple[int, int]:
107
- if identity is None or not hasattr(identity, "moving_mean") or not hasattr(identity, "moving_variance"):
108
- return 0, 0
109
- if not hasattr(self, "id_tensor"):
110
- input_dim = self.in_channels // self.groups
111
- kernel_value = np.zeros((1, 1, input_dim, self.in_channels), dtype=np.float32)
112
- for i in range(self.in_channels):
113
- kernel_value[0, 0, i % input_dim, i] = 1
114
- id_tensor = tf.constant(kernel_value, dtype=tf.float32)
115
- self.id_tensor = self._pad_to_mxn_tensor(id_tensor)
116
- kernel = self.id_tensor
117
- std = tf.sqrt(identity.moving_variance + identity.epsilon)
118
- t = tf.reshape(identity.gamma / std, (1, 1, 1, -1))
119
- return kernel * t, identity.beta - identity.moving_mean * identity.gamma / std
120
-
121
- def _fuse_bn_tensor(self, conv: layers.Conv2D, bn: layers.BatchNormalization) -> tuple[tf.Tensor, tf.Tensor]:
122
- kernel = conv.kernel
123
- kernel = self._pad_to_mxn_tensor(kernel)
124
- std = tf.sqrt(bn.moving_variance + bn.epsilon)
125
- t = tf.reshape(bn.gamma / std, (1, 1, 1, -1))
126
- return kernel * t, bn.beta - bn.moving_mean * bn.gamma / std
127
-
128
- def _get_equivalent_kernel_bias(self):
129
- kernel_mxn, bias_mxn = self._fuse_bn_tensor(self.conv, self.bn)
130
- if self.ver_conv is not None:
131
- kernel_mx1, bias_mx1 = self._fuse_bn_tensor(self.ver_conv, self.ver_bn)
132
- else:
133
- kernel_mx1, bias_mx1 = 0, 0
134
- if self.hor_conv is not None:
135
- kernel_1xn, bias_1xn = self._fuse_bn_tensor(self.hor_conv, self.hor_bn)
136
- else:
137
- kernel_1xn, bias_1xn = 0, 0
138
- kernel_id, bias_id = self._identity_to_conv(self.rbr_identity)
139
- kernel_mxn = kernel_mxn + kernel_mx1 + kernel_1xn + kernel_id
140
- bias_mxn = bias_mxn + bias_mx1 + bias_1xn + bias_id
141
- return kernel_mxn, bias_mxn
142
-
143
- def _pad_to_mxn_tensor(self, kernel: tf.Tensor) -> tf.Tensor:
144
- kernel_height, kernel_width = self.converted_ks
145
- height, width = kernel.shape[:2]
146
- pad_left_right = tf.maximum(0, (kernel_width - width) // 2)
147
- pad_top_down = tf.maximum(0, (kernel_height - height) // 2)
148
- return tf.pad(kernel, [[pad_top_down, pad_top_down], [pad_left_right, pad_left_right], [0, 0], [0, 0]])
149
-
150
- def reparameterize_layer(self):
151
- kernel, bias = self._get_equivalent_kernel_bias()
152
- self.fused_conv = layers.Conv2D(
153
- filters=self.conv.filters,
154
- kernel_size=self.conv.kernel_size,
155
- strides=self.conv.strides,
156
- padding=self.conv.padding,
157
- dilation_rate=self.conv.dilation_rate,
158
- groups=self.conv.groups,
159
- use_bias=True,
160
- )
161
- # build layer to initialize weights and biases
162
- self.fused_conv.build(input_shape=(None, None, None, kernel.shape[-2]))
163
- self.fused_conv.set_weights([kernel.numpy(), bias.numpy()])
164
- for para in self.trainable_variables:
165
- para._trainable = False
166
- for attr in ["conv", "bn", "ver_conv", "ver_bn", "hor_conv", "hor_bn"]:
167
- if hasattr(self, attr):
168
- delattr(self, attr)
169
-
170
- if hasattr(self, "rbr_identity"):
171
- delattr(self, "rbr_identity")
@@ -1,235 +0,0 @@
1
- # Copyright (C) 2021-2025, Mindee.
2
-
3
- # This program is licensed under the Apache License 2.0.
4
- # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
-
6
- import math
7
- from collections.abc import Callable
8
- from typing import Any
9
-
10
- import tensorflow as tf
11
- from tensorflow.keras import layers
12
-
13
- from doctr.utils.repr import NestedObject
14
-
15
- __all__ = ["Decoder", "PositionalEncoding", "EncoderBlock", "PositionwiseFeedForward", "MultiHeadAttention"]
16
-
17
-
18
- class PositionalEncoding(layers.Layer, NestedObject):
19
- """Compute positional encoding"""
20
-
21
- def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000) -> None:
22
- super(PositionalEncoding, self).__init__()
23
- self.dropout = layers.Dropout(rate=dropout)
24
-
25
- # Compute the positional encodings once in log space.
26
- pe = tf.Variable(tf.zeros((max_len, d_model)))
27
- position = tf.cast(
28
- tf.expand_dims(tf.experimental.numpy.arange(start=0, stop=max_len), axis=1), dtype=tf.float32
29
- )
30
- div_term = tf.math.exp(
31
- tf.cast(tf.experimental.numpy.arange(start=0, stop=d_model, step=2), dtype=tf.float32)
32
- * -(math.log(10000.0) / d_model)
33
- )
34
- pe = pe.numpy()
35
- pe[:, 0::2] = tf.math.sin(position * div_term)
36
- pe[:, 1::2] = tf.math.cos(position * div_term)
37
- self.pe = tf.expand_dims(tf.convert_to_tensor(pe), axis=0)
38
-
39
- def call(
40
- self,
41
- x: tf.Tensor,
42
- **kwargs: Any,
43
- ) -> tf.Tensor:
44
- """Forward pass
45
-
46
- Args:
47
- x: embeddings (batch, max_len, d_model)
48
- **kwargs: additional arguments
49
-
50
- Returns:
51
- positional embeddings (batch, max_len, d_model)
52
- """
53
- if x.dtype == tf.float16: # amp fix: cast to half
54
- x = x + tf.cast(self.pe[:, : x.shape[1]], dtype=tf.half)
55
- else:
56
- x = x + self.pe[:, : x.shape[1]]
57
- return self.dropout(x, **kwargs)
58
-
59
-
60
- @tf.function
61
- def scaled_dot_product_attention(
62
- query: tf.Tensor, key: tf.Tensor, value: tf.Tensor, mask: tf.Tensor | None = None
63
- ) -> tuple[tf.Tensor, tf.Tensor]:
64
- """Scaled Dot-Product Attention"""
65
- scores = tf.matmul(query, tf.transpose(key, perm=[0, 1, 3, 2])) / math.sqrt(query.shape[-1])
66
- if mask is not None:
67
- # NOTE: to ensure the ONNX compatibility, tf.where works only with bool type condition
68
- scores = tf.where(mask == False, float("-inf"), scores) # noqa: E712
69
- p_attn = tf.nn.softmax(scores, axis=-1)
70
- return tf.matmul(p_attn, value), p_attn
71
-
72
-
73
- class PositionwiseFeedForward(layers.Layer, NestedObject):
74
- """Position-wise Feed-Forward Network"""
75
-
76
- def __init__(
77
- self, d_model: int, ffd: int, dropout=0.1, activation_fct: Callable[[Any], Any] = layers.ReLU()
78
- ) -> None:
79
- super(PositionwiseFeedForward, self).__init__()
80
- self.activation_fct = activation_fct
81
-
82
- self.first_linear = layers.Dense(ffd, kernel_initializer=tf.initializers.he_uniform())
83
- self.sec_linear = layers.Dense(d_model, kernel_initializer=tf.initializers.he_uniform())
84
- self.dropout = layers.Dropout(rate=dropout)
85
-
86
- def call(self, x: tf.Tensor, **kwargs: Any) -> tf.Tensor:
87
- x = self.first_linear(x, **kwargs)
88
- x = self.activation_fct(x)
89
- x = self.dropout(x, **kwargs)
90
- x = self.sec_linear(x, **kwargs)
91
- x = self.dropout(x, **kwargs)
92
- return x
93
-
94
-
95
- class MultiHeadAttention(layers.Layer, NestedObject):
96
- """Multi-Head Attention"""
97
-
98
- def __init__(self, num_heads: int, d_model: int, dropout: float = 0.1) -> None:
99
- super().__init__()
100
- assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
101
-
102
- self.d_k = d_model // num_heads
103
- self.num_heads = num_heads
104
-
105
- self.linear_layers = [layers.Dense(d_model, kernel_initializer=tf.initializers.he_uniform()) for _ in range(3)]
106
- self.output_linear = layers.Dense(d_model, kernel_initializer=tf.initializers.he_uniform())
107
-
108
- def call(
109
- self,
110
- query: tf.Tensor,
111
- key: tf.Tensor,
112
- value: tf.Tensor,
113
- mask: tf.Tensor = None,
114
- **kwargs: Any,
115
- ) -> tf.Tensor:
116
- batch_size = query.shape[0]
117
-
118
- # linear projections of Q, K, V
119
- query, key, value = [
120
- tf.transpose(
121
- tf.reshape(linear(x, **kwargs), shape=[batch_size, -1, self.num_heads, self.d_k]), perm=[0, 2, 1, 3]
122
- )
123
- for linear, x in zip(self.linear_layers, (query, key, value))
124
- ]
125
-
126
- # apply attention on all the projected vectors in batch
127
- x, attn = scaled_dot_product_attention(query, key, value, mask=mask)
128
-
129
- # Concat attention heads
130
- x = tf.transpose(x, perm=[0, 2, 1, 3])
131
- x = tf.reshape(x, shape=[batch_size, -1, self.num_heads * self.d_k])
132
-
133
- return self.output_linear(x, **kwargs)
134
-
135
-
136
- class EncoderBlock(layers.Layer, NestedObject):
137
- """Transformer Encoder Block"""
138
-
139
- def __init__(
140
- self,
141
- num_layers: int,
142
- num_heads: int,
143
- d_model: int,
144
- dff: int, # hidden dimension of the feedforward network
145
- dropout: float,
146
- activation_fct: Callable[[Any], Any] = layers.ReLU(),
147
- ) -> None:
148
- super().__init__()
149
-
150
- self.num_layers = num_layers
151
-
152
- self.layer_norm_input = layers.LayerNormalization(epsilon=1e-5)
153
- self.layer_norm_attention = layers.LayerNormalization(epsilon=1e-5)
154
- self.layer_norm_output = layers.LayerNormalization(epsilon=1e-5)
155
- self.dropout = layers.Dropout(rate=dropout)
156
-
157
- self.attention = [MultiHeadAttention(num_heads, d_model, dropout) for _ in range(self.num_layers)]
158
- self.position_feed_forward = [
159
- PositionwiseFeedForward(d_model, dff, dropout, activation_fct) for _ in range(self.num_layers)
160
- ]
161
-
162
- def call(self, x: tf.Tensor, mask: tf.Tensor | None = None, **kwargs: Any) -> tf.Tensor:
163
- output = x
164
-
165
- for i in range(self.num_layers):
166
- normed_output = self.layer_norm_input(output, **kwargs)
167
- output = output + self.dropout(
168
- self.attention[i](normed_output, normed_output, normed_output, mask, **kwargs),
169
- **kwargs,
170
- )
171
- normed_output = self.layer_norm_attention(output, **kwargs)
172
- output = output + self.dropout(self.position_feed_forward[i](normed_output, **kwargs), **kwargs)
173
-
174
- # (batch_size, seq_len, d_model)
175
- return self.layer_norm_output(output, **kwargs)
176
-
177
-
178
- class Decoder(layers.Layer, NestedObject):
179
- """Transformer Decoder"""
180
-
181
- def __init__(
182
- self,
183
- num_layers: int,
184
- num_heads: int,
185
- d_model: int,
186
- vocab_size: int,
187
- dropout: float = 0.2,
188
- dff: int = 2048, # hidden dimension of the feedforward network
189
- maximum_position_encoding: int = 50,
190
- ) -> None:
191
- super(Decoder, self).__init__()
192
- self.num_layers = num_layers
193
- self.d_model = d_model
194
-
195
- self.layer_norm_input = layers.LayerNormalization(epsilon=1e-5)
196
- self.layer_norm_masked_attention = layers.LayerNormalization(epsilon=1e-5)
197
- self.layer_norm_attention = layers.LayerNormalization(epsilon=1e-5)
198
- self.layer_norm_output = layers.LayerNormalization(epsilon=1e-5)
199
-
200
- self.dropout = layers.Dropout(rate=dropout)
201
- self.embed = layers.Embedding(vocab_size, d_model)
202
- self.positional_encoding = PositionalEncoding(d_model, dropout, maximum_position_encoding)
203
-
204
- self.attention = [MultiHeadAttention(num_heads, d_model, dropout) for _ in range(self.num_layers)]
205
- self.source_attention = [MultiHeadAttention(num_heads, d_model, dropout) for _ in range(self.num_layers)]
206
- self.position_feed_forward = [PositionwiseFeedForward(d_model, dff, dropout) for _ in range(self.num_layers)]
207
-
208
- def call(
209
- self,
210
- tgt: tf.Tensor,
211
- memory: tf.Tensor,
212
- source_mask: tf.Tensor | None = None,
213
- target_mask: tf.Tensor | None = None,
214
- **kwargs: Any,
215
- ) -> tf.Tensor:
216
- tgt = self.embed(tgt, **kwargs) * math.sqrt(self.d_model)
217
- pos_enc_tgt = self.positional_encoding(tgt, **kwargs)
218
- output = pos_enc_tgt
219
-
220
- for i in range(self.num_layers):
221
- normed_output = self.layer_norm_input(output, **kwargs)
222
- output = output + self.dropout(
223
- self.attention[i](normed_output, normed_output, normed_output, target_mask, **kwargs),
224
- **kwargs,
225
- )
226
- normed_output = self.layer_norm_masked_attention(output, **kwargs)
227
- output = output + self.dropout(
228
- self.source_attention[i](normed_output, memory, memory, source_mask, **kwargs),
229
- **kwargs,
230
- )
231
- normed_output = self.layer_norm_attention(output, **kwargs)
232
- output = output + self.dropout(self.position_feed_forward[i](normed_output, **kwargs), **kwargs)
233
-
234
- # (batch_size, seq_len, d_model)
235
- return self.layer_norm_output(output, **kwargs)
@@ -1,100 +0,0 @@
1
- # Copyright (C) 2021-2025, Mindee.
2
-
3
- # This program is licensed under the Apache License 2.0.
4
- # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
-
6
- import math
7
- from typing import Any
8
-
9
- import tensorflow as tf
10
- from tensorflow.keras import layers
11
-
12
- from doctr.utils.repr import NestedObject
13
-
14
- __all__ = ["PatchEmbedding"]
15
-
16
-
17
- class PatchEmbedding(layers.Layer, NestedObject):
18
- """Compute 2D patch embeddings with cls token and positional encoding"""
19
-
20
- def __init__(self, input_shape: tuple[int, int, int], embed_dim: int, patch_size: tuple[int, int]) -> None:
21
- super().__init__()
22
- height, width, _ = input_shape
23
- self.patch_size = patch_size
24
- self.interpolate = True if patch_size[0] == patch_size[1] else False
25
- self.grid_size = tuple(s // p for s, p in zip((height, width), self.patch_size))
26
- self.num_patches = self.grid_size[0] * self.grid_size[1]
27
-
28
- self.cls_token = self.add_weight(shape=(1, 1, embed_dim), initializer="zeros", trainable=True, name="cls_token")
29
- self.positions = self.add_weight(
30
- shape=(1, self.num_patches + 1, embed_dim),
31
- initializer="zeros",
32
- trainable=True,
33
- name="positions",
34
- )
35
- self.projection = layers.Conv2D(
36
- filters=embed_dim,
37
- kernel_size=self.patch_size,
38
- strides=self.patch_size,
39
- padding="valid",
40
- data_format="channels_last",
41
- use_bias=True,
42
- kernel_initializer="glorot_uniform",
43
- bias_initializer="zeros",
44
- name="projection",
45
- )
46
-
47
- def interpolate_pos_encoding(self, embeddings: tf.Tensor, height: int, width: int) -> tf.Tensor:
48
- """100 % borrowed from:
49
- https://github.com/huggingface/transformers/blob/main/src/transformers/models/vit/modeling_tf_vit.py
50
-
51
- This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
52
- resolution images.
53
-
54
- Source:
55
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py
56
- """
57
- seq_len, dim = embeddings.shape[1:]
58
- num_patches = seq_len - 1
59
-
60
- num_positions = self.positions.shape[1] - 1
61
-
62
- if num_patches == num_positions and height == width:
63
- return self.positions
64
- class_pos_embed = self.positions[:, :1]
65
- patch_pos_embed = self.positions[:, 1:]
66
- h0 = height // self.patch_size[0]
67
- w0 = width // self.patch_size[1]
68
- patch_pos_embed = tf.image.resize(
69
- images=tf.reshape(
70
- patch_pos_embed, shape=(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim)
71
- ),
72
- size=(h0, w0),
73
- method="bilinear",
74
- )
75
-
76
- shape = patch_pos_embed.shape
77
- assert h0 == shape[-3], "height of interpolated patch embedding doesn't match"
78
- assert w0 == shape[-2], "width of interpolated patch embedding doesn't match"
79
-
80
- patch_pos_embed = tf.reshape(tensor=patch_pos_embed, shape=(1, -1, dim))
81
- return tf.concat(values=(class_pos_embed, patch_pos_embed), axis=1)
82
-
83
- def call(self, x: tf.Tensor, **kwargs: Any) -> tf.Tensor:
84
- B, H, W, C = x.shape
85
- assert H % self.patch_size[0] == 0, "Image height must be divisible by patch height"
86
- assert W % self.patch_size[1] == 0, "Image width must be divisible by patch width"
87
- # patchify image
88
- patches = self.projection(x, **kwargs) # (batch_size, num_patches, d_model)
89
- patches = tf.reshape(patches, (B, self.num_patches, -1)) # (batch_size, num_patches, d_model)
90
-
91
- cls_tokens = tf.repeat(self.cls_token, B, axis=0) # (batch_size, 1, d_model)
92
- # concate cls_tokens to patches
93
- embeddings = tf.concat([cls_tokens, patches], axis=1) # (batch_size, num_patches + 1, d_model)
94
- # add positions to embeddings
95
- if self.interpolate:
96
- embeddings += self.interpolate_pos_encoding(embeddings, H, W)
97
- else:
98
- embeddings += self.positions
99
-
100
- return embeddings # (batch_size, num_patches + 1, d_model)
@@ -1,155 +0,0 @@
1
- # Copyright (C) 2021-2025, Mindee.
2
-
3
- # This program is licensed under the Apache License 2.0.
4
- # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
-
6
- from typing import Any
7
-
8
- import numpy as np
9
- import tensorflow as tf
10
-
11
- from doctr.io.elements import Document
12
- from doctr.models._utils import get_language
13
- from doctr.models.detection.predictor import DetectionPredictor
14
- from doctr.models.recognition.predictor import RecognitionPredictor
15
- from doctr.utils.geometry import detach_scores
16
- from doctr.utils.repr import NestedObject
17
-
18
- from .base import _OCRPredictor
19
-
20
- __all__ = ["OCRPredictor"]
21
-
22
-
23
- class OCRPredictor(NestedObject, _OCRPredictor):
24
- """Implements an object able to localize and identify text elements in a set of documents
25
-
26
- Args:
27
- det_predictor: detection module
28
- reco_predictor: recognition module
29
- assume_straight_pages: if True, speeds up the inference by assuming you only pass straight pages
30
- without rotated textual elements.
31
- straighten_pages: if True, estimates the page general orientation based on the median line orientation.
32
- Then, rotates page before passing it to the deep learning modules. The final predictions will be remapped
33
- accordingly. Doing so will improve performances for documents with page-uniform rotations.
34
- detect_orientation: if True, the estimated general page orientation will be added to the predictions for each
35
- page. Doing so will slightly deteriorate the overall latency.
36
- detect_language: if True, the language prediction will be added to the predictions for each
37
- page. Doing so will slightly deteriorate the overall latency.
38
- **kwargs: keyword args of `DocumentBuilder`
39
- """
40
-
41
- _children_names = ["det_predictor", "reco_predictor", "doc_builder"]
42
-
43
- def __init__(
44
- self,
45
- det_predictor: DetectionPredictor,
46
- reco_predictor: RecognitionPredictor,
47
- assume_straight_pages: bool = True,
48
- straighten_pages: bool = False,
49
- preserve_aspect_ratio: bool = True,
50
- symmetric_pad: bool = True,
51
- detect_orientation: bool = False,
52
- detect_language: bool = False,
53
- **kwargs: Any,
54
- ) -> None:
55
- self.det_predictor = det_predictor
56
- self.reco_predictor = reco_predictor
57
- _OCRPredictor.__init__(
58
- self,
59
- assume_straight_pages,
60
- straighten_pages,
61
- preserve_aspect_ratio,
62
- symmetric_pad,
63
- detect_orientation,
64
- **kwargs,
65
- )
66
- self.detect_orientation = detect_orientation
67
- self.detect_language = detect_language
68
-
69
- def __call__(
70
- self,
71
- pages: list[np.ndarray | tf.Tensor],
72
- **kwargs: Any,
73
- ) -> Document:
74
- # Dimension check
75
- if any(page.ndim != 3 for page in pages):
76
- raise ValueError("incorrect input shape: all pages are expected to be multi-channel 2D images.")
77
-
78
- origin_page_shapes = [page.shape[:2] for page in pages]
79
-
80
- # Localize text elements
81
- loc_preds_dict, out_maps = self.det_predictor(pages, return_maps=True, **kwargs)
82
-
83
- # Detect document rotation and rotate pages
84
- seg_maps = [
85
- np.where(out_map > getattr(self.det_predictor.model.postprocessor, "bin_thresh"), 255, 0).astype(np.uint8)
86
- for out_map in out_maps
87
- ]
88
- if self.detect_orientation:
89
- general_pages_orientations, origin_pages_orientations = self._get_orientations(pages, seg_maps)
90
- orientations = [
91
- {"value": orientation_page, "confidence": None} for orientation_page in origin_pages_orientations
92
- ]
93
- else:
94
- orientations = None
95
- general_pages_orientations = None
96
- origin_pages_orientations = None
97
- if self.straighten_pages:
98
- pages = self._straighten_pages(pages, seg_maps, general_pages_orientations, origin_pages_orientations)
99
- # update page shapes after straightening
100
- origin_page_shapes = [page.shape[:2] for page in pages]
101
-
102
- # forward again to get predictions on straight pages
103
- loc_preds_dict = self.det_predictor(pages, **kwargs)
104
-
105
- assert all(len(loc_pred) == 1 for loc_pred in loc_preds_dict), (
106
- "Detection Model in ocr_predictor should output only one class"
107
- )
108
- loc_preds: list[np.ndarray] = [list(loc_pred.values())[0] for loc_pred in loc_preds_dict]
109
- # Detach objectness scores from loc_preds
110
- loc_preds, objectness_scores = detach_scores(loc_preds)
111
-
112
- # Apply hooks to loc_preds if any
113
- for hook in self.hooks:
114
- loc_preds = hook(loc_preds)
115
-
116
- # Crop images
117
- crops, loc_preds = self._prepare_crops(
118
- pages,
119
- loc_preds,
120
- channels_last=True,
121
- assume_straight_pages=self.assume_straight_pages,
122
- assume_horizontal=self._page_orientation_disabled,
123
- )
124
- # Rectify crop orientation and get crop orientation predictions
125
- crop_orientations: Any = []
126
- if not self.assume_straight_pages:
127
- crops, loc_preds, _crop_orientations = self._rectify_crops(crops, loc_preds)
128
- crop_orientations = [
129
- {"value": orientation[0], "confidence": orientation[1]} for orientation in _crop_orientations
130
- ]
131
-
132
- # Identify character sequences
133
- word_preds = self.reco_predictor([crop for page_crops in crops for crop in page_crops], **kwargs)
134
- if not crop_orientations:
135
- crop_orientations = [{"value": 0, "confidence": None} for _ in word_preds]
136
-
137
- boxes, text_preds, crop_orientations = self._process_predictions(loc_preds, word_preds, crop_orientations)
138
-
139
- if self.detect_language:
140
- languages = [get_language(" ".join([item[0] for item in text_pred])) for text_pred in text_preds]
141
- languages_dict = [{"value": lang[0], "confidence": lang[1]} for lang in languages]
142
- else:
143
- languages_dict = None
144
-
145
- out = self.doc_builder(
146
- pages,
147
- boxes,
148
- objectness_scores,
149
- text_preds,
150
- origin_page_shapes,
151
- crop_orientations,
152
- orientations,
153
- languages_dict,
154
- )
155
- return out