python-doctr 0.11.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. doctr/__init__.py +0 -1
  2. doctr/datasets/__init__.py +1 -5
  3. doctr/datasets/coco_text.py +139 -0
  4. doctr/datasets/cord.py +2 -1
  5. doctr/datasets/datasets/__init__.py +1 -6
  6. doctr/datasets/datasets/pytorch.py +2 -2
  7. doctr/datasets/funsd.py +2 -2
  8. doctr/datasets/generator/__init__.py +1 -6
  9. doctr/datasets/ic03.py +1 -1
  10. doctr/datasets/ic13.py +2 -1
  11. doctr/datasets/iiit5k.py +4 -1
  12. doctr/datasets/imgur5k.py +9 -2
  13. doctr/datasets/ocr.py +1 -1
  14. doctr/datasets/recognition.py +1 -1
  15. doctr/datasets/svhn.py +1 -1
  16. doctr/datasets/svt.py +2 -2
  17. doctr/datasets/synthtext.py +15 -2
  18. doctr/datasets/utils.py +7 -6
  19. doctr/datasets/vocabs.py +1100 -54
  20. doctr/file_utils.py +2 -92
  21. doctr/io/elements.py +37 -3
  22. doctr/io/image/__init__.py +1 -7
  23. doctr/io/image/pytorch.py +1 -1
  24. doctr/models/_utils.py +4 -4
  25. doctr/models/classification/__init__.py +1 -0
  26. doctr/models/classification/magc_resnet/__init__.py +1 -6
  27. doctr/models/classification/magc_resnet/pytorch.py +3 -4
  28. doctr/models/classification/mobilenet/__init__.py +1 -6
  29. doctr/models/classification/mobilenet/pytorch.py +15 -1
  30. doctr/models/classification/predictor/__init__.py +1 -6
  31. doctr/models/classification/predictor/pytorch.py +2 -2
  32. doctr/models/classification/resnet/__init__.py +1 -6
  33. doctr/models/classification/resnet/pytorch.py +26 -3
  34. doctr/models/classification/textnet/__init__.py +1 -6
  35. doctr/models/classification/textnet/pytorch.py +11 -2
  36. doctr/models/classification/vgg/__init__.py +1 -6
  37. doctr/models/classification/vgg/pytorch.py +16 -1
  38. doctr/models/classification/vip/__init__.py +1 -0
  39. doctr/models/classification/vip/layers/__init__.py +1 -0
  40. doctr/models/classification/vip/layers/pytorch.py +615 -0
  41. doctr/models/classification/vip/pytorch.py +505 -0
  42. doctr/models/classification/vit/__init__.py +1 -6
  43. doctr/models/classification/vit/pytorch.py +12 -3
  44. doctr/models/classification/zoo.py +7 -8
  45. doctr/models/detection/_utils/__init__.py +1 -6
  46. doctr/models/detection/core.py +1 -1
  47. doctr/models/detection/differentiable_binarization/__init__.py +1 -6
  48. doctr/models/detection/differentiable_binarization/base.py +7 -16
  49. doctr/models/detection/differentiable_binarization/pytorch.py +13 -4
  50. doctr/models/detection/fast/__init__.py +1 -6
  51. doctr/models/detection/fast/base.py +6 -17
  52. doctr/models/detection/fast/pytorch.py +17 -8
  53. doctr/models/detection/linknet/__init__.py +1 -6
  54. doctr/models/detection/linknet/base.py +5 -15
  55. doctr/models/detection/linknet/pytorch.py +12 -3
  56. doctr/models/detection/predictor/__init__.py +1 -6
  57. doctr/models/detection/predictor/pytorch.py +1 -1
  58. doctr/models/detection/zoo.py +15 -32
  59. doctr/models/factory/hub.py +9 -22
  60. doctr/models/kie_predictor/__init__.py +1 -6
  61. doctr/models/kie_predictor/pytorch.py +3 -7
  62. doctr/models/modules/layers/__init__.py +1 -6
  63. doctr/models/modules/layers/pytorch.py +52 -4
  64. doctr/models/modules/transformer/__init__.py +1 -6
  65. doctr/models/modules/transformer/pytorch.py +2 -2
  66. doctr/models/modules/vision_transformer/__init__.py +1 -6
  67. doctr/models/predictor/__init__.py +1 -6
  68. doctr/models/predictor/base.py +3 -8
  69. doctr/models/predictor/pytorch.py +3 -6
  70. doctr/models/preprocessor/__init__.py +1 -6
  71. doctr/models/preprocessor/pytorch.py +27 -32
  72. doctr/models/recognition/__init__.py +1 -0
  73. doctr/models/recognition/crnn/__init__.py +1 -6
  74. doctr/models/recognition/crnn/pytorch.py +16 -7
  75. doctr/models/recognition/master/__init__.py +1 -6
  76. doctr/models/recognition/master/pytorch.py +15 -6
  77. doctr/models/recognition/parseq/__init__.py +1 -6
  78. doctr/models/recognition/parseq/pytorch.py +26 -8
  79. doctr/models/recognition/predictor/__init__.py +1 -6
  80. doctr/models/recognition/predictor/_utils.py +100 -47
  81. doctr/models/recognition/predictor/pytorch.py +4 -5
  82. doctr/models/recognition/sar/__init__.py +1 -6
  83. doctr/models/recognition/sar/pytorch.py +13 -4
  84. doctr/models/recognition/utils.py +56 -47
  85. doctr/models/recognition/viptr/__init__.py +1 -0
  86. doctr/models/recognition/viptr/pytorch.py +277 -0
  87. doctr/models/recognition/vitstr/__init__.py +1 -6
  88. doctr/models/recognition/vitstr/pytorch.py +13 -4
  89. doctr/models/recognition/zoo.py +13 -8
  90. doctr/models/utils/__init__.py +1 -6
  91. doctr/models/utils/pytorch.py +29 -19
  92. doctr/transforms/functional/__init__.py +1 -6
  93. doctr/transforms/functional/pytorch.py +4 -4
  94. doctr/transforms/modules/__init__.py +1 -7
  95. doctr/transforms/modules/base.py +26 -92
  96. doctr/transforms/modules/pytorch.py +28 -26
  97. doctr/utils/data.py +1 -1
  98. doctr/utils/geometry.py +7 -11
  99. doctr/utils/visualization.py +1 -1
  100. doctr/version.py +1 -1
  101. {python_doctr-0.11.0.dist-info → python_doctr-1.0.0.dist-info}/METADATA +22 -63
  102. python_doctr-1.0.0.dist-info/RECORD +149 -0
  103. {python_doctr-0.11.0.dist-info → python_doctr-1.0.0.dist-info}/WHEEL +1 -1
  104. doctr/datasets/datasets/tensorflow.py +0 -59
  105. doctr/datasets/generator/tensorflow.py +0 -58
  106. doctr/datasets/loader.py +0 -94
  107. doctr/io/image/tensorflow.py +0 -101
  108. doctr/models/classification/magc_resnet/tensorflow.py +0 -196
  109. doctr/models/classification/mobilenet/tensorflow.py +0 -433
  110. doctr/models/classification/predictor/tensorflow.py +0 -60
  111. doctr/models/classification/resnet/tensorflow.py +0 -397
  112. doctr/models/classification/textnet/tensorflow.py +0 -266
  113. doctr/models/classification/vgg/tensorflow.py +0 -116
  114. doctr/models/classification/vit/tensorflow.py +0 -192
  115. doctr/models/detection/_utils/tensorflow.py +0 -34
  116. doctr/models/detection/differentiable_binarization/tensorflow.py +0 -414
  117. doctr/models/detection/fast/tensorflow.py +0 -419
  118. doctr/models/detection/linknet/tensorflow.py +0 -369
  119. doctr/models/detection/predictor/tensorflow.py +0 -70
  120. doctr/models/kie_predictor/tensorflow.py +0 -187
  121. doctr/models/modules/layers/tensorflow.py +0 -171
  122. doctr/models/modules/transformer/tensorflow.py +0 -235
  123. doctr/models/modules/vision_transformer/tensorflow.py +0 -100
  124. doctr/models/predictor/tensorflow.py +0 -155
  125. doctr/models/preprocessor/tensorflow.py +0 -122
  126. doctr/models/recognition/crnn/tensorflow.py +0 -308
  127. doctr/models/recognition/master/tensorflow.py +0 -313
  128. doctr/models/recognition/parseq/tensorflow.py +0 -508
  129. doctr/models/recognition/predictor/tensorflow.py +0 -79
  130. doctr/models/recognition/sar/tensorflow.py +0 -416
  131. doctr/models/recognition/vitstr/tensorflow.py +0 -278
  132. doctr/models/utils/tensorflow.py +0 -182
  133. doctr/transforms/functional/tensorflow.py +0 -254
  134. doctr/transforms/modules/tensorflow.py +0 -562
  135. python_doctr-0.11.0.dist-info/RECORD +0 -173
  136. {python_doctr-0.11.0.dist-info → python_doctr-1.0.0.dist-info/licenses}/LICENSE +0 -0
  137. {python_doctr-0.11.0.dist-info → python_doctr-1.0.0.dist-info}/top_level.txt +0 -0
  138. {python_doctr-0.11.0.dist-info → python_doctr-1.0.0.dist-info}/zip-safe +0 -0
@@ -1,313 +0,0 @@
1
- # Copyright (C) 2021-2025, Mindee.
2
-
3
- # This program is licensed under the Apache License 2.0.
4
- # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
-
6
- from copy import deepcopy
7
- from typing import Any
8
-
9
- import tensorflow as tf
10
- from tensorflow.keras import Model, layers
11
-
12
- from doctr.datasets import VOCABS
13
- from doctr.models.classification import magc_resnet31
14
- from doctr.models.modules.transformer import Decoder, PositionalEncoding
15
-
16
- from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
17
- from .base import _MASTER, _MASTERPostProcessor
18
-
19
- __all__ = ["MASTER", "master"]
20
-
21
-
22
- default_cfgs: dict[str, dict[str, Any]] = {
23
- "master": {
24
- "mean": (0.694, 0.695, 0.693),
25
- "std": (0.299, 0.296, 0.301),
26
- "input_shape": (32, 128, 3),
27
- "vocab": VOCABS["french"],
28
- "url": "https://doctr-static.mindee.com/models?id=v0.9.0/master-d7fdaeff.weights.h5&src=0",
29
- },
30
- }
31
-
32
-
33
- class MASTER(_MASTER, Model):
34
- """Implements MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_.
35
- Implementation based on the official TF implementation: <https://github.com/jiangxiluning/MASTER-TF>`_.
36
-
37
- Args:
38
- feature_extractor: the backbone serving as feature extractor
39
- vocab: vocabulary, (without EOS, SOS, PAD)
40
- d_model: d parameter for the transformer decoder
41
- dff: depth of the pointwise feed-forward layer
42
- num_heads: number of heads for the mutli-head attention module
43
- num_layers: number of decoder layers to stack
44
- max_length: maximum length of character sequence handled by the model
45
- dropout: dropout probability of the decoder
46
- input_shape: size of the image inputs
47
- exportable: onnx exportable returns only logits
48
- cfg: dictionary containing information about the model
49
- """
50
-
51
- def __init__(
52
- self,
53
- feature_extractor: Model,
54
- vocab: str,
55
- d_model: int = 512,
56
- dff: int = 2048,
57
- num_heads: int = 8, # number of heads in the transformer decoder
58
- num_layers: int = 3,
59
- max_length: int = 50,
60
- dropout: float = 0.2,
61
- input_shape: tuple[int, int, int] = (32, 128, 3), # different from the paper
62
- exportable: bool = False,
63
- cfg: dict[str, Any] | None = None,
64
- ) -> None:
65
- super().__init__()
66
-
67
- self.exportable = exportable
68
- self.max_length = max_length
69
- self.d_model = d_model
70
- self.vocab = vocab
71
- self.cfg = cfg
72
- self.vocab_size = len(vocab)
73
-
74
- self.feat_extractor = feature_extractor
75
- self.positional_encoding = PositionalEncoding(self.d_model, dropout, max_len=input_shape[0] * input_shape[1])
76
-
77
- self.decoder = Decoder(
78
- num_layers=num_layers,
79
- d_model=self.d_model,
80
- num_heads=num_heads,
81
- vocab_size=self.vocab_size + 3, # EOS, SOS, PAD
82
- dff=dff,
83
- dropout=dropout,
84
- maximum_position_encoding=self.max_length,
85
- )
86
-
87
- self.linear = layers.Dense(self.vocab_size + 3, kernel_initializer=tf.initializers.he_uniform())
88
- self.postprocessor = MASTERPostProcessor(vocab=self.vocab)
89
-
90
- @tf.function
91
- def make_source_and_target_mask(self, source: tf.Tensor, target: tf.Tensor) -> tuple[tf.Tensor, tf.Tensor]:
92
- # [1, 1, 1, ..., 0, 0, 0] -> 0 is masked
93
- # (N, 1, 1, max_length)
94
- target_pad_mask = tf.cast(tf.math.not_equal(target, self.vocab_size + 2), dtype=tf.uint8)
95
- target_pad_mask = target_pad_mask[:, tf.newaxis, tf.newaxis, :]
96
- target_length = target.shape[1]
97
- # sub mask filled diagonal with 1 = see 0 = masked (max_length, max_length)
98
- target_sub_mask = tf.linalg.band_part(tf.ones((target_length, target_length)), -1, 0)
99
- # source mask filled with ones (max_length, positional_encoded_seq_len)
100
- source_mask = tf.ones((target_length, source.shape[1]))
101
- # combine the two masks into one boolean mask where False is masked (N, 1, max_length, max_length)
102
- target_mask = tf.math.logical_and(
103
- tf.cast(target_sub_mask, dtype=tf.bool), tf.cast(target_pad_mask, dtype=tf.bool)
104
- )
105
- return source_mask, target_mask
106
-
107
- @staticmethod
108
- def compute_loss(
109
- model_output: tf.Tensor,
110
- gt: tf.Tensor,
111
- seq_len: list[int],
112
- ) -> tf.Tensor:
113
- """Compute categorical cross-entropy loss for the model.
114
- Sequences are masked after the EOS character.
115
-
116
- Args:
117
- gt: the encoded tensor with gt labels
118
- model_output: predicted logits of the model
119
- seq_len: lengths of each gt word inside the batch
120
-
121
- Returns:
122
- The loss of the model on the batch
123
- """
124
- # Input length : number of timesteps
125
- input_len = tf.shape(model_output)[1]
126
- # Add one for additional <eos> token (sos disappear in shift!)
127
- seq_len = tf.cast(seq_len, tf.int32) + 1
128
- # One-hot gt labels
129
- oh_gt = tf.one_hot(gt, depth=model_output.shape[2])
130
- # Compute loss: don't forget to shift gt! Otherwise the model learns to output the gt[t-1]!
131
- # The "masked" first gt char is <sos>. Delete last logit of the model output.
132
- cce = tf.nn.softmax_cross_entropy_with_logits(oh_gt[:, 1:, :], model_output[:, :-1, :])
133
- # Compute mask
134
- mask_values = tf.zeros_like(cce)
135
- mask_2d = tf.sequence_mask(seq_len, input_len - 1) # delete the last mask timestep as well
136
- masked_loss = tf.where(mask_2d, cce, mask_values)
137
- ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, model_output.dtype))
138
-
139
- return tf.expand_dims(ce_loss, axis=1)
140
-
141
- def call(
142
- self,
143
- x: tf.Tensor,
144
- target: list[str] | None = None,
145
- return_model_output: bool = False,
146
- return_preds: bool = False,
147
- **kwargs: Any,
148
- ) -> dict[str, Any]:
149
- """Call function for training
150
-
151
- Args:
152
- x: images
153
- target: list of str labels
154
- return_model_output: if True, return logits
155
- return_preds: if True, decode logits
156
- **kwargs: keyword arguments passed to the decoder
157
-
158
- Returns:
159
- A dictionnary containing eventually loss, logits and predictions.
160
- """
161
- # Encode
162
- feature = self.feat_extractor(x, **kwargs)
163
- b, h, w, c = feature.get_shape()
164
- # (N, H, W, C) --> (N, H * W, C)
165
- feature = tf.reshape(feature, shape=(b, h * w, c))
166
- # add positional encoding to features
167
- encoded = self.positional_encoding(feature, **kwargs)
168
-
169
- out: dict[str, tf.Tensor] = {}
170
-
171
- if kwargs.get("training", False) and target is None:
172
- raise ValueError("Need to provide labels during training")
173
-
174
- if target is not None:
175
- # Compute target: tensor of gts and sequence lengths
176
- gt, seq_len = self.build_target(target)
177
- # Compute decoder masks
178
- source_mask, target_mask = self.make_source_and_target_mask(encoded, gt)
179
- # Compute logits
180
- output = self.decoder(gt, encoded, source_mask, target_mask, **kwargs)
181
- logits = self.linear(output, **kwargs)
182
- else:
183
- logits = self.decode(encoded, **kwargs)
184
-
185
- logits = _bf16_to_float32(logits)
186
-
187
- if self.exportable:
188
- out["logits"] = logits
189
- return out
190
-
191
- if target is not None:
192
- out["loss"] = self.compute_loss(logits, gt, seq_len)
193
-
194
- if return_model_output:
195
- out["out_map"] = logits
196
-
197
- if return_preds:
198
- out["preds"] = self.postprocessor(logits)
199
-
200
- return out
201
-
202
- @tf.function
203
- def decode(self, encoded: tf.Tensor, **kwargs: Any) -> tf.Tensor:
204
- """Decode function for prediction
205
-
206
- Args:
207
- encoded: encoded features
208
- **kwargs: keyword arguments passed to the decoder
209
-
210
- Returns:
211
- A tuple of tf.Tensor: predictions, logits
212
- """
213
- b = encoded.shape[0]
214
-
215
- start_symbol = tf.constant(self.vocab_size + 1, dtype=tf.int32) # SOS
216
- padding_symbol = tf.constant(self.vocab_size + 2, dtype=tf.int32) # PAD
217
-
218
- ys = tf.fill(dims=(b, self.max_length - 1), value=padding_symbol)
219
- start_vector = tf.fill(dims=(b, 1), value=start_symbol)
220
- ys = tf.concat([start_vector, ys], axis=-1)
221
-
222
- # Final dimension include EOS/SOS/PAD
223
- for i in range(self.max_length - 1):
224
- source_mask, target_mask = self.make_source_and_target_mask(encoded, ys)
225
- output = self.decoder(ys, encoded, source_mask, target_mask, **kwargs)
226
- logits = self.linear(output, **kwargs)
227
- prob = tf.nn.softmax(logits, axis=-1)
228
- next_token = tf.argmax(prob, axis=-1, output_type=ys.dtype)
229
- # update ys with the next token and ignore the first token (SOS)
230
- i_mesh, j_mesh = tf.meshgrid(tf.range(b), tf.range(self.max_length), indexing="ij")
231
- indices = tf.stack([i_mesh[:, i + 1], j_mesh[:, i + 1]], axis=1)
232
-
233
- ys = tf.tensor_scatter_nd_update(ys, indices, next_token[:, i])
234
-
235
- # Shape (N, max_length, vocab_size + 1)
236
- return logits
237
-
238
-
239
- class MASTERPostProcessor(_MASTERPostProcessor):
240
- """Post processor for MASTER architectures
241
-
242
- Args:
243
- vocab: string containing the ordered sequence of supported characters
244
- """
245
-
246
- def __call__(
247
- self,
248
- logits: tf.Tensor,
249
- ) -> list[tuple[str, float]]:
250
- # compute pred with argmax for attention models
251
- out_idxs = tf.math.argmax(logits, axis=2)
252
- # N x L
253
- probs = tf.gather(tf.nn.softmax(logits, axis=-1), out_idxs, axis=-1, batch_dims=2)
254
- # Take the minimum confidence of the sequence
255
- probs = tf.math.reduce_min(probs, axis=1)
256
-
257
- # decode raw output of the model with tf_label_to_idx
258
- out_idxs = tf.cast(out_idxs, dtype="int32")
259
- embedding = tf.constant(self._embedding, dtype=tf.string)
260
- decoded_strings_pred = tf.strings.reduce_join(inputs=tf.nn.embedding_lookup(embedding, out_idxs), axis=-1)
261
- decoded_strings_pred = tf.strings.split(decoded_strings_pred, "<eos>")
262
- decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value="not valid")[:, 0]
263
- word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()]
264
-
265
- return list(zip(word_values, probs.numpy().clip(0, 1).tolist()))
266
-
267
-
268
- def _master(arch: str, pretrained: bool, backbone_fn, pretrained_backbone: bool = True, **kwargs: Any) -> MASTER:
269
- pretrained_backbone = pretrained_backbone and not pretrained
270
-
271
- # Patch the config
272
- _cfg = deepcopy(default_cfgs[arch])
273
- _cfg["input_shape"] = kwargs.get("input_shape", _cfg["input_shape"])
274
- _cfg["vocab"] = kwargs.get("vocab", _cfg["vocab"])
275
-
276
- kwargs["vocab"] = _cfg["vocab"]
277
- kwargs["input_shape"] = _cfg["input_shape"]
278
-
279
- # Build the model
280
- model = MASTER(
281
- backbone_fn(pretrained=pretrained_backbone, input_shape=_cfg["input_shape"], include_top=False),
282
- cfg=_cfg,
283
- **kwargs,
284
- )
285
- _build_model(model)
286
-
287
- # Load pretrained parameters
288
- if pretrained:
289
- # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning
290
- load_pretrained_params(
291
- model, default_cfgs[arch]["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"]
292
- )
293
-
294
- return model
295
-
296
-
297
- def master(pretrained: bool = False, **kwargs: Any) -> MASTER:
298
- """MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_.
299
-
300
- >>> import tensorflow as tf
301
- >>> from doctr.models import master
302
- >>> model = master(pretrained=False)
303
- >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32)
304
- >>> out = model(input_tensor)
305
-
306
- Args:
307
- pretrained (bool): If True, returns a model pre-trained on our text recognition dataset
308
- **kwargs: keywoard arguments passed to the MASTER architecture
309
-
310
- Returns:
311
- text recognition architecture
312
- """
313
- return _master("master", pretrained, magc_resnet31, **kwargs)