python-doctr 0.12.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. doctr/__init__.py +0 -1
  2. doctr/datasets/__init__.py +0 -5
  3. doctr/datasets/datasets/__init__.py +1 -6
  4. doctr/datasets/datasets/pytorch.py +2 -2
  5. doctr/datasets/generator/__init__.py +1 -6
  6. doctr/datasets/vocabs.py +0 -2
  7. doctr/file_utils.py +2 -101
  8. doctr/io/image/__init__.py +1 -7
  9. doctr/io/image/pytorch.py +1 -1
  10. doctr/models/_utils.py +3 -3
  11. doctr/models/classification/magc_resnet/__init__.py +1 -6
  12. doctr/models/classification/magc_resnet/pytorch.py +2 -2
  13. doctr/models/classification/mobilenet/__init__.py +1 -6
  14. doctr/models/classification/predictor/__init__.py +1 -6
  15. doctr/models/classification/predictor/pytorch.py +1 -1
  16. doctr/models/classification/resnet/__init__.py +1 -6
  17. doctr/models/classification/textnet/__init__.py +1 -6
  18. doctr/models/classification/textnet/pytorch.py +1 -1
  19. doctr/models/classification/vgg/__init__.py +1 -6
  20. doctr/models/classification/vip/__init__.py +1 -4
  21. doctr/models/classification/vip/layers/__init__.py +1 -4
  22. doctr/models/classification/vip/layers/pytorch.py +1 -1
  23. doctr/models/classification/vit/__init__.py +1 -6
  24. doctr/models/classification/vit/pytorch.py +2 -2
  25. doctr/models/classification/zoo.py +6 -11
  26. doctr/models/detection/_utils/__init__.py +1 -6
  27. doctr/models/detection/core.py +1 -1
  28. doctr/models/detection/differentiable_binarization/__init__.py +1 -6
  29. doctr/models/detection/differentiable_binarization/base.py +4 -12
  30. doctr/models/detection/differentiable_binarization/pytorch.py +3 -3
  31. doctr/models/detection/fast/__init__.py +1 -6
  32. doctr/models/detection/fast/base.py +4 -14
  33. doctr/models/detection/fast/pytorch.py +4 -4
  34. doctr/models/detection/linknet/__init__.py +1 -6
  35. doctr/models/detection/linknet/base.py +3 -12
  36. doctr/models/detection/linknet/pytorch.py +2 -2
  37. doctr/models/detection/predictor/__init__.py +1 -6
  38. doctr/models/detection/predictor/pytorch.py +1 -1
  39. doctr/models/detection/zoo.py +15 -32
  40. doctr/models/factory/hub.py +8 -21
  41. doctr/models/kie_predictor/__init__.py +1 -6
  42. doctr/models/kie_predictor/pytorch.py +2 -6
  43. doctr/models/modules/layers/__init__.py +1 -6
  44. doctr/models/modules/layers/pytorch.py +3 -3
  45. doctr/models/modules/transformer/__init__.py +1 -6
  46. doctr/models/modules/transformer/pytorch.py +2 -2
  47. doctr/models/modules/vision_transformer/__init__.py +1 -6
  48. doctr/models/predictor/__init__.py +1 -6
  49. doctr/models/predictor/base.py +3 -8
  50. doctr/models/predictor/pytorch.py +2 -5
  51. doctr/models/preprocessor/__init__.py +1 -6
  52. doctr/models/preprocessor/pytorch.py +27 -32
  53. doctr/models/recognition/crnn/__init__.py +1 -6
  54. doctr/models/recognition/crnn/pytorch.py +6 -6
  55. doctr/models/recognition/master/__init__.py +1 -6
  56. doctr/models/recognition/master/pytorch.py +5 -5
  57. doctr/models/recognition/parseq/__init__.py +1 -6
  58. doctr/models/recognition/parseq/pytorch.py +5 -5
  59. doctr/models/recognition/predictor/__init__.py +1 -6
  60. doctr/models/recognition/predictor/_utils.py +7 -16
  61. doctr/models/recognition/predictor/pytorch.py +1 -2
  62. doctr/models/recognition/sar/__init__.py +1 -6
  63. doctr/models/recognition/sar/pytorch.py +3 -3
  64. doctr/models/recognition/viptr/__init__.py +1 -4
  65. doctr/models/recognition/viptr/pytorch.py +3 -3
  66. doctr/models/recognition/vitstr/__init__.py +1 -6
  67. doctr/models/recognition/vitstr/pytorch.py +3 -3
  68. doctr/models/recognition/zoo.py +13 -13
  69. doctr/models/utils/__init__.py +1 -6
  70. doctr/models/utils/pytorch.py +1 -1
  71. doctr/transforms/functional/__init__.py +1 -6
  72. doctr/transforms/functional/pytorch.py +4 -4
  73. doctr/transforms/modules/__init__.py +1 -7
  74. doctr/transforms/modules/base.py +26 -92
  75. doctr/transforms/modules/pytorch.py +28 -26
  76. doctr/utils/geometry.py +6 -10
  77. doctr/utils/visualization.py +1 -1
  78. doctr/version.py +1 -1
  79. {python_doctr-0.12.0.dist-info → python_doctr-1.0.0.dist-info}/METADATA +18 -75
  80. python_doctr-1.0.0.dist-info/RECORD +149 -0
  81. doctr/datasets/datasets/tensorflow.py +0 -59
  82. doctr/datasets/generator/tensorflow.py +0 -58
  83. doctr/datasets/loader.py +0 -94
  84. doctr/io/image/tensorflow.py +0 -101
  85. doctr/models/classification/magc_resnet/tensorflow.py +0 -196
  86. doctr/models/classification/mobilenet/tensorflow.py +0 -442
  87. doctr/models/classification/predictor/tensorflow.py +0 -60
  88. doctr/models/classification/resnet/tensorflow.py +0 -418
  89. doctr/models/classification/textnet/tensorflow.py +0 -275
  90. doctr/models/classification/vgg/tensorflow.py +0 -125
  91. doctr/models/classification/vit/tensorflow.py +0 -201
  92. doctr/models/detection/_utils/tensorflow.py +0 -34
  93. doctr/models/detection/differentiable_binarization/tensorflow.py +0 -421
  94. doctr/models/detection/fast/tensorflow.py +0 -427
  95. doctr/models/detection/linknet/tensorflow.py +0 -377
  96. doctr/models/detection/predictor/tensorflow.py +0 -70
  97. doctr/models/kie_predictor/tensorflow.py +0 -187
  98. doctr/models/modules/layers/tensorflow.py +0 -171
  99. doctr/models/modules/transformer/tensorflow.py +0 -235
  100. doctr/models/modules/vision_transformer/tensorflow.py +0 -100
  101. doctr/models/predictor/tensorflow.py +0 -155
  102. doctr/models/preprocessor/tensorflow.py +0 -122
  103. doctr/models/recognition/crnn/tensorflow.py +0 -317
  104. doctr/models/recognition/master/tensorflow.py +0 -320
  105. doctr/models/recognition/parseq/tensorflow.py +0 -516
  106. doctr/models/recognition/predictor/tensorflow.py +0 -79
  107. doctr/models/recognition/sar/tensorflow.py +0 -423
  108. doctr/models/recognition/vitstr/tensorflow.py +0 -285
  109. doctr/models/utils/tensorflow.py +0 -189
  110. doctr/transforms/functional/tensorflow.py +0 -254
  111. doctr/transforms/modules/tensorflow.py +0 -562
  112. python_doctr-0.12.0.dist-info/RECORD +0 -180
  113. {python_doctr-0.12.0.dist-info → python_doctr-1.0.0.dist-info}/WHEEL +0 -0
  114. {python_doctr-0.12.0.dist-info → python_doctr-1.0.0.dist-info}/licenses/LICENSE +0 -0
  115. {python_doctr-0.12.0.dist-info → python_doctr-1.0.0.dist-info}/top_level.txt +0 -0
  116. {python_doctr-0.12.0.dist-info → python_doctr-1.0.0.dist-info}/zip-safe +0 -0
@@ -1,320 +0,0 @@
1
- # Copyright (C) 2021-2025, Mindee.
2
-
3
- # This program is licensed under the Apache License 2.0.
4
- # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
-
6
- from copy import deepcopy
7
- from typing import Any
8
-
9
- import tensorflow as tf
10
- from tensorflow.keras import Model, layers
11
-
12
- from doctr.datasets import VOCABS
13
- from doctr.models.classification import magc_resnet31
14
- from doctr.models.modules.transformer import Decoder, PositionalEncoding
15
-
16
- from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
17
- from .base import _MASTER, _MASTERPostProcessor
18
-
19
- __all__ = ["MASTER", "master"]
20
-
21
-
22
- default_cfgs: dict[str, dict[str, Any]] = {
23
- "master": {
24
- "mean": (0.694, 0.695, 0.693),
25
- "std": (0.299, 0.296, 0.301),
26
- "input_shape": (32, 128, 3),
27
- "vocab": VOCABS["french"],
28
- "url": "https://doctr-static.mindee.com/models?id=v0.9.0/master-d7fdaeff.weights.h5&src=0",
29
- },
30
- }
31
-
32
-
33
- class MASTER(_MASTER, Model):
34
- """Implements MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_.
35
- Implementation based on the official TF implementation: <https://github.com/jiangxiluning/MASTER-TF>`_.
36
-
37
- Args:
38
- feature_extractor: the backbone serving as feature extractor
39
- vocab: vocabulary, (without EOS, SOS, PAD)
40
- d_model: d parameter for the transformer decoder
41
- dff: depth of the pointwise feed-forward layer
42
- num_heads: number of heads for the mutli-head attention module
43
- num_layers: number of decoder layers to stack
44
- max_length: maximum length of character sequence handled by the model
45
- dropout: dropout probability of the decoder
46
- input_shape: size of the image inputs
47
- exportable: onnx exportable returns only logits
48
- cfg: dictionary containing information about the model
49
- """
50
-
51
- def __init__(
52
- self,
53
- feature_extractor: Model,
54
- vocab: str,
55
- d_model: int = 512,
56
- dff: int = 2048,
57
- num_heads: int = 8, # number of heads in the transformer decoder
58
- num_layers: int = 3,
59
- max_length: int = 50,
60
- dropout: float = 0.2,
61
- input_shape: tuple[int, int, int] = (32, 128, 3), # different from the paper
62
- exportable: bool = False,
63
- cfg: dict[str, Any] | None = None,
64
- ) -> None:
65
- super().__init__()
66
-
67
- self.exportable = exportable
68
- self.max_length = max_length
69
- self.d_model = d_model
70
- self.vocab = vocab
71
- self.cfg = cfg
72
- self.vocab_size = len(vocab)
73
-
74
- self.feat_extractor = feature_extractor
75
- self.positional_encoding = PositionalEncoding(self.d_model, dropout, max_len=input_shape[0] * input_shape[1])
76
-
77
- self.decoder = Decoder(
78
- num_layers=num_layers,
79
- d_model=self.d_model,
80
- num_heads=num_heads,
81
- vocab_size=self.vocab_size + 3, # EOS, SOS, PAD
82
- dff=dff,
83
- dropout=dropout,
84
- maximum_position_encoding=self.max_length,
85
- )
86
-
87
- self.linear = layers.Dense(self.vocab_size + 3, kernel_initializer=tf.initializers.he_uniform())
88
- self.postprocessor = MASTERPostProcessor(vocab=self.vocab)
89
-
90
- def from_pretrained(self, path_or_url: str, **kwargs: Any) -> None:
91
- """Load pretrained parameters onto the model
92
-
93
- Args:
94
- path_or_url: the path or URL to the model parameters (checkpoint)
95
- **kwargs: additional arguments to be passed to `doctr.models.utils.load_pretrained_params`
96
- """
97
- load_pretrained_params(self, path_or_url, **kwargs)
98
-
99
- @tf.function
100
- def make_source_and_target_mask(self, source: tf.Tensor, target: tf.Tensor) -> tuple[tf.Tensor, tf.Tensor]:
101
- # [1, 1, 1, ..., 0, 0, 0] -> 0 is masked
102
- # (N, 1, 1, max_length)
103
- target_pad_mask = tf.cast(tf.math.not_equal(target, self.vocab_size + 2), dtype=tf.uint8)
104
- target_pad_mask = target_pad_mask[:, tf.newaxis, tf.newaxis, :]
105
- target_length = target.shape[1]
106
- # sub mask filled diagonal with 1 = see 0 = masked (max_length, max_length)
107
- target_sub_mask = tf.linalg.band_part(tf.ones((target_length, target_length)), -1, 0)
108
- # source mask filled with ones (max_length, positional_encoded_seq_len)
109
- source_mask = tf.ones((target_length, source.shape[1]))
110
- # combine the two masks into one boolean mask where False is masked (N, 1, max_length, max_length)
111
- target_mask = tf.math.logical_and(
112
- tf.cast(target_sub_mask, dtype=tf.bool), tf.cast(target_pad_mask, dtype=tf.bool)
113
- )
114
- return source_mask, target_mask
115
-
116
- @staticmethod
117
- def compute_loss(
118
- model_output: tf.Tensor,
119
- gt: tf.Tensor,
120
- seq_len: list[int],
121
- ) -> tf.Tensor:
122
- """Compute categorical cross-entropy loss for the model.
123
- Sequences are masked after the EOS character.
124
-
125
- Args:
126
- gt: the encoded tensor with gt labels
127
- model_output: predicted logits of the model
128
- seq_len: lengths of each gt word inside the batch
129
-
130
- Returns:
131
- The loss of the model on the batch
132
- """
133
- # Input length : number of timesteps
134
- input_len = tf.shape(model_output)[1]
135
- # Add one for additional <eos> token (sos disappear in shift!)
136
- seq_len = tf.cast(seq_len, tf.int32) + 1
137
- # One-hot gt labels
138
- oh_gt = tf.one_hot(gt, depth=model_output.shape[2])
139
- # Compute loss: don't forget to shift gt! Otherwise the model learns to output the gt[t-1]!
140
- # The "masked" first gt char is <sos>. Delete last logit of the model output.
141
- cce = tf.nn.softmax_cross_entropy_with_logits(oh_gt[:, 1:, :], model_output[:, :-1, :])
142
- # Compute mask
143
- mask_values = tf.zeros_like(cce)
144
- mask_2d = tf.sequence_mask(seq_len, input_len - 1) # delete the last mask timestep as well
145
- masked_loss = tf.where(mask_2d, cce, mask_values)
146
- ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, model_output.dtype))
147
-
148
- return tf.expand_dims(ce_loss, axis=1)
149
-
150
- def call(
151
- self,
152
- x: tf.Tensor,
153
- target: list[str] | None = None,
154
- return_model_output: bool = False,
155
- return_preds: bool = False,
156
- **kwargs: Any,
157
- ) -> dict[str, Any]:
158
- """Call function for training
159
-
160
- Args:
161
- x: images
162
- target: list of str labels
163
- return_model_output: if True, return logits
164
- return_preds: if True, decode logits
165
- **kwargs: keyword arguments passed to the decoder
166
-
167
- Returns:
168
- A dictionnary containing eventually loss, logits and predictions.
169
- """
170
- # Encode
171
- feature = self.feat_extractor(x, **kwargs)
172
- b, h, w, c = feature.get_shape()
173
- # (N, H, W, C) --> (N, H * W, C)
174
- feature = tf.reshape(feature, shape=(b, h * w, c))
175
- # add positional encoding to features
176
- encoded = self.positional_encoding(feature, **kwargs)
177
-
178
- out: dict[str, tf.Tensor] = {}
179
-
180
- if kwargs.get("training", False) and target is None:
181
- raise ValueError("Need to provide labels during training")
182
-
183
- if target is not None:
184
- # Compute target: tensor of gts and sequence lengths
185
- gt, seq_len = self.build_target(target)
186
- # Compute decoder masks
187
- source_mask, target_mask = self.make_source_and_target_mask(encoded, gt)
188
- # Compute logits
189
- output = self.decoder(gt, encoded, source_mask, target_mask, **kwargs)
190
- logits = self.linear(output, **kwargs)
191
- else:
192
- logits = self.decode(encoded, **kwargs)
193
-
194
- logits = _bf16_to_float32(logits)
195
-
196
- if self.exportable:
197
- out["logits"] = logits
198
- return out
199
-
200
- if target is not None:
201
- out["loss"] = self.compute_loss(logits, gt, seq_len)
202
-
203
- if return_model_output:
204
- out["out_map"] = logits
205
-
206
- if return_preds:
207
- out["preds"] = self.postprocessor(logits)
208
-
209
- return out
210
-
211
- @tf.function
212
- def decode(self, encoded: tf.Tensor, **kwargs: Any) -> tf.Tensor:
213
- """Decode function for prediction
214
-
215
- Args:
216
- encoded: encoded features
217
- **kwargs: keyword arguments passed to the decoder
218
-
219
- Returns:
220
- A tuple of tf.Tensor: predictions, logits
221
- """
222
- b = encoded.shape[0]
223
-
224
- start_symbol = tf.constant(self.vocab_size + 1, dtype=tf.int32) # SOS
225
- padding_symbol = tf.constant(self.vocab_size + 2, dtype=tf.int32) # PAD
226
-
227
- ys = tf.fill(dims=(b, self.max_length - 1), value=padding_symbol)
228
- start_vector = tf.fill(dims=(b, 1), value=start_symbol)
229
- ys = tf.concat([start_vector, ys], axis=-1)
230
-
231
- # Final dimension include EOS/SOS/PAD
232
- for i in range(self.max_length - 1):
233
- source_mask, target_mask = self.make_source_and_target_mask(encoded, ys)
234
- output = self.decoder(ys, encoded, source_mask, target_mask, **kwargs)
235
- logits = self.linear(output, **kwargs)
236
- prob = tf.nn.softmax(logits, axis=-1)
237
- next_token = tf.argmax(prob, axis=-1, output_type=ys.dtype)
238
- # update ys with the next token and ignore the first token (SOS)
239
- i_mesh, j_mesh = tf.meshgrid(tf.range(b), tf.range(self.max_length), indexing="ij")
240
- indices = tf.stack([i_mesh[:, i + 1], j_mesh[:, i + 1]], axis=1)
241
-
242
- ys = tf.tensor_scatter_nd_update(ys, indices, next_token[:, i])
243
-
244
- # Shape (N, max_length, vocab_size + 1)
245
- return logits
246
-
247
-
248
- class MASTERPostProcessor(_MASTERPostProcessor):
249
- """Post processor for MASTER architectures
250
-
251
- Args:
252
- vocab: string containing the ordered sequence of supported characters
253
- """
254
-
255
- def __call__(
256
- self,
257
- logits: tf.Tensor,
258
- ) -> list[tuple[str, float]]:
259
- # compute pred with argmax for attention models
260
- out_idxs = tf.math.argmax(logits, axis=2)
261
- # N x L
262
- probs = tf.gather(tf.nn.softmax(logits, axis=-1), out_idxs, axis=-1, batch_dims=2)
263
- # Take the minimum confidence of the sequence
264
- probs = tf.math.reduce_min(probs, axis=1)
265
-
266
- # decode raw output of the model with tf_label_to_idx
267
- out_idxs = tf.cast(out_idxs, dtype="int32")
268
- embedding = tf.constant(self._embedding, dtype=tf.string)
269
- decoded_strings_pred = tf.strings.reduce_join(inputs=tf.nn.embedding_lookup(embedding, out_idxs), axis=-1)
270
- decoded_strings_pred = tf.strings.split(decoded_strings_pred, "<eos>")
271
- decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value="not valid")[:, 0]
272
- word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()]
273
-
274
- return list(zip(word_values, probs.numpy().clip(0, 1).tolist()))
275
-
276
-
277
- def _master(arch: str, pretrained: bool, backbone_fn, pretrained_backbone: bool = True, **kwargs: Any) -> MASTER:
278
- pretrained_backbone = pretrained_backbone and not pretrained
279
-
280
- # Patch the config
281
- _cfg = deepcopy(default_cfgs[arch])
282
- _cfg["input_shape"] = kwargs.get("input_shape", _cfg["input_shape"])
283
- _cfg["vocab"] = kwargs.get("vocab", _cfg["vocab"])
284
-
285
- kwargs["vocab"] = _cfg["vocab"]
286
- kwargs["input_shape"] = _cfg["input_shape"]
287
-
288
- # Build the model
289
- model = MASTER(
290
- backbone_fn(pretrained=pretrained_backbone, input_shape=_cfg["input_shape"], include_top=False),
291
- cfg=_cfg,
292
- **kwargs,
293
- )
294
- _build_model(model)
295
-
296
- # Load pretrained parameters
297
- if pretrained:
298
- # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning
299
- model.from_pretrained(default_cfgs[arch]["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"])
300
-
301
- return model
302
-
303
-
304
- def master(pretrained: bool = False, **kwargs: Any) -> MASTER:
305
- """MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_.
306
-
307
- >>> import tensorflow as tf
308
- >>> from doctr.models import master
309
- >>> model = master(pretrained=False)
310
- >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32)
311
- >>> out = model(input_tensor)
312
-
313
- Args:
314
- pretrained (bool): If True, returns a model pre-trained on our text recognition dataset
315
- **kwargs: keywoard arguments passed to the MASTER architecture
316
-
317
- Returns:
318
- text recognition architecture
319
- """
320
- return _master("master", pretrained, magc_resnet31, **kwargs)