python-doctr 0.10.0__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. doctr/contrib/__init__.py +1 -0
  2. doctr/contrib/artefacts.py +7 -9
  3. doctr/contrib/base.py +8 -17
  4. doctr/datasets/__init__.py +1 -0
  5. doctr/datasets/coco_text.py +139 -0
  6. doctr/datasets/cord.py +10 -8
  7. doctr/datasets/datasets/__init__.py +4 -4
  8. doctr/datasets/datasets/base.py +16 -16
  9. doctr/datasets/datasets/pytorch.py +12 -12
  10. doctr/datasets/datasets/tensorflow.py +10 -10
  11. doctr/datasets/detection.py +6 -9
  12. doctr/datasets/doc_artefacts.py +3 -4
  13. doctr/datasets/funsd.py +9 -8
  14. doctr/datasets/generator/__init__.py +4 -4
  15. doctr/datasets/generator/base.py +16 -17
  16. doctr/datasets/generator/pytorch.py +1 -3
  17. doctr/datasets/generator/tensorflow.py +1 -3
  18. doctr/datasets/ic03.py +5 -6
  19. doctr/datasets/ic13.py +6 -6
  20. doctr/datasets/iiit5k.py +10 -6
  21. doctr/datasets/iiithws.py +4 -5
  22. doctr/datasets/imgur5k.py +15 -7
  23. doctr/datasets/loader.py +4 -7
  24. doctr/datasets/mjsynth.py +6 -5
  25. doctr/datasets/ocr.py +3 -4
  26. doctr/datasets/orientation.py +3 -4
  27. doctr/datasets/recognition.py +4 -5
  28. doctr/datasets/sroie.py +6 -5
  29. doctr/datasets/svhn.py +7 -6
  30. doctr/datasets/svt.py +6 -7
  31. doctr/datasets/synthtext.py +19 -7
  32. doctr/datasets/utils.py +41 -35
  33. doctr/datasets/vocabs.py +1107 -49
  34. doctr/datasets/wildreceipt.py +14 -10
  35. doctr/file_utils.py +11 -7
  36. doctr/io/elements.py +96 -82
  37. doctr/io/html.py +1 -3
  38. doctr/io/image/__init__.py +3 -3
  39. doctr/io/image/base.py +2 -5
  40. doctr/io/image/pytorch.py +3 -12
  41. doctr/io/image/tensorflow.py +2 -11
  42. doctr/io/pdf.py +5 -7
  43. doctr/io/reader.py +5 -11
  44. doctr/models/_utils.py +15 -23
  45. doctr/models/builder.py +30 -48
  46. doctr/models/classification/__init__.py +1 -0
  47. doctr/models/classification/magc_resnet/__init__.py +3 -3
  48. doctr/models/classification/magc_resnet/pytorch.py +11 -15
  49. doctr/models/classification/magc_resnet/tensorflow.py +11 -14
  50. doctr/models/classification/mobilenet/__init__.py +3 -3
  51. doctr/models/classification/mobilenet/pytorch.py +20 -18
  52. doctr/models/classification/mobilenet/tensorflow.py +19 -23
  53. doctr/models/classification/predictor/__init__.py +4 -4
  54. doctr/models/classification/predictor/pytorch.py +7 -9
  55. doctr/models/classification/predictor/tensorflow.py +6 -8
  56. doctr/models/classification/resnet/__init__.py +4 -4
  57. doctr/models/classification/resnet/pytorch.py +47 -34
  58. doctr/models/classification/resnet/tensorflow.py +45 -35
  59. doctr/models/classification/textnet/__init__.py +3 -3
  60. doctr/models/classification/textnet/pytorch.py +20 -18
  61. doctr/models/classification/textnet/tensorflow.py +19 -17
  62. doctr/models/classification/vgg/__init__.py +3 -3
  63. doctr/models/classification/vgg/pytorch.py +21 -8
  64. doctr/models/classification/vgg/tensorflow.py +20 -14
  65. doctr/models/classification/vip/__init__.py +4 -0
  66. doctr/models/classification/vip/layers/__init__.py +4 -0
  67. doctr/models/classification/vip/layers/pytorch.py +615 -0
  68. doctr/models/classification/vip/pytorch.py +505 -0
  69. doctr/models/classification/vit/__init__.py +3 -3
  70. doctr/models/classification/vit/pytorch.py +18 -15
  71. doctr/models/classification/vit/tensorflow.py +15 -12
  72. doctr/models/classification/zoo.py +23 -14
  73. doctr/models/core.py +3 -3
  74. doctr/models/detection/_utils/__init__.py +4 -4
  75. doctr/models/detection/_utils/base.py +4 -7
  76. doctr/models/detection/_utils/pytorch.py +1 -5
  77. doctr/models/detection/_utils/tensorflow.py +1 -5
  78. doctr/models/detection/core.py +2 -8
  79. doctr/models/detection/differentiable_binarization/__init__.py +4 -4
  80. doctr/models/detection/differentiable_binarization/base.py +10 -21
  81. doctr/models/detection/differentiable_binarization/pytorch.py +37 -31
  82. doctr/models/detection/differentiable_binarization/tensorflow.py +26 -29
  83. doctr/models/detection/fast/__init__.py +4 -4
  84. doctr/models/detection/fast/base.py +8 -17
  85. doctr/models/detection/fast/pytorch.py +37 -35
  86. doctr/models/detection/fast/tensorflow.py +24 -28
  87. doctr/models/detection/linknet/__init__.py +4 -4
  88. doctr/models/detection/linknet/base.py +8 -18
  89. doctr/models/detection/linknet/pytorch.py +34 -28
  90. doctr/models/detection/linknet/tensorflow.py +24 -25
  91. doctr/models/detection/predictor/__init__.py +5 -5
  92. doctr/models/detection/predictor/pytorch.py +6 -7
  93. doctr/models/detection/predictor/tensorflow.py +5 -6
  94. doctr/models/detection/zoo.py +27 -7
  95. doctr/models/factory/hub.py +6 -10
  96. doctr/models/kie_predictor/__init__.py +5 -5
  97. doctr/models/kie_predictor/base.py +4 -5
  98. doctr/models/kie_predictor/pytorch.py +19 -20
  99. doctr/models/kie_predictor/tensorflow.py +14 -15
  100. doctr/models/modules/layers/__init__.py +3 -3
  101. doctr/models/modules/layers/pytorch.py +55 -10
  102. doctr/models/modules/layers/tensorflow.py +5 -7
  103. doctr/models/modules/transformer/__init__.py +3 -3
  104. doctr/models/modules/transformer/pytorch.py +12 -13
  105. doctr/models/modules/transformer/tensorflow.py +9 -10
  106. doctr/models/modules/vision_transformer/__init__.py +3 -3
  107. doctr/models/modules/vision_transformer/pytorch.py +2 -3
  108. doctr/models/modules/vision_transformer/tensorflow.py +3 -3
  109. doctr/models/predictor/__init__.py +5 -5
  110. doctr/models/predictor/base.py +28 -29
  111. doctr/models/predictor/pytorch.py +13 -14
  112. doctr/models/predictor/tensorflow.py +9 -10
  113. doctr/models/preprocessor/__init__.py +4 -4
  114. doctr/models/preprocessor/pytorch.py +13 -17
  115. doctr/models/preprocessor/tensorflow.py +10 -14
  116. doctr/models/recognition/__init__.py +1 -0
  117. doctr/models/recognition/core.py +3 -7
  118. doctr/models/recognition/crnn/__init__.py +4 -4
  119. doctr/models/recognition/crnn/pytorch.py +30 -29
  120. doctr/models/recognition/crnn/tensorflow.py +21 -24
  121. doctr/models/recognition/master/__init__.py +3 -3
  122. doctr/models/recognition/master/base.py +3 -7
  123. doctr/models/recognition/master/pytorch.py +32 -25
  124. doctr/models/recognition/master/tensorflow.py +22 -25
  125. doctr/models/recognition/parseq/__init__.py +3 -3
  126. doctr/models/recognition/parseq/base.py +3 -7
  127. doctr/models/recognition/parseq/pytorch.py +47 -29
  128. doctr/models/recognition/parseq/tensorflow.py +29 -27
  129. doctr/models/recognition/predictor/__init__.py +5 -5
  130. doctr/models/recognition/predictor/_utils.py +111 -52
  131. doctr/models/recognition/predictor/pytorch.py +9 -9
  132. doctr/models/recognition/predictor/tensorflow.py +8 -9
  133. doctr/models/recognition/sar/__init__.py +4 -4
  134. doctr/models/recognition/sar/pytorch.py +30 -22
  135. doctr/models/recognition/sar/tensorflow.py +22 -24
  136. doctr/models/recognition/utils.py +57 -53
  137. doctr/models/recognition/viptr/__init__.py +4 -0
  138. doctr/models/recognition/viptr/pytorch.py +277 -0
  139. doctr/models/recognition/vitstr/__init__.py +4 -4
  140. doctr/models/recognition/vitstr/base.py +3 -7
  141. doctr/models/recognition/vitstr/pytorch.py +28 -21
  142. doctr/models/recognition/vitstr/tensorflow.py +22 -23
  143. doctr/models/recognition/zoo.py +27 -11
  144. doctr/models/utils/__init__.py +4 -4
  145. doctr/models/utils/pytorch.py +41 -34
  146. doctr/models/utils/tensorflow.py +31 -23
  147. doctr/models/zoo.py +1 -5
  148. doctr/transforms/functional/__init__.py +3 -3
  149. doctr/transforms/functional/base.py +4 -11
  150. doctr/transforms/functional/pytorch.py +20 -28
  151. doctr/transforms/functional/tensorflow.py +10 -22
  152. doctr/transforms/modules/__init__.py +4 -4
  153. doctr/transforms/modules/base.py +48 -55
  154. doctr/transforms/modules/pytorch.py +58 -22
  155. doctr/transforms/modules/tensorflow.py +18 -32
  156. doctr/utils/common_types.py +8 -9
  157. doctr/utils/data.py +9 -13
  158. doctr/utils/fonts.py +2 -7
  159. doctr/utils/geometry.py +17 -48
  160. doctr/utils/metrics.py +17 -37
  161. doctr/utils/multithreading.py +4 -6
  162. doctr/utils/reconstitution.py +9 -13
  163. doctr/utils/repr.py +2 -3
  164. doctr/utils/visualization.py +16 -29
  165. doctr/version.py +1 -1
  166. {python_doctr-0.10.0.dist-info → python_doctr-0.12.0.dist-info}/METADATA +70 -52
  167. python_doctr-0.12.0.dist-info/RECORD +180 -0
  168. {python_doctr-0.10.0.dist-info → python_doctr-0.12.0.dist-info}/WHEEL +1 -1
  169. python_doctr-0.10.0.dist-info/RECORD +0 -173
  170. {python_doctr-0.10.0.dist-info → python_doctr-0.12.0.dist-info/licenses}/LICENSE +0 -0
  171. {python_doctr-0.10.0.dist-info → python_doctr-0.12.0.dist-info}/top_level.txt +0 -0
  172. {python_doctr-0.10.0.dist-info → python_doctr-0.12.0.dist-info}/zip-safe +0 -0
doctr/io/image/pytorch.py CHANGED
@@ -1,10 +1,9 @@
1
- # Copyright (C) 2021-2024, Mindee.
1
+ # Copyright (C) 2021-2025, Mindee.
2
2
 
3
3
  # This program is licensed under the Apache License 2.0.
4
4
  # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
5
 
6
6
  from io import BytesIO
7
- from typing import Tuple
8
7
 
9
8
  import numpy as np
10
9
  import torch
@@ -20,12 +19,10 @@ def tensor_from_pil(pil_img: Image.Image, dtype: torch.dtype = torch.float32) ->
20
19
  """Convert a PIL Image to a PyTorch tensor
21
20
 
22
21
  Args:
23
- ----
24
22
  pil_img: a PIL image
25
23
  dtype: the output tensor data type
26
24
 
27
25
  Returns:
28
- -------
29
26
  decoded image as tensor
30
27
  """
31
28
  if dtype == torch.float32:
@@ -40,12 +37,10 @@ def read_img_as_tensor(img_path: AbstractPath, dtype: torch.dtype = torch.float3
40
37
  """Read an image file as a PyTorch tensor
41
38
 
42
39
  Args:
43
- ----
44
40
  img_path: location of the image file
45
41
  dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
46
42
 
47
43
  Returns:
48
- -------
49
44
  decoded image as a tensor
50
45
  """
51
46
  if dtype not in (torch.uint8, torch.float16, torch.float32):
@@ -59,12 +54,10 @@ def decode_img_as_tensor(img_content: bytes, dtype: torch.dtype = torch.float32)
59
54
  """Read a byte stream as a PyTorch tensor
60
55
 
61
56
  Args:
62
- ----
63
57
  img_content: bytes of a decoded image
64
58
  dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
65
59
 
66
60
  Returns:
67
- -------
68
61
  decoded image as a tensor
69
62
  """
70
63
  if dtype not in (torch.uint8, torch.float16, torch.float32):
@@ -78,12 +71,10 @@ def tensor_from_numpy(npy_img: np.ndarray, dtype: torch.dtype = torch.float32) -
78
71
  """Read an image file as a PyTorch tensor
79
72
 
80
73
  Args:
81
- ----
82
74
  npy_img: image encoded as a numpy array of shape (H, W, C) in np.uint8
83
75
  dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
84
76
 
85
77
  Returns:
86
- -------
87
78
  same image as a tensor of shape (C, H, W)
88
79
  """
89
80
  if dtype not in (torch.uint8, torch.float16, torch.float32):
@@ -102,6 +93,6 @@ def tensor_from_numpy(npy_img: np.ndarray, dtype: torch.dtype = torch.float32) -
102
93
  return img
103
94
 
104
95
 
105
- def get_img_shape(img: torch.Tensor) -> Tuple[int, int]:
96
+ def get_img_shape(img: torch.Tensor) -> tuple[int, int]:
106
97
  """Get the shape of an image"""
107
- return img.shape[-2:] # type: ignore[return-value]
98
+ return img.shape[-2:]
@@ -1,9 +1,8 @@
1
- # Copyright (C) 2021-2024, Mindee.
1
+ # Copyright (C) 2021-2025, Mindee.
2
2
 
3
3
  # This program is licensed under the Apache License 2.0.
4
4
  # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
5
 
6
- from typing import Tuple
7
6
 
8
7
  import numpy as np
9
8
  import tensorflow as tf
@@ -19,12 +18,10 @@ def tensor_from_pil(pil_img: Image.Image, dtype: tf.dtypes.DType = tf.float32) -
19
18
  """Convert a PIL Image to a TensorFlow tensor
20
19
 
21
20
  Args:
22
- ----
23
21
  pil_img: a PIL image
24
22
  dtype: the output tensor data type
25
23
 
26
24
  Returns:
27
- -------
28
25
  decoded image as tensor
29
26
  """
30
27
  npy_img = img_to_array(pil_img)
@@ -36,12 +33,10 @@ def read_img_as_tensor(img_path: AbstractPath, dtype: tf.dtypes.DType = tf.float
36
33
  """Read an image file as a TensorFlow tensor
37
34
 
38
35
  Args:
39
- ----
40
36
  img_path: location of the image file
41
37
  dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
42
38
 
43
39
  Returns:
44
- -------
45
40
  decoded image as a tensor
46
41
  """
47
42
  if dtype not in (tf.uint8, tf.float16, tf.float32):
@@ -61,12 +56,10 @@ def decode_img_as_tensor(img_content: bytes, dtype: tf.dtypes.DType = tf.float32
61
56
  """Read a byte stream as a TensorFlow tensor
62
57
 
63
58
  Args:
64
- ----
65
59
  img_content: bytes of a decoded image
66
60
  dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
67
61
 
68
62
  Returns:
69
- -------
70
63
  decoded image as a tensor
71
64
  """
72
65
  if dtype not in (tf.uint8, tf.float16, tf.float32):
@@ -85,12 +78,10 @@ def tensor_from_numpy(npy_img: np.ndarray, dtype: tf.dtypes.DType = tf.float32)
85
78
  """Read an image file as a TensorFlow tensor
86
79
 
87
80
  Args:
88
- ----
89
81
  npy_img: image encoded as a numpy array of shape (H, W, C) in np.uint8
90
82
  dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
91
83
 
92
84
  Returns:
93
- -------
94
85
  same image as a tensor of shape (H, W, C)
95
86
  """
96
87
  if dtype not in (tf.uint8, tf.float16, tf.float32):
@@ -105,6 +96,6 @@ def tensor_from_numpy(npy_img: np.ndarray, dtype: tf.dtypes.DType = tf.float32)
105
96
  return img
106
97
 
107
98
 
108
- def get_img_shape(img: tf.Tensor) -> Tuple[int, int]:
99
+ def get_img_shape(img: tf.Tensor) -> tuple[int, int]:
109
100
  """Get the shape of an image"""
110
101
  return img.shape[:2]
doctr/io/pdf.py CHANGED
@@ -1,9 +1,9 @@
1
- # Copyright (C) 2021-2024, Mindee.
1
+ # Copyright (C) 2021-2025, Mindee.
2
2
 
3
3
  # This program is licensed under the Apache License 2.0.
4
4
  # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
5
 
6
- from typing import Any, List, Optional
6
+ from typing import Any
7
7
 
8
8
  import numpy as np
9
9
  import pypdfium2 as pdfium
@@ -15,18 +15,17 @@ __all__ = ["read_pdf"]
15
15
 
16
16
  def read_pdf(
17
17
  file: AbstractFile,
18
- scale: float = 2,
18
+ scale: int = 2,
19
19
  rgb_mode: bool = True,
20
- password: Optional[str] = None,
20
+ password: str | None = None,
21
21
  **kwargs: Any,
22
- ) -> List[np.ndarray]:
22
+ ) -> list[np.ndarray]:
23
23
  """Read a PDF file and convert it into an image in numpy format
24
24
 
25
25
  >>> from doctr.io import read_pdf
26
26
  >>> doc = read_pdf("path/to/your/doc.pdf")
27
27
 
28
28
  Args:
29
- ----
30
29
  file: the path to the PDF file
31
30
  scale: rendering scale (1 corresponds to 72dpi)
32
31
  rgb_mode: if True, the output will be RGB, otherwise BGR
@@ -34,7 +33,6 @@ def read_pdf(
34
33
  **kwargs: additional parameters to :meth:`pypdfium2.PdfPage.render`
35
34
 
36
35
  Returns:
37
- -------
38
36
  the list of pages decoded as numpy ndarray of shape H x W x C
39
37
  """
40
38
  # Rasterise pages to numpy ndarrays with pypdfium2
doctr/io/reader.py CHANGED
@@ -1,10 +1,10 @@
1
- # Copyright (C) 2021-2024, Mindee.
1
+ # Copyright (C) 2021-2025, Mindee.
2
2
 
3
3
  # This program is licensed under the Apache License 2.0.
4
4
  # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
5
 
6
+ from collections.abc import Sequence
6
7
  from pathlib import Path
7
- from typing import List, Sequence, Union
8
8
 
9
9
  import numpy as np
10
10
 
@@ -22,37 +22,33 @@ class DocumentFile:
22
22
  """Read a document from multiple extensions"""
23
23
 
24
24
  @classmethod
25
- def from_pdf(cls, file: AbstractFile, **kwargs) -> List[np.ndarray]:
25
+ def from_pdf(cls, file: AbstractFile, **kwargs) -> list[np.ndarray]:
26
26
  """Read a PDF file
27
27
 
28
28
  >>> from doctr.io import DocumentFile
29
29
  >>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
30
30
 
31
31
  Args:
32
- ----
33
32
  file: the path to the PDF file or a binary stream
34
33
  **kwargs: additional parameters to :meth:`pypdfium2.PdfPage.render`
35
34
 
36
35
  Returns:
37
- -------
38
36
  the list of pages decoded as numpy ndarray of shape H x W x 3
39
37
  """
40
38
  return read_pdf(file, **kwargs)
41
39
 
42
40
  @classmethod
43
- def from_url(cls, url: str, **kwargs) -> List[np.ndarray]:
41
+ def from_url(cls, url: str, **kwargs) -> list[np.ndarray]:
44
42
  """Interpret a web page as a PDF document
45
43
 
46
44
  >>> from doctr.io import DocumentFile
47
45
  >>> doc = DocumentFile.from_url("https://www.yoursite.com")
48
46
 
49
47
  Args:
50
- ----
51
48
  url: the URL of the target web page
52
49
  **kwargs: additional parameters to :meth:`pypdfium2.PdfPage.render`
53
50
 
54
51
  Returns:
55
- -------
56
52
  the list of pages decoded as numpy ndarray of shape H x W x 3
57
53
  """
58
54
  requires_package(
@@ -64,19 +60,17 @@ class DocumentFile:
64
60
  return cls.from_pdf(pdf_stream, **kwargs)
65
61
 
66
62
  @classmethod
67
- def from_images(cls, files: Union[Sequence[AbstractFile], AbstractFile], **kwargs) -> List[np.ndarray]:
63
+ def from_images(cls, files: Sequence[AbstractFile] | AbstractFile, **kwargs) -> list[np.ndarray]:
68
64
  """Read an image file (or a collection of image files) and convert it into an image in numpy format
69
65
 
70
66
  >>> from doctr.io import DocumentFile
71
67
  >>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"])
72
68
 
73
69
  Args:
74
- ----
75
70
  files: the path to the image file or a binary stream, or a collection of those
76
71
  **kwargs: additional parameters to :meth:`doctr.io.image.read_img_as_numpy`
77
72
 
78
73
  Returns:
79
- -------
80
74
  the list of pages decoded as numpy ndarray of shape H x W x 3
81
75
  """
82
76
  if isinstance(files, (str, Path, bytes)):
doctr/models/_utils.py CHANGED
@@ -1,11 +1,11 @@
1
- # Copyright (C) 2021-2024, Mindee.
1
+ # Copyright (C) 2021-2025, Mindee.
2
2
 
3
3
  # This program is licensed under the Apache License 2.0.
4
4
  # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
5
 
6
6
  from math import floor
7
7
  from statistics import median_low
8
- from typing import Any, Dict, List, Optional, Tuple, Union
8
+ from typing import Any
9
9
 
10
10
  import cv2
11
11
  import numpy as np
@@ -20,11 +20,9 @@ def get_max_width_length_ratio(contour: np.ndarray) -> float:
20
20
  """Get the maximum shape ratio of a contour.
21
21
 
22
22
  Args:
23
- ----
24
23
  contour: the contour from cv2.findContour
25
24
 
26
25
  Returns:
27
- -------
28
26
  the maximum shape ratio
29
27
  """
30
28
  _, (w, h), _ = cv2.minAreaRect(contour)
@@ -33,7 +31,7 @@ def get_max_width_length_ratio(contour: np.ndarray) -> float:
33
31
 
34
32
  def estimate_orientation(
35
33
  img: np.ndarray,
36
- general_page_orientation: Optional[Tuple[int, float]] = None,
34
+ general_page_orientation: tuple[int, float] | None = None,
37
35
  n_ct: int = 70,
38
36
  ratio_threshold_for_lines: float = 3,
39
37
  min_confidence: float = 0.2,
@@ -43,7 +41,6 @@ def estimate_orientation(
43
41
  lines of the document and the assumption that they should be horizontal.
44
42
 
45
43
  Args:
46
- ----
47
44
  img: the img or bitmap to analyze (H, W, C)
48
45
  general_page_orientation: the general orientation of the page (angle [0, 90, 180, 270 (-90)], confidence)
49
46
  estimated by a model
@@ -53,7 +50,6 @@ def estimate_orientation(
53
50
  lower_area: the minimum area of a contour to be considered
54
51
 
55
52
  Returns:
56
- -------
57
53
  the estimated angle of the page (clockwise, negative for left side rotation, positive for right side rotation)
58
54
  """
59
55
  assert len(img.shape) == 3 and img.shape[-1] in [1, 3], f"Image shape {img.shape} not supported"
@@ -64,13 +60,13 @@ def estimate_orientation(
64
60
  gray_img = cv2.medianBlur(gray_img, 5)
65
61
  thresh = cv2.threshold(gray_img, thresh=0, maxval=255, type=cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
66
62
  else:
67
- thresh = img.astype(np.uint8) # type: ignore[assignment]
63
+ thresh = img.astype(np.uint8)
68
64
 
69
65
  page_orientation, orientation_confidence = general_page_orientation or (None, 0.0)
70
66
  if page_orientation and orientation_confidence >= min_confidence:
71
67
  # We rotate the image to the general orientation which improves the detection
72
68
  # No expand needed bitmap is already padded
73
- thresh = rotate_image(thresh, -page_orientation) # type: ignore
69
+ thresh = rotate_image(thresh, -page_orientation)
74
70
  else: # That's only required if we do not work on the detection models bin map
75
71
  # try to merge words in lines
76
72
  (h, w) = img.shape[:2]
@@ -91,7 +87,7 @@ def estimate_orientation(
91
87
 
92
88
  angles = []
93
89
  for contour in contours[:n_ct]:
94
- _, (w, h), angle = cv2.minAreaRect(contour) # type: ignore[assignment]
90
+ _, (w, h), angle = cv2.minAreaRect(contour)
95
91
  if w / h > ratio_threshold_for_lines: # select only contours with ratio like lines
96
92
  angles.append(angle)
97
93
  elif w / h < 1 / ratio_threshold_for_lines: # if lines are vertical, substract 90 degree
@@ -119,9 +115,9 @@ def estimate_orientation(
119
115
 
120
116
 
121
117
  def rectify_crops(
122
- crops: List[np.ndarray],
123
- orientations: List[int],
124
- ) -> List[np.ndarray]:
118
+ crops: list[np.ndarray],
119
+ orientations: list[int],
120
+ ) -> list[np.ndarray]:
125
121
  """Rotate each crop of the list according to the predicted orientation:
126
122
  0: already straight, no rotation
127
123
  1: 90 ccw, rotate 3 times ccw
@@ -139,8 +135,8 @@ def rectify_crops(
139
135
 
140
136
  def rectify_loc_preds(
141
137
  page_loc_preds: np.ndarray,
142
- orientations: List[int],
143
- ) -> Optional[np.ndarray]:
138
+ orientations: list[int],
139
+ ) -> np.ndarray | None:
144
140
  """Orient the quadrangle (Polygon4P) according to the predicted orientation,
145
141
  so that the points are in this order: top L, top R, bot R, bot L if the crop is readable
146
142
  """
@@ -157,16 +153,14 @@ def rectify_loc_preds(
157
153
  )
158
154
 
159
155
 
160
- def get_language(text: str) -> Tuple[str, float]:
156
+ def get_language(text: str) -> tuple[str, float]:
161
157
  """Get languages of a text using langdetect model.
162
158
  Get the language with the highest probability or no language if only a few words or a low probability
163
159
 
164
160
  Args:
165
- ----
166
161
  text (str): text
167
162
 
168
163
  Returns:
169
- -------
170
164
  The detected language in ISO 639 code and confidence score
171
165
  """
172
166
  try:
@@ -179,16 +173,14 @@ def get_language(text: str) -> Tuple[str, float]:
179
173
 
180
174
 
181
175
  def invert_data_structure(
182
- x: Union[List[Dict[str, Any]], Dict[str, List[Any]]],
183
- ) -> Union[List[Dict[str, Any]], Dict[str, List[Any]]]:
184
- """Invert a List of Dict of elements to a Dict of list of elements and the other way around
176
+ x: list[dict[str, Any]] | dict[str, list[Any]],
177
+ ) -> list[dict[str, Any]] | dict[str, list[Any]]:
178
+ """Invert a list of dict of elements to a dict of list of elements and the other way around
185
179
 
186
180
  Args:
187
- ----
188
181
  x: a list of dictionaries with the same keys or a dictionary of lists of the same length
189
182
 
190
183
  Returns:
191
- -------
192
184
  dictionary of list when x is a list of dictionaries or a list of dictionaries when x is dictionary of lists
193
185
  """
194
186
  if isinstance(x, dict):
doctr/models/builder.py CHANGED
@@ -1,10 +1,10 @@
1
- # Copyright (C) 2021-2024, Mindee.
1
+ # Copyright (C) 2021-2025, Mindee.
2
2
 
3
3
  # This program is licensed under the Apache License 2.0.
4
4
  # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
5
 
6
6
 
7
- from typing import Any, Dict, List, Optional, Tuple
7
+ from typing import Any
8
8
 
9
9
  import numpy as np
10
10
  from scipy.cluster.hierarchy import fclusterdata
@@ -20,7 +20,6 @@ class DocumentBuilder(NestedObject):
20
20
  """Implements a document builder
21
21
 
22
22
  Args:
23
- ----
24
23
  resolve_lines: whether words should be automatically grouped into lines
25
24
  resolve_blocks: whether lines should be automatically grouped into blocks
26
25
  paragraph_break: relative length of the minimum space separating paragraphs
@@ -41,15 +40,13 @@ class DocumentBuilder(NestedObject):
41
40
  self.export_as_straight_boxes = export_as_straight_boxes
42
41
 
43
42
  @staticmethod
44
- def _sort_boxes(boxes: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
43
+ def _sort_boxes(boxes: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
45
44
  """Sort bounding boxes from top to bottom, left to right
46
45
 
47
46
  Args:
48
- ----
49
47
  boxes: bounding boxes of shape (N, 4) or (N, 4, 2) (in case of rotated bbox)
50
48
 
51
49
  Returns:
52
- -------
53
50
  tuple: indices of ordered boxes of shape (N,), boxes
54
51
  If straight boxes are passed tpo the function, boxes are unchanged
55
52
  else: boxes returned are straight boxes fitted to the straightened rotated boxes
@@ -65,16 +62,14 @@ class DocumentBuilder(NestedObject):
65
62
  boxes = np.concatenate((boxes.min(1), boxes.max(1)), -1)
66
63
  return (boxes[:, 0] + 2 * boxes[:, 3] / np.median(boxes[:, 3] - boxes[:, 1])).argsort(), boxes
67
64
 
68
- def _resolve_sub_lines(self, boxes: np.ndarray, word_idcs: List[int]) -> List[List[int]]:
65
+ def _resolve_sub_lines(self, boxes: np.ndarray, word_idcs: list[int]) -> list[list[int]]:
69
66
  """Split a line in sub_lines
70
67
 
71
68
  Args:
72
- ----
73
69
  boxes: bounding boxes of shape (N, 4)
74
70
  word_idcs: list of indexes for the words of the line
75
71
 
76
72
  Returns:
77
- -------
78
73
  A list of (sub-)lines computed from the original line (words)
79
74
  """
80
75
  lines = []
@@ -105,15 +100,13 @@ class DocumentBuilder(NestedObject):
105
100
 
106
101
  return lines
107
102
 
108
- def _resolve_lines(self, boxes: np.ndarray) -> List[List[int]]:
103
+ def _resolve_lines(self, boxes: np.ndarray) -> list[list[int]]:
109
104
  """Order boxes to group them in lines
110
105
 
111
106
  Args:
112
- ----
113
107
  boxes: bounding boxes of shape (N, 4) or (N, 4, 2) in case of rotated bbox
114
108
 
115
109
  Returns:
116
- -------
117
110
  nested list of box indices
118
111
  """
119
112
  # Sort boxes, and straighten the boxes if they are rotated
@@ -153,16 +146,14 @@ class DocumentBuilder(NestedObject):
153
146
  return lines
154
147
 
155
148
  @staticmethod
156
- def _resolve_blocks(boxes: np.ndarray, lines: List[List[int]]) -> List[List[List[int]]]:
149
+ def _resolve_blocks(boxes: np.ndarray, lines: list[list[int]]) -> list[list[list[int]]]:
157
150
  """Order lines to group them in blocks
158
151
 
159
152
  Args:
160
- ----
161
153
  boxes: bounding boxes of shape (N, 4) or (N, 4, 2)
162
154
  lines: list of lines, each line is a list of idx
163
155
 
164
156
  Returns:
165
- -------
166
157
  nested list of box indices
167
158
  """
168
159
  # Resolve enclosing boxes of lines
@@ -207,7 +198,7 @@ class DocumentBuilder(NestedObject):
207
198
  # Compute clusters
208
199
  clusters = fclusterdata(box_features, t=0.1, depth=4, criterion="distance", metric="euclidean")
209
200
 
210
- _blocks: Dict[int, List[int]] = {}
201
+ _blocks: dict[int, list[int]] = {}
211
202
  # Form clusters
212
203
  for line_idx, cluster_idx in enumerate(clusters):
213
204
  if cluster_idx in _blocks.keys():
@@ -224,13 +215,12 @@ class DocumentBuilder(NestedObject):
224
215
  self,
225
216
  boxes: np.ndarray,
226
217
  objectness_scores: np.ndarray,
227
- word_preds: List[Tuple[str, float]],
228
- crop_orientations: List[Dict[str, Any]],
229
- ) -> List[Block]:
218
+ word_preds: list[tuple[str, float]],
219
+ crop_orientations: list[dict[str, Any]],
220
+ ) -> list[Block]:
230
221
  """Gather independent words in structured blocks
231
222
 
232
223
  Args:
233
- ----
234
224
  boxes: bounding boxes of all detected words of the page, of shape (N, 4) or (N, 4, 2)
235
225
  objectness_scores: objectness scores of all detected words of the page, of shape N
236
226
  word_preds: list of all detected words of the page, of shape N
@@ -238,7 +228,6 @@ class DocumentBuilder(NestedObject):
238
228
  the general orientation (orientations + confidences) of the crops
239
229
 
240
230
  Returns:
241
- -------
242
231
  list of block elements
243
232
  """
244
233
  if boxes.shape[0] != len(word_preds):
@@ -295,19 +284,18 @@ class DocumentBuilder(NestedObject):
295
284
 
296
285
  def __call__(
297
286
  self,
298
- pages: List[np.ndarray],
299
- boxes: List[np.ndarray],
300
- objectness_scores: List[np.ndarray],
301
- text_preds: List[List[Tuple[str, float]]],
302
- page_shapes: List[Tuple[int, int]],
303
- crop_orientations: List[Dict[str, Any]],
304
- orientations: Optional[List[Dict[str, Any]]] = None,
305
- languages: Optional[List[Dict[str, Any]]] = None,
287
+ pages: list[np.ndarray],
288
+ boxes: list[np.ndarray],
289
+ objectness_scores: list[np.ndarray],
290
+ text_preds: list[list[tuple[str, float]]],
291
+ page_shapes: list[tuple[int, int]],
292
+ crop_orientations: list[dict[str, Any]],
293
+ orientations: list[dict[str, Any]] | None = None,
294
+ languages: list[dict[str, Any]] | None = None,
306
295
  ) -> Document:
307
296
  """Re-arrange detected words into structured blocks
308
297
 
309
298
  Args:
310
- ----
311
299
  pages: list of N elements, where each element represents the page image
312
300
  boxes: list of N elements, where each element represents the localization predictions, of shape (*, 4)
313
301
  or (*, 4, 2) for all words for a given page
@@ -322,7 +310,6 @@ class DocumentBuilder(NestedObject):
322
310
  where each element is a dictionary containing the language (language + confidence)
323
311
 
324
312
  Returns:
325
- -------
326
313
  document object
327
314
  """
328
315
  if len(boxes) != len(text_preds) != len(crop_orientations) != len(objectness_scores) or len(boxes) != len(
@@ -374,7 +361,6 @@ class KIEDocumentBuilder(DocumentBuilder):
374
361
  """Implements a KIE document builder
375
362
 
376
363
  Args:
377
- ----
378
364
  resolve_lines: whether words should be automatically grouped into lines
379
365
  resolve_blocks: whether lines should be automatically grouped into blocks
380
366
  paragraph_break: relative length of the minimum space separating paragraphs
@@ -384,19 +370,18 @@ class KIEDocumentBuilder(DocumentBuilder):
384
370
 
385
371
  def __call__( # type: ignore[override]
386
372
  self,
387
- pages: List[np.ndarray],
388
- boxes: List[Dict[str, np.ndarray]],
389
- objectness_scores: List[Dict[str, np.ndarray]],
390
- text_preds: List[Dict[str, List[Tuple[str, float]]]],
391
- page_shapes: List[Tuple[int, int]],
392
- crop_orientations: List[Dict[str, List[Dict[str, Any]]]],
393
- orientations: Optional[List[Dict[str, Any]]] = None,
394
- languages: Optional[List[Dict[str, Any]]] = None,
373
+ pages: list[np.ndarray],
374
+ boxes: list[dict[str, np.ndarray]],
375
+ objectness_scores: list[dict[str, np.ndarray]],
376
+ text_preds: list[dict[str, list[tuple[str, float]]]],
377
+ page_shapes: list[tuple[int, int]],
378
+ crop_orientations: list[dict[str, list[dict[str, Any]]]],
379
+ orientations: list[dict[str, Any]] | None = None,
380
+ languages: list[dict[str, Any]] | None = None,
395
381
  ) -> KIEDocument:
396
382
  """Re-arrange detected words into structured predictions
397
383
 
398
384
  Args:
399
- ----
400
385
  pages: list of N elements, where each element represents the page image
401
386
  boxes: list of N dictionaries, where each element represents the localization predictions for a class,
402
387
  of shape (*, 5) or (*, 6) for all predictions
@@ -411,7 +396,6 @@ class KIEDocumentBuilder(DocumentBuilder):
411
396
  where each element is a dictionary containing the language (language + confidence)
412
397
 
413
398
  Returns:
414
- -------
415
399
  document object
416
400
  """
417
401
  if len(boxes) != len(text_preds) != len(crop_orientations) != len(objectness_scores) or len(boxes) != len(
@@ -425,7 +409,7 @@ class KIEDocumentBuilder(DocumentBuilder):
425
409
  if self.export_as_straight_boxes and len(boxes) > 0:
426
410
  # If boxes are already straight OK, else fit a bounding rect
427
411
  if next(iter(boxes[0].values())).ndim == 3:
428
- straight_boxes: List[Dict[str, np.ndarray]] = []
412
+ straight_boxes: list[dict[str, np.ndarray]] = []
429
413
  # Iterate over pages
430
414
  for p_boxes in boxes:
431
415
  # Iterate over boxes of the pages
@@ -471,20 +455,18 @@ class KIEDocumentBuilder(DocumentBuilder):
471
455
  self,
472
456
  boxes: np.ndarray,
473
457
  objectness_scores: np.ndarray,
474
- word_preds: List[Tuple[str, float]],
475
- crop_orientations: List[Dict[str, Any]],
476
- ) -> List[Prediction]:
458
+ word_preds: list[tuple[str, float]],
459
+ crop_orientations: list[dict[str, Any]],
460
+ ) -> list[Prediction]:
477
461
  """Gather independent words in structured blocks
478
462
 
479
463
  Args:
480
- ----
481
464
  boxes: bounding boxes of all detected words of the page, of shape (N, 4) or (N, 4, 2)
482
465
  objectness_scores: objectness scores of all detected words of the page
483
466
  word_preds: list of all detected words of the page, of shape N
484
467
  crop_orientations: list of orientations for each word crop
485
468
 
486
469
  Returns:
487
- -------
488
470
  list of block elements
489
471
  """
490
472
  if boxes.shape[0] != len(word_preds):
@@ -4,4 +4,5 @@ from .vgg import *
4
4
  from .magc_resnet import *
5
5
  from .vit import *
6
6
  from .textnet import *
7
+ from .vip import *
7
8
  from .zoo import *
@@ -1,6 +1,6 @@
1
1
  from doctr.file_utils import is_tf_available, is_torch_available
2
2
 
3
- if is_tf_available():
3
+ if is_torch_available():
4
+ from .pytorch import *
5
+ elif is_tf_available():
4
6
  from .tensorflow import *
5
- elif is_torch_available():
6
- from .pytorch import * # type: ignore[assignment]