python-doctr 0.10.0__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. doctr/contrib/__init__.py +1 -0
  2. doctr/contrib/artefacts.py +7 -9
  3. doctr/contrib/base.py +8 -17
  4. doctr/datasets/__init__.py +1 -0
  5. doctr/datasets/coco_text.py +139 -0
  6. doctr/datasets/cord.py +10 -8
  7. doctr/datasets/datasets/__init__.py +4 -4
  8. doctr/datasets/datasets/base.py +16 -16
  9. doctr/datasets/datasets/pytorch.py +12 -12
  10. doctr/datasets/datasets/tensorflow.py +10 -10
  11. doctr/datasets/detection.py +6 -9
  12. doctr/datasets/doc_artefacts.py +3 -4
  13. doctr/datasets/funsd.py +9 -8
  14. doctr/datasets/generator/__init__.py +4 -4
  15. doctr/datasets/generator/base.py +16 -17
  16. doctr/datasets/generator/pytorch.py +1 -3
  17. doctr/datasets/generator/tensorflow.py +1 -3
  18. doctr/datasets/ic03.py +5 -6
  19. doctr/datasets/ic13.py +6 -6
  20. doctr/datasets/iiit5k.py +10 -6
  21. doctr/datasets/iiithws.py +4 -5
  22. doctr/datasets/imgur5k.py +15 -7
  23. doctr/datasets/loader.py +4 -7
  24. doctr/datasets/mjsynth.py +6 -5
  25. doctr/datasets/ocr.py +3 -4
  26. doctr/datasets/orientation.py +3 -4
  27. doctr/datasets/recognition.py +4 -5
  28. doctr/datasets/sroie.py +6 -5
  29. doctr/datasets/svhn.py +7 -6
  30. doctr/datasets/svt.py +6 -7
  31. doctr/datasets/synthtext.py +19 -7
  32. doctr/datasets/utils.py +41 -35
  33. doctr/datasets/vocabs.py +1107 -49
  34. doctr/datasets/wildreceipt.py +14 -10
  35. doctr/file_utils.py +11 -7
  36. doctr/io/elements.py +96 -82
  37. doctr/io/html.py +1 -3
  38. doctr/io/image/__init__.py +3 -3
  39. doctr/io/image/base.py +2 -5
  40. doctr/io/image/pytorch.py +3 -12
  41. doctr/io/image/tensorflow.py +2 -11
  42. doctr/io/pdf.py +5 -7
  43. doctr/io/reader.py +5 -11
  44. doctr/models/_utils.py +15 -23
  45. doctr/models/builder.py +30 -48
  46. doctr/models/classification/__init__.py +1 -0
  47. doctr/models/classification/magc_resnet/__init__.py +3 -3
  48. doctr/models/classification/magc_resnet/pytorch.py +11 -15
  49. doctr/models/classification/magc_resnet/tensorflow.py +11 -14
  50. doctr/models/classification/mobilenet/__init__.py +3 -3
  51. doctr/models/classification/mobilenet/pytorch.py +20 -18
  52. doctr/models/classification/mobilenet/tensorflow.py +19 -23
  53. doctr/models/classification/predictor/__init__.py +4 -4
  54. doctr/models/classification/predictor/pytorch.py +7 -9
  55. doctr/models/classification/predictor/tensorflow.py +6 -8
  56. doctr/models/classification/resnet/__init__.py +4 -4
  57. doctr/models/classification/resnet/pytorch.py +47 -34
  58. doctr/models/classification/resnet/tensorflow.py +45 -35
  59. doctr/models/classification/textnet/__init__.py +3 -3
  60. doctr/models/classification/textnet/pytorch.py +20 -18
  61. doctr/models/classification/textnet/tensorflow.py +19 -17
  62. doctr/models/classification/vgg/__init__.py +3 -3
  63. doctr/models/classification/vgg/pytorch.py +21 -8
  64. doctr/models/classification/vgg/tensorflow.py +20 -14
  65. doctr/models/classification/vip/__init__.py +4 -0
  66. doctr/models/classification/vip/layers/__init__.py +4 -0
  67. doctr/models/classification/vip/layers/pytorch.py +615 -0
  68. doctr/models/classification/vip/pytorch.py +505 -0
  69. doctr/models/classification/vit/__init__.py +3 -3
  70. doctr/models/classification/vit/pytorch.py +18 -15
  71. doctr/models/classification/vit/tensorflow.py +15 -12
  72. doctr/models/classification/zoo.py +23 -14
  73. doctr/models/core.py +3 -3
  74. doctr/models/detection/_utils/__init__.py +4 -4
  75. doctr/models/detection/_utils/base.py +4 -7
  76. doctr/models/detection/_utils/pytorch.py +1 -5
  77. doctr/models/detection/_utils/tensorflow.py +1 -5
  78. doctr/models/detection/core.py +2 -8
  79. doctr/models/detection/differentiable_binarization/__init__.py +4 -4
  80. doctr/models/detection/differentiable_binarization/base.py +10 -21
  81. doctr/models/detection/differentiable_binarization/pytorch.py +37 -31
  82. doctr/models/detection/differentiable_binarization/tensorflow.py +26 -29
  83. doctr/models/detection/fast/__init__.py +4 -4
  84. doctr/models/detection/fast/base.py +8 -17
  85. doctr/models/detection/fast/pytorch.py +37 -35
  86. doctr/models/detection/fast/tensorflow.py +24 -28
  87. doctr/models/detection/linknet/__init__.py +4 -4
  88. doctr/models/detection/linknet/base.py +8 -18
  89. doctr/models/detection/linknet/pytorch.py +34 -28
  90. doctr/models/detection/linknet/tensorflow.py +24 -25
  91. doctr/models/detection/predictor/__init__.py +5 -5
  92. doctr/models/detection/predictor/pytorch.py +6 -7
  93. doctr/models/detection/predictor/tensorflow.py +5 -6
  94. doctr/models/detection/zoo.py +27 -7
  95. doctr/models/factory/hub.py +6 -10
  96. doctr/models/kie_predictor/__init__.py +5 -5
  97. doctr/models/kie_predictor/base.py +4 -5
  98. doctr/models/kie_predictor/pytorch.py +19 -20
  99. doctr/models/kie_predictor/tensorflow.py +14 -15
  100. doctr/models/modules/layers/__init__.py +3 -3
  101. doctr/models/modules/layers/pytorch.py +55 -10
  102. doctr/models/modules/layers/tensorflow.py +5 -7
  103. doctr/models/modules/transformer/__init__.py +3 -3
  104. doctr/models/modules/transformer/pytorch.py +12 -13
  105. doctr/models/modules/transformer/tensorflow.py +9 -10
  106. doctr/models/modules/vision_transformer/__init__.py +3 -3
  107. doctr/models/modules/vision_transformer/pytorch.py +2 -3
  108. doctr/models/modules/vision_transformer/tensorflow.py +3 -3
  109. doctr/models/predictor/__init__.py +5 -5
  110. doctr/models/predictor/base.py +28 -29
  111. doctr/models/predictor/pytorch.py +13 -14
  112. doctr/models/predictor/tensorflow.py +9 -10
  113. doctr/models/preprocessor/__init__.py +4 -4
  114. doctr/models/preprocessor/pytorch.py +13 -17
  115. doctr/models/preprocessor/tensorflow.py +10 -14
  116. doctr/models/recognition/__init__.py +1 -0
  117. doctr/models/recognition/core.py +3 -7
  118. doctr/models/recognition/crnn/__init__.py +4 -4
  119. doctr/models/recognition/crnn/pytorch.py +30 -29
  120. doctr/models/recognition/crnn/tensorflow.py +21 -24
  121. doctr/models/recognition/master/__init__.py +3 -3
  122. doctr/models/recognition/master/base.py +3 -7
  123. doctr/models/recognition/master/pytorch.py +32 -25
  124. doctr/models/recognition/master/tensorflow.py +22 -25
  125. doctr/models/recognition/parseq/__init__.py +3 -3
  126. doctr/models/recognition/parseq/base.py +3 -7
  127. doctr/models/recognition/parseq/pytorch.py +47 -29
  128. doctr/models/recognition/parseq/tensorflow.py +29 -27
  129. doctr/models/recognition/predictor/__init__.py +5 -5
  130. doctr/models/recognition/predictor/_utils.py +111 -52
  131. doctr/models/recognition/predictor/pytorch.py +9 -9
  132. doctr/models/recognition/predictor/tensorflow.py +8 -9
  133. doctr/models/recognition/sar/__init__.py +4 -4
  134. doctr/models/recognition/sar/pytorch.py +30 -22
  135. doctr/models/recognition/sar/tensorflow.py +22 -24
  136. doctr/models/recognition/utils.py +57 -53
  137. doctr/models/recognition/viptr/__init__.py +4 -0
  138. doctr/models/recognition/viptr/pytorch.py +277 -0
  139. doctr/models/recognition/vitstr/__init__.py +4 -4
  140. doctr/models/recognition/vitstr/base.py +3 -7
  141. doctr/models/recognition/vitstr/pytorch.py +28 -21
  142. doctr/models/recognition/vitstr/tensorflow.py +22 -23
  143. doctr/models/recognition/zoo.py +27 -11
  144. doctr/models/utils/__init__.py +4 -4
  145. doctr/models/utils/pytorch.py +41 -34
  146. doctr/models/utils/tensorflow.py +31 -23
  147. doctr/models/zoo.py +1 -5
  148. doctr/transforms/functional/__init__.py +3 -3
  149. doctr/transforms/functional/base.py +4 -11
  150. doctr/transforms/functional/pytorch.py +20 -28
  151. doctr/transforms/functional/tensorflow.py +10 -22
  152. doctr/transforms/modules/__init__.py +4 -4
  153. doctr/transforms/modules/base.py +48 -55
  154. doctr/transforms/modules/pytorch.py +58 -22
  155. doctr/transforms/modules/tensorflow.py +18 -32
  156. doctr/utils/common_types.py +8 -9
  157. doctr/utils/data.py +9 -13
  158. doctr/utils/fonts.py +2 -7
  159. doctr/utils/geometry.py +17 -48
  160. doctr/utils/metrics.py +17 -37
  161. doctr/utils/multithreading.py +4 -6
  162. doctr/utils/reconstitution.py +9 -13
  163. doctr/utils/repr.py +2 -3
  164. doctr/utils/visualization.py +16 -29
  165. doctr/version.py +1 -1
  166. {python_doctr-0.10.0.dist-info → python_doctr-0.12.0.dist-info}/METADATA +70 -52
  167. python_doctr-0.12.0.dist-info/RECORD +180 -0
  168. {python_doctr-0.10.0.dist-info → python_doctr-0.12.0.dist-info}/WHEEL +1 -1
  169. python_doctr-0.10.0.dist-info/RECORD +0 -173
  170. {python_doctr-0.10.0.dist-info → python_doctr-0.12.0.dist-info/licenses}/LICENSE +0 -0
  171. {python_doctr-0.10.0.dist-info → python_doctr-0.12.0.dist-info}/top_level.txt +0 -0
  172. {python_doctr-0.10.0.dist-info → python_doctr-0.12.0.dist-info}/zip-safe +0 -0
doctr/datasets/funsd.py CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2021-2024, Mindee.
1
+ # Copyright (C) 2021-2025, Mindee.
2
2
 
3
3
  # This program is licensed under the Apache License 2.0.
4
4
  # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
@@ -6,7 +6,7 @@
6
6
  import json
7
7
  import os
8
8
  from pathlib import Path
9
- from typing import Any, Dict, List, Tuple, Union
9
+ from typing import Any
10
10
 
11
11
  import numpy as np
12
12
  from tqdm import tqdm
@@ -29,7 +29,6 @@ class FUNSD(VisionDataset):
29
29
  >>> img, target = train_set[0]
30
30
 
31
31
  Args:
32
- ----
33
32
  train: whether the subset should be the training one
34
33
  use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
35
34
  recognition_task: whether the dataset should be used for recognition task
@@ -69,10 +68,12 @@ class FUNSD(VisionDataset):
69
68
  # Use the subset
70
69
  subfolder = os.path.join("dataset", "training_data" if train else "testing_data")
71
70
 
72
- # # List images
71
+ # # list images
73
72
  tmp_root = os.path.join(self.root, subfolder, "images")
74
- self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
75
- for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking FUNSD", total=len(os.listdir(tmp_root))):
73
+ self.data: list[tuple[str | np.ndarray, str | dict[str, Any] | np.ndarray]] = []
74
+ for img_path in tqdm(
75
+ iterable=os.listdir(tmp_root), desc="Preparing and Loading FUNSD", total=len(os.listdir(tmp_root))
76
+ ):
76
77
  # File existence check
77
78
  if not os.path.exists(os.path.join(tmp_root, img_path)):
78
79
  raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}")
@@ -106,8 +107,8 @@ class FUNSD(VisionDataset):
106
107
  )
107
108
  for crop, label in zip(crops, list(text_targets)):
108
109
  # filter labels with unknown characters
109
- if not any(char in label for char in ["☑", "☐", "\uf703", "\uf702"]):
110
- self.data.append((crop, label))
110
+ if not any(char in label for char in ["☑", "☐", "\u03bf", "\uf703", "\uf702", " "]):
111
+ self.data.append((crop, label.replace("–", "-")))
111
112
  elif detection_task:
112
113
  self.data.append((img_path, np.asarray(box_targets, dtype=np_dtype)))
113
114
  else:
@@ -1,6 +1,6 @@
1
1
  from doctr.file_utils import is_tf_available, is_torch_available
2
2
 
3
- if is_tf_available():
4
- from .tensorflow import *
5
- elif is_torch_available():
6
- from .pytorch import * # type: ignore[assignment]
3
+ if is_torch_available():
4
+ from .pytorch import *
5
+ elif is_tf_available():
6
+ from .tensorflow import * # type: ignore[assignment]
@@ -1,10 +1,11 @@
1
- # Copyright (C) 2021-2024, Mindee.
1
+ # Copyright (C) 2021-2025, Mindee.
2
2
 
3
3
  # This program is licensed under the Apache License 2.0.
4
4
  # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
5
 
6
6
  import random
7
- from typing import Any, Callable, List, Optional, Tuple, Union
7
+ from collections.abc import Callable
8
+ from typing import Any
8
9
 
9
10
  from PIL import Image, ImageDraw
10
11
 
@@ -17,14 +18,13 @@ from ..datasets import AbstractDataset
17
18
  def synthesize_text_img(
18
19
  text: str,
19
20
  font_size: int = 32,
20
- font_family: Optional[str] = None,
21
- background_color: Optional[Tuple[int, int, int]] = None,
22
- text_color: Optional[Tuple[int, int, int]] = None,
21
+ font_family: str | None = None,
22
+ background_color: tuple[int, int, int] | None = None,
23
+ text_color: tuple[int, int, int] | None = None,
23
24
  ) -> Image.Image:
24
25
  """Generate a synthetic text image
25
26
 
26
27
  Args:
27
- ----
28
28
  text: the text to render as an image
29
29
  font_size: the size of the font
30
30
  font_family: the font family (has to be installed on your system)
@@ -32,7 +32,6 @@ def synthesize_text_img(
32
32
  text_color: text color on the final image
33
33
 
34
34
  Returns:
35
- -------
36
35
  PIL image of the text
37
36
  """
38
37
  background_color = (0, 0, 0) if background_color is None else background_color
@@ -61,9 +60,9 @@ class _CharacterGenerator(AbstractDataset):
61
60
  vocab: str,
62
61
  num_samples: int,
63
62
  cache_samples: bool = False,
64
- font_family: Optional[Union[str, List[str]]] = None,
65
- img_transforms: Optional[Callable[[Any], Any]] = None,
66
- sample_transforms: Optional[Callable[[Any, Any], Tuple[Any, Any]]] = None,
63
+ font_family: str | list[str] | None = None,
64
+ img_transforms: Callable[[Any], Any] | None = None,
65
+ sample_transforms: Callable[[Any, Any], tuple[Any, Any]] | None = None,
67
66
  ) -> None:
68
67
  self.vocab = vocab
69
68
  self._num_samples = num_samples
@@ -78,7 +77,7 @@ class _CharacterGenerator(AbstractDataset):
78
77
  self.img_transforms = img_transforms
79
78
  self.sample_transforms = sample_transforms
80
79
 
81
- self._data: List[Image.Image] = []
80
+ self._data: list[Image.Image] = []
82
81
  if cache_samples:
83
82
  self._data = [
84
83
  (synthesize_text_img(char, font_family=font), idx) # type: ignore[misc]
@@ -89,7 +88,7 @@ class _CharacterGenerator(AbstractDataset):
89
88
  def __len__(self) -> int:
90
89
  return self._num_samples
91
90
 
92
- def _read_sample(self, index: int) -> Tuple[Any, int]:
91
+ def _read_sample(self, index: int) -> tuple[Any, int]:
93
92
  # Samples are already cached
94
93
  if len(self._data) > 0:
95
94
  idx = index % len(self._data)
@@ -110,9 +109,9 @@ class _WordGenerator(AbstractDataset):
110
109
  max_chars: int,
111
110
  num_samples: int,
112
111
  cache_samples: bool = False,
113
- font_family: Optional[Union[str, List[str]]] = None,
114
- img_transforms: Optional[Callable[[Any], Any]] = None,
115
- sample_transforms: Optional[Callable[[Any, Any], Tuple[Any, Any]]] = None,
112
+ font_family: str | list[str] | None = None,
113
+ img_transforms: Callable[[Any], Any] | None = None,
114
+ sample_transforms: Callable[[Any, Any], tuple[Any, Any]] | None = None,
116
115
  ) -> None:
117
116
  self.vocab = vocab
118
117
  self.wordlen_range = (min_chars, max_chars)
@@ -128,7 +127,7 @@ class _WordGenerator(AbstractDataset):
128
127
  self.img_transforms = img_transforms
129
128
  self.sample_transforms = sample_transforms
130
129
 
131
- self._data: List[Image.Image] = []
130
+ self._data: list[Image.Image] = []
132
131
  if cache_samples:
133
132
  _words = [self._generate_string(*self.wordlen_range) for _ in range(num_samples)]
134
133
  self._data = [
@@ -143,7 +142,7 @@ class _WordGenerator(AbstractDataset):
143
142
  def __len__(self) -> int:
144
143
  return self._num_samples
145
144
 
146
- def _read_sample(self, index: int) -> Tuple[Any, str]:
145
+ def _read_sample(self, index: int) -> tuple[Any, str]:
147
146
  # Samples are already cached
148
147
  if len(self._data) > 0:
149
148
  pil_img, target = self._data[index] # type: ignore[misc]
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2021-2024, Mindee.
1
+ # Copyright (C) 2021-2025, Mindee.
2
2
 
3
3
  # This program is licensed under the Apache License 2.0.
4
4
  # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
@@ -18,7 +18,6 @@ class CharacterGenerator(_CharacterGenerator):
18
18
  >>> img, target = ds[0]
19
19
 
20
20
  Args:
21
- ----
22
21
  vocab: vocabulary to take the character from
23
22
  num_samples: number of samples that will be generated iterating over the dataset
24
23
  cache_samples: whether generated images should be cached firsthand
@@ -40,7 +39,6 @@ class WordGenerator(_WordGenerator):
40
39
  >>> img, target = ds[0]
41
40
 
42
41
  Args:
43
- ----
44
42
  vocab: vocabulary to take the character from
45
43
  min_chars: minimum number of characters in a word
46
44
  max_chars: maximum number of characters in a word
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2021-2024, Mindee.
1
+ # Copyright (C) 2021-2025, Mindee.
2
2
 
3
3
  # This program is licensed under the Apache License 2.0.
4
4
  # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
@@ -18,7 +18,6 @@ class CharacterGenerator(_CharacterGenerator):
18
18
  >>> img, target = ds[0]
19
19
 
20
20
  Args:
21
- ----
22
21
  vocab: vocabulary to take the character from
23
22
  num_samples: number of samples that will be generated iterating over the dataset
24
23
  cache_samples: whether generated images should be cached firsthand
@@ -46,7 +45,6 @@ class WordGenerator(_WordGenerator):
46
45
  >>> img, target = ds[0]
47
46
 
48
47
  Args:
49
- ----
50
48
  vocab: vocabulary to take the character from
51
49
  min_chars: minimum number of characters in a word
52
50
  max_chars: maximum number of characters in a word
doctr/datasets/ic03.py CHANGED
@@ -1,10 +1,10 @@
1
- # Copyright (C) 2021-2024, Mindee.
1
+ # Copyright (C) 2021-2025, Mindee.
2
2
 
3
3
  # This program is licensed under the Apache License 2.0.
4
4
  # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
5
 
6
6
  import os
7
- from typing import Any, Dict, List, Tuple, Union
7
+ from typing import Any
8
8
 
9
9
  import defusedxml.ElementTree as ET
10
10
  import numpy as np
@@ -28,7 +28,6 @@ class IC03(VisionDataset):
28
28
  >>> img, target = train_set[0]
29
29
 
30
30
  Args:
31
- ----
32
31
  train: whether the subset should be the training one
33
32
  use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
34
33
  recognition_task: whether the dataset should be used for recognition task
@@ -71,7 +70,7 @@ class IC03(VisionDataset):
71
70
  )
72
71
 
73
72
  self.train = train
74
- self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
73
+ self.data: list[tuple[str | np.ndarray, str | dict[str, Any] | np.ndarray]] = []
75
74
  np_dtype = np.float32
76
75
 
77
76
  # Load xml data
@@ -81,7 +80,7 @@ class IC03(VisionDataset):
81
80
  xml_tree = ET.parse(os.path.join(tmp_root, "words.xml"))
82
81
  xml_root = xml_tree.getroot()
83
82
 
84
- for image in tqdm(iterable=xml_root, desc="Unpacking IC03", total=len(xml_root)):
83
+ for image in tqdm(iterable=xml_root, desc="Preparing and Loading IC03", total=len(xml_root)):
85
84
  name, _resolution, rectangles = image
86
85
 
87
86
  # File existence check
@@ -123,7 +122,7 @@ class IC03(VisionDataset):
123
122
  if recognition_task:
124
123
  crops = crop_bboxes_from_image(img_path=os.path.join(tmp_root, name.text), geoms=boxes)
125
124
  for crop, label in zip(crops, labels):
126
- if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
125
+ if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0 and " " not in label:
127
126
  self.data.append((crop, label))
128
127
  elif detection_task:
129
128
  self.data.append((name.text, boxes))
doctr/datasets/ic13.py CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2021-2024, Mindee.
1
+ # Copyright (C) 2021-2025, Mindee.
2
2
 
3
3
  # This program is licensed under the Apache License 2.0.
4
4
  # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
@@ -6,7 +6,7 @@
6
6
  import csv
7
7
  import os
8
8
  from pathlib import Path
9
- from typing import Any, Dict, List, Tuple, Union
9
+ from typing import Any
10
10
 
11
11
  import numpy as np
12
12
  from tqdm import tqdm
@@ -33,7 +33,6 @@ class IC13(AbstractDataset):
33
33
  >>> img, target = test_set[0]
34
34
 
35
35
  Args:
36
- ----
37
36
  img_folder: folder with all the images of the dataset
38
37
  label_folder: folder with all annotation files for the images
39
38
  use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
@@ -66,12 +65,12 @@ class IC13(AbstractDataset):
66
65
  f"unable to locate {label_folder if not os.path.exists(label_folder) else img_folder}"
67
66
  )
68
67
 
69
- self.data: List[Tuple[Union[Path, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
68
+ self.data: list[tuple[Path | np.ndarray, str | dict[str, Any] | np.ndarray]] = []
70
69
  np_dtype = np.float32
71
70
 
72
71
  img_names = os.listdir(img_folder)
73
72
 
74
- for img_name in tqdm(iterable=img_names, desc="Unpacking IC13", total=len(img_names)):
73
+ for img_name in tqdm(iterable=img_names, desc="Preparing and Loading IC13", total=len(img_names)):
75
74
  img_path = Path(img_folder, img_name)
76
75
  label_path = Path(label_folder, "gt_" + Path(img_name).stem + ".txt")
77
76
 
@@ -101,7 +100,8 @@ class IC13(AbstractDataset):
101
100
  if recognition_task:
102
101
  crops = crop_bboxes_from_image(img_path=img_path, geoms=box_targets)
103
102
  for crop, label in zip(crops, labels):
104
- self.data.append((crop, label))
103
+ if " " not in label:
104
+ self.data.append((crop, label))
105
105
  elif detection_task:
106
106
  self.data.append((img_path, box_targets))
107
107
  else:
doctr/datasets/iiit5k.py CHANGED
@@ -1,13 +1,14 @@
1
- # Copyright (C) 2021-2024, Mindee.
1
+ # Copyright (C) 2021-2025, Mindee.
2
2
 
3
3
  # This program is licensed under the Apache License 2.0.
4
4
  # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
5
 
6
6
  import os
7
- from typing import Any, Dict, List, Tuple, Union
7
+ from typing import Any
8
8
 
9
9
  import numpy as np
10
10
  import scipy.io as sio
11
+ from PIL import Image
11
12
  from tqdm import tqdm
12
13
 
13
14
  from .datasets import VisionDataset
@@ -30,7 +31,6 @@ class IIIT5K(VisionDataset):
30
31
  >>> img, target = train_set[0]
31
32
 
32
33
  Args:
33
- ----
34
34
  train: whether the subset should be the training one
35
35
  use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
36
36
  recognition_task: whether the dataset should be used for recognition task
@@ -70,10 +70,12 @@ class IIIT5K(VisionDataset):
70
70
  mat_file = "trainCharBound" if self.train else "testCharBound"
71
71
  mat_data = sio.loadmat(os.path.join(tmp_root, f"{mat_file}.mat"))[mat_file][0]
72
72
 
73
- self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
73
+ self.data: list[tuple[str | np.ndarray, str | dict[str, Any] | np.ndarray]] = []
74
74
  np_dtype = np.float32
75
75
 
76
- for img_path, label, box_targets in tqdm(iterable=mat_data, desc="Unpacking IIIT5K", total=len(mat_data)):
76
+ for img_path, label, box_targets in tqdm(
77
+ iterable=mat_data, desc="Preparing and Loading IIIT5K", total=len(mat_data)
78
+ ):
77
79
  _raw_path = img_path[0]
78
80
  _raw_label = label[0]
79
81
 
@@ -97,7 +99,9 @@ class IIIT5K(VisionDataset):
97
99
  box_targets = [[box[0], box[1], box[0] + box[2], box[1] + box[3]] for box in box_targets]
98
100
 
99
101
  if recognition_task:
100
- self.data.append((_raw_path, _raw_label))
102
+ if " " not in _raw_label:
103
+ with Image.open(os.path.join(tmp_root, _raw_path)) as pil_img:
104
+ self.data.append((np.array(pil_img.convert("RGB")), _raw_label))
101
105
  elif detection_task:
102
106
  self.data.append((_raw_path, np.asarray(box_targets, dtype=np_dtype)))
103
107
  else:
doctr/datasets/iiithws.py CHANGED
@@ -1,11 +1,11 @@
1
- # Copyright (C) 2021-2024, Mindee.
1
+ # Copyright (C) 2021-2025, Mindee.
2
2
 
3
3
  # This program is licensed under the Apache License 2.0.
4
4
  # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
5
 
6
6
  import os
7
7
  from random import sample
8
- from typing import Any, List, Tuple
8
+ from typing import Any
9
9
 
10
10
  from tqdm import tqdm
11
11
 
@@ -32,7 +32,6 @@ class IIITHWS(AbstractDataset):
32
32
  >>> img, target = test_set[0]
33
33
 
34
34
  Args:
35
- ----
36
35
  img_folder: folder with all the images of the dataset
37
36
  label_path: path to the file with the labels
38
37
  train: whether the subset should be the training one
@@ -52,7 +51,7 @@ class IIITHWS(AbstractDataset):
52
51
  if not os.path.exists(label_path) or not os.path.exists(img_folder):
53
52
  raise FileNotFoundError(f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}")
54
53
 
55
- self.data: List[Tuple[str, str]] = []
54
+ self.data: list[tuple[str, str]] = []
56
55
  self.train = train
57
56
 
58
57
  with open(label_path) as f:
@@ -64,7 +63,7 @@ class IIITHWS(AbstractDataset):
64
63
  set_slice = slice(train_samples) if self.train else slice(train_samples, None)
65
64
 
66
65
  for annotation in tqdm(
67
- iterable=annotations[set_slice], desc="Unpacking IIITHWS", total=len(annotations[set_slice])
66
+ iterable=annotations[set_slice], desc="Preparing and Loading IIITHWS", total=len(annotations[set_slice])
68
67
  ):
69
68
  img_path, label = annotation.split()[0:2]
70
69
  img_path = os.path.join(img_folder, img_path)
doctr/datasets/imgur5k.py CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2021-2024, Mindee.
1
+ # Copyright (C) 2021-2025, Mindee.
2
2
 
3
3
  # This program is licensed under the Apache License 2.0.
4
4
  # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
@@ -7,7 +7,7 @@ import glob
7
7
  import json
8
8
  import os
9
9
  from pathlib import Path
10
- from typing import Any, Dict, List, Tuple, Union
10
+ from typing import Any
11
11
 
12
12
  import cv2
13
13
  import numpy as np
@@ -40,7 +40,6 @@ class IMGUR5K(AbstractDataset):
40
40
  >>> img, target = test_set[0]
41
41
 
42
42
  Args:
43
- ----
44
43
  img_folder: folder with all the images of the dataset
45
44
  label_path: path to the annotations file of the dataset
46
45
  train: whether the subset should be the training one
@@ -73,7 +72,7 @@ class IMGUR5K(AbstractDataset):
73
72
  if not os.path.exists(label_path) or not os.path.exists(img_folder):
74
73
  raise FileNotFoundError(f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}")
75
74
 
76
- self.data: List[Tuple[Union[str, Path, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
75
+ self.data: list[tuple[str | Path | np.ndarray, str | dict[str, Any] | np.ndarray]] = []
77
76
  self.train = train
78
77
  np_dtype = np.float32
79
78
 
@@ -96,7 +95,9 @@ class IMGUR5K(AbstractDataset):
96
95
  with open(label_path) as f:
97
96
  annotation_file = json.load(f)
98
97
 
99
- for img_name in tqdm(iterable=img_names[set_slice], desc="Unpacking IMGUR5K", total=len(img_names[set_slice])):
98
+ for img_name in tqdm(
99
+ iterable=img_names[set_slice], desc="Preparing and Loading IMGUR5K", total=len(img_names[set_slice])
100
+ ):
100
101
  img_path = Path(img_folder, img_name)
101
102
  img_id = img_name.split(".")[0]
102
103
 
@@ -132,7 +133,13 @@ class IMGUR5K(AbstractDataset):
132
133
  img_path=os.path.join(self.root, img_name), geoms=np.asarray(box_targets, dtype=np_dtype)
133
134
  )
134
135
  for crop, label in zip(crops, labels):
135
- if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
136
+ if (
137
+ crop.shape[0] > 0
138
+ and crop.shape[1] > 0
139
+ and len(label) > 0
140
+ and len(label) < 30
141
+ and " " not in label
142
+ ):
136
143
  # write data to disk
137
144
  with open(os.path.join(reco_folder_path, f"{reco_images_counter}.txt"), "w") as f:
138
145
  f.write(label)
@@ -151,6 +158,7 @@ class IMGUR5K(AbstractDataset):
151
158
  return f"train={self.train}"
152
159
 
153
160
  def _read_from_folder(self, path: str) -> None:
154
- for img_path in glob.glob(os.path.join(path, "*.png")):
161
+ img_paths = glob.glob(os.path.join(path, "*.png"))
162
+ for img_path in tqdm(iterable=img_paths, desc="Preparing and Loading IMGUR5K", total=len(img_paths)):
155
163
  with open(os.path.join(path, f"{os.path.basename(img_path)[:-4]}.txt"), "r") as f:
156
164
  self.data.append((img_path, f.read()))
doctr/datasets/loader.py CHANGED
@@ -1,10 +1,10 @@
1
- # Copyright (C) 2021-2024, Mindee.
1
+ # Copyright (C) 2021-2025, Mindee.
2
2
 
3
3
  # This program is licensed under the Apache License 2.0.
4
4
  # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
5
 
6
6
  import math
7
- from typing import Callable, Optional
7
+ from collections.abc import Callable
8
8
 
9
9
  import numpy as np
10
10
  import tensorflow as tf
@@ -16,12 +16,10 @@ def default_collate(samples):
16
16
  """Collate multiple elements into batches
17
17
 
18
18
  Args:
19
- ----
20
19
  samples: list of N tuples containing M elements
21
20
 
22
21
  Returns:
23
- -------
24
- Tuple of M sequences contianing N elements each
22
+ tuple of M sequences containing N elements each
25
23
  """
26
24
  batch_data = zip(*samples)
27
25
 
@@ -40,7 +38,6 @@ class DataLoader:
40
38
  >>> images, targets = next(train_iter)
41
39
 
42
40
  Args:
43
- ----
44
41
  dataset: the dataset
45
42
  shuffle: whether the samples should be shuffled before passing it to the iterator
46
43
  batch_size: number of elements in each batch
@@ -54,7 +51,7 @@ class DataLoader:
54
51
  shuffle: bool = True,
55
52
  batch_size: int = 1,
56
53
  drop_last: bool = False,
57
- collate_fn: Optional[Callable] = None,
54
+ collate_fn: Callable | None = None,
58
55
  ) -> None:
59
56
  self.dataset = dataset
60
57
  self.shuffle = shuffle
doctr/datasets/mjsynth.py CHANGED
@@ -1,10 +1,10 @@
1
- # Copyright (C) 2021-2024, Mindee.
1
+ # Copyright (C) 2021-2025, Mindee.
2
2
 
3
3
  # This program is licensed under the Apache License 2.0.
4
4
  # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
5
 
6
6
  import os
7
- from typing import Any, List, Tuple
7
+ from typing import Any
8
8
 
9
9
  from tqdm import tqdm
10
10
 
@@ -30,7 +30,6 @@ class MJSynth(AbstractDataset):
30
30
  >>> img, target = test_set[0]
31
31
 
32
32
  Args:
33
- ----
34
33
  img_folder: folder with all the images of the dataset
35
34
  label_path: path to the file with the labels
36
35
  train: whether the subset should be the training one
@@ -86,7 +85,7 @@ class MJSynth(AbstractDataset):
86
85
  if not os.path.exists(label_path) or not os.path.exists(img_folder):
87
86
  raise FileNotFoundError(f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}")
88
87
 
89
- self.data: List[Tuple[str, str]] = []
88
+ self.data: list[tuple[str, str]] = []
90
89
  self.train = train
91
90
 
92
91
  with open(label_path) as f:
@@ -95,7 +94,9 @@ class MJSynth(AbstractDataset):
95
94
  train_samples = int(len(img_paths) * 0.9)
96
95
  set_slice = slice(train_samples) if self.train else slice(train_samples, None)
97
96
 
98
- for path in tqdm(iterable=img_paths[set_slice], desc="Unpacking MJSynth", total=len(img_paths[set_slice])):
97
+ for path in tqdm(
98
+ iterable=img_paths[set_slice], desc="Preparing and Loading MJSynth", total=len(img_paths[set_slice])
99
+ ):
99
100
  if path not in self.BLACKLIST:
100
101
  label = path.split("_")[1]
101
102
  img_path = os.path.join(img_folder, path[2:]).strip()
doctr/datasets/ocr.py CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2021-2024, Mindee.
1
+ # Copyright (C) 2021-2025, Mindee.
2
2
 
3
3
  # This program is licensed under the Apache License 2.0.
4
4
  # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
@@ -6,7 +6,7 @@
6
6
  import json
7
7
  import os
8
8
  from pathlib import Path
9
- from typing import Any, Dict, List, Tuple
9
+ from typing import Any
10
10
 
11
11
  import numpy as np
12
12
 
@@ -24,7 +24,6 @@ class OCRDataset(AbstractDataset):
24
24
  >>> img, target = train_set[0]
25
25
 
26
26
  Args:
27
- ----
28
27
  img_folder: local path to image folder (all jpg at the root)
29
28
  label_file: local path to the label file
30
29
  use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
@@ -41,7 +40,7 @@ class OCRDataset(AbstractDataset):
41
40
  super().__init__(img_folder, **kwargs)
42
41
 
43
42
  # List images
44
- self.data: List[Tuple[str, Dict[str, Any]]] = []
43
+ self.data: list[tuple[Path, dict[str, Any]]] = []
45
44
  np_dtype = np.float32
46
45
  with open(label_file, "rb") as f:
47
46
  data = json.load(f)
@@ -1,10 +1,10 @@
1
- # Copyright (C) 2021-2024, Mindee.
1
+ # Copyright (C) 2021-2025, Mindee.
2
2
 
3
3
  # This program is licensed under the Apache License 2.0.
4
4
  # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
5
 
6
6
  import os
7
- from typing import Any, List, Tuple
7
+ from typing import Any
8
8
 
9
9
  import numpy as np
10
10
 
@@ -21,7 +21,6 @@ class OrientationDataset(AbstractDataset):
21
21
  >>> img, target = train_set[0]
22
22
 
23
23
  Args:
24
- ----
25
24
  img_folder: folder with all the images of the dataset
26
25
  **kwargs: keyword arguments from `AbstractDataset`.
27
26
  """
@@ -37,4 +36,4 @@ class OrientationDataset(AbstractDataset):
37
36
  )
38
37
 
39
38
  # initialize dataset with 0 degree rotation targets
40
- self.data: List[Tuple[str, np.ndarray]] = [(img_name, np.array([0])) for img_name in os.listdir(self.root)]
39
+ self.data: list[tuple[str, np.ndarray]] = [(img_name, np.array([0])) for img_name in os.listdir(self.root)]
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2021-2024, Mindee.
1
+ # Copyright (C) 2021-2025, Mindee.
2
2
 
3
3
  # This program is licensed under the Apache License 2.0.
4
4
  # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
@@ -6,7 +6,7 @@
6
6
  import json
7
7
  import os
8
8
  from pathlib import Path
9
- from typing import Any, List, Tuple
9
+ from typing import Any
10
10
 
11
11
  from .datasets import AbstractDataset
12
12
 
@@ -22,9 +22,8 @@ class RecognitionDataset(AbstractDataset):
22
22
  >>> img, target = train_set[0]
23
23
 
24
24
  Args:
25
- ----
26
25
  img_folder: path to the images folder
27
- labels_path: pathe to the json file containing all labels (character sequences)
26
+ labels_path: path to the json file containing all labels (character sequences)
28
27
  **kwargs: keyword arguments from `AbstractDataset`.
29
28
  """
30
29
 
@@ -36,7 +35,7 @@ class RecognitionDataset(AbstractDataset):
36
35
  ) -> None:
37
36
  super().__init__(img_folder, **kwargs)
38
37
 
39
- self.data: List[Tuple[str, str]] = []
38
+ self.data: list[tuple[str, str]] = []
40
39
  with open(labels_path, encoding="utf-8") as f:
41
40
  labels = json.load(f)
42
41