python-doctr 0.9.0__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. doctr/contrib/__init__.py +1 -0
  2. doctr/contrib/artefacts.py +7 -9
  3. doctr/contrib/base.py +8 -17
  4. doctr/datasets/cord.py +17 -7
  5. doctr/datasets/datasets/__init__.py +4 -4
  6. doctr/datasets/datasets/base.py +16 -16
  7. doctr/datasets/datasets/pytorch.py +12 -12
  8. doctr/datasets/datasets/tensorflow.py +10 -10
  9. doctr/datasets/detection.py +6 -9
  10. doctr/datasets/doc_artefacts.py +3 -4
  11. doctr/datasets/funsd.py +17 -6
  12. doctr/datasets/generator/__init__.py +4 -4
  13. doctr/datasets/generator/base.py +16 -17
  14. doctr/datasets/generator/pytorch.py +1 -3
  15. doctr/datasets/generator/tensorflow.py +1 -3
  16. doctr/datasets/ic03.py +14 -5
  17. doctr/datasets/ic13.py +13 -5
  18. doctr/datasets/iiit5k.py +31 -20
  19. doctr/datasets/iiithws.py +4 -5
  20. doctr/datasets/imgur5k.py +15 -5
  21. doctr/datasets/loader.py +4 -7
  22. doctr/datasets/mjsynth.py +6 -5
  23. doctr/datasets/ocr.py +3 -4
  24. doctr/datasets/orientation.py +3 -4
  25. doctr/datasets/recognition.py +3 -4
  26. doctr/datasets/sroie.py +16 -5
  27. doctr/datasets/svhn.py +16 -5
  28. doctr/datasets/svt.py +14 -5
  29. doctr/datasets/synthtext.py +14 -5
  30. doctr/datasets/utils.py +37 -27
  31. doctr/datasets/vocabs.py +21 -7
  32. doctr/datasets/wildreceipt.py +25 -10
  33. doctr/file_utils.py +18 -4
  34. doctr/io/elements.py +69 -81
  35. doctr/io/html.py +1 -3
  36. doctr/io/image/__init__.py +3 -3
  37. doctr/io/image/base.py +2 -5
  38. doctr/io/image/pytorch.py +3 -12
  39. doctr/io/image/tensorflow.py +2 -11
  40. doctr/io/pdf.py +5 -7
  41. doctr/io/reader.py +5 -11
  42. doctr/models/_utils.py +14 -22
  43. doctr/models/builder.py +32 -50
  44. doctr/models/classification/magc_resnet/__init__.py +3 -3
  45. doctr/models/classification/magc_resnet/pytorch.py +10 -13
  46. doctr/models/classification/magc_resnet/tensorflow.py +21 -17
  47. doctr/models/classification/mobilenet/__init__.py +3 -3
  48. doctr/models/classification/mobilenet/pytorch.py +7 -17
  49. doctr/models/classification/mobilenet/tensorflow.py +22 -29
  50. doctr/models/classification/predictor/__init__.py +4 -4
  51. doctr/models/classification/predictor/pytorch.py +13 -11
  52. doctr/models/classification/predictor/tensorflow.py +13 -11
  53. doctr/models/classification/resnet/__init__.py +4 -4
  54. doctr/models/classification/resnet/pytorch.py +21 -31
  55. doctr/models/classification/resnet/tensorflow.py +41 -39
  56. doctr/models/classification/textnet/__init__.py +3 -3
  57. doctr/models/classification/textnet/pytorch.py +10 -17
  58. doctr/models/classification/textnet/tensorflow.py +19 -20
  59. doctr/models/classification/vgg/__init__.py +3 -3
  60. doctr/models/classification/vgg/pytorch.py +5 -7
  61. doctr/models/classification/vgg/tensorflow.py +18 -15
  62. doctr/models/classification/vit/__init__.py +3 -3
  63. doctr/models/classification/vit/pytorch.py +8 -14
  64. doctr/models/classification/vit/tensorflow.py +16 -16
  65. doctr/models/classification/zoo.py +36 -19
  66. doctr/models/core.py +3 -3
  67. doctr/models/detection/_utils/__init__.py +4 -4
  68. doctr/models/detection/_utils/base.py +4 -7
  69. doctr/models/detection/_utils/pytorch.py +1 -5
  70. doctr/models/detection/_utils/tensorflow.py +1 -5
  71. doctr/models/detection/core.py +2 -8
  72. doctr/models/detection/differentiable_binarization/__init__.py +4 -4
  73. doctr/models/detection/differentiable_binarization/base.py +7 -17
  74. doctr/models/detection/differentiable_binarization/pytorch.py +27 -30
  75. doctr/models/detection/differentiable_binarization/tensorflow.py +49 -37
  76. doctr/models/detection/fast/__init__.py +4 -4
  77. doctr/models/detection/fast/base.py +6 -14
  78. doctr/models/detection/fast/pytorch.py +24 -31
  79. doctr/models/detection/fast/tensorflow.py +28 -37
  80. doctr/models/detection/linknet/__init__.py +4 -4
  81. doctr/models/detection/linknet/base.py +6 -15
  82. doctr/models/detection/linknet/pytorch.py +24 -27
  83. doctr/models/detection/linknet/tensorflow.py +36 -33
  84. doctr/models/detection/predictor/__init__.py +5 -5
  85. doctr/models/detection/predictor/pytorch.py +6 -7
  86. doctr/models/detection/predictor/tensorflow.py +7 -8
  87. doctr/models/detection/zoo.py +27 -7
  88. doctr/models/factory/hub.py +8 -13
  89. doctr/models/kie_predictor/__init__.py +5 -5
  90. doctr/models/kie_predictor/base.py +8 -5
  91. doctr/models/kie_predictor/pytorch.py +22 -19
  92. doctr/models/kie_predictor/tensorflow.py +21 -15
  93. doctr/models/modules/layers/__init__.py +3 -3
  94. doctr/models/modules/layers/pytorch.py +6 -9
  95. doctr/models/modules/layers/tensorflow.py +5 -7
  96. doctr/models/modules/transformer/__init__.py +3 -3
  97. doctr/models/modules/transformer/pytorch.py +12 -13
  98. doctr/models/modules/transformer/tensorflow.py +9 -12
  99. doctr/models/modules/vision_transformer/__init__.py +3 -3
  100. doctr/models/modules/vision_transformer/pytorch.py +3 -4
  101. doctr/models/modules/vision_transformer/tensorflow.py +4 -4
  102. doctr/models/predictor/__init__.py +5 -5
  103. doctr/models/predictor/base.py +52 -41
  104. doctr/models/predictor/pytorch.py +16 -13
  105. doctr/models/predictor/tensorflow.py +16 -10
  106. doctr/models/preprocessor/__init__.py +4 -4
  107. doctr/models/preprocessor/pytorch.py +13 -17
  108. doctr/models/preprocessor/tensorflow.py +11 -15
  109. doctr/models/recognition/core.py +3 -7
  110. doctr/models/recognition/crnn/__init__.py +4 -4
  111. doctr/models/recognition/crnn/pytorch.py +20 -28
  112. doctr/models/recognition/crnn/tensorflow.py +19 -29
  113. doctr/models/recognition/master/__init__.py +3 -3
  114. doctr/models/recognition/master/base.py +3 -7
  115. doctr/models/recognition/master/pytorch.py +22 -24
  116. doctr/models/recognition/master/tensorflow.py +21 -26
  117. doctr/models/recognition/parseq/__init__.py +3 -3
  118. doctr/models/recognition/parseq/base.py +3 -7
  119. doctr/models/recognition/parseq/pytorch.py +26 -26
  120. doctr/models/recognition/parseq/tensorflow.py +26 -30
  121. doctr/models/recognition/predictor/__init__.py +5 -5
  122. doctr/models/recognition/predictor/_utils.py +7 -10
  123. doctr/models/recognition/predictor/pytorch.py +6 -6
  124. doctr/models/recognition/predictor/tensorflow.py +5 -6
  125. doctr/models/recognition/sar/__init__.py +4 -4
  126. doctr/models/recognition/sar/pytorch.py +20 -21
  127. doctr/models/recognition/sar/tensorflow.py +19 -24
  128. doctr/models/recognition/utils.py +5 -10
  129. doctr/models/recognition/vitstr/__init__.py +4 -4
  130. doctr/models/recognition/vitstr/base.py +3 -7
  131. doctr/models/recognition/vitstr/pytorch.py +18 -20
  132. doctr/models/recognition/vitstr/tensorflow.py +21 -24
  133. doctr/models/recognition/zoo.py +22 -11
  134. doctr/models/utils/__init__.py +4 -4
  135. doctr/models/utils/pytorch.py +13 -16
  136. doctr/models/utils/tensorflow.py +31 -30
  137. doctr/models/zoo.py +1 -5
  138. doctr/transforms/functional/__init__.py +3 -3
  139. doctr/transforms/functional/base.py +4 -11
  140. doctr/transforms/functional/pytorch.py +21 -29
  141. doctr/transforms/functional/tensorflow.py +10 -22
  142. doctr/transforms/modules/__init__.py +4 -4
  143. doctr/transforms/modules/base.py +48 -55
  144. doctr/transforms/modules/pytorch.py +65 -28
  145. doctr/transforms/modules/tensorflow.py +33 -44
  146. doctr/utils/common_types.py +8 -9
  147. doctr/utils/data.py +8 -12
  148. doctr/utils/fonts.py +2 -7
  149. doctr/utils/geometry.py +120 -64
  150. doctr/utils/metrics.py +18 -38
  151. doctr/utils/multithreading.py +4 -6
  152. doctr/utils/reconstitution.py +157 -75
  153. doctr/utils/repr.py +2 -3
  154. doctr/utils/visualization.py +16 -29
  155. doctr/version.py +1 -1
  156. {python_doctr-0.9.0.dist-info → python_doctr-0.11.0.dist-info}/METADATA +59 -57
  157. python_doctr-0.11.0.dist-info/RECORD +173 -0
  158. {python_doctr-0.9.0.dist-info → python_doctr-0.11.0.dist-info}/WHEEL +1 -1
  159. python_doctr-0.9.0.dist-info/RECORD +0 -173
  160. {python_doctr-0.9.0.dist-info → python_doctr-0.11.0.dist-info}/LICENSE +0 -0
  161. {python_doctr-0.9.0.dist-info → python_doctr-0.11.0.dist-info}/top_level.txt +0 -0
  162. {python_doctr-0.9.0.dist-info → python_doctr-0.11.0.dist-info}/zip-safe +0 -0
doctr/io/elements.py CHANGED
@@ -1,9 +1,9 @@
1
- # Copyright (C) 2021-2024, Mindee.
1
+ # Copyright (C) 2021-2025, Mindee.
2
2
 
3
3
  # This program is licensed under the Apache License 2.0.
4
4
  # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
5
 
6
- from typing import Any, Dict, List, Optional, Tuple, Union
6
+ from typing import Any
7
7
 
8
8
  from defusedxml import defuse_stdlib
9
9
 
@@ -32,8 +32,8 @@ __all__ = ["Element", "Word", "Artefact", "Line", "Prediction", "Block", "Page",
32
32
  class Element(NestedObject):
33
33
  """Implements an abstract document element with exporting and text rendering capabilities"""
34
34
 
35
- _children_names: List[str] = []
36
- _exported_keys: List[str] = []
35
+ _children_names: list[str] = []
36
+ _exported_keys: list[str] = []
37
37
 
38
38
  def __init__(self, **kwargs: Any) -> None:
39
39
  for k, v in kwargs.items():
@@ -42,7 +42,7 @@ class Element(NestedObject):
42
42
  else:
43
43
  raise KeyError(f"{self.__class__.__name__} object does not have any attribute named '{k}'")
44
44
 
45
- def export(self) -> Dict[str, Any]:
45
+ def export(self) -> dict[str, Any]:
46
46
  """Exports the object into a nested dict format"""
47
47
  export_dict = {k: getattr(self, k) for k in self._exported_keys}
48
48
  for children_name in self._children_names:
@@ -56,7 +56,7 @@ class Element(NestedObject):
56
56
  return export_dict
57
57
 
58
58
  @classmethod
59
- def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
59
+ def from_dict(cls, save_dict: dict[str, Any], **kwargs):
60
60
  raise NotImplementedError
61
61
 
62
62
  def render(self) -> str:
@@ -67,7 +67,6 @@ class Word(Element):
67
67
  """Implements a word element
68
68
 
69
69
  Args:
70
- ----
71
70
  value: the text string of the word
72
71
  confidence: the confidence associated with the text prediction
73
72
  geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
@@ -76,16 +75,16 @@ class Word(Element):
76
75
  crop_orientation: the general orientation of the crop in degrees and its confidence
77
76
  """
78
77
 
79
- _exported_keys: List[str] = ["value", "confidence", "geometry", "objectness_score", "crop_orientation"]
80
- _children_names: List[str] = []
78
+ _exported_keys: list[str] = ["value", "confidence", "geometry", "objectness_score", "crop_orientation"]
79
+ _children_names: list[str] = []
81
80
 
82
81
  def __init__(
83
82
  self,
84
83
  value: str,
85
84
  confidence: float,
86
- geometry: Union[BoundingBox, np.ndarray],
85
+ geometry: BoundingBox | np.ndarray,
87
86
  objectness_score: float,
88
- crop_orientation: Dict[str, Any],
87
+ crop_orientation: dict[str, Any],
89
88
  ) -> None:
90
89
  super().__init__()
91
90
  self.value = value
@@ -102,7 +101,7 @@ class Word(Element):
102
101
  return f"value='{self.value}', confidence={self.confidence:.2}"
103
102
 
104
103
  @classmethod
105
- def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
104
+ def from_dict(cls, save_dict: dict[str, Any], **kwargs):
106
105
  kwargs = {k: save_dict[k] for k in cls._exported_keys}
107
106
  return cls(**kwargs)
108
107
 
@@ -111,15 +110,14 @@ class Artefact(Element):
111
110
  """Implements a non-textual element
112
111
 
113
112
  Args:
114
- ----
115
113
  artefact_type: the type of artefact
116
114
  confidence: the confidence of the type prediction
117
115
  geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
118
116
  the page's size.
119
117
  """
120
118
 
121
- _exported_keys: List[str] = ["geometry", "type", "confidence"]
122
- _children_names: List[str] = []
119
+ _exported_keys: list[str] = ["geometry", "type", "confidence"]
120
+ _children_names: list[str] = []
123
121
 
124
122
  def __init__(self, artefact_type: str, confidence: float, geometry: BoundingBox) -> None:
125
123
  super().__init__()
@@ -135,7 +133,7 @@ class Artefact(Element):
135
133
  return f"type='{self.type}', confidence={self.confidence:.2}"
136
134
 
137
135
  @classmethod
138
- def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
136
+ def from_dict(cls, save_dict: dict[str, Any], **kwargs):
139
137
  kwargs = {k: save_dict[k] for k in cls._exported_keys}
140
138
  return cls(**kwargs)
141
139
 
@@ -144,22 +142,21 @@ class Line(Element):
144
142
  """Implements a line element as a collection of words
145
143
 
146
144
  Args:
147
- ----
148
145
  words: list of word elements
149
146
  geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
150
147
  the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing
151
148
  all words in it.
152
149
  """
153
150
 
154
- _exported_keys: List[str] = ["geometry", "objectness_score"]
155
- _children_names: List[str] = ["words"]
156
- words: List[Word] = []
151
+ _exported_keys: list[str] = ["geometry", "objectness_score"]
152
+ _children_names: list[str] = ["words"]
153
+ words: list[Word] = []
157
154
 
158
155
  def __init__(
159
156
  self,
160
- words: List[Word],
161
- geometry: Optional[Union[BoundingBox, np.ndarray]] = None,
162
- objectness_score: Optional[float] = None,
157
+ words: list[Word],
158
+ geometry: BoundingBox | np.ndarray | None = None,
159
+ objectness_score: float | None = None,
163
160
  ) -> None:
164
161
  # Compute the objectness score of the line
165
162
  if objectness_score is None:
@@ -168,7 +165,7 @@ class Line(Element):
168
165
  if geometry is None:
169
166
  # Check whether this is a rotated or straight box
170
167
  box_resolution_fn = resolve_enclosing_rbbox if len(words[0].geometry) == 4 else resolve_enclosing_bbox
171
- geometry = box_resolution_fn([w.geometry for w in words]) # type: ignore[operator]
168
+ geometry = box_resolution_fn([w.geometry for w in words]) # type: ignore[misc]
172
169
 
173
170
  super().__init__(words=words)
174
171
  self.geometry = geometry
@@ -179,7 +176,7 @@ class Line(Element):
179
176
  return " ".join(w.render() for w in self.words)
180
177
 
181
178
  @classmethod
182
- def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
179
+ def from_dict(cls, save_dict: dict[str, Any], **kwargs):
183
180
  kwargs = {k: save_dict[k] for k in cls._exported_keys}
184
181
  kwargs.update({
185
182
  "words": [Word.from_dict(_dict) for _dict in save_dict["words"]],
@@ -202,7 +199,6 @@ class Block(Element):
202
199
  """Implements a block element as a collection of lines and artefacts
203
200
 
204
201
  Args:
205
- ----
206
202
  lines: list of line elements
207
203
  artefacts: list of artefacts
208
204
  geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
@@ -210,17 +206,17 @@ class Block(Element):
210
206
  all lines and artefacts in it.
211
207
  """
212
208
 
213
- _exported_keys: List[str] = ["geometry", "objectness_score"]
214
- _children_names: List[str] = ["lines", "artefacts"]
215
- lines: List[Line] = []
216
- artefacts: List[Artefact] = []
209
+ _exported_keys: list[str] = ["geometry", "objectness_score"]
210
+ _children_names: list[str] = ["lines", "artefacts"]
211
+ lines: list[Line] = []
212
+ artefacts: list[Artefact] = []
217
213
 
218
214
  def __init__(
219
215
  self,
220
- lines: List[Line] = [],
221
- artefacts: List[Artefact] = [],
222
- geometry: Optional[Union[BoundingBox, np.ndarray]] = None,
223
- objectness_score: Optional[float] = None,
216
+ lines: list[Line] = [],
217
+ artefacts: list[Artefact] = [],
218
+ geometry: BoundingBox | np.ndarray | None = None,
219
+ objectness_score: float | None = None,
224
220
  ) -> None:
225
221
  # Compute the objectness score of the line
226
222
  if objectness_score is None:
@@ -232,7 +228,7 @@ class Block(Element):
232
228
  box_resolution_fn = (
233
229
  resolve_enclosing_rbbox if isinstance(lines[0].geometry, np.ndarray) else resolve_enclosing_bbox
234
230
  )
235
- geometry = box_resolution_fn(line_boxes + artefact_boxes) # type: ignore[operator]
231
+ geometry = box_resolution_fn(line_boxes + artefact_boxes) # type: ignore
236
232
 
237
233
  super().__init__(lines=lines, artefacts=artefacts)
238
234
  self.geometry = geometry
@@ -243,7 +239,7 @@ class Block(Element):
243
239
  return line_break.join(line.render() for line in self.lines)
244
240
 
245
241
  @classmethod
246
- def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
242
+ def from_dict(cls, save_dict: dict[str, Any], **kwargs):
247
243
  kwargs = {k: save_dict[k] for k in cls._exported_keys}
248
244
  kwargs.update({
249
245
  "lines": [Line.from_dict(_dict) for _dict in save_dict["lines"]],
@@ -256,7 +252,6 @@ class Page(Element):
256
252
  """Implements a page element as a collection of blocks
257
253
 
258
254
  Args:
259
- ----
260
255
  page: image encoded as a numpy array in uint8
261
256
  blocks: list of block elements
262
257
  page_idx: the index of the page in the input raw document
@@ -265,18 +260,18 @@ class Page(Element):
265
260
  language: a dictionary with the language value and confidence of the prediction
266
261
  """
267
262
 
268
- _exported_keys: List[str] = ["page_idx", "dimensions", "orientation", "language"]
269
- _children_names: List[str] = ["blocks"]
270
- blocks: List[Block] = []
263
+ _exported_keys: list[str] = ["page_idx", "dimensions", "orientation", "language"]
264
+ _children_names: list[str] = ["blocks"]
265
+ blocks: list[Block] = []
271
266
 
272
267
  def __init__(
273
268
  self,
274
269
  page: np.ndarray,
275
- blocks: List[Block],
270
+ blocks: list[Block],
276
271
  page_idx: int,
277
- dimensions: Tuple[int, int],
278
- orientation: Optional[Dict[str, Any]] = None,
279
- language: Optional[Dict[str, Any]] = None,
272
+ dimensions: tuple[int, int],
273
+ orientation: dict[str, Any] | None = None,
274
+ language: dict[str, Any] | None = None,
280
275
  ) -> None:
281
276
  super().__init__(blocks=blocks)
282
277
  self.page = page
@@ -310,22 +305,22 @@ class Page(Element):
310
305
  def synthesize(self, **kwargs) -> np.ndarray:
311
306
  """Synthesize the page from the predictions
312
307
 
313
- Returns
314
- -------
308
+ Args:
309
+ **kwargs: keyword arguments passed to the `synthesize_page` method
310
+
311
+ Returns:
315
312
  synthesized page
316
313
  """
317
314
  return synthesize_page(self.export(), **kwargs)
318
315
 
319
- def export_as_xml(self, file_title: str = "docTR - XML export (hOCR)") -> Tuple[bytes, ET.ElementTree]:
316
+ def export_as_xml(self, file_title: str = "docTR - XML export (hOCR)") -> tuple[bytes, ET.ElementTree]:
320
317
  """Export the page as XML (hOCR-format)
321
318
  convention: https://github.com/kba/hocr-spec/blob/master/1.2/spec.md
322
319
 
323
320
  Args:
324
- ----
325
321
  file_title: the title of the XML file
326
322
 
327
323
  Returns:
328
- -------
329
324
  a tuple of the XML byte string, and its ElementTree
330
325
  """
331
326
  p_idx = self.page_idx
@@ -423,7 +418,7 @@ class Page(Element):
423
418
  return (ET.tostring(page_hocr, encoding="utf-8", method="xml"), ET.ElementTree(page_hocr))
424
419
 
425
420
  @classmethod
426
- def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
421
+ def from_dict(cls, save_dict: dict[str, Any], **kwargs):
427
422
  kwargs = {k: save_dict[k] for k in cls._exported_keys}
428
423
  kwargs.update({"blocks": [Block.from_dict(block_dict) for block_dict in save_dict["blocks"]]})
429
424
  return cls(**kwargs)
@@ -433,7 +428,6 @@ class KIEPage(Element):
433
428
  """Implements a KIE page element as a collection of predictions
434
429
 
435
430
  Args:
436
- ----
437
431
  predictions: Dictionary with list of block elements for each detection class
438
432
  page: image encoded as a numpy array in uint8
439
433
  page_idx: the index of the page in the input raw document
@@ -442,18 +436,18 @@ class KIEPage(Element):
442
436
  language: a dictionary with the language value and confidence of the prediction
443
437
  """
444
438
 
445
- _exported_keys: List[str] = ["page_idx", "dimensions", "orientation", "language"]
446
- _children_names: List[str] = ["predictions"]
447
- predictions: Dict[str, List[Prediction]] = {}
439
+ _exported_keys: list[str] = ["page_idx", "dimensions", "orientation", "language"]
440
+ _children_names: list[str] = ["predictions"]
441
+ predictions: dict[str, list[Prediction]] = {}
448
442
 
449
443
  def __init__(
450
444
  self,
451
445
  page: np.ndarray,
452
- predictions: Dict[str, List[Prediction]],
446
+ predictions: dict[str, list[Prediction]],
453
447
  page_idx: int,
454
- dimensions: Tuple[int, int],
455
- orientation: Optional[Dict[str, Any]] = None,
456
- language: Optional[Dict[str, Any]] = None,
448
+ dimensions: tuple[int, int],
449
+ orientation: dict[str, Any] | None = None,
450
+ language: dict[str, Any] | None = None,
457
451
  ) -> None:
458
452
  super().__init__(predictions=predictions)
459
453
  self.page = page
@@ -492,25 +486,21 @@ class KIEPage(Element):
492
486
  """Synthesize the page from the predictions
493
487
 
494
488
  Args:
495
- ----
496
- **kwargs: keyword arguments passed to the matplotlib.pyplot.show method
489
+ **kwargs: keyword arguments passed to the `synthesize_kie_page` method
497
490
 
498
491
  Returns:
499
- -------
500
492
  synthesized page
501
493
  """
502
494
  return synthesize_kie_page(self.export(), **kwargs)
503
495
 
504
- def export_as_xml(self, file_title: str = "docTR - XML export (hOCR)") -> Tuple[bytes, ET.ElementTree]:
496
+ def export_as_xml(self, file_title: str = "docTR - XML export (hOCR)") -> tuple[bytes, ET.ElementTree]:
505
497
  """Export the page as XML (hOCR-format)
506
498
  convention: https://github.com/kba/hocr-spec/blob/master/1.2/spec.md
507
499
 
508
500
  Args:
509
- ----
510
501
  file_title: the title of the XML file
511
502
 
512
503
  Returns:
513
- -------
514
504
  a tuple of the XML byte string, and its ElementTree
515
505
  """
516
506
  p_idx = self.page_idx
@@ -566,7 +556,7 @@ class KIEPage(Element):
566
556
  return ET.tostring(page_hocr, encoding="utf-8", method="xml"), ET.ElementTree(page_hocr)
567
557
 
568
558
  @classmethod
569
- def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
559
+ def from_dict(cls, save_dict: dict[str, Any], **kwargs):
570
560
  kwargs = {k: save_dict[k] for k in cls._exported_keys}
571
561
  kwargs.update({
572
562
  "predictions": [Prediction.from_dict(predictions_dict) for predictions_dict in save_dict["predictions"]]
@@ -578,16 +568,15 @@ class Document(Element):
578
568
  """Implements a document element as a collection of pages
579
569
 
580
570
  Args:
581
- ----
582
571
  pages: list of page elements
583
572
  """
584
573
 
585
- _children_names: List[str] = ["pages"]
586
- pages: List[Page] = []
574
+ _children_names: list[str] = ["pages"]
575
+ pages: list[Page] = []
587
576
 
588
577
  def __init__(
589
578
  self,
590
- pages: List[Page],
579
+ pages: list[Page],
591
580
  ) -> None:
592
581
  super().__init__(pages=pages)
593
582
 
@@ -600,30 +589,30 @@ class Document(Element):
600
589
  for result in self.pages:
601
590
  result.show(**kwargs)
602
591
 
603
- def synthesize(self, **kwargs) -> List[np.ndarray]:
592
+ def synthesize(self, **kwargs) -> list[np.ndarray]:
604
593
  """Synthesize all pages from their predictions
605
594
 
606
- Returns
607
- -------
595
+ Args:
596
+ **kwargs: keyword arguments passed to the `Page.synthesize` method
597
+
598
+ Returns:
608
599
  list of synthesized pages
609
600
  """
610
- return [page.synthesize() for page in self.pages]
601
+ return [page.synthesize(**kwargs) for page in self.pages]
611
602
 
612
- def export_as_xml(self, **kwargs) -> List[Tuple[bytes, ET.ElementTree]]:
603
+ def export_as_xml(self, **kwargs) -> list[tuple[bytes, ET.ElementTree]]:
613
604
  """Export the document as XML (hOCR-format)
614
605
 
615
606
  Args:
616
- ----
617
607
  **kwargs: additional keyword arguments passed to the Page.export_as_xml method
618
608
 
619
609
  Returns:
620
- -------
621
610
  list of tuple of (bytes, ElementTree)
622
611
  """
623
612
  return [page.export_as_xml(**kwargs) for page in self.pages]
624
613
 
625
614
  @classmethod
626
- def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
615
+ def from_dict(cls, save_dict: dict[str, Any], **kwargs):
627
616
  kwargs = {k: save_dict[k] for k in cls._exported_keys}
628
617
  kwargs.update({"pages": [Page.from_dict(page_dict) for page_dict in save_dict["pages"]]})
629
618
  return cls(**kwargs)
@@ -633,15 +622,14 @@ class KIEDocument(Document):
633
622
  """Implements a document element as a collection of pages
634
623
 
635
624
  Args:
636
- ----
637
625
  pages: list of page elements
638
626
  """
639
627
 
640
- _children_names: List[str] = ["pages"]
641
- pages: List[KIEPage] = [] # type: ignore[assignment]
628
+ _children_names: list[str] = ["pages"]
629
+ pages: list[KIEPage] = [] # type: ignore[assignment]
642
630
 
643
631
  def __init__(
644
632
  self,
645
- pages: List[KIEPage],
633
+ pages: list[KIEPage],
646
634
  ) -> None:
647
635
  super().__init__(pages=pages) # type: ignore[arg-type]
doctr/io/html.py CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2021-2024, Mindee.
1
+ # Copyright (C) 2021-2025, Mindee.
2
2
 
3
3
  # This program is licensed under the Apache License 2.0.
4
4
  # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
@@ -15,12 +15,10 @@ def read_html(url: str, **kwargs: Any) -> bytes:
15
15
  >>> doc = read_html("https://www.yoursite.com")
16
16
 
17
17
  Args:
18
- ----
19
18
  url: URL of the target web page
20
19
  **kwargs: keyword arguments from `weasyprint.HTML`
21
20
 
22
21
  Returns:
23
- -------
24
22
  decoded PDF file as a bytes stream
25
23
  """
26
24
  from weasyprint import HTML
@@ -2,7 +2,7 @@ from doctr.file_utils import is_tf_available, is_torch_available
2
2
 
3
3
  from .base import *
4
4
 
5
- if is_tf_available():
6
- from .tensorflow import *
7
- elif is_torch_available():
5
+ if is_torch_available():
8
6
  from .pytorch import *
7
+ elif is_tf_available():
8
+ from .tensorflow import *
doctr/io/image/base.py CHANGED
@@ -1,10 +1,9 @@
1
- # Copyright (C) 2021-2024, Mindee.
1
+ # Copyright (C) 2021-2025, Mindee.
2
2
 
3
3
  # This program is licensed under the Apache License 2.0.
4
4
  # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
5
 
6
6
  from pathlib import Path
7
- from typing import Optional, Tuple
8
7
 
9
8
  import cv2
10
9
  import numpy as np
@@ -16,7 +15,7 @@ __all__ = ["read_img_as_numpy"]
16
15
 
17
16
  def read_img_as_numpy(
18
17
  file: AbstractFile,
19
- output_size: Optional[Tuple[int, int]] = None,
18
+ output_size: tuple[int, int] | None = None,
20
19
  rgb_output: bool = True,
21
20
  ) -> np.ndarray:
22
21
  """Read an image file into numpy format
@@ -25,13 +24,11 @@ def read_img_as_numpy(
25
24
  >>> page = read_img_as_numpy("path/to/your/doc.jpg")
26
25
 
27
26
  Args:
28
- ----
29
27
  file: the path to the image file
30
28
  output_size: the expected output size of each page in format H x W
31
29
  rgb_output: whether the output ndarray channel order should be RGB instead of BGR.
32
30
 
33
31
  Returns:
34
- -------
35
32
  the page decoded as numpy ndarray of shape H x W x 3
36
33
  """
37
34
  if isinstance(file, (str, Path)):
doctr/io/image/pytorch.py CHANGED
@@ -1,10 +1,9 @@
1
- # Copyright (C) 2021-2024, Mindee.
1
+ # Copyright (C) 2021-2025, Mindee.
2
2
 
3
3
  # This program is licensed under the Apache License 2.0.
4
4
  # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
5
 
6
6
  from io import BytesIO
7
- from typing import Tuple
8
7
 
9
8
  import numpy as np
10
9
  import torch
@@ -20,12 +19,10 @@ def tensor_from_pil(pil_img: Image.Image, dtype: torch.dtype = torch.float32) ->
20
19
  """Convert a PIL Image to a PyTorch tensor
21
20
 
22
21
  Args:
23
- ----
24
22
  pil_img: a PIL image
25
23
  dtype: the output tensor data type
26
24
 
27
25
  Returns:
28
- -------
29
26
  decoded image as tensor
30
27
  """
31
28
  if dtype == torch.float32:
@@ -40,12 +37,10 @@ def read_img_as_tensor(img_path: AbstractPath, dtype: torch.dtype = torch.float3
40
37
  """Read an image file as a PyTorch tensor
41
38
 
42
39
  Args:
43
- ----
44
40
  img_path: location of the image file
45
41
  dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
46
42
 
47
43
  Returns:
48
- -------
49
44
  decoded image as a tensor
50
45
  """
51
46
  if dtype not in (torch.uint8, torch.float16, torch.float32):
@@ -59,12 +54,10 @@ def decode_img_as_tensor(img_content: bytes, dtype: torch.dtype = torch.float32)
59
54
  """Read a byte stream as a PyTorch tensor
60
55
 
61
56
  Args:
62
- ----
63
57
  img_content: bytes of a decoded image
64
58
  dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
65
59
 
66
60
  Returns:
67
- -------
68
61
  decoded image as a tensor
69
62
  """
70
63
  if dtype not in (torch.uint8, torch.float16, torch.float32):
@@ -78,12 +71,10 @@ def tensor_from_numpy(npy_img: np.ndarray, dtype: torch.dtype = torch.float32) -
78
71
  """Read an image file as a PyTorch tensor
79
72
 
80
73
  Args:
81
- ----
82
74
  npy_img: image encoded as a numpy array of shape (H, W, C) in np.uint8
83
75
  dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
84
76
 
85
77
  Returns:
86
- -------
87
78
  same image as a tensor of shape (C, H, W)
88
79
  """
89
80
  if dtype not in (torch.uint8, torch.float16, torch.float32):
@@ -102,6 +93,6 @@ def tensor_from_numpy(npy_img: np.ndarray, dtype: torch.dtype = torch.float32) -
102
93
  return img
103
94
 
104
95
 
105
- def get_img_shape(img: torch.Tensor) -> Tuple[int, int]:
96
+ def get_img_shape(img: torch.Tensor) -> tuple[int, int]:
106
97
  """Get the shape of an image"""
107
- return img.shape[-2:] # type: ignore[return-value]
98
+ return img.shape[-2:]
@@ -1,9 +1,8 @@
1
- # Copyright (C) 2021-2024, Mindee.
1
+ # Copyright (C) 2021-2025, Mindee.
2
2
 
3
3
  # This program is licensed under the Apache License 2.0.
4
4
  # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
5
 
6
- from typing import Tuple
7
6
 
8
7
  import numpy as np
9
8
  import tensorflow as tf
@@ -19,12 +18,10 @@ def tensor_from_pil(pil_img: Image.Image, dtype: tf.dtypes.DType = tf.float32) -
19
18
  """Convert a PIL Image to a TensorFlow tensor
20
19
 
21
20
  Args:
22
- ----
23
21
  pil_img: a PIL image
24
22
  dtype: the output tensor data type
25
23
 
26
24
  Returns:
27
- -------
28
25
  decoded image as tensor
29
26
  """
30
27
  npy_img = img_to_array(pil_img)
@@ -36,12 +33,10 @@ def read_img_as_tensor(img_path: AbstractPath, dtype: tf.dtypes.DType = tf.float
36
33
  """Read an image file as a TensorFlow tensor
37
34
 
38
35
  Args:
39
- ----
40
36
  img_path: location of the image file
41
37
  dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
42
38
 
43
39
  Returns:
44
- -------
45
40
  decoded image as a tensor
46
41
  """
47
42
  if dtype not in (tf.uint8, tf.float16, tf.float32):
@@ -61,12 +56,10 @@ def decode_img_as_tensor(img_content: bytes, dtype: tf.dtypes.DType = tf.float32
61
56
  """Read a byte stream as a TensorFlow tensor
62
57
 
63
58
  Args:
64
- ----
65
59
  img_content: bytes of a decoded image
66
60
  dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
67
61
 
68
62
  Returns:
69
- -------
70
63
  decoded image as a tensor
71
64
  """
72
65
  if dtype not in (tf.uint8, tf.float16, tf.float32):
@@ -85,12 +78,10 @@ def tensor_from_numpy(npy_img: np.ndarray, dtype: tf.dtypes.DType = tf.float32)
85
78
  """Read an image file as a TensorFlow tensor
86
79
 
87
80
  Args:
88
- ----
89
81
  npy_img: image encoded as a numpy array of shape (H, W, C) in np.uint8
90
82
  dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
91
83
 
92
84
  Returns:
93
- -------
94
85
  same image as a tensor of shape (H, W, C)
95
86
  """
96
87
  if dtype not in (tf.uint8, tf.float16, tf.float32):
@@ -105,6 +96,6 @@ def tensor_from_numpy(npy_img: np.ndarray, dtype: tf.dtypes.DType = tf.float32)
105
96
  return img
106
97
 
107
98
 
108
- def get_img_shape(img: tf.Tensor) -> Tuple[int, int]:
99
+ def get_img_shape(img: tf.Tensor) -> tuple[int, int]:
109
100
  """Get the shape of an image"""
110
101
  return img.shape[:2]
doctr/io/pdf.py CHANGED
@@ -1,9 +1,9 @@
1
- # Copyright (C) 2021-2024, Mindee.
1
+ # Copyright (C) 2021-2025, Mindee.
2
2
 
3
3
  # This program is licensed under the Apache License 2.0.
4
4
  # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
5
 
6
- from typing import Any, List, Optional
6
+ from typing import Any
7
7
 
8
8
  import numpy as np
9
9
  import pypdfium2 as pdfium
@@ -15,18 +15,17 @@ __all__ = ["read_pdf"]
15
15
 
16
16
  def read_pdf(
17
17
  file: AbstractFile,
18
- scale: float = 2,
18
+ scale: int = 2,
19
19
  rgb_mode: bool = True,
20
- password: Optional[str] = None,
20
+ password: str | None = None,
21
21
  **kwargs: Any,
22
- ) -> List[np.ndarray]:
22
+ ) -> list[np.ndarray]:
23
23
  """Read a PDF file and convert it into an image in numpy format
24
24
 
25
25
  >>> from doctr.io import read_pdf
26
26
  >>> doc = read_pdf("path/to/your/doc.pdf")
27
27
 
28
28
  Args:
29
- ----
30
29
  file: the path to the PDF file
31
30
  scale: rendering scale (1 corresponds to 72dpi)
32
31
  rgb_mode: if True, the output will be RGB, otherwise BGR
@@ -34,7 +33,6 @@ def read_pdf(
34
33
  **kwargs: additional parameters to :meth:`pypdfium2.PdfPage.render`
35
34
 
36
35
  Returns:
37
- -------
38
36
  the list of pages decoded as numpy ndarray of shape H x W x C
39
37
  """
40
38
  # Rasterise pages to numpy ndarrays with pypdfium2