deepdoctection 0.42.1__py3-none-any.whl → 0.43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (124) hide show
  1. deepdoctection/__init__.py +2 -1
  2. deepdoctection/analyzer/__init__.py +2 -1
  3. deepdoctection/analyzer/config.py +904 -0
  4. deepdoctection/analyzer/dd.py +36 -62
  5. deepdoctection/analyzer/factory.py +311 -141
  6. deepdoctection/configs/conf_dd_one.yaml +100 -44
  7. deepdoctection/configs/profiles.jsonl +32 -0
  8. deepdoctection/dataflow/__init__.py +9 -6
  9. deepdoctection/dataflow/base.py +33 -15
  10. deepdoctection/dataflow/common.py +96 -75
  11. deepdoctection/dataflow/custom.py +36 -29
  12. deepdoctection/dataflow/custom_serialize.py +135 -91
  13. deepdoctection/dataflow/parallel_map.py +33 -31
  14. deepdoctection/dataflow/serialize.py +15 -10
  15. deepdoctection/dataflow/stats.py +41 -28
  16. deepdoctection/datapoint/__init__.py +4 -6
  17. deepdoctection/datapoint/annotation.py +104 -66
  18. deepdoctection/datapoint/box.py +190 -130
  19. deepdoctection/datapoint/convert.py +66 -39
  20. deepdoctection/datapoint/image.py +151 -95
  21. deepdoctection/datapoint/view.py +383 -236
  22. deepdoctection/datasets/__init__.py +2 -6
  23. deepdoctection/datasets/adapter.py +11 -11
  24. deepdoctection/datasets/base.py +118 -81
  25. deepdoctection/datasets/dataflow_builder.py +18 -12
  26. deepdoctection/datasets/info.py +76 -57
  27. deepdoctection/datasets/instances/__init__.py +6 -2
  28. deepdoctection/datasets/instances/doclaynet.py +17 -14
  29. deepdoctection/datasets/instances/fintabnet.py +16 -22
  30. deepdoctection/datasets/instances/funsd.py +11 -6
  31. deepdoctection/datasets/instances/iiitar13k.py +9 -9
  32. deepdoctection/datasets/instances/layouttest.py +9 -9
  33. deepdoctection/datasets/instances/publaynet.py +9 -9
  34. deepdoctection/datasets/instances/pubtables1m.py +13 -13
  35. deepdoctection/datasets/instances/pubtabnet.py +13 -15
  36. deepdoctection/datasets/instances/rvlcdip.py +8 -8
  37. deepdoctection/datasets/instances/xfund.py +11 -9
  38. deepdoctection/datasets/registry.py +18 -11
  39. deepdoctection/datasets/save.py +12 -11
  40. deepdoctection/eval/__init__.py +3 -2
  41. deepdoctection/eval/accmetric.py +72 -52
  42. deepdoctection/eval/base.py +29 -10
  43. deepdoctection/eval/cocometric.py +14 -12
  44. deepdoctection/eval/eval.py +56 -41
  45. deepdoctection/eval/registry.py +6 -3
  46. deepdoctection/eval/tedsmetric.py +24 -9
  47. deepdoctection/eval/tp_eval_callback.py +13 -12
  48. deepdoctection/extern/__init__.py +1 -1
  49. deepdoctection/extern/base.py +176 -97
  50. deepdoctection/extern/d2detect.py +127 -92
  51. deepdoctection/extern/deskew.py +19 -10
  52. deepdoctection/extern/doctrocr.py +157 -106
  53. deepdoctection/extern/fastlang.py +25 -17
  54. deepdoctection/extern/hfdetr.py +137 -60
  55. deepdoctection/extern/hflayoutlm.py +329 -248
  56. deepdoctection/extern/hflm.py +67 -33
  57. deepdoctection/extern/model.py +108 -762
  58. deepdoctection/extern/pdftext.py +37 -12
  59. deepdoctection/extern/pt/nms.py +15 -1
  60. deepdoctection/extern/pt/ptutils.py +13 -9
  61. deepdoctection/extern/tessocr.py +87 -54
  62. deepdoctection/extern/texocr.py +29 -14
  63. deepdoctection/extern/tp/tfutils.py +36 -8
  64. deepdoctection/extern/tp/tpcompat.py +54 -16
  65. deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
  66. deepdoctection/extern/tpdetect.py +4 -2
  67. deepdoctection/mapper/__init__.py +1 -1
  68. deepdoctection/mapper/cats.py +117 -76
  69. deepdoctection/mapper/cocostruct.py +35 -17
  70. deepdoctection/mapper/d2struct.py +56 -29
  71. deepdoctection/mapper/hfstruct.py +32 -19
  72. deepdoctection/mapper/laylmstruct.py +221 -185
  73. deepdoctection/mapper/maputils.py +71 -35
  74. deepdoctection/mapper/match.py +76 -62
  75. deepdoctection/mapper/misc.py +68 -44
  76. deepdoctection/mapper/pascalstruct.py +13 -12
  77. deepdoctection/mapper/prodigystruct.py +33 -19
  78. deepdoctection/mapper/pubstruct.py +42 -32
  79. deepdoctection/mapper/tpstruct.py +39 -19
  80. deepdoctection/mapper/xfundstruct.py +20 -13
  81. deepdoctection/pipe/__init__.py +1 -2
  82. deepdoctection/pipe/anngen.py +104 -62
  83. deepdoctection/pipe/base.py +226 -107
  84. deepdoctection/pipe/common.py +206 -123
  85. deepdoctection/pipe/concurrency.py +74 -47
  86. deepdoctection/pipe/doctectionpipe.py +108 -47
  87. deepdoctection/pipe/language.py +41 -24
  88. deepdoctection/pipe/layout.py +45 -18
  89. deepdoctection/pipe/lm.py +146 -78
  90. deepdoctection/pipe/order.py +196 -113
  91. deepdoctection/pipe/refine.py +111 -63
  92. deepdoctection/pipe/registry.py +1 -1
  93. deepdoctection/pipe/segment.py +213 -142
  94. deepdoctection/pipe/sub_layout.py +76 -46
  95. deepdoctection/pipe/text.py +52 -33
  96. deepdoctection/pipe/transform.py +8 -6
  97. deepdoctection/train/d2_frcnn_train.py +87 -69
  98. deepdoctection/train/hf_detr_train.py +72 -40
  99. deepdoctection/train/hf_layoutlm_train.py +85 -46
  100. deepdoctection/train/tp_frcnn_train.py +56 -28
  101. deepdoctection/utils/concurrency.py +59 -16
  102. deepdoctection/utils/context.py +40 -19
  103. deepdoctection/utils/develop.py +25 -17
  104. deepdoctection/utils/env_info.py +85 -36
  105. deepdoctection/utils/error.py +16 -10
  106. deepdoctection/utils/file_utils.py +246 -62
  107. deepdoctection/utils/fs.py +162 -43
  108. deepdoctection/utils/identifier.py +29 -16
  109. deepdoctection/utils/logger.py +49 -32
  110. deepdoctection/utils/metacfg.py +83 -21
  111. deepdoctection/utils/pdf_utils.py +119 -62
  112. deepdoctection/utils/settings.py +24 -10
  113. deepdoctection/utils/tqdm.py +10 -5
  114. deepdoctection/utils/transform.py +182 -46
  115. deepdoctection/utils/utils.py +61 -28
  116. deepdoctection/utils/viz.py +150 -104
  117. deepdoctection-0.43.dist-info/METADATA +376 -0
  118. deepdoctection-0.43.dist-info/RECORD +149 -0
  119. deepdoctection/analyzer/_config.py +0 -146
  120. deepdoctection-0.42.1.dist-info/METADATA +0 -431
  121. deepdoctection-0.42.1.dist-info/RECORD +0 -148
  122. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/WHEEL +0 -0
  123. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/licenses/LICENSE +0 -0
  124. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/top_level.txt +0 -0
@@ -16,7 +16,8 @@
16
16
  # limitations under the License.
17
17
 
18
18
  """
19
- Class AttrDict for maintaining configs and some functions for generating and saving AttrDict instances to .yaml files
19
+ Class `AttrDict` for maintaining configs and some functions for generating and saving `AttrDict` instances to
20
+ `.yaml` files
20
21
  """
21
22
  from __future__ import annotations
22
23
 
@@ -32,7 +33,12 @@ from .types import PathLikeOrStr
32
33
  # Licensed under the Apache License, Version 2.0 (the "License")
33
34
  class AttrDict:
34
35
  """
35
- Class for storing key,values as instance with attributes and values.
36
+ Class `AttrDict` for maintaining configs and some functions for generating and saving `AttrDict` instances to
37
+ `.yaml` files.
38
+
39
+ Info:
40
+ This module provides a class for storing key-value pairs as attributes and functions for serializing and
41
+ deserializing configurations.
36
42
  """
37
43
 
38
44
  _freezed = False
@@ -41,7 +47,17 @@ class AttrDict:
41
47
 
42
48
  def __getattr__(self, name: str) -> Any:
43
49
  """
44
- __getattr__
50
+ Returns the attribute value for `name`. If the attribute does not exist and the instance is not frozen, a new
51
+ `AttrDict` is created and assigned.
52
+
53
+ Args:
54
+ name: The name of the attribute.
55
+
56
+ Returns:
57
+ The value of the attribute.
58
+
59
+ Raises:
60
+ AttributeError: If the instance is frozen or the attribute name starts with `_`.
45
61
  """
46
62
  if self._freezed:
47
63
  raise AttributeError(name)
@@ -54,29 +70,47 @@ class AttrDict:
54
70
 
55
71
  def __setattr__(self, name: str, value: Any) -> None:
56
72
  """
57
- __setattr__
73
+ Sets the attribute `name` to `value`.
74
+
75
+ Args:
76
+ name: The name of the attribute.
77
+ value: The value to set.
78
+
79
+ Raises:
80
+ AttributeError: If the instance is frozen and `name` is not `_freezed`.
58
81
  """
59
- if self._freezed and name not in self.__dict__:
82
+ if self._freezed and name != "_freezed":
60
83
  raise AttributeError(f"Config was freezed! Unknown config: {name}")
61
84
  super().__setattr__(name, value)
62
85
 
63
86
  def __str__(self) -> str:
64
87
  """
65
- __str__
88
+ Returns a pretty-printed string representation of the configuration.
89
+
90
+ Returns:
91
+ A string representation of the configuration.
66
92
  """
67
93
  return pprint.pformat(self.to_dict(), width=100, compact=True)
68
94
 
69
95
  __repr__ = __str__
70
96
 
71
97
  def to_dict(self) -> dict[str, Any]:
72
- """Convert to a nested dict."""
98
+ """
99
+ Convert to a nested dict.
100
+
101
+ Returns:
102
+ A dictionary representation of the configuration.
103
+ """
73
104
  return {
74
105
  k: v.to_dict() if isinstance(v, AttrDict) else v for k, v in self.__dict__.items() if not k.startswith("_")
75
106
  }
76
107
 
77
108
  def from_dict(self, d: dict[str, Any]) -> None: # pylint: disable=C0103
78
109
  """
79
- Generate an instance from a dict
110
+ Generate an instance from a dict.
111
+
112
+ Args:
113
+ d: The dictionary to load values from.
80
114
  """
81
115
  if isinstance(d, dict):
82
116
  self.freeze(False)
@@ -90,6 +124,9 @@ class AttrDict:
90
124
  def update_args(self, args: list[str]) -> None:
91
125
  """
92
126
  Update from command line args.
127
+
128
+ Args:
129
+ args: A list of command line arguments in the form `key1.key2=val`.
93
130
  """
94
131
  for cfg in args:
95
132
  keys, v = cfg.split("=", maxsplit=1) # pylint: disable=C0103
@@ -110,8 +147,11 @@ class AttrDict:
110
147
  """
111
148
  Overwrite the current config with values from another config.
112
149
 
113
- :param other_config: The other AttrDict instance to copy values from.
114
- :raises AttributeError: If a key from other_config is not an attribute of self.
150
+ Args:
151
+ other_config: The other `AttrDict` instance to copy values from.
152
+
153
+ Raises:
154
+ AttributeError: If the config is frozen.
115
155
  """
116
156
  if self._freezed:
117
157
  raise AttributeError("Config was freezed! Cannot overwrite config.")
@@ -119,7 +159,10 @@ class AttrDict:
119
159
 
120
160
  def freeze(self, freezed: bool = True) -> None:
121
161
  """
122
- :param freezed: freeze the instance, so that no attributes can be added or changed
162
+ Freeze or unfreeze the instance, so that no attributes can be added or changed.
163
+
164
+ Args:
165
+ freezed: Whether to freeze the instance.
123
166
  """
124
167
  self._freezed = freezed
125
168
  for v in self.__dict__.values(): # pylint: disable=C0103
@@ -136,9 +179,13 @@ class AttrDict:
136
179
 
137
180
  def set_config_by_yaml(path_yaml: PathLikeOrStr) -> AttrDict:
138
181
  """
139
- Use to initialize the config class for tensorpack faster rcnn
182
+ Initialize the config class from a YAML file.
183
+
184
+ Args:
185
+ path_yaml: The path to the YAML file.
140
186
 
141
- :param path_yaml: The path to the file
187
+ Returns:
188
+ An `AttrDict` instance initialized from the YAML file.
142
189
  """
143
190
  config = AttrDict()
144
191
  _C = config # pylint: disable=C0103
@@ -153,9 +200,17 @@ def set_config_by_yaml(path_yaml: PathLikeOrStr) -> AttrDict:
153
200
 
154
201
  def save_config_to_yaml(config: AttrDict, path_yaml: PathLikeOrStr) -> None:
155
202
  """
156
- :param config: The configuration instance as an AttrDict
157
- :param path_yaml: Save the config class for tensorpack faster rcnn
158
- :return: yaml_path: The path to save the file to
203
+ Save the configuration instance as a YAML file.
204
+
205
+ Example:
206
+ ```python
207
+ save_config_to_yaml(config, "config.yaml")
208
+ ```
209
+
210
+ Args:
211
+ config: The configuration instance as an `AttrDict`.
212
+ path_yaml: The path to save the YAML file to.
213
+
159
214
  """
160
215
 
161
216
  with open(path_yaml, "w") as file: # pylint: disable=W1514
@@ -164,12 +219,19 @@ def save_config_to_yaml(config: AttrDict, path_yaml: PathLikeOrStr) -> None:
164
219
 
165
220
  def config_to_cli_str(config: AttrDict, *exclude: str) -> str:
166
221
  """
167
- Transform an AttrDict to a string that can be passed to a cli. Add optionally keys of the config that should not be
168
- added to the string.
222
+ Transform an `AttrDict` to a string that can be passed to a CLI. Optionally exclude keys from the string.
223
+
224
+ Example:
225
+ ```python
226
+ config_to_cli_str(config, "key1", "key2")
227
+ ```
228
+
229
+ Args:
230
+ config: An `AttrDict`.
231
+ *exclude: Keys of the `AttrDict` to exclude.
169
232
 
170
- :param config: An `AttrDict`
171
- :param exclude: keys of the AttrDict
172
- :return: A string that can be passed to a cli
233
+ Returns:
234
+ A string that can be passed to a CLI.
173
235
  """
174
236
 
175
237
  config_dict = config.to_dict()
@@ -16,8 +16,9 @@
16
16
  # limitations under the License.
17
17
 
18
18
  """
19
- Module with pdf processing tools
19
+ Pdf processing tools
20
20
  """
21
+
21
22
  import os
22
23
  import platform
23
24
  import subprocess
@@ -57,17 +58,20 @@ __all__ = [
57
58
 
58
59
  def decrypt_pdf_document(path: PathLikeOrStr) -> bool:
59
60
  """
60
- Decrypting a pdf. As copying a pdf document removes the password that protects pdf, this method
61
- generates a copy and decrypts the copy using qpdf. The result is saved as the original
62
- document.
61
+ Decrypt a PDF file.
62
+
63
+ As copying a PDF document removes the password that protects the PDF, this method generates a copy and decrypts the
64
+ copy using `qpdf`. The result is saved as the original document.
63
65
 
64
- qpdf: <http://qpdf.sourceforge.net/>
66
+ Note:
67
+ This decryption does not work if the PDF has a readable protection, in which case no solution is provided.
68
+ `qpdf`: <http://qpdf.sourceforge.net/>
65
69
 
66
- Note, that this is decryption does not work, if the pdf has a readable protection, in which case we do not
67
- provide any solution.
70
+ Args:
71
+ path: A path to the PDF file.
68
72
 
69
- :param path: A path to the pdf file
70
- :return: True if document has been successfully decrypted
73
+ Returns:
74
+ True if the document has been successfully decrypted.
71
75
  """
72
76
  if qpdf_available():
73
77
  path_base, file_name = os.path.split(path)
@@ -88,12 +92,20 @@ def decrypt_pdf_document(path: PathLikeOrStr) -> bool:
88
92
 
89
93
  def decrypt_pdf_document_from_bytes(input_bytes: bytes) -> bytes:
90
94
  """
91
- Decrypting a pdf given as bytes. Under the hood, it saves the bytes to a temporary file and then calls
95
+ Decrypt a PDF given as bytes.
96
+
97
+ Under the hood, it saves the bytes to a temporary file and then calls `decrypt_pdf_document`.
98
+
99
+ Note:
100
+ `qpdf`: <http://qpdf.sourceforge.net/>
101
+
102
+ Args:
103
+ input_bytes: A bytes object representing the PDF file.
104
+
105
+ Returns:
106
+ The decrypted bytes object.
92
107
 
93
- qpdf: <http://qpdf.sourceforge.net/>
94
108
 
95
- :param input_bytes: A bytes object representing the pdf file
96
- :return: The decrypted bytes object
97
109
  """
98
110
  with save_tmp_file(input_bytes, "pdf_") as (_, input_file_name):
99
111
  is_decrypted = decrypt_pdf_document(input_file_name)
@@ -107,11 +119,16 @@ def decrypt_pdf_document_from_bytes(input_bytes: bytes) -> bytes:
107
119
 
108
120
  def get_pdf_file_reader(path_or_bytes: Union[PathLikeOrStr, bytes]) -> PdfReader:
109
121
  """
110
- Creates a file reader object from a pdf document. Will try to decrypt the document if it is
111
- encrypted. (See `decrypt_pdf_document` to understand what is meant with "decrypt").
122
+ Create a file reader object from a PDF document.
123
+
124
+ Will try to decrypt the document if it is encrypted. (See `decrypt_pdf_document` to understand what is meant with
125
+ "decrypt").
112
126
 
113
- :param path_or_bytes: A path to a pdf document
114
- :return: A file reader object from which you can iterate through the document.
127
+ Args:
128
+ path_or_bytes: A path to a PDF document or bytes.
129
+
130
+ Returns:
131
+ A file reader object from which you can iterate through the document.
115
132
  """
116
133
 
117
134
  if isinstance(path_or_bytes, bytes):
@@ -153,39 +170,47 @@ def get_pdf_file_reader(path_or_bytes: Union[PathLikeOrStr, bytes]) -> PdfReader
153
170
 
154
171
  def get_pdf_file_writer() -> PdfWriter:
155
172
  """
156
- `PdfWriter` instance
173
+ `PdfWriter` instance.
174
+
175
+ Returns:
176
+ A new `PdfWriter` instance.
157
177
  """
158
178
  return PdfWriter()
159
179
 
160
180
 
161
181
  class PDFStreamer:
162
182
  """
163
- A class for streaming pdf documents as bytes objects. Build as a generator, it is possible to load the document
164
- iteratively into memory. Uses py2pdf FileReader and FileWriter.
165
-
166
- **Example:**
167
-
168
- # Building a Dataflow with a PDFStreamer
169
- df = dataflow.DataFromIterable(PDFStreamer(path=path))
170
- df.reset_state()
171
-
172
- for page in df:
173
- ... # do whatever you like
174
-
175
- # Something else you can do:
176
- streamer = PDFStreamer(path=path)
177
- pages = len(streamer) # get the number of pages
178
- random_int = random.sample(range(0, pages), 2) # select some pages
179
- for ran in random_int:
180
- pdf_bytes = streamer[ran] # get the page bytes directly
181
-
182
- streamer.close() # Do not forget to close the streamer, otherwise the file will never be closed and might
183
- # cause memory leaks if you open many files.
183
+ A class for streaming PDF documents as bytes objects.
184
+
185
+ Built as a generator, it is possible to load the document iteratively into memory. Uses `pypdf` `PdfReader` and
186
+ `PdfWriter`.
187
+
188
+ Example:
189
+ ```python
190
+ df = dataflow.DataFromIterable(PDFStreamer(path=path))
191
+ df.reset_state()
192
+ for page in df:
193
+ ...
194
+ streamer = PDFStreamer(path=path)
195
+ pages = len(streamer)
196
+ random_int = random.sample(range(0, pages), 2)
197
+ for ran in random_int:
198
+ pdf_bytes = streamer[ran]
199
+ streamer.close()
200
+ ```
201
+
202
+ Note:
203
+ Do not forget to close the streamer, otherwise the file will never be closed and might cause memory leaks if
204
+ you open many files.
184
205
  """
185
206
 
186
207
  def __init__(self, path_or_bytes: Union[PathLikeOrStr, bytes]) -> None:
187
208
  """
188
- :param path_or_bytes: to a pdf.
209
+ Args:
210
+ path_or_bytes: Path to a PDF.
211
+
212
+ Returns:
213
+ None.
189
214
  """
190
215
  self.file_reader = get_pdf_file_reader(path_or_bytes)
191
216
  self.file_writer = PdfWriter()
@@ -256,10 +281,15 @@ def _input_to_cli_str(
256
281
 
257
282
  class PopplerError(RuntimeError):
258
283
  """
259
- Poppler Error
284
+ Poppler Error.
260
285
  """
261
286
 
262
287
  def __init__(self, status: int, message: str) -> None:
288
+ """
289
+ Args:
290
+ status: Error status code.
291
+ message: Error message.
292
+ """
263
293
  super().__init__()
264
294
  self.status = status
265
295
  self.message = message
@@ -283,13 +313,20 @@ def pdf_to_np_array_poppler(
283
313
  pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dpi: Optional[int] = None
284
314
  ) -> PixelValues:
285
315
  """
286
- Convert a single pdf page from its byte representation to a numpy array. This function will save the pdf as to a tmp
287
- file and then call poppler via `pdftoppm` resp. `pdftocairo` if the former is not available.
316
+ Convert a single PDF page from its byte representation to a numpy array using Poppler.
317
+
318
+ This function will save the PDF as a temporary file and then call Poppler via `pdftoppm` or `pdftocairo`.
288
319
 
289
- :param pdf_bytes: Bytes representing the PDF file
290
- :param size: Size of the resulting image(s), uses (width, height) standard
291
- :param dpi: Image quality in DPI/dots-per-inch (default 200)
292
- :return: numpy array
320
+ Raises:
321
+ ValueError: If neither `dpi` nor `size` is provided.
322
+
323
+ Args:
324
+ pdf_bytes: Bytes representing the PDF file.
325
+ size: Size of the resulting image(s), as (width, height).
326
+ dpi: Image quality in DPI/dots-per-inch.
327
+
328
+ Returns:
329
+ `np.array`.
293
330
  """
294
331
  if dpi is None and size is None:
295
332
  raise ValueError("Either dpi or size must be provided.")
@@ -302,11 +339,17 @@ def pdf_to_np_array_poppler(
302
339
 
303
340
  def pdf_to_np_array_pdfmium(pdf_bytes: bytes, dpi: Optional[int] = None) -> PixelValues:
304
341
  """
305
- Convert a single pdf page from its byte representation to a numpy array using pdfium.
342
+ Convert a single PDF page from its byte representation to a numpy array using pdfium.
343
+
344
+ Args:
345
+ pdf_bytes: Bytes representing the PDF file.
346
+ dpi: Image quality in DPI/dots-per-inch.
306
347
 
307
- :param pdf_bytes: Bytes representing the PDF file
308
- :param dpi: Image quality in DPI/dots-per-inch (default 200)
309
- :return: numpy array
348
+ Returns:
349
+ `np.array`.
350
+
351
+ Raises:
352
+ ValueError: If `dpi` is not provided.
310
353
  """
311
354
  if dpi is None:
312
355
  raise ValueError("dpi must be provided.")
@@ -316,13 +359,21 @@ def pdf_to_np_array_pdfmium(pdf_bytes: bytes, dpi: Optional[int] = None) -> Pixe
316
359
 
317
360
  def pdf_to_np_array(pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dpi: Optional[int] = None) -> PixelValues:
318
361
  """
319
- Convert a single pdf page from its byte representation to a numpy array. This function will either use Poppler or
320
- pdfium to render the pdf.
362
+ Convert a single PDF page from its byte representation to a `np.array`.
363
+
364
+ This function will either use Poppler or pdfium to render the PDF.
365
+
366
+ Args:
367
+ pdf_bytes: Bytes representing the PDF file.
368
+ size: Size of the resulting image(s), as (width, height).
369
+ dpi: Image quality in DPI/dots-per-inch.
321
370
 
322
- :param pdf_bytes: Bytes representing the PDF file
323
- :param size: Size of the resulting image(s), uses (width, height) standard
324
- :param dpi: Image quality in DPI/dots-per-inch (default 200)
325
- :return: numpy array
371
+ Returns:
372
+ `np.array`.
373
+
374
+ Note:
375
+ If `USE_DD_PDFIUM` is set, `pdf_to_np_array_pdfmium` does not support the `size` parameter and will use
376
+ `dpi` instead.
326
377
  """
327
378
  if os.environ.get("USE_DD_PDFIUM", "False") in ENV_VARS_TRUE:
328
379
  if size is not None:
@@ -339,12 +390,18 @@ def split_pdf(
339
390
  pdf_path: PathLikeOrStr, output_dir: PathLikeOrStr, file_type: Literal["image", "pdf"], dpi: int = 200
340
391
  ) -> None:
341
392
  """
342
- Split a pdf into single pages. The pages are saved as single pdf/png files in a subfolder of the output directory.
393
+ Split a PDF into single pages.
394
+
395
+ The pages are saved as single PDF or PNG files in a subfolder of the output directory.
396
+
397
+ Args:
398
+ pdf_path: Path to the PDF file.
399
+ output_dir: Path to the output directory.
400
+ file_type: Type of the output file. Either "image" or "pdf".
401
+ dpi: Image quality in DPI/dots-per-inch.
343
402
 
344
- :param pdf_path: Path to the pdf file
345
- :param output_dir: Path to the output directory
346
- :param file_type: Type of the output file. Either "image" or "pdf"
347
- :param dpi: Image quality in DPI/dots-per-inch (default
403
+ Returns:
404
+ None.
348
405
  """
349
406
  pdf_path = Path(pdf_path)
350
407
  filename = pdf_path.stem
@@ -79,6 +79,7 @@ class SummaryType(ObjectTypes):
79
79
  """Summary type member"""
80
80
 
81
81
  SUMMARY = "summary"
82
+ DOCUMENT_SUMMARY = "document_summary"
82
83
 
83
84
 
84
85
  @object_types_registry.register("DocumentType")
@@ -329,12 +330,18 @@ _TOKEN_AND_TAG_TO_TOKEN_CLASS_WITH_TAG = {
329
330
 
330
331
  def token_class_tag_to_token_class_with_tag(token: ObjectTypes, tag: ObjectTypes) -> ObjectTypes:
331
332
  """
332
- Mapping TokenClassWithTag enum member from token class and tag, e.g. `TokenClasses.header` and `BioTag.inside`
333
- maps to TokenClassWithTag.i_header.
333
+ Maps a `TokenClassWithTag` enum member from a token class and tag, e.g. `TokenClasses.header` and `BioTag.inside`
334
+ maps to `TokenClassWithTag.i_header`.
334
335
 
335
- :param token: TokenClasses member
336
- :param tag: BioTag member
337
- :return: TokenClassWithTag member
336
+ Args:
337
+ token: TokenClasses member.
338
+ tag: BioTag member.
339
+
340
+ Returns:
341
+ TokenClassWithTag member.
342
+
343
+ Raises:
344
+ TypeError: If token is not of type TokenClasses or tag is not of type BioTag.
338
345
  """
339
346
  if isinstance(token, TokenClasses) and isinstance(tag, BioTag):
340
347
  return _TOKEN_AND_TAG_TO_TOKEN_CLASS_WITH_TAG[(token, tag)]
@@ -349,8 +356,11 @@ def token_class_with_tag_to_token_class_and_tag(
349
356
  """
350
357
  This is the reverse mapping from TokenClassWithTag members to TokenClasses and BioTag
351
358
 
352
- :param token_class_with_tag: TokenClassWithTag member
353
- :return: Tuple of TokenClasses member and BioTag member
359
+ Args:
360
+ token_class_with_tag: `TokenClassWithTag` member
361
+
362
+ Returns:
363
+ Tuple of `TokenClasses` member and `BioTag` member
354
364
  """
355
365
  return {val: key for key, val in _TOKEN_AND_TAG_TO_TOKEN_CLASS_WITH_TAG.items()}.get(token_class_with_tag)
356
366
 
@@ -405,10 +415,13 @@ def update_black_list(item: str) -> None:
405
415
 
406
416
 
407
417
  def get_type(obj_type: Union[str, ObjectTypes]) -> ObjectTypes:
408
- """Get an object type property from a given string. Does nothing if an ObjectType is passed
418
+ """
419
+ Get an object type property from a given string. Does nothing if an `ObjectType` is passed
409
420
 
410
- :param obj_type: String or ObjectTypes
411
- :return: ObjectType
421
+ Args:
422
+ obj_type: String or ObjectTypes
423
+ Returns:
424
+ `ObjectType`
412
425
  """
413
426
  if isinstance(obj_type, ObjectTypes):
414
427
  return obj_type
@@ -435,6 +448,7 @@ if os.environ.get("DEEPDOCTECTION_CACHE"):
435
448
  else:
436
449
  dd_cache_home = Path(os.getenv("XDG_CACHE_HOME", Path.home() / ".cache")) / "deepdoctection"
437
450
 
451
+ CACHE_DIR = dd_cache_home
438
452
  MODEL_DIR = dd_cache_home / "weights"
439
453
 
440
454
  # configs cache directory
@@ -32,8 +32,10 @@ def get_tqdm_default_kwargs(
32
32
  **kwargs: Optional[Union[str, int, float]]
33
33
  ) -> Dict[str, Union[str, float, bool, int, None]]:
34
34
  """
35
- Return default arguments to be used with tqdm.
36
- :param kwargs: extra arguments to be used.
35
+ Return default arguments to be used with `tqdm`.
36
+
37
+ Args:
38
+ kwargs: extra arguments to be used.
37
39
  """
38
40
 
39
41
  return {
@@ -49,10 +51,13 @@ def get_tqdm_default_kwargs(
49
51
 
50
52
  def get_tqdm(total: Optional[Union[int, float]] = None, **kwargs: Union[str, int, float]) -> TqdmType:
51
53
  """
52
- Get tqdm progress bar with some default options to have consistent style.
54
+ Get `tqdm` progress bar with some default options to have consistent style.
55
+
56
+ Args:
57
+ total: The number of expected iterations.
53
58
 
54
- :param total: The number of expected iterations.
55
- :return: A tqdm instance
59
+ Returns:
60
+ A `tqdm` instance
56
61
  """
57
62
 
58
63
  default_tqdm_setting = get_tqdm_default_kwargs(total=total)