deepdoctection 0.42.1__py3-none-any.whl → 0.43.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (124) hide show
  1. deepdoctection/__init__.py +4 -2
  2. deepdoctection/analyzer/__init__.py +2 -1
  3. deepdoctection/analyzer/config.py +919 -0
  4. deepdoctection/analyzer/dd.py +36 -62
  5. deepdoctection/analyzer/factory.py +311 -141
  6. deepdoctection/configs/conf_dd_one.yaml +100 -44
  7. deepdoctection/configs/profiles.jsonl +32 -0
  8. deepdoctection/dataflow/__init__.py +9 -6
  9. deepdoctection/dataflow/base.py +33 -15
  10. deepdoctection/dataflow/common.py +96 -75
  11. deepdoctection/dataflow/custom.py +36 -29
  12. deepdoctection/dataflow/custom_serialize.py +135 -91
  13. deepdoctection/dataflow/parallel_map.py +33 -31
  14. deepdoctection/dataflow/serialize.py +15 -10
  15. deepdoctection/dataflow/stats.py +41 -28
  16. deepdoctection/datapoint/__init__.py +4 -6
  17. deepdoctection/datapoint/annotation.py +104 -66
  18. deepdoctection/datapoint/box.py +190 -130
  19. deepdoctection/datapoint/convert.py +66 -39
  20. deepdoctection/datapoint/image.py +151 -95
  21. deepdoctection/datapoint/view.py +383 -236
  22. deepdoctection/datasets/__init__.py +2 -6
  23. deepdoctection/datasets/adapter.py +11 -11
  24. deepdoctection/datasets/base.py +118 -81
  25. deepdoctection/datasets/dataflow_builder.py +18 -12
  26. deepdoctection/datasets/info.py +76 -57
  27. deepdoctection/datasets/instances/__init__.py +6 -2
  28. deepdoctection/datasets/instances/doclaynet.py +17 -14
  29. deepdoctection/datasets/instances/fintabnet.py +16 -22
  30. deepdoctection/datasets/instances/funsd.py +11 -6
  31. deepdoctection/datasets/instances/iiitar13k.py +9 -9
  32. deepdoctection/datasets/instances/layouttest.py +9 -9
  33. deepdoctection/datasets/instances/publaynet.py +9 -9
  34. deepdoctection/datasets/instances/pubtables1m.py +13 -13
  35. deepdoctection/datasets/instances/pubtabnet.py +13 -15
  36. deepdoctection/datasets/instances/rvlcdip.py +8 -8
  37. deepdoctection/datasets/instances/xfund.py +11 -9
  38. deepdoctection/datasets/registry.py +18 -11
  39. deepdoctection/datasets/save.py +12 -11
  40. deepdoctection/eval/__init__.py +3 -2
  41. deepdoctection/eval/accmetric.py +72 -52
  42. deepdoctection/eval/base.py +29 -10
  43. deepdoctection/eval/cocometric.py +14 -12
  44. deepdoctection/eval/eval.py +56 -41
  45. deepdoctection/eval/registry.py +6 -3
  46. deepdoctection/eval/tedsmetric.py +24 -9
  47. deepdoctection/eval/tp_eval_callback.py +13 -12
  48. deepdoctection/extern/__init__.py +1 -1
  49. deepdoctection/extern/base.py +176 -97
  50. deepdoctection/extern/d2detect.py +127 -92
  51. deepdoctection/extern/deskew.py +19 -10
  52. deepdoctection/extern/doctrocr.py +162 -108
  53. deepdoctection/extern/fastlang.py +25 -17
  54. deepdoctection/extern/hfdetr.py +137 -60
  55. deepdoctection/extern/hflayoutlm.py +329 -248
  56. deepdoctection/extern/hflm.py +67 -33
  57. deepdoctection/extern/model.py +108 -762
  58. deepdoctection/extern/pdftext.py +37 -12
  59. deepdoctection/extern/pt/nms.py +15 -1
  60. deepdoctection/extern/pt/ptutils.py +13 -9
  61. deepdoctection/extern/tessocr.py +87 -54
  62. deepdoctection/extern/texocr.py +29 -14
  63. deepdoctection/extern/tp/tfutils.py +36 -8
  64. deepdoctection/extern/tp/tpcompat.py +54 -16
  65. deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
  66. deepdoctection/extern/tpdetect.py +4 -2
  67. deepdoctection/mapper/__init__.py +1 -1
  68. deepdoctection/mapper/cats.py +117 -76
  69. deepdoctection/mapper/cocostruct.py +35 -17
  70. deepdoctection/mapper/d2struct.py +56 -29
  71. deepdoctection/mapper/hfstruct.py +32 -19
  72. deepdoctection/mapper/laylmstruct.py +221 -185
  73. deepdoctection/mapper/maputils.py +71 -35
  74. deepdoctection/mapper/match.py +76 -62
  75. deepdoctection/mapper/misc.py +68 -44
  76. deepdoctection/mapper/pascalstruct.py +13 -12
  77. deepdoctection/mapper/prodigystruct.py +33 -19
  78. deepdoctection/mapper/pubstruct.py +42 -32
  79. deepdoctection/mapper/tpstruct.py +39 -19
  80. deepdoctection/mapper/xfundstruct.py +20 -13
  81. deepdoctection/pipe/__init__.py +1 -2
  82. deepdoctection/pipe/anngen.py +104 -62
  83. deepdoctection/pipe/base.py +226 -107
  84. deepdoctection/pipe/common.py +206 -123
  85. deepdoctection/pipe/concurrency.py +74 -47
  86. deepdoctection/pipe/doctectionpipe.py +108 -47
  87. deepdoctection/pipe/language.py +41 -24
  88. deepdoctection/pipe/layout.py +45 -18
  89. deepdoctection/pipe/lm.py +146 -78
  90. deepdoctection/pipe/order.py +205 -119
  91. deepdoctection/pipe/refine.py +111 -63
  92. deepdoctection/pipe/registry.py +1 -1
  93. deepdoctection/pipe/segment.py +213 -142
  94. deepdoctection/pipe/sub_layout.py +76 -46
  95. deepdoctection/pipe/text.py +52 -33
  96. deepdoctection/pipe/transform.py +8 -6
  97. deepdoctection/train/d2_frcnn_train.py +87 -69
  98. deepdoctection/train/hf_detr_train.py +72 -40
  99. deepdoctection/train/hf_layoutlm_train.py +85 -46
  100. deepdoctection/train/tp_frcnn_train.py +56 -28
  101. deepdoctection/utils/concurrency.py +59 -16
  102. deepdoctection/utils/context.py +40 -19
  103. deepdoctection/utils/develop.py +26 -17
  104. deepdoctection/utils/env_info.py +86 -37
  105. deepdoctection/utils/error.py +16 -10
  106. deepdoctection/utils/file_utils.py +246 -71
  107. deepdoctection/utils/fs.py +162 -43
  108. deepdoctection/utils/identifier.py +29 -16
  109. deepdoctection/utils/logger.py +49 -32
  110. deepdoctection/utils/metacfg.py +83 -21
  111. deepdoctection/utils/pdf_utils.py +119 -62
  112. deepdoctection/utils/settings.py +24 -10
  113. deepdoctection/utils/tqdm.py +10 -5
  114. deepdoctection/utils/transform.py +182 -46
  115. deepdoctection/utils/utils.py +61 -28
  116. deepdoctection/utils/viz.py +150 -104
  117. deepdoctection-0.43.1.dist-info/METADATA +376 -0
  118. deepdoctection-0.43.1.dist-info/RECORD +149 -0
  119. deepdoctection/analyzer/_config.py +0 -146
  120. deepdoctection-0.42.1.dist-info/METADATA +0 -431
  121. deepdoctection-0.42.1.dist-info/RECORD +0 -148
  122. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/WHEEL +0 -0
  123. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/licenses/LICENSE +0 -0
  124. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/top_level.txt +0 -0
@@ -41,53 +41,64 @@ from .registry import pipeline_component_registry
41
41
  @pipeline_component_registry.register("ImageCroppingService")
42
42
  class MultiThreadPipelineComponent(PipelineComponent):
43
43
  """
44
- Running a pipeline component in multiple thread to increase through put. Datapoints will be queued
45
- and processed once calling the `start`.
44
+ This module provides functionality for running pipeline components in multiple threads to increase throughput.
45
+ Datapoints are queued and processed once calling the `start` method.
46
46
 
47
- The number of threads is derived from the list of pipeline components. It makes no sense to create the various
48
- components.
47
+ Note:
48
+ The number of threads is derived from the list of `pipeline_components`. It makes no sense to create various
49
+ components.
49
50
 
50
- Think of the pipeline component as an asynchronous process. Because the entire data flow is loaded into memory
51
- before the process is started, storage capacity must be guaranteed.
51
+ Think of the pipeline component as an asynchronous process. Because the entire data flow is loaded into memory
52
+ before the process is started, storage capacity must be guaranteed.
52
53
 
53
- If pre- and post-processing are to be carried out before the task within the wrapped pipeline component, this can
54
- also be transferred as a function. These tasks are also assigned to the threads.
54
+ If pre- and post-processing are to be carried out before the task within the wrapped pipeline component, this
55
+ can also be transferred as a function. These tasks are also assigned to the threads.
55
56
 
56
- Note that the order in the dataflow and when returning lists is generally is no longer retained.
57
+ The order in the dataflow and when returning lists is generally no longer retained.
57
58
 
59
+ Example:
60
+ ```python
58
61
  some_component = SubImageLayoutService(some_predictor, some_category)
59
- some_component:clone = some_component.clone()
62
+ some_component_clone = some_component.clone()
60
63
 
61
- multi_thread_comp = MultiThreadPipelineComponent(pipeline_components=[some_component,some_component_clone],
62
- pre_proc_func=maybe_load_image,
63
- post_proc_func=maybe_remove_image)
64
+ multi_thread_comp = MultiThreadPipelineComponent(
65
+ pipeline_components=[some_component, some_component_clone],
66
+ pre_proc_func=maybe_load_image,
67
+ post_proc_func=maybe_remove_image
68
+ )
64
69
 
65
70
  multi_thread_comp.put_task(some_dataflow)
66
71
  output_list = multi_thread_comp.start()
72
+ ```
67
73
 
68
- You cannot run `MultiThreadPipelineComponent` in `DoctectionPipe` as this requires batching datapoints and neither
69
- can you run `MultiThreadPipelineComponent` in combination with a humble 'PipelineComponent` unless you take care
70
- of batching/unbatching between each component by yourself. The easiest way to build a pipeline with
71
- `MultiThreadPipelineComponent` can be accomplished as follows:
74
+ Info:
75
+ You cannot run `MultiThreadPipelineComponent` in `DoctectionPipe` as this requires batching datapoints and
76
+ neither can you run `MultiThreadPipelineComponent` in combination with a humble `PipelineComponent` unless you
77
+ take care of batching/unbatching between each component by yourself. The easiest way to build a pipeline with
78
+ `MultiThreadPipelineComponent` can be accomplished as follows:
72
79
 
80
+ Example:
81
+ ```python
73
82
  # define the pipeline component
74
- ome_component = SubImageLayoutService(some_predictor, some_category)
75
- some_component:clone = some_component.clone()
83
+ some_component = SubImageLayoutService(some_predictor, some_category)
84
+ some_component_clone = some_component.clone()
76
85
 
77
86
  # creating two threads, one for each component
78
- multi_thread_comp = MultiThreadPipelineComponent(pipeline_components=[some_component,some_component_clone],
79
- pre_proc_func=maybe_load_image,
80
- post_proc_func=maybe_remove_image)
87
+ multi_thread_comp = MultiThreadPipelineComponent(
88
+ pipeline_components=[some_component, some_component_clone],
89
+ pre_proc_func=maybe_load_image,
90
+ post_proc_func=maybe_remove_image
91
+ )
81
92
 
82
93
  # currying `to_image`, so that you can call it in `MapData`.
83
94
  @curry
84
- def _to_image(dp,dpi):
85
- return to_image(dp,dpi)
95
+ def _to_image(dp, dpi):
96
+ return to_image(dp, dpi)
86
97
 
87
98
  # set-up the dataflow/stream, e.g.
88
99
  df = SerializerPdfDoc.load(path, max_datapoints=max_datapoints)
89
100
  df = MapData(df, to_image(dpi=300))
90
- df = BatchData(df, batch_size=32,remainder=True)
101
+ df = BatchData(df, batch_size=32, remainder=True)
91
102
  df = multi_thread_comp.predict_dataflow(df)
92
103
  df = FlattenData(df)
93
104
  df = MapData(df, lambda x: x[0])
@@ -95,7 +106,8 @@ class MultiThreadPipelineComponent(PipelineComponent):
95
106
  df.reset_state()
96
107
 
97
108
  for dp in df:
98
- ...
109
+ ...
110
+ ```
99
111
  """
100
112
 
101
113
  def __init__(
@@ -106,13 +118,12 @@ class MultiThreadPipelineComponent(PipelineComponent):
106
118
  max_datapoints: Optional[int] = None,
107
119
  ) -> None:
108
120
  """
109
- :param pipeline_components: list of identical pipeline component. Number of threads created is determined by
110
- `len`
111
- :param pre_proc_func: pass a function, that reads and returns an image. Will execute before entering the pipe
112
- component
113
- :param post_proc_func: pass a function, that reads and returns an image. Will execute after entering the pipe
114
- component
115
- :param max_datapoints: max datapoints to process
121
+ Args:
122
+ pipeline_components: List of identical `PipelineComponent`. Number of threads created is determined by
123
+ `len`.
124
+ pre_proc_func: Function that reads and returns an image. Will execute before entering the pipe component.
125
+ post_proc_func: Function that reads and returns an image. Will execute after entering the pipe component.
126
+ max_datapoints: Maximum datapoints to process.
116
127
  """
117
128
 
118
129
  self.pipe_components = pipeline_components
@@ -125,20 +136,29 @@ class MultiThreadPipelineComponent(PipelineComponent):
125
136
 
126
137
  def put_task(self, df: Union[DataFlow, list[Image]]) -> None:
127
138
  """
128
- Put a dataflow or a list of datapoints to the queue. Note, that the process will not start before `start`
129
- is called. If you do not know how many datapoints will be cached, use max_datapoint to ensure no oom.
139
+ Put a `DataFlow` or a list of datapoints to the queue.
140
+
141
+ Note:
142
+ The process will not start before `start` is called. If you do not know how many datapoints will be cached,
143
+ use `max_datapoints` to ensure no OOM.
130
144
 
131
- :param df: A list or a dataflow of Image
145
+ Args:
146
+ df: A list or a `DataFlow` of `Image`.
132
147
  """
133
148
 
134
149
  self._put_datapoints_to_queue(df)
135
150
 
136
151
  def start(self) -> list[Image]:
137
152
  """
138
- Creates a worker for each component and starts processing the data points of the queue. A list of the results
139
- is returned once all points in the queue have been processed.
153
+ Creates a worker for each component and starts processing the datapoints of the queue.
140
154
 
141
- :return: A list of Images
155
+ Example:
156
+ ```python
157
+ output_list = multi_thread_comp.start()
158
+ ```
159
+
160
+ Returns:
161
+ A list of `Image` objects.
142
162
  """
143
163
  with ThreadPoolExecutor(
144
164
  max_workers=len(self.pipe_components), thread_name_prefix="EvalWorker"
@@ -195,11 +215,15 @@ class MultiThreadPipelineComponent(PipelineComponent):
195
215
 
196
216
  def pass_datapoints(self, dpts: list[Image]) -> list[Image]:
197
217
  """
198
- Putting the list of datapoints into a thread-save queue and start for each pipeline
199
- component a separate thread. It will return a list of datapoints where the order of appearance
200
- of the output might be not the same as the input.
201
- :param dpts:
202
- :return:
218
+ Put the list of datapoints into a thread-safe queue and start a separate thread for each pipeline component.
219
+
220
+ The order of appearance of the output might not be the same as the input.
221
+
222
+ Args:
223
+ dpts: List of `Image` datapoints.
224
+
225
+ Returns:
226
+ List of processed `Image` datapoints.
203
227
  """
204
228
  for dp in dpts:
205
229
  self.input_queue.put(dp)
@@ -212,10 +236,13 @@ class MultiThreadPipelineComponent(PipelineComponent):
212
236
 
213
237
  def predict_dataflow(self, df: DataFlow) -> DataFlow:
214
238
  """
215
- Mapping a datapoint via `pass_datapoint` within a dataflow pipeline
239
+ Map a datapoint via `pass_datapoints` within a dataflow pipeline.
240
+
241
+ Args:
242
+ df: An input `DataFlow`.
216
243
 
217
- :param df: An input dataflow
218
- :return: A output dataflow
244
+ Returns:
245
+ An output `DataFlow`.
219
246
  """
220
247
  return MapData(df, self.pass_datapoints)
221
248
 
@@ -16,7 +16,7 @@
16
16
  # limitations under the License.
17
17
 
18
18
  """
19
- Module for pipeline with Tensorpack predictors
19
+ Module for document processing pipeline
20
20
  """
21
21
 
22
22
  import os
@@ -42,6 +42,21 @@ from .common import PageParsingService
42
42
  def _collect_from_kwargs(
43
43
  **kwargs: Union[Optional[str], bytes, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
44
44
  ) -> Tuple[Optional[str], Union[str, Sequence[str]], bool, int, str, DataFlow, Optional[bytes]]:
45
+ """
46
+ Collects and validates keyword arguments for dataflow construction.
47
+
48
+ Args:
49
+ **kwargs: Keyword arguments that may include `path`, `bytes`, `dataset_dataflow`, `shuffle`, `file_type`, and
50
+ `max_datapoints`.
51
+
52
+ Returns:
53
+ Tuple containing `path`, `file_type`, `shuffle`, `max_datapoints`, `doc_path`, `dataset_dataflow`, and
54
+ `b_bytes`.
55
+
56
+ Raises:
57
+ ValueError: If neither `path` nor `dataset_dataflow` is provided, or if required arguments are missing.
58
+ TypeError: If argument types are incorrect.
59
+ """
45
60
  b_bytes = kwargs.get("bytes")
46
61
  dataset_dataflow = kwargs.get("dataset_dataflow")
47
62
  path = kwargs.get("path")
@@ -115,10 +130,35 @@ def _to_image(
115
130
  width: Optional[int] = None,
116
131
  height: Optional[int] = None,
117
132
  ) -> Optional[Image]:
133
+ """
134
+ Converts a data point to an `Image` object.
135
+
136
+ Args:
137
+ dp: The data point, which can be a string or a mapping.
138
+ dpi: Dots per inch for the image.
139
+ width: Width of the image.
140
+ height: Height of the image.
141
+
142
+ Returns:
143
+ An `Image` object or None.
144
+ """
118
145
  return to_image(dp, dpi, width, height)
119
146
 
120
147
 
121
148
  def _doc_to_dataflow(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> DataFlow:
149
+ """
150
+ Creates a dataflow from a PDF document.
151
+
152
+ Args:
153
+ path: Path to the PDF document.
154
+ max_datapoints: Maximum number of data points to consider.
155
+
156
+ Returns:
157
+ A `DataFlow` object.
158
+
159
+ Raises:
160
+ FileExistsError: If the file does not exist.
161
+ """
122
162
  if not os.path.isfile(path):
123
163
  raise FileExistsError(f"{path} not a file")
124
164
 
@@ -129,28 +169,24 @@ def _doc_to_dataflow(path: PathLikeOrStr, max_datapoints: Optional[int] = None)
129
169
 
130
170
  class DoctectionPipe(Pipeline):
131
171
  """
132
- Prototype for a document layout pipeline. Contains implementation for loading document types (images in directory,
133
- single PDF document, dataflow from datasets), conversions in dataflows and building a pipeline.
134
-
172
+ Prototype for a document layout pipeline.
135
173
 
174
+ Contains implementation for loading document types (images in directory, single PDF document, dataflow from
175
+ datasets), conversions in dataflows, and building a pipeline.
136
176
 
137
177
  See `deepdoctection.analyzer.dd` for a concrete implementation.
138
178
 
139
179
  See also the explanations in `base.Pipeline`.
140
180
 
141
- By default, `DoctectionPipe` will instantiate a default `PageParsingService`
142
-
143
- PageParsingService(text_container=LayoutType.word,
144
- text_block_categories=[LayoutType.title,
145
- LayoutType.text,
146
- LayoutType.list,
147
- LayoutType.table])
148
-
149
- but you can overwrite the current setting:
181
+ By default, `DoctectionPipe` will instantiate a default `PageParsingService`:
150
182
 
151
- **Example:**
183
+ Example:
184
+ ```python
185
+ pipe = DoctectionPipe([comp_1, com_2], PageParsingService(text_container= my_custom_setting))
186
+ ```
152
187
 
153
- pipe = DoctectionPipe([comp_1, com_2], PageParsingService(text_container= my_custom_setting))
188
+ Note:
189
+ You can overwrite the current setting by providing a custom `PageParsingService`.
154
190
  """
155
191
 
156
192
  def __init__(
@@ -158,8 +194,17 @@ class DoctectionPipe(Pipeline):
158
194
  pipeline_component_list: List[PipelineComponent],
159
195
  page_parsing_service: Optional[PageParsingService] = None,
160
196
  ):
197
+ """
198
+ Initializes the `DoctectionPipe`.
199
+
200
+ Args:
201
+ pipeline_component_list: List of pipeline components.
202
+ page_parsing_service: Optional custom `PageParsingService`.
203
+ """
161
204
  self.page_parser = (
162
- PageParsingService(text_container=IMAGE_DEFAULTS["text_container"])
205
+ PageParsingService(
206
+ text_container=IMAGE_DEFAULTS.TEXT_CONTAINER,
207
+ )
163
208
  if page_parsing_service is None
164
209
  else page_parsing_service
165
210
  )
@@ -216,13 +261,19 @@ class DoctectionPipe(Pipeline):
216
261
  shuffle: bool = False,
217
262
  ) -> DataFlow:
218
263
  """
219
- Processing method for directories
264
+ Processing method for directories.
220
265
 
221
- :param path: path to directory
222
- :param file_type: file type to consider (single str or list of strings)
223
- :param max_datapoints: max number of datapoints to consider
224
- :param shuffle: Shuffle file names in order to stream them randomly
225
- :return: dataflow
266
+ Args:
267
+ path: Path to directory.
268
+ file_type: File type to consider (single string or list of strings).
269
+ max_datapoints: Maximum number of data points to consider.
270
+ shuffle: Whether to shuffle file names for random streaming.
271
+
272
+ Returns:
273
+ A `DataFlow` object.
274
+
275
+ Raises:
276
+ NotADirectoryError: If the path is not a directory.
226
277
  """
227
278
  if not os.path.isdir(path):
228
279
  raise NotADirectoryError(f"{os.fspath(path)} not a directory")
@@ -232,11 +283,14 @@ class DoctectionPipe(Pipeline):
232
283
  @staticmethod
233
284
  def doc_to_dataflow(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> DataFlow:
234
285
  """
235
- Processing method for documents
286
+ Processing method for documents.
236
287
 
237
- :param path: path to directory
238
- :param max_datapoints: max number of datapoints to consider
239
- :return: dataflow
288
+ Args:
289
+ path: Path to the document.
290
+ max_datapoints: Maximum number of data points to consider.
291
+
292
+ Returns:
293
+ A `DataFlow` object.
240
294
  """
241
295
  return _doc_to_dataflow(path, max_datapoints)
242
296
 
@@ -245,13 +299,19 @@ class DoctectionPipe(Pipeline):
245
299
  path: str, b_bytes: bytes, file_type: Union[str, Sequence[str]], max_datapoints: Optional[int] = None
246
300
  ) -> DataFlow:
247
301
  """
248
- Converts a bytes object to a dataflow
302
+ Converts a bytes object to a dataflow.
303
+
304
+ Args:
305
+ path: Path to directory or an image file.
306
+ b_bytes: Bytes object.
307
+ file_type: File type, e.g., `.pdf`, `.jpg`, or a list of image file types.
308
+ max_datapoints: Maximum number of data points to consider.
249
309
 
250
- :param path: path to directory or an image file
251
- :param b_bytes: bytes object
252
- :param file_type: e.g. ".pdf", ".jpg" or [".jpg", ".png", ".jpeg", ".tif"]
253
- :param max_datapoints: max number of datapoints to consider
254
- :return: DataFlow
310
+ Returns:
311
+ A `DataFlow` object.
312
+
313
+ Raises:
314
+ ValueError: If the combination of arguments is not supported.
255
315
  """
256
316
 
257
317
  file_name = os.path.split(path)[1]
@@ -280,10 +340,13 @@ class DoctectionPipe(Pipeline):
280
340
 
281
341
  def dataflow_to_page(self, df: DataFlow) -> DataFlow:
282
342
  """
283
- Converts a dataflow of images to a dataflow of pages
343
+ Converts a dataflow of images to a dataflow of pages.
344
+
345
+ Args:
346
+ df: Dataflow.
284
347
 
285
- :param df: Dataflow
286
- :return: Dataflow
348
+ Returns:
349
+ A dataflow of pages.
287
350
  """
288
351
  return self.page_parser.predict_dataflow(df)
289
352
 
@@ -291,18 +354,16 @@ class DoctectionPipe(Pipeline):
291
354
  self, **kwargs: Union[str, bytes, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
292
355
  ) -> DataFlow:
293
356
  """
294
- `kwargs key dataset_dataflow:` Transfer a dataflow of a dataset via its dataflow builder
295
-
296
- `kwargs key path:` A path to a directory in which either image documents or pdf files are located. It is
297
- assumed that the pdf documents consist of only one page. If there are multiple pages,
298
- only the first page is processed through the pipeline.
299
- Alternatively, a path to a pdf document with multiple pages.
300
-
301
- `kwargs key bytes:` A bytes object of an image
302
-
303
- `kwargs key file_type:` Selection of the file type, if: args:`file_type` is passed
304
-
305
- `kwargs key max_datapoints:` Stops processing as soon as max_datapoints images have been processed
357
+ Args:
358
+ `kwargs:
359
+ dataset_dataflow (Dataflow):` Transfer a dataflow of a dataset via its dataflow builder
360
+ path (TypeOrStr):` A path to a directory in which either image documents or pdf files are located. It
361
+ is assumed that the pdf documents consist of only one page. If there are multiple pages,
362
+ only the first page is processed through the pipeline.
363
+ Alternatively, a path to a pdf document with multiple pages.
364
+ bytes:` A bytes object of an image
365
+ file_type:` Selection of the file type, if: args:`file_type` is passed
366
+ max_datapoints:` Stops processing as soon as max_datapoints images have been processed
306
367
 
307
368
  :return: dataflow
308
369
  """
@@ -21,7 +21,7 @@ Module for language detection pipeline component
21
21
  from typing import Optional, Sequence
22
22
 
23
23
  from ..datapoint.image import Image
24
- from ..datapoint.view import Page
24
+ from ..datapoint.view import ImageDefaults, Page
25
25
  from ..extern.base import LanguageDetector, ObjectDetector
26
26
  from ..utils.error import ImageError
27
27
  from ..utils.settings import PageType, TypeOrStr, get_type
@@ -36,20 +36,22 @@ class LanguageDetectionService(PipelineComponent):
36
36
 
37
37
  There are two ways to use this component:
38
38
 
39
- - By analyzing the already extracted and ordered text. For this purpose, a page object is parsed internally and
40
- the full text is passed to the language_detector. This approach provides the greatest precision.
41
-
42
- - By previous text extraction with an object detector and subsequent transfer of concatenated word elements to the
43
- language_detector. Only one OCR detector can be used here. This method can be used, for example, to select an OCR
44
- detector that specializes in a language using. Although the word recognition is less accurate
45
- when choosing any detector, the results are confident enough to rely on the results, especially when extracting
46
- longer text passages. So, a TextExtractionService, for example, can be selected as the subsequent pipeline
47
- component. The words determined by the OCR detector are not transferred to the image object.
48
-
49
- lang_detector = FasttextLangDetector(path_weights,profile.categories)
50
- component = LanguageDetectionService(lang_detector, text_container="word",
51
- text_block_names=["text","title","table"])
52
-
39
+ 1. By analyzing the already extracted and ordered text. For this purpose, a `Page` object is parsed internally and
40
+ the full text is passed to the `language_detector`. This approach provides the greatest precision.
41
+
42
+ 2. By previous text extraction with an object detector and subsequent transfer of concatenated word elements to the
43
+ `language_detector`. Only one OCR detector can be used here. This method can be used, for example, to select an OCR
44
+ detector that specializes in a language. Although the word recognition is less accurate
45
+ when choosing any detector, the results are confident enough to rely on, especially when extracting
46
+ longer text passages. So, a `TextExtractionService`, for example, can be selected as the subsequent pipeline
47
+ component. The words determined by the OCR detector are not transferred to the image object.
48
+
49
+ Example:
50
+ ```python
51
+ lang_detector = FasttextLangDetector(path_weights, profile.categories)
52
+ component = LanguageDetectionService(lang_detector, text_container="word",
53
+ text_block_names=["text", "title", "table"])
54
+ ```
53
55
  """
54
56
 
55
57
  def __init__(
@@ -60,18 +62,20 @@ class LanguageDetectionService(PipelineComponent):
60
62
  floating_text_block_categories: Optional[Sequence[TypeOrStr]] = None,
61
63
  ):
62
64
  """
63
- :param language_detector: Detector to determine text
64
- :param text_container: text container, needed to generate the reading order. Not necessary when passing a
65
- text detector.
66
- :param text_detector: Object detector to extract text. You cannot use a Pdfminer here.
67
-
68
- :param floating_text_block_categories: text blocks, needed for generating the reading order. Not necessary
69
- when passing a text detector.
65
+ Initializes a `LanguageDetectionService` instance.
66
+
67
+ Args:
68
+ language_detector: Detector to determine text.
69
+ text_container: Text container, needed to generate the reading order. Not necessary when passing a
70
+ `text_detector`.
71
+ text_detector: Object detector to extract text. You cannot use a Pdfminer here.
72
+ floating_text_block_categories: Text blocks, needed for generating the reading order. Not necessary
73
+ when passing a `text_detector`.
70
74
  """
71
75
 
72
76
  self.predictor = language_detector
73
77
  self.text_detector = text_detector
74
- self.text_container = get_type(text_container) if text_container is not None else text_container
78
+ self.text_container = get_type(text_container) if text_container is not None else ImageDefaults.TEXT_CONTAINER
75
79
  self.floating_text_block_categories = (
76
80
  tuple(get_type(text_block) for text_block in floating_text_block_categories)
77
81
  if (floating_text_block_categories is not None)
@@ -81,8 +85,21 @@ class LanguageDetectionService(PipelineComponent):
81
85
  super().__init__(self._get_name(self.predictor.name))
82
86
 
83
87
  def serve(self, dp: Image) -> None:
88
+ """
89
+ Serves the language detection on the given `Image`.
90
+
91
+ Args:
92
+ dp: The `Image` datapoint to process.
93
+
94
+ Raises:
95
+ ImageError: If `dp.image` is `None` and a `text_detector` is used.
96
+ """
84
97
  if self.text_detector is None:
85
- page = Page.from_image(dp, self.text_container, self.floating_text_block_categories)
98
+ page = Page.from_image(
99
+ image_orig=dp,
100
+ text_container=self.text_container,
101
+ floating_text_block_categories=self.floating_text_block_categories,
102
+ )
86
103
  text = page.text_no_line_break
87
104
  else:
88
105
  if dp.image is None:
@@ -43,11 +43,20 @@ def skip_if_category_or_service_extracted(
43
43
  """
44
44
  Skip the processing of the pipeline component if the category or service is already extracted.
45
45
 
46
- **Example**
47
-
46
+ Example:
47
+ ```python
48
48
  detector = # some detector
49
49
  item_component = ImageLayoutService(detector)
50
50
  item_component.set_inbound_filter(skip_if_category_or_service_extracted(detector.get_categories(as_dict=False)))
51
+ ```
52
+
53
+ Args:
54
+ dp: The `Image` datapoint to check.
55
+ category_names: Optional category names or sequence of `ObjectTypes` to check for.
56
+ service_ids: Optional service IDs or sequence of service IDs to check for.
57
+
58
+ Returns:
59
+ Whether to skip processing based on existing annotation.
51
60
  """
52
61
 
53
62
  if dp.get_annotation(category_names=category_names, service_ids=service_ids):
@@ -58,18 +67,21 @@ def skip_if_category_or_service_extracted(
58
67
  @pipeline_component_registry.register("ImageLayoutService")
59
68
  class ImageLayoutService(PipelineComponent):
60
69
  """
61
- Pipeline component for determining the layout. Which layout blocks are determined depends on the Detector and thus
62
- usually on the data set on which the Detector was pre-trained. If the Detector has been trained on Publaynet, these
63
- are layouts such as text, title, table, list and figure. If the Detector has been trained on DocBank, these are
64
- rather Abstract, Author, Caption, Equation, Figure, Footer, List, Paragraph, Reference, Section, Table, Title.
70
+ Pipeline component for determining the layout.
65
71
 
66
- The component is usually at the beginning of the pipeline. Cropping of the layout blocks can be selected to simplify
67
- further processing.
72
+ Which layout blocks are determined depends on the `Detector` and thus usually on the data set on which the
73
+ `Detector` was pre-trained. If the `Detector` has been trained on Publaynet, these are layouts such as text, title
74
+ , table, list and figure. If the `Detector` has been trained on DocBank, these are rather Abstract, Author,
75
+ Caption, Equation, Figure, Footer, List, Paragraph, Reference, Section, Table, Title.
68
76
 
69
- **Example**
77
+ The component is usually at the beginning of the pipeline. Cropping of the layout blocks can be selected to
78
+ simplify further processing.
70
79
 
71
- d_items = TPFrcnnDetector(item_config_path, item_weights_path, {1: 'row', 2: 'column'})
72
- item_component = ImageLayoutService(d_items)
80
+ Example:
81
+ ```python
82
+ d_items = TPFrcnnDetector(item_config_path, item_weights_path, {1: 'row', 2: 'column'})
83
+ item_component = ImageLayoutService(d_items)
84
+ ```
73
85
  """
74
86
 
75
87
  def __init__(
@@ -80,13 +92,19 @@ class ImageLayoutService(PipelineComponent):
80
92
  padder: Optional[PadTransform] = None,
81
93
  ):
82
94
  """
83
- :param layout_detector: object detector
84
- :param to_image: Generate an image for each detected block, e.g. populate `ImageAnnotation.image`. Useful,
85
- if you want to process only some blocks in a subsequent pipeline component.
86
- :param crop_image: Do not only populate `ImageAnnotation.image` but also crop the detected block according
87
- to its bounding box and populate the resulting sub image to
88
- `ImageAnnotation.image.image`.
89
- :param padder: If not `None`, will apply the padder to the image before prediction and inverse apply the padder
95
+ Initializes the `ImageLayoutService`.
96
+
97
+ Args:
98
+ layout_detector: The object detector.
99
+ to_image: Whether to generate an image for each detected block, e.g. populate `ImageAnnotation.image`.
100
+ Useful if you want to process only some blocks in a subsequent pipeline component.
101
+ crop_image: Whether to crop the detected block according to its bounding box and populate the resulting sub
102
+ image to `ImageAnnotation.image.image`.
103
+ padder: If not `None`, will apply the padder to the image before prediction and inverse apply the padder.
104
+
105
+ Note:
106
+ If `padder` is provided, it will be applied before prediction and inversely applied to the coordinates
107
+ after prediction.
90
108
  """
91
109
  self.to_image = to_image
92
110
  self.crop_image = crop_image
@@ -95,6 +113,15 @@ class ImageLayoutService(PipelineComponent):
95
113
  super().__init__(self._get_name(layout_detector.name), self.predictor.model_id)
96
114
 
97
115
  def serve(self, dp: Image) -> None:
116
+ """
117
+ Serve the pipeline component on the given `Image`.
118
+
119
+ Args:
120
+ dp: The `Image` datapoint to process.
121
+
122
+ Raises:
123
+ ImageError: If `dp.image` is `None`.
124
+ """
98
125
  if dp.image is None:
99
126
  raise ImageError("image cannot be None")
100
127
  np_image = dp.image