deepdoctection 0.42.0__py3-none-any.whl → 0.43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (124) hide show
  1. deepdoctection/__init__.py +2 -1
  2. deepdoctection/analyzer/__init__.py +2 -1
  3. deepdoctection/analyzer/config.py +904 -0
  4. deepdoctection/analyzer/dd.py +36 -62
  5. deepdoctection/analyzer/factory.py +311 -141
  6. deepdoctection/configs/conf_dd_one.yaml +100 -44
  7. deepdoctection/configs/profiles.jsonl +32 -0
  8. deepdoctection/dataflow/__init__.py +9 -6
  9. deepdoctection/dataflow/base.py +33 -15
  10. deepdoctection/dataflow/common.py +96 -75
  11. deepdoctection/dataflow/custom.py +36 -29
  12. deepdoctection/dataflow/custom_serialize.py +135 -91
  13. deepdoctection/dataflow/parallel_map.py +33 -31
  14. deepdoctection/dataflow/serialize.py +15 -10
  15. deepdoctection/dataflow/stats.py +41 -28
  16. deepdoctection/datapoint/__init__.py +4 -6
  17. deepdoctection/datapoint/annotation.py +104 -66
  18. deepdoctection/datapoint/box.py +190 -130
  19. deepdoctection/datapoint/convert.py +66 -39
  20. deepdoctection/datapoint/image.py +151 -95
  21. deepdoctection/datapoint/view.py +383 -236
  22. deepdoctection/datasets/__init__.py +2 -6
  23. deepdoctection/datasets/adapter.py +11 -11
  24. deepdoctection/datasets/base.py +118 -81
  25. deepdoctection/datasets/dataflow_builder.py +18 -12
  26. deepdoctection/datasets/info.py +76 -57
  27. deepdoctection/datasets/instances/__init__.py +6 -2
  28. deepdoctection/datasets/instances/doclaynet.py +17 -14
  29. deepdoctection/datasets/instances/fintabnet.py +16 -22
  30. deepdoctection/datasets/instances/funsd.py +11 -6
  31. deepdoctection/datasets/instances/iiitar13k.py +9 -9
  32. deepdoctection/datasets/instances/layouttest.py +9 -9
  33. deepdoctection/datasets/instances/publaynet.py +9 -9
  34. deepdoctection/datasets/instances/pubtables1m.py +13 -13
  35. deepdoctection/datasets/instances/pubtabnet.py +13 -15
  36. deepdoctection/datasets/instances/rvlcdip.py +8 -8
  37. deepdoctection/datasets/instances/xfund.py +11 -9
  38. deepdoctection/datasets/registry.py +18 -11
  39. deepdoctection/datasets/save.py +12 -11
  40. deepdoctection/eval/__init__.py +3 -2
  41. deepdoctection/eval/accmetric.py +72 -52
  42. deepdoctection/eval/base.py +29 -10
  43. deepdoctection/eval/cocometric.py +14 -12
  44. deepdoctection/eval/eval.py +56 -41
  45. deepdoctection/eval/registry.py +6 -3
  46. deepdoctection/eval/tedsmetric.py +24 -9
  47. deepdoctection/eval/tp_eval_callback.py +13 -12
  48. deepdoctection/extern/__init__.py +1 -1
  49. deepdoctection/extern/base.py +176 -97
  50. deepdoctection/extern/d2detect.py +127 -92
  51. deepdoctection/extern/deskew.py +19 -10
  52. deepdoctection/extern/doctrocr.py +157 -106
  53. deepdoctection/extern/fastlang.py +25 -17
  54. deepdoctection/extern/hfdetr.py +137 -60
  55. deepdoctection/extern/hflayoutlm.py +329 -248
  56. deepdoctection/extern/hflm.py +67 -33
  57. deepdoctection/extern/model.py +108 -762
  58. deepdoctection/extern/pdftext.py +37 -12
  59. deepdoctection/extern/pt/nms.py +15 -1
  60. deepdoctection/extern/pt/ptutils.py +13 -9
  61. deepdoctection/extern/tessocr.py +87 -54
  62. deepdoctection/extern/texocr.py +29 -14
  63. deepdoctection/extern/tp/tfutils.py +36 -8
  64. deepdoctection/extern/tp/tpcompat.py +54 -16
  65. deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
  66. deepdoctection/extern/tpdetect.py +4 -2
  67. deepdoctection/mapper/__init__.py +1 -1
  68. deepdoctection/mapper/cats.py +117 -76
  69. deepdoctection/mapper/cocostruct.py +35 -17
  70. deepdoctection/mapper/d2struct.py +56 -29
  71. deepdoctection/mapper/hfstruct.py +32 -19
  72. deepdoctection/mapper/laylmstruct.py +221 -185
  73. deepdoctection/mapper/maputils.py +71 -35
  74. deepdoctection/mapper/match.py +76 -62
  75. deepdoctection/mapper/misc.py +68 -44
  76. deepdoctection/mapper/pascalstruct.py +13 -12
  77. deepdoctection/mapper/prodigystruct.py +33 -19
  78. deepdoctection/mapper/pubstruct.py +42 -32
  79. deepdoctection/mapper/tpstruct.py +39 -19
  80. deepdoctection/mapper/xfundstruct.py +20 -13
  81. deepdoctection/pipe/__init__.py +1 -2
  82. deepdoctection/pipe/anngen.py +104 -62
  83. deepdoctection/pipe/base.py +226 -107
  84. deepdoctection/pipe/common.py +206 -123
  85. deepdoctection/pipe/concurrency.py +74 -47
  86. deepdoctection/pipe/doctectionpipe.py +108 -47
  87. deepdoctection/pipe/language.py +41 -24
  88. deepdoctection/pipe/layout.py +45 -18
  89. deepdoctection/pipe/lm.py +146 -78
  90. deepdoctection/pipe/order.py +196 -113
  91. deepdoctection/pipe/refine.py +111 -63
  92. deepdoctection/pipe/registry.py +1 -1
  93. deepdoctection/pipe/segment.py +213 -142
  94. deepdoctection/pipe/sub_layout.py +76 -46
  95. deepdoctection/pipe/text.py +52 -33
  96. deepdoctection/pipe/transform.py +8 -6
  97. deepdoctection/train/d2_frcnn_train.py +87 -69
  98. deepdoctection/train/hf_detr_train.py +72 -40
  99. deepdoctection/train/hf_layoutlm_train.py +85 -46
  100. deepdoctection/train/tp_frcnn_train.py +56 -28
  101. deepdoctection/utils/concurrency.py +59 -16
  102. deepdoctection/utils/context.py +40 -19
  103. deepdoctection/utils/develop.py +25 -17
  104. deepdoctection/utils/env_info.py +85 -36
  105. deepdoctection/utils/error.py +16 -10
  106. deepdoctection/utils/file_utils.py +246 -62
  107. deepdoctection/utils/fs.py +162 -43
  108. deepdoctection/utils/identifier.py +29 -16
  109. deepdoctection/utils/logger.py +49 -32
  110. deepdoctection/utils/metacfg.py +83 -21
  111. deepdoctection/utils/pdf_utils.py +119 -62
  112. deepdoctection/utils/settings.py +24 -10
  113. deepdoctection/utils/tqdm.py +10 -5
  114. deepdoctection/utils/transform.py +182 -46
  115. deepdoctection/utils/utils.py +61 -28
  116. deepdoctection/utils/viz.py +150 -104
  117. deepdoctection-0.43.dist-info/METADATA +376 -0
  118. deepdoctection-0.43.dist-info/RECORD +149 -0
  119. {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/WHEEL +1 -1
  120. deepdoctection/analyzer/_config.py +0 -146
  121. deepdoctection-0.42.0.dist-info/METADATA +0 -431
  122. deepdoctection-0.42.0.dist-info/RECORD +0 -148
  123. {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/licenses/LICENSE +0 -0
  124. {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/top_level.txt +0 -0
@@ -39,8 +39,16 @@ from .anngen import DatapointManager
39
39
 
40
40
  @dataclass(frozen=True)
41
41
  class MetaAnnotation:
42
- """A immutable dataclass that stores information about what `Image` are being
43
- modified through a pipeline compoenent."""
42
+ """
43
+ A immutable dataclass that stores information about what `Image` are being
44
+ modified through a pipeline component.
45
+
46
+ Attributes:
47
+ image_annotations: Tuple of `ObjectTypes` representing image annotations.
48
+ sub_categories: Dictionary mapping `ObjectTypes` to sets of `ObjectTypes` for sub-categories.
49
+ relationships: Dictionary mapping `ObjectTypes` to sets of `ObjectTypes` for relationships.
50
+ summaries: Tuple of `ObjectTypes` representing summaries.
51
+ """
44
52
 
45
53
  image_annotations: tuple[ObjectTypes, ...] = field(default=())
46
54
  sub_categories: dict[ObjectTypes, set[ObjectTypes]] = field(default_factory=dict)
@@ -50,28 +58,38 @@ class MetaAnnotation:
50
58
 
51
59
  class PipelineComponent(ABC):
52
60
  """
53
- Base class for pipeline components. Pipeline components are the parts that make up a pipeline. They contain the
54
- abstract `serve`, in which the component steps are defined. Within pipelines, pipeline components take an
55
- image, enrich these with annotations or transform existing annotation and transfer the image again. The pipeline
56
- component should be implemented in such a way that the pythonic approach of passing arguments via assignment is used
57
- well. To support the pipeline component, an intrinsic datapoint manager is provided, which can perform operations on
58
- the image datapoint that are common for pipeline components. This includes the creation of an image, sub-category
59
- and similar annotations.
60
-
61
- Pipeline components do not necessarily have to contain predictors but can also contain rule-based transformation
62
- steps. (For pipeline components with predictors see `PredictorPipelineComponent`.)
63
-
64
- The sequential execution of pipeline components is carried out with dataflows. In the case of components with
65
- predictors, this allows the predictor graph to be set up first and then to be streamed to the processed data points.
66
-
67
- **Caution:** Currently, predictors can only process single images. Processing higher number of batches is not
68
- planned.
61
+ Base class for pipeline components.
62
+
63
+ Pipeline components are the parts that make up a pipeline. They contain the
64
+ abstract `serve`, in which the component steps are defined. Within pipelines,
65
+ pipeline components take an image, enrich these with annotations or transform
66
+ existing annotation and transfer the image again. The pipeline component should
67
+ be implemented in such a way that the pythonic approach of passing arguments via
68
+ assignment is used well. To support the pipeline component, an intrinsic
69
+ datapoint manager is provided, which can perform operations on the image
70
+ datapoint that are common for pipeline components. This includes the creation of
71
+ an image, sub-category and similar annotations.
72
+
73
+ Pipeline components do not necessarily have to contain predictors but can also
74
+ contain rule-based transformation steps. (For pipeline components with
75
+ predictors see `PredictorPipelineComponent`.)
76
+
77
+ The sequential execution of pipeline components is carried out with dataflows.
78
+ In the case of components with predictors, this allows the predictor graph to be
79
+ set up first and then to be streamed to the processed data points.
80
+
81
+ Note:
82
+ Currently, predictors can only process single images. Processing higher number of batches is not planned.
69
83
  """
70
84
 
71
85
  def __init__(self, name: str, model_id: Optional[str] = None) -> None:
72
86
  """
73
- :param name: The name of the pipeline component. The name will be used to identify a pipeline component in a
74
- pipeline. Use something that describe the task of the pipeline.
87
+ Initializes a `PipelineComponent`.
88
+
89
+ Args:
90
+ name: The name of the pipeline component. The name will be used to identify a pipeline component in a
91
+ pipeline. Use something that describes the task of the pipeline.
92
+ model_id: Optional model identifier.
75
93
  """
76
94
  self.name = name
77
95
  self.service_id = self.get_service_id()
@@ -81,39 +99,43 @@ class PipelineComponent(ABC):
81
99
 
82
100
  def set_inbound_filter(self, filter_func: Callable[[DP], bool]) -> None:
83
101
  """
84
- Set a filter function to decide, if an image of the inbound dataflow should be passed to self.serve.
85
- The filter function should return a boolean value. If the function returns True, the image will not be processed
86
- by this pipeline component.
102
+ Set a filter function to decide if an image of the inbound dataflow should be passed to `self.serve`.
87
103
 
88
- **Example:**
104
+ The filter function should return a boolean value. If the function returns True, the image will not be
105
+ processed by this pipeline component.
89
106
 
107
+ Example:
90
108
  ```python
91
109
  def do_not_process_tables(dp: Image) -> bool:
92
- if "table" not in dp.get_categories_from_current_state():
110
+ if "table" not in dp.get_categories_from_current_state():
93
111
  return True
94
- return False
112
+ return False
95
113
 
96
114
  layout_component = ImageLayoutService(...)
97
115
  layout_component.set_inbound_filter(do_not_process_tables)
98
116
  ```
99
117
 
100
-
101
- :param filter_func: A function that takes an image datapoint and returns a boolean value
118
+ Args:
119
+ filter_func: A function that takes an image datapoint and returns a boolean value.
102
120
  """
103
121
  self.filter_func = filter_func # type: ignore
104
122
 
105
123
  @abstractmethod
106
124
  def serve(self, dp: Image) -> None:
107
125
  """
108
- Processing an image through the whole pipeline component. Abstract method that contains all processing steps of
109
- the component. Please note that dp is already available to the dp_manager and operations for this can be carried
110
- out via it.
126
+ Processing an image through the whole pipeline component.
127
+
128
+ Abstract method that contains all processing steps of the component. Please note that `dp` is already available
129
+ to the `dp_manager` and operations for this can be carried out via it.
130
+
131
+ `dp` was transferred to the `dp_manager` via an assignment. This means that operations on `dp` directly or
132
+ operations via `dp_manager` are equivalent.
111
133
 
112
- dp was transferred to the dp_manager via an assignment. This means that operations on dp directly or operations
113
- via dp_manager are equivalent.
134
+ As a simplified interface `serve` does not have to return a `dp`. The data point is passed on within pipelines
135
+ internally (via `pass_datapoint`).
114
136
 
115
- As a simplified interface `serve` does not have to return a dp. The data point is passed on within
116
- pipelines internally (via `pass_datapoint`).
137
+ Args:
138
+ dp: The image datapoint to process.
117
139
  """
118
140
  raise NotImplementedError()
119
141
 
@@ -124,12 +146,15 @@ class PipelineComponent(ABC):
124
146
 
125
147
  def pass_datapoint(self, dp: Image) -> Image:
126
148
  """
127
- Acceptance, handover to dp_manager, transformation and forwarding of dp. To measure the time, use
149
+ Acceptance, handover to `dp_manager`, transformation and forwarding of `dp`.
128
150
 
129
- self.timer_on = True
151
+ To measure the time, use `self.timer_on = True`.
130
152
 
131
- :param dp: datapoint
132
- :return: datapoint
153
+ Args:
154
+ dp: Datapoint.
155
+
156
+ Returns:
157
+ Datapoint.
133
158
  """
134
159
  if self.timer_on:
135
160
  with timed_operation(self.__class__.__name__):
@@ -140,42 +165,60 @@ class PipelineComponent(ABC):
140
165
 
141
166
  def predict_dataflow(self, df: DataFlow) -> DataFlow:
142
167
  """
143
- Mapping a datapoint via `pass_datapoint` within a dataflow pipeline
168
+ Mapping a datapoint via `pass_datapoint` within a dataflow pipeline.
169
+
170
+ Args:
171
+ df: An input dataflow.
144
172
 
145
- :param df: An input dataflow
146
- :return: A output dataflow
173
+ Returns:
174
+ An output dataflow.
147
175
  """
148
176
  return MapData(df, self.pass_datapoint)
149
177
 
150
178
  @abstractmethod
151
179
  def clone(self) -> PipelineComponent:
152
180
  """
153
- Clone an instance
181
+ Clone an instance.
182
+
183
+ Returns:
184
+ A cloned instance of `PipelineComponent`.
154
185
  """
155
186
  raise NotImplementedError()
156
187
 
157
188
  @abstractmethod
158
189
  def get_meta_annotation(self) -> MetaAnnotation:
159
190
  """
160
- Get a dict of list of annotation type. The dict must contain
191
+ Get a dict of list of annotation type.
161
192
 
162
- `image_annotation` with values: a list of category names,
163
- `sub_categories` with values: a dict with category names as keys and a list of the generated sub categories
164
- `relationships` with values: a dict with category names as keys and a list of the generated relationships
165
- `summaries` with values: A list of summary sub categories
166
- :return: Dict with meta infos as just described
193
+ The dict must contain:
194
+ - `image_annotation` with values: a list of category names,
195
+ - `sub_categories` with values: a dict with category names as keys and a list of the generated sub categories,
196
+ - `relationships` with values: a dict with category names as keys and a list of the generated relationships,
197
+ - `summaries` with values: A list of summary sub categories.
198
+
199
+ Returns:
200
+ Dict with meta infos as just described.
167
201
  """
168
202
  raise NotImplementedError()
169
203
 
170
204
  def get_service_id(self) -> str:
171
205
  """
172
- Get the generating model
206
+ Get the generating service id.
207
+
208
+ Returns:
209
+ The service id as a string.
173
210
  """
174
211
  return get_uuid_from_str(self.name)[:8]
175
212
 
176
213
  def clear_predictor(self) -> None:
177
214
  """
178
- Clear the predictor of the pipeline component if it has one. Needed for model updates during training.
215
+ Clear the predictor of the pipeline component if it has one.
216
+
217
+ Needed for model updates during training.
218
+
219
+ Note:
220
+ Maybe you forgot to implement this method in your pipeline component. This might be the case when you run
221
+ evaluation during training and need to update the trained model in your pipeline component.
179
222
  """
180
223
  raise NotImplementedError(
181
224
  "Maybe you forgot to implement this method in your pipeline component. This might "
@@ -185,7 +228,10 @@ class PipelineComponent(ABC):
185
228
 
186
229
  def has_predictor(self) -> bool:
187
230
  """
188
- Check if the pipeline component has a predictor
231
+ Check if the pipeline component has a predictor.
232
+
233
+ Returns:
234
+ `True` if the pipeline component has a predictor, otherwise `False`.
189
235
  """
190
236
  if hasattr(self, "predictor"):
191
237
  if self.predictor is not None:
@@ -194,8 +240,16 @@ class PipelineComponent(ABC):
194
240
 
195
241
  def _undo(self, dp: Image) -> Image:
196
242
  """
197
- Undo the processing of the pipeline component. It will remove `ImageAnnotation`, `CategoryAnnotation` and
198
- `ContainerAnnotation` with the service_id of the pipeline component.
243
+ Undo the processing of the pipeline component.
244
+
245
+ It will remove `ImageAnnotation`, `CategoryAnnotation` and `ContainerAnnotation` with the `service_id` of the
246
+ pipeline component.
247
+
248
+ Args:
249
+ dp: The image datapoint.
250
+
251
+ Returns:
252
+ The modified image datapoint.
199
253
  """
200
254
  if self.timer_on:
201
255
  with timed_operation(self.__class__.__name__):
@@ -208,64 +262,69 @@ class PipelineComponent(ABC):
208
262
 
209
263
  def undo(self, df: DataFlow) -> DataFlow:
210
264
  """
211
- Mapping a datapoint via `_undo` within a dataflow pipeline
265
+ Mapping a datapoint via `_undo` within a dataflow pipeline.
266
+
267
+ Args:
268
+ df: An input dataflow of Images.
212
269
 
213
- :param df: An input dataflow of Images
214
- :return: A output dataflow of Images
270
+ Returns:
271
+ An output dataflow of Images.
215
272
  """
216
273
  return MapData(df, self._undo)
217
274
 
218
275
 
219
276
  class Pipeline(ABC):
220
277
  """
221
- Abstract base class for creating pipelines. Pipelines represent the framework with which documents can be processed
222
- by reading individual pages, processing the pages through the pipeline infrastructure and returning the extracted
223
- information.
278
+ Abstract base class for creating pipelines.
279
+
280
+ Pipelines represent the framework with which documents can be processed by reading individual pages, processing the
281
+ pages through the pipeline infrastructure and returning the extracted information.
224
282
 
225
283
  The infrastructure, as the backbone of the pipeline, consists of a list of pipeline components in which images can
226
- be passed through via dataflows. The order of the pipeline components in the list determines the processing order.
227
- The components for the pipeline backbone are composed in `_build_pipe`.
284
+ be passed through via dataflows. The order of the pipeline components in the list determines the processing order.
285
+ The components for the pipeline backbone are composed in `_build_pipe`.
228
286
 
229
- The pipeline is set up via: `analyze` for a directory with single pages or a document with multiple pages. A
230
- data flow is returned that is triggered via a for loop and starts the actual processing.
287
+ The pipeline is set up via: `analyze` for a directory with single pages or a document with multiple pages. A data
288
+ flow is returned that is triggered via a for loop and starts the actual processing.
231
289
 
232
290
  This creates a pipeline using the following command arrangement:
233
291
 
234
- **Example:**
292
+ Example:
293
+ ```python
294
+ layout = LayoutPipeComponent(layout_detector ...)
295
+ text = TextExtractPipeComponent(text_detector ...)
296
+ simple_pipe = MyPipeline(pipeline_component = [layout, text])
297
+ doc_dataflow = simple_pipe.analyze(input = path / to / dir)
235
298
 
236
- ```python
237
- layout = LayoutPipeComponent(layout_detector ...)
238
- text = TextExtractPipeComponent(text_detector ...)
239
- simple_pipe = MyPipeline(pipeline_component = [layout, text])
240
- doc_dataflow = simple_pipe.analyze(input = path / to / dir)
299
+ for page in doc_dataflow:
300
+ print(page)
301
+ ```
241
302
 
242
- for page in doc_dataflow:
243
- print(page)
244
- ```
245
-
246
- In doing so, page contains all document structures determined via the pipeline (either directly from the Image core
247
- model or already processed further).
303
+ In doing so, `page` contains all document structures determined via the pipeline (either directly from the `Image`
304
+ core model or already processed further).
248
305
 
249
306
  In addition to `analyze`, the internal `_entry` is used to bundle preprocessing steps.
250
307
 
251
308
  It is possible to set a session id for the pipeline. This is useful for logging purposes. The session id can be
252
- either passed to the pipeline via the `analyze` method or generated automatically.
253
-
254
- To generate a session_id automatically:
309
+ either passed to the pipeline via the `analyze` method or generated automatically.
255
310
 
256
- **Example:**
311
+ To generate a `session_id` automatically:
257
312
 
258
- ```python
259
- pipe = MyPipeline(pipeline_component = [layout, text])
260
- pipe.set_session_id = True
313
+ Example:
314
+ ```python
315
+ pipe = MyPipeline(pipeline_component = [layout, text])
316
+ pipe.set_session_id = True
261
317
 
262
- df = pipe.analyze(input = "path/to/dir") # session_id is generated automatically
263
- ```
318
+ df = pipe.analyze(input = "path/to/dir") # session_id is generated automatically
319
+ ```
264
320
  """
265
321
 
266
322
  def __init__(self, pipeline_component_list: list[PipelineComponent]) -> None:
267
323
  """
268
- :param pipeline_component_list: A list of pipeline components.
324
+ Initializes a `Pipeline`.
325
+
326
+ Args:
327
+ pipeline_component_list: A list of pipeline components.
269
328
  """
270
329
  self.pipe_component_list = pipeline_component_list
271
330
  self.set_session_id = False
@@ -274,9 +333,13 @@ class Pipeline(ABC):
274
333
  def _entry(self, **kwargs: Any) -> DataFlow:
275
334
  """
276
335
  Use this method to bundle all preprocessing, such as loading one or more documents, so that a dataflow is
277
- provided as a return value that can be passed on to the pipeline backbone.
336
+ provided as a return value that can be passed on to the pipeline backbone.
337
+
338
+ Args:
339
+ kwargs: Arguments, for dynamic customizing of the processing or for the transfer of processing types.
278
340
 
279
- :param kwargs: Arguments, for dynamic customizing of the processing or for the transfer of processing types
341
+ Returns:
342
+ A dataflow for further processing.
280
343
  """
281
344
  raise NotImplementedError()
282
345
 
@@ -284,38 +347,67 @@ class Pipeline(ABC):
284
347
  @curry
285
348
  def _undo(dp: Image, service_ids: Optional[list[str]] = None) -> Image:
286
349
  """
287
- Remove annotations from a datapoint
350
+ Remove annotations from a datapoint.
351
+
352
+ Args:
353
+ dp: The image datapoint.
354
+ service_ids: Optional list of service ids to remove.
355
+
356
+ Returns:
357
+ The modified image datapoint.
288
358
  """
289
359
  dp.remove(service_ids=service_ids)
290
360
  return dp
291
361
 
292
362
  def undo(self, df: DataFlow, service_ids: Optional[set[str]] = None) -> DataFlow:
293
363
  """
294
- Mapping a datapoint via `_undo` within a dataflow pipeline
364
+ Mapping a datapoint via `_undo` within a dataflow pipeline.
365
+
366
+ Args:
367
+ df: An input dataflow of Images.
368
+ service_ids: A set of service ids to remove.
295
369
 
296
- :param df: An input dataflow of Images
297
- :param service_ids: A set of service ids to remove
298
- :return: A output dataflow of Images
370
+ Returns:
371
+ An output dataflow of Images.
299
372
  """
300
373
  return MapData(df, self._undo(service_ids=service_ids))
301
374
 
302
375
  @abstractmethod
303
376
  def analyze(self, **kwargs: Any) -> DataFlow:
304
377
  """
305
- Try to keep this method as the only one necessary for the user. All processing steps, such as preprocessing,
306
- setting up the backbone and post-processing are to be bundled. A dataflow generator df is returned, which is
307
- generated via
378
+ Try to keep this method as the only one necessary for the user.
379
+
380
+ All processing steps, such as preprocessing, setting up the backbone and post-processing are to be bundled. A
381
+ dataflow generator `df` is returned, which is generated via
308
382
 
383
+ Example:
384
+ ```python
385
+ df = pipe.analyze(path="path/to/dir")
386
+ df.reset_state()
309
387
  doc = iter(df)
310
388
  page = next(doc)
389
+ ```
311
390
 
312
391
  can be triggered.
392
+
393
+ Args:
394
+ kwargs: Arguments for analysis.
395
+
396
+ Returns:
397
+ A dataflow generator.
313
398
  """
314
399
  raise NotImplementedError()
315
400
 
316
401
  def _build_pipe(self, df: DataFlow, session_id: Optional[str] = None) -> DataFlow:
317
402
  """
318
- Composition of the backbone
403
+ Composition of the backbone.
404
+
405
+ Args:
406
+ df: The input dataflow.
407
+ session_id: Optional session id.
408
+
409
+ Returns:
410
+ The processed dataflow.
319
411
  """
320
412
  if session_id is None and self.set_session_id:
321
413
  session_id = self.get_session_id()
@@ -327,11 +419,12 @@ class Pipeline(ABC):
327
419
 
328
420
  def get_meta_annotation(self) -> MetaAnnotation:
329
421
  """
330
- Collects meta annotations from all pipeline components and summarizes the returned results
422
+ Collects meta annotations from all pipeline components and summarizes the returned results.
331
423
 
332
- :return: Meta annotations with information about image annotations (list), sub categories (dict with category
333
- names and generated sub categories), relationships (dict with category names and generated
334
- relationships) as well as summaries (list with sub categories)
424
+ Returns:
425
+ Meta annotations with information about image annotations (list), sub categories (dict with category
426
+ names and generated sub categories), relationships (dict with category names and generated relationships)
427
+ as well as summaries (list with sub categories).
335
428
  """
336
429
  image_annotations: list[ObjectTypes] = []
337
430
  sub_categories = defaultdict(set)
@@ -355,11 +448,18 @@ class Pipeline(ABC):
355
448
  def get_pipeline_info(
356
449
  self, service_id: Optional[str] = None, name: Optional[str] = None
357
450
  ) -> Union[str, Mapping[str, str]]:
358
- """Get pipeline information: Returns a dictionary with a description of each pipeline component
359
- :param service_id: service_id of the pipeline component to search for
360
- :param name: name of the pipeline component to search for
361
- :return: Either a full dictionary with position and name of all pipeline components or the name, if the position
362
- has been passed or the position if the name has been passed.
451
+ """
452
+ Get pipeline information.
453
+
454
+ Returns a dictionary with a description of each pipeline component.
455
+
456
+ Args:
457
+ service_id: Service id of the pipeline component to search for.
458
+ name: Name of the pipeline component to search for.
459
+
460
+ Returns:
461
+ Either a full dictionary with position and name of all pipeline components or the name, if
462
+ the position has been passed or the position if the name has been passed.
363
463
  """
364
464
  comp_info = {comp.service_id: comp.name for comp in self.pipe_component_list}
365
465
  comp_info_name_as_key = {value: key for key, value in comp_info.items()}
@@ -369,9 +469,28 @@ class Pipeline(ABC):
369
469
  return comp_info_name_as_key[name]
370
470
  return comp_info
371
471
 
472
+ def get_pipeline_component(self, service_id: Optional[str] = None, name: Optional[str] = None) -> PipelineComponent:
473
+ """
474
+ Get a pipeline component by `service_id` or `name`.
475
+
476
+ Args:
477
+ service_id: Service id of the pipeline component to search for.
478
+ name: Name of the pipeline component to search for.
479
+
480
+ Returns:
481
+ The pipeline component if found, otherwise raises ValueError.
482
+ """
483
+ for comp in self.pipe_component_list:
484
+ if comp.service_id == service_id or comp.name == name:
485
+ return comp
486
+ raise ValueError(f"Pipeline component not found with service_id={service_id} or name={name}")
487
+
372
488
  @staticmethod
373
489
  def get_session_id() -> str:
374
490
  """
375
- Get the generating a session id
491
+ Get the generating a session id.
492
+
493
+ Returns:
494
+ The session id as a string.
376
495
  """
377
496
  return str(uuid1())[:8]