deepdoctection 0.42.1__py3-none-any.whl → 0.43__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +2 -1
- deepdoctection/analyzer/__init__.py +2 -1
- deepdoctection/analyzer/config.py +904 -0
- deepdoctection/analyzer/dd.py +36 -62
- deepdoctection/analyzer/factory.py +311 -141
- deepdoctection/configs/conf_dd_one.yaml +100 -44
- deepdoctection/configs/profiles.jsonl +32 -0
- deepdoctection/dataflow/__init__.py +9 -6
- deepdoctection/dataflow/base.py +33 -15
- deepdoctection/dataflow/common.py +96 -75
- deepdoctection/dataflow/custom.py +36 -29
- deepdoctection/dataflow/custom_serialize.py +135 -91
- deepdoctection/dataflow/parallel_map.py +33 -31
- deepdoctection/dataflow/serialize.py +15 -10
- deepdoctection/dataflow/stats.py +41 -28
- deepdoctection/datapoint/__init__.py +4 -6
- deepdoctection/datapoint/annotation.py +104 -66
- deepdoctection/datapoint/box.py +190 -130
- deepdoctection/datapoint/convert.py +66 -39
- deepdoctection/datapoint/image.py +151 -95
- deepdoctection/datapoint/view.py +383 -236
- deepdoctection/datasets/__init__.py +2 -6
- deepdoctection/datasets/adapter.py +11 -11
- deepdoctection/datasets/base.py +118 -81
- deepdoctection/datasets/dataflow_builder.py +18 -12
- deepdoctection/datasets/info.py +76 -57
- deepdoctection/datasets/instances/__init__.py +6 -2
- deepdoctection/datasets/instances/doclaynet.py +17 -14
- deepdoctection/datasets/instances/fintabnet.py +16 -22
- deepdoctection/datasets/instances/funsd.py +11 -6
- deepdoctection/datasets/instances/iiitar13k.py +9 -9
- deepdoctection/datasets/instances/layouttest.py +9 -9
- deepdoctection/datasets/instances/publaynet.py +9 -9
- deepdoctection/datasets/instances/pubtables1m.py +13 -13
- deepdoctection/datasets/instances/pubtabnet.py +13 -15
- deepdoctection/datasets/instances/rvlcdip.py +8 -8
- deepdoctection/datasets/instances/xfund.py +11 -9
- deepdoctection/datasets/registry.py +18 -11
- deepdoctection/datasets/save.py +12 -11
- deepdoctection/eval/__init__.py +3 -2
- deepdoctection/eval/accmetric.py +72 -52
- deepdoctection/eval/base.py +29 -10
- deepdoctection/eval/cocometric.py +14 -12
- deepdoctection/eval/eval.py +56 -41
- deepdoctection/eval/registry.py +6 -3
- deepdoctection/eval/tedsmetric.py +24 -9
- deepdoctection/eval/tp_eval_callback.py +13 -12
- deepdoctection/extern/__init__.py +1 -1
- deepdoctection/extern/base.py +176 -97
- deepdoctection/extern/d2detect.py +127 -92
- deepdoctection/extern/deskew.py +19 -10
- deepdoctection/extern/doctrocr.py +157 -106
- deepdoctection/extern/fastlang.py +25 -17
- deepdoctection/extern/hfdetr.py +137 -60
- deepdoctection/extern/hflayoutlm.py +329 -248
- deepdoctection/extern/hflm.py +67 -33
- deepdoctection/extern/model.py +108 -762
- deepdoctection/extern/pdftext.py +37 -12
- deepdoctection/extern/pt/nms.py +15 -1
- deepdoctection/extern/pt/ptutils.py +13 -9
- deepdoctection/extern/tessocr.py +87 -54
- deepdoctection/extern/texocr.py +29 -14
- deepdoctection/extern/tp/tfutils.py +36 -8
- deepdoctection/extern/tp/tpcompat.py +54 -16
- deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
- deepdoctection/extern/tpdetect.py +4 -2
- deepdoctection/mapper/__init__.py +1 -1
- deepdoctection/mapper/cats.py +117 -76
- deepdoctection/mapper/cocostruct.py +35 -17
- deepdoctection/mapper/d2struct.py +56 -29
- deepdoctection/mapper/hfstruct.py +32 -19
- deepdoctection/mapper/laylmstruct.py +221 -185
- deepdoctection/mapper/maputils.py +71 -35
- deepdoctection/mapper/match.py +76 -62
- deepdoctection/mapper/misc.py +68 -44
- deepdoctection/mapper/pascalstruct.py +13 -12
- deepdoctection/mapper/prodigystruct.py +33 -19
- deepdoctection/mapper/pubstruct.py +42 -32
- deepdoctection/mapper/tpstruct.py +39 -19
- deepdoctection/mapper/xfundstruct.py +20 -13
- deepdoctection/pipe/__init__.py +1 -2
- deepdoctection/pipe/anngen.py +104 -62
- deepdoctection/pipe/base.py +226 -107
- deepdoctection/pipe/common.py +206 -123
- deepdoctection/pipe/concurrency.py +74 -47
- deepdoctection/pipe/doctectionpipe.py +108 -47
- deepdoctection/pipe/language.py +41 -24
- deepdoctection/pipe/layout.py +45 -18
- deepdoctection/pipe/lm.py +146 -78
- deepdoctection/pipe/order.py +196 -113
- deepdoctection/pipe/refine.py +111 -63
- deepdoctection/pipe/registry.py +1 -1
- deepdoctection/pipe/segment.py +213 -142
- deepdoctection/pipe/sub_layout.py +76 -46
- deepdoctection/pipe/text.py +52 -33
- deepdoctection/pipe/transform.py +8 -6
- deepdoctection/train/d2_frcnn_train.py +87 -69
- deepdoctection/train/hf_detr_train.py +72 -40
- deepdoctection/train/hf_layoutlm_train.py +85 -46
- deepdoctection/train/tp_frcnn_train.py +56 -28
- deepdoctection/utils/concurrency.py +59 -16
- deepdoctection/utils/context.py +40 -19
- deepdoctection/utils/develop.py +25 -17
- deepdoctection/utils/env_info.py +85 -36
- deepdoctection/utils/error.py +16 -10
- deepdoctection/utils/file_utils.py +246 -62
- deepdoctection/utils/fs.py +162 -43
- deepdoctection/utils/identifier.py +29 -16
- deepdoctection/utils/logger.py +49 -32
- deepdoctection/utils/metacfg.py +83 -21
- deepdoctection/utils/pdf_utils.py +119 -62
- deepdoctection/utils/settings.py +24 -10
- deepdoctection/utils/tqdm.py +10 -5
- deepdoctection/utils/transform.py +182 -46
- deepdoctection/utils/utils.py +61 -28
- deepdoctection/utils/viz.py +150 -104
- deepdoctection-0.43.dist-info/METADATA +376 -0
- deepdoctection-0.43.dist-info/RECORD +149 -0
- deepdoctection/analyzer/_config.py +0 -146
- deepdoctection-0.42.1.dist-info/METADATA +0 -431
- deepdoctection-0.42.1.dist-info/RECORD +0 -148
- {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/WHEEL +0 -0
- {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/licenses/LICENSE +0 -0
- {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/top_level.txt +0 -0
deepdoctection/pipe/base.py
CHANGED
|
@@ -39,8 +39,16 @@ from .anngen import DatapointManager
|
|
|
39
39
|
|
|
40
40
|
@dataclass(frozen=True)
|
|
41
41
|
class MetaAnnotation:
|
|
42
|
-
"""
|
|
43
|
-
|
|
42
|
+
"""
|
|
43
|
+
A immutable dataclass that stores information about what `Image` are being
|
|
44
|
+
modified through a pipeline component.
|
|
45
|
+
|
|
46
|
+
Attributes:
|
|
47
|
+
image_annotations: Tuple of `ObjectTypes` representing image annotations.
|
|
48
|
+
sub_categories: Dictionary mapping `ObjectTypes` to sets of `ObjectTypes` for sub-categories.
|
|
49
|
+
relationships: Dictionary mapping `ObjectTypes` to sets of `ObjectTypes` for relationships.
|
|
50
|
+
summaries: Tuple of `ObjectTypes` representing summaries.
|
|
51
|
+
"""
|
|
44
52
|
|
|
45
53
|
image_annotations: tuple[ObjectTypes, ...] = field(default=())
|
|
46
54
|
sub_categories: dict[ObjectTypes, set[ObjectTypes]] = field(default_factory=dict)
|
|
@@ -50,28 +58,38 @@ class MetaAnnotation:
|
|
|
50
58
|
|
|
51
59
|
class PipelineComponent(ABC):
|
|
52
60
|
"""
|
|
53
|
-
Base class for pipeline components.
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
61
|
+
Base class for pipeline components.
|
|
62
|
+
|
|
63
|
+
Pipeline components are the parts that make up a pipeline. They contain the
|
|
64
|
+
abstract `serve`, in which the component steps are defined. Within pipelines,
|
|
65
|
+
pipeline components take an image, enrich these with annotations or transform
|
|
66
|
+
existing annotation and transfer the image again. The pipeline component should
|
|
67
|
+
be implemented in such a way that the pythonic approach of passing arguments via
|
|
68
|
+
assignment is used well. To support the pipeline component, an intrinsic
|
|
69
|
+
datapoint manager is provided, which can perform operations on the image
|
|
70
|
+
datapoint that are common for pipeline components. This includes the creation of
|
|
71
|
+
an image, sub-category and similar annotations.
|
|
72
|
+
|
|
73
|
+
Pipeline components do not necessarily have to contain predictors but can also
|
|
74
|
+
contain rule-based transformation steps. (For pipeline components with
|
|
75
|
+
predictors see `PredictorPipelineComponent`.)
|
|
76
|
+
|
|
77
|
+
The sequential execution of pipeline components is carried out with dataflows.
|
|
78
|
+
In the case of components with predictors, this allows the predictor graph to be
|
|
79
|
+
set up first and then to be streamed to the processed data points.
|
|
80
|
+
|
|
81
|
+
Note:
|
|
82
|
+
Currently, predictors can only process single images. Processing higher number of batches is not planned.
|
|
69
83
|
"""
|
|
70
84
|
|
|
71
85
|
def __init__(self, name: str, model_id: Optional[str] = None) -> None:
|
|
72
86
|
"""
|
|
73
|
-
|
|
74
|
-
|
|
87
|
+
Initializes a `PipelineComponent`.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
name: The name of the pipeline component. The name will be used to identify a pipeline component in a
|
|
91
|
+
pipeline. Use something that describes the task of the pipeline.
|
|
92
|
+
model_id: Optional model identifier.
|
|
75
93
|
"""
|
|
76
94
|
self.name = name
|
|
77
95
|
self.service_id = self.get_service_id()
|
|
@@ -81,39 +99,43 @@ class PipelineComponent(ABC):
|
|
|
81
99
|
|
|
82
100
|
def set_inbound_filter(self, filter_func: Callable[[DP], bool]) -> None:
|
|
83
101
|
"""
|
|
84
|
-
Set a filter function to decide
|
|
85
|
-
The filter function should return a boolean value. If the function returns True, the image will not be processed
|
|
86
|
-
by this pipeline component.
|
|
102
|
+
Set a filter function to decide if an image of the inbound dataflow should be passed to `self.serve`.
|
|
87
103
|
|
|
88
|
-
|
|
104
|
+
The filter function should return a boolean value. If the function returns True, the image will not be
|
|
105
|
+
processed by this pipeline component.
|
|
89
106
|
|
|
107
|
+
Example:
|
|
90
108
|
```python
|
|
91
109
|
def do_not_process_tables(dp: Image) -> bool:
|
|
92
|
-
|
|
110
|
+
if "table" not in dp.get_categories_from_current_state():
|
|
93
111
|
return True
|
|
94
|
-
|
|
112
|
+
return False
|
|
95
113
|
|
|
96
114
|
layout_component = ImageLayoutService(...)
|
|
97
115
|
layout_component.set_inbound_filter(do_not_process_tables)
|
|
98
116
|
```
|
|
99
117
|
|
|
100
|
-
|
|
101
|
-
|
|
118
|
+
Args:
|
|
119
|
+
filter_func: A function that takes an image datapoint and returns a boolean value.
|
|
102
120
|
"""
|
|
103
121
|
self.filter_func = filter_func # type: ignore
|
|
104
122
|
|
|
105
123
|
@abstractmethod
|
|
106
124
|
def serve(self, dp: Image) -> None:
|
|
107
125
|
"""
|
|
108
|
-
Processing an image through the whole pipeline component.
|
|
109
|
-
|
|
110
|
-
|
|
126
|
+
Processing an image through the whole pipeline component.
|
|
127
|
+
|
|
128
|
+
Abstract method that contains all processing steps of the component. Please note that `dp` is already available
|
|
129
|
+
to the `dp_manager` and operations for this can be carried out via it.
|
|
130
|
+
|
|
131
|
+
`dp` was transferred to the `dp_manager` via an assignment. This means that operations on `dp` directly or
|
|
132
|
+
operations via `dp_manager` are equivalent.
|
|
111
133
|
|
|
112
|
-
|
|
113
|
-
via
|
|
134
|
+
As a simplified interface `serve` does not have to return a `dp`. The data point is passed on within pipelines
|
|
135
|
+
internally (via `pass_datapoint`).
|
|
114
136
|
|
|
115
|
-
|
|
116
|
-
|
|
137
|
+
Args:
|
|
138
|
+
dp: The image datapoint to process.
|
|
117
139
|
"""
|
|
118
140
|
raise NotImplementedError()
|
|
119
141
|
|
|
@@ -124,12 +146,15 @@ class PipelineComponent(ABC):
|
|
|
124
146
|
|
|
125
147
|
def pass_datapoint(self, dp: Image) -> Image:
|
|
126
148
|
"""
|
|
127
|
-
Acceptance, handover to dp_manager
|
|
149
|
+
Acceptance, handover to `dp_manager`, transformation and forwarding of `dp`.
|
|
128
150
|
|
|
129
|
-
|
|
151
|
+
To measure the time, use `self.timer_on = True`.
|
|
130
152
|
|
|
131
|
-
:
|
|
132
|
-
|
|
153
|
+
Args:
|
|
154
|
+
dp: Datapoint.
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
Datapoint.
|
|
133
158
|
"""
|
|
134
159
|
if self.timer_on:
|
|
135
160
|
with timed_operation(self.__class__.__name__):
|
|
@@ -140,42 +165,60 @@ class PipelineComponent(ABC):
|
|
|
140
165
|
|
|
141
166
|
def predict_dataflow(self, df: DataFlow) -> DataFlow:
|
|
142
167
|
"""
|
|
143
|
-
Mapping a datapoint via `pass_datapoint` within a dataflow pipeline
|
|
168
|
+
Mapping a datapoint via `pass_datapoint` within a dataflow pipeline.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
df: An input dataflow.
|
|
144
172
|
|
|
145
|
-
:
|
|
146
|
-
|
|
173
|
+
Returns:
|
|
174
|
+
An output dataflow.
|
|
147
175
|
"""
|
|
148
176
|
return MapData(df, self.pass_datapoint)
|
|
149
177
|
|
|
150
178
|
@abstractmethod
|
|
151
179
|
def clone(self) -> PipelineComponent:
|
|
152
180
|
"""
|
|
153
|
-
Clone an instance
|
|
181
|
+
Clone an instance.
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
A cloned instance of `PipelineComponent`.
|
|
154
185
|
"""
|
|
155
186
|
raise NotImplementedError()
|
|
156
187
|
|
|
157
188
|
@abstractmethod
|
|
158
189
|
def get_meta_annotation(self) -> MetaAnnotation:
|
|
159
190
|
"""
|
|
160
|
-
Get a dict of list of annotation type.
|
|
191
|
+
Get a dict of list of annotation type.
|
|
161
192
|
|
|
162
|
-
|
|
163
|
-
`
|
|
164
|
-
`
|
|
165
|
-
`
|
|
166
|
-
|
|
193
|
+
The dict must contain:
|
|
194
|
+
- `image_annotation` with values: a list of category names,
|
|
195
|
+
- `sub_categories` with values: a dict with category names as keys and a list of the generated sub categories,
|
|
196
|
+
- `relationships` with values: a dict with category names as keys and a list of the generated relationships,
|
|
197
|
+
- `summaries` with values: A list of summary sub categories.
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
Dict with meta infos as just described.
|
|
167
201
|
"""
|
|
168
202
|
raise NotImplementedError()
|
|
169
203
|
|
|
170
204
|
def get_service_id(self) -> str:
|
|
171
205
|
"""
|
|
172
|
-
Get the generating
|
|
206
|
+
Get the generating service id.
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
The service id as a string.
|
|
173
210
|
"""
|
|
174
211
|
return get_uuid_from_str(self.name)[:8]
|
|
175
212
|
|
|
176
213
|
def clear_predictor(self) -> None:
|
|
177
214
|
"""
|
|
178
|
-
Clear the predictor of the pipeline component if it has one.
|
|
215
|
+
Clear the predictor of the pipeline component if it has one.
|
|
216
|
+
|
|
217
|
+
Needed for model updates during training.
|
|
218
|
+
|
|
219
|
+
Note:
|
|
220
|
+
Maybe you forgot to implement this method in your pipeline component. This might be the case when you run
|
|
221
|
+
evaluation during training and need to update the trained model in your pipeline component.
|
|
179
222
|
"""
|
|
180
223
|
raise NotImplementedError(
|
|
181
224
|
"Maybe you forgot to implement this method in your pipeline component. This might "
|
|
@@ -185,7 +228,10 @@ class PipelineComponent(ABC):
|
|
|
185
228
|
|
|
186
229
|
def has_predictor(self) -> bool:
|
|
187
230
|
"""
|
|
188
|
-
Check if the pipeline component has a predictor
|
|
231
|
+
Check if the pipeline component has a predictor.
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
`True` if the pipeline component has a predictor, otherwise `False`.
|
|
189
235
|
"""
|
|
190
236
|
if hasattr(self, "predictor"):
|
|
191
237
|
if self.predictor is not None:
|
|
@@ -194,8 +240,16 @@ class PipelineComponent(ABC):
|
|
|
194
240
|
|
|
195
241
|
def _undo(self, dp: Image) -> Image:
|
|
196
242
|
"""
|
|
197
|
-
Undo the processing of the pipeline component.
|
|
198
|
-
|
|
243
|
+
Undo the processing of the pipeline component.
|
|
244
|
+
|
|
245
|
+
It will remove `ImageAnnotation`, `CategoryAnnotation` and `ContainerAnnotation` with the `service_id` of the
|
|
246
|
+
pipeline component.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
dp: The image datapoint.
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
The modified image datapoint.
|
|
199
253
|
"""
|
|
200
254
|
if self.timer_on:
|
|
201
255
|
with timed_operation(self.__class__.__name__):
|
|
@@ -208,64 +262,69 @@ class PipelineComponent(ABC):
|
|
|
208
262
|
|
|
209
263
|
def undo(self, df: DataFlow) -> DataFlow:
|
|
210
264
|
"""
|
|
211
|
-
Mapping a datapoint via `_undo` within a dataflow pipeline
|
|
265
|
+
Mapping a datapoint via `_undo` within a dataflow pipeline.
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
df: An input dataflow of Images.
|
|
212
269
|
|
|
213
|
-
:
|
|
214
|
-
|
|
270
|
+
Returns:
|
|
271
|
+
An output dataflow of Images.
|
|
215
272
|
"""
|
|
216
273
|
return MapData(df, self._undo)
|
|
217
274
|
|
|
218
275
|
|
|
219
276
|
class Pipeline(ABC):
|
|
220
277
|
"""
|
|
221
|
-
Abstract base class for creating pipelines.
|
|
222
|
-
|
|
223
|
-
|
|
278
|
+
Abstract base class for creating pipelines.
|
|
279
|
+
|
|
280
|
+
Pipelines represent the framework with which documents can be processed by reading individual pages, processing the
|
|
281
|
+
pages through the pipeline infrastructure and returning the extracted information.
|
|
224
282
|
|
|
225
283
|
The infrastructure, as the backbone of the pipeline, consists of a list of pipeline components in which images can
|
|
226
|
-
|
|
227
|
-
|
|
284
|
+
be passed through via dataflows. The order of the pipeline components in the list determines the processing order.
|
|
285
|
+
The components for the pipeline backbone are composed in `_build_pipe`.
|
|
228
286
|
|
|
229
|
-
The pipeline is set up via: `analyze` for a directory with single pages or a document with multiple pages. A
|
|
230
|
-
|
|
287
|
+
The pipeline is set up via: `analyze` for a directory with single pages or a document with multiple pages. A data
|
|
288
|
+
flow is returned that is triggered via a for loop and starts the actual processing.
|
|
231
289
|
|
|
232
290
|
This creates a pipeline using the following command arrangement:
|
|
233
291
|
|
|
234
|
-
|
|
292
|
+
Example:
|
|
293
|
+
```python
|
|
294
|
+
layout = LayoutPipeComponent(layout_detector ...)
|
|
295
|
+
text = TextExtractPipeComponent(text_detector ...)
|
|
296
|
+
simple_pipe = MyPipeline(pipeline_component = [layout, text])
|
|
297
|
+
doc_dataflow = simple_pipe.analyze(input = path / to / dir)
|
|
235
298
|
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
simple_pipe = MyPipeline(pipeline_component = [layout, text])
|
|
240
|
-
doc_dataflow = simple_pipe.analyze(input = path / to / dir)
|
|
299
|
+
for page in doc_dataflow:
|
|
300
|
+
print(page)
|
|
301
|
+
```
|
|
241
302
|
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
```
|
|
245
|
-
|
|
246
|
-
In doing so, page contains all document structures determined via the pipeline (either directly from the Image core
|
|
247
|
-
model or already processed further).
|
|
303
|
+
In doing so, `page` contains all document structures determined via the pipeline (either directly from the `Image`
|
|
304
|
+
core model or already processed further).
|
|
248
305
|
|
|
249
306
|
In addition to `analyze`, the internal `_entry` is used to bundle preprocessing steps.
|
|
250
307
|
|
|
251
308
|
It is possible to set a session id for the pipeline. This is useful for logging purposes. The session id can be
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
To generate a session_id automatically:
|
|
309
|
+
either passed to the pipeline via the `analyze` method or generated automatically.
|
|
255
310
|
|
|
256
|
-
|
|
311
|
+
To generate a `session_id` automatically:
|
|
257
312
|
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
313
|
+
Example:
|
|
314
|
+
```python
|
|
315
|
+
pipe = MyPipeline(pipeline_component = [layout, text])
|
|
316
|
+
pipe.set_session_id = True
|
|
261
317
|
|
|
262
|
-
|
|
263
|
-
|
|
318
|
+
df = pipe.analyze(input = "path/to/dir") # session_id is generated automatically
|
|
319
|
+
```
|
|
264
320
|
"""
|
|
265
321
|
|
|
266
322
|
def __init__(self, pipeline_component_list: list[PipelineComponent]) -> None:
|
|
267
323
|
"""
|
|
268
|
-
|
|
324
|
+
Initializes a `Pipeline`.
|
|
325
|
+
|
|
326
|
+
Args:
|
|
327
|
+
pipeline_component_list: A list of pipeline components.
|
|
269
328
|
"""
|
|
270
329
|
self.pipe_component_list = pipeline_component_list
|
|
271
330
|
self.set_session_id = False
|
|
@@ -274,9 +333,13 @@ class Pipeline(ABC):
|
|
|
274
333
|
def _entry(self, **kwargs: Any) -> DataFlow:
|
|
275
334
|
"""
|
|
276
335
|
Use this method to bundle all preprocessing, such as loading one or more documents, so that a dataflow is
|
|
277
|
-
|
|
336
|
+
provided as a return value that can be passed on to the pipeline backbone.
|
|
337
|
+
|
|
338
|
+
Args:
|
|
339
|
+
kwargs: Arguments, for dynamic customizing of the processing or for the transfer of processing types.
|
|
278
340
|
|
|
279
|
-
:
|
|
341
|
+
Returns:
|
|
342
|
+
A dataflow for further processing.
|
|
280
343
|
"""
|
|
281
344
|
raise NotImplementedError()
|
|
282
345
|
|
|
@@ -284,38 +347,67 @@ class Pipeline(ABC):
|
|
|
284
347
|
@curry
|
|
285
348
|
def _undo(dp: Image, service_ids: Optional[list[str]] = None) -> Image:
|
|
286
349
|
"""
|
|
287
|
-
Remove annotations from a datapoint
|
|
350
|
+
Remove annotations from a datapoint.
|
|
351
|
+
|
|
352
|
+
Args:
|
|
353
|
+
dp: The image datapoint.
|
|
354
|
+
service_ids: Optional list of service ids to remove.
|
|
355
|
+
|
|
356
|
+
Returns:
|
|
357
|
+
The modified image datapoint.
|
|
288
358
|
"""
|
|
289
359
|
dp.remove(service_ids=service_ids)
|
|
290
360
|
return dp
|
|
291
361
|
|
|
292
362
|
def undo(self, df: DataFlow, service_ids: Optional[set[str]] = None) -> DataFlow:
|
|
293
363
|
"""
|
|
294
|
-
Mapping a datapoint via `_undo` within a dataflow pipeline
|
|
364
|
+
Mapping a datapoint via `_undo` within a dataflow pipeline.
|
|
365
|
+
|
|
366
|
+
Args:
|
|
367
|
+
df: An input dataflow of Images.
|
|
368
|
+
service_ids: A set of service ids to remove.
|
|
295
369
|
|
|
296
|
-
:
|
|
297
|
-
|
|
298
|
-
:return: A output dataflow of Images
|
|
370
|
+
Returns:
|
|
371
|
+
An output dataflow of Images.
|
|
299
372
|
"""
|
|
300
373
|
return MapData(df, self._undo(service_ids=service_ids))
|
|
301
374
|
|
|
302
375
|
@abstractmethod
|
|
303
376
|
def analyze(self, **kwargs: Any) -> DataFlow:
|
|
304
377
|
"""
|
|
305
|
-
Try to keep this method as the only one necessary for the user.
|
|
306
|
-
|
|
307
|
-
|
|
378
|
+
Try to keep this method as the only one necessary for the user.
|
|
379
|
+
|
|
380
|
+
All processing steps, such as preprocessing, setting up the backbone and post-processing are to be bundled. A
|
|
381
|
+
dataflow generator `df` is returned, which is generated via
|
|
308
382
|
|
|
383
|
+
Example:
|
|
384
|
+
```python
|
|
385
|
+
df = pipe.analyze(path="path/to/dir")
|
|
386
|
+
df.reset_state()
|
|
309
387
|
doc = iter(df)
|
|
310
388
|
page = next(doc)
|
|
389
|
+
```
|
|
311
390
|
|
|
312
391
|
can be triggered.
|
|
392
|
+
|
|
393
|
+
Args:
|
|
394
|
+
kwargs: Arguments for analysis.
|
|
395
|
+
|
|
396
|
+
Returns:
|
|
397
|
+
A dataflow generator.
|
|
313
398
|
"""
|
|
314
399
|
raise NotImplementedError()
|
|
315
400
|
|
|
316
401
|
def _build_pipe(self, df: DataFlow, session_id: Optional[str] = None) -> DataFlow:
|
|
317
402
|
"""
|
|
318
|
-
Composition of the backbone
|
|
403
|
+
Composition of the backbone.
|
|
404
|
+
|
|
405
|
+
Args:
|
|
406
|
+
df: The input dataflow.
|
|
407
|
+
session_id: Optional session id.
|
|
408
|
+
|
|
409
|
+
Returns:
|
|
410
|
+
The processed dataflow.
|
|
319
411
|
"""
|
|
320
412
|
if session_id is None and self.set_session_id:
|
|
321
413
|
session_id = self.get_session_id()
|
|
@@ -327,11 +419,12 @@ class Pipeline(ABC):
|
|
|
327
419
|
|
|
328
420
|
def get_meta_annotation(self) -> MetaAnnotation:
|
|
329
421
|
"""
|
|
330
|
-
Collects meta annotations from all pipeline components and summarizes the returned results
|
|
422
|
+
Collects meta annotations from all pipeline components and summarizes the returned results.
|
|
331
423
|
|
|
332
|
-
:
|
|
333
|
-
|
|
334
|
-
|
|
424
|
+
Returns:
|
|
425
|
+
Meta annotations with information about image annotations (list), sub categories (dict with category
|
|
426
|
+
names and generated sub categories), relationships (dict with category names and generated relationships)
|
|
427
|
+
as well as summaries (list with sub categories).
|
|
335
428
|
"""
|
|
336
429
|
image_annotations: list[ObjectTypes] = []
|
|
337
430
|
sub_categories = defaultdict(set)
|
|
@@ -355,11 +448,18 @@ class Pipeline(ABC):
|
|
|
355
448
|
def get_pipeline_info(
|
|
356
449
|
self, service_id: Optional[str] = None, name: Optional[str] = None
|
|
357
450
|
) -> Union[str, Mapping[str, str]]:
|
|
358
|
-
"""
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
451
|
+
"""
|
|
452
|
+
Get pipeline information.
|
|
453
|
+
|
|
454
|
+
Returns a dictionary with a description of each pipeline component.
|
|
455
|
+
|
|
456
|
+
Args:
|
|
457
|
+
service_id: Service id of the pipeline component to search for.
|
|
458
|
+
name: Name of the pipeline component to search for.
|
|
459
|
+
|
|
460
|
+
Returns:
|
|
461
|
+
Either a full dictionary with position and name of all pipeline components or the name, if
|
|
462
|
+
the position has been passed or the position if the name has been passed.
|
|
363
463
|
"""
|
|
364
464
|
comp_info = {comp.service_id: comp.name for comp in self.pipe_component_list}
|
|
365
465
|
comp_info_name_as_key = {value: key for key, value in comp_info.items()}
|
|
@@ -369,9 +469,28 @@ class Pipeline(ABC):
|
|
|
369
469
|
return comp_info_name_as_key[name]
|
|
370
470
|
return comp_info
|
|
371
471
|
|
|
472
|
+
def get_pipeline_component(self, service_id: Optional[str] = None, name: Optional[str] = None) -> PipelineComponent:
|
|
473
|
+
"""
|
|
474
|
+
Get a pipeline component by `service_id` or `name`.
|
|
475
|
+
|
|
476
|
+
Args:
|
|
477
|
+
service_id: Service id of the pipeline component to search for.
|
|
478
|
+
name: Name of the pipeline component to search for.
|
|
479
|
+
|
|
480
|
+
Returns:
|
|
481
|
+
The pipeline component if found, otherwise raises ValueError.
|
|
482
|
+
"""
|
|
483
|
+
for comp in self.pipe_component_list:
|
|
484
|
+
if comp.service_id == service_id or comp.name == name:
|
|
485
|
+
return comp
|
|
486
|
+
raise ValueError(f"Pipeline component not found with service_id={service_id} or name={name}")
|
|
487
|
+
|
|
372
488
|
@staticmethod
|
|
373
489
|
def get_session_id() -> str:
|
|
374
490
|
"""
|
|
375
|
-
Get the generating a session id
|
|
491
|
+
Get the generating a session id.
|
|
492
|
+
|
|
493
|
+
Returns:
|
|
494
|
+
The session id as a string.
|
|
376
495
|
"""
|
|
377
496
|
return str(uuid1())[:8]
|