deepdoctection 0.31__py3-none-any.whl → 0.33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (131) hide show
  1. deepdoctection/__init__.py +16 -29
  2. deepdoctection/analyzer/dd.py +70 -59
  3. deepdoctection/configs/conf_dd_one.yaml +34 -31
  4. deepdoctection/dataflow/common.py +9 -5
  5. deepdoctection/dataflow/custom.py +5 -5
  6. deepdoctection/dataflow/custom_serialize.py +75 -18
  7. deepdoctection/dataflow/parallel_map.py +3 -3
  8. deepdoctection/dataflow/serialize.py +4 -4
  9. deepdoctection/dataflow/stats.py +3 -3
  10. deepdoctection/datapoint/annotation.py +41 -56
  11. deepdoctection/datapoint/box.py +9 -8
  12. deepdoctection/datapoint/convert.py +6 -6
  13. deepdoctection/datapoint/image.py +56 -44
  14. deepdoctection/datapoint/view.py +245 -150
  15. deepdoctection/datasets/__init__.py +1 -4
  16. deepdoctection/datasets/adapter.py +35 -26
  17. deepdoctection/datasets/base.py +14 -12
  18. deepdoctection/datasets/dataflow_builder.py +3 -3
  19. deepdoctection/datasets/info.py +24 -26
  20. deepdoctection/datasets/instances/doclaynet.py +51 -51
  21. deepdoctection/datasets/instances/fintabnet.py +46 -46
  22. deepdoctection/datasets/instances/funsd.py +25 -24
  23. deepdoctection/datasets/instances/iiitar13k.py +13 -10
  24. deepdoctection/datasets/instances/layouttest.py +4 -3
  25. deepdoctection/datasets/instances/publaynet.py +5 -5
  26. deepdoctection/datasets/instances/pubtables1m.py +24 -21
  27. deepdoctection/datasets/instances/pubtabnet.py +32 -30
  28. deepdoctection/datasets/instances/rvlcdip.py +30 -30
  29. deepdoctection/datasets/instances/xfund.py +26 -26
  30. deepdoctection/datasets/save.py +6 -6
  31. deepdoctection/eval/__init__.py +1 -4
  32. deepdoctection/eval/accmetric.py +32 -33
  33. deepdoctection/eval/base.py +8 -9
  34. deepdoctection/eval/cocometric.py +15 -13
  35. deepdoctection/eval/eval.py +41 -37
  36. deepdoctection/eval/tedsmetric.py +30 -23
  37. deepdoctection/eval/tp_eval_callback.py +16 -19
  38. deepdoctection/extern/__init__.py +2 -7
  39. deepdoctection/extern/base.py +339 -134
  40. deepdoctection/extern/d2detect.py +85 -113
  41. deepdoctection/extern/deskew.py +14 -11
  42. deepdoctection/extern/doctrocr.py +141 -130
  43. deepdoctection/extern/fastlang.py +27 -18
  44. deepdoctection/extern/hfdetr.py +71 -62
  45. deepdoctection/extern/hflayoutlm.py +504 -211
  46. deepdoctection/extern/hflm.py +230 -0
  47. deepdoctection/extern/model.py +488 -302
  48. deepdoctection/extern/pdftext.py +23 -19
  49. deepdoctection/extern/pt/__init__.py +1 -3
  50. deepdoctection/extern/pt/nms.py +6 -2
  51. deepdoctection/extern/pt/ptutils.py +29 -19
  52. deepdoctection/extern/tessocr.py +39 -38
  53. deepdoctection/extern/texocr.py +18 -18
  54. deepdoctection/extern/tp/tfutils.py +57 -9
  55. deepdoctection/extern/tp/tpcompat.py +21 -14
  56. deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
  57. deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
  58. deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
  59. deepdoctection/extern/tp/tpfrcnn/config/config.py +13 -10
  60. deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
  61. deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +18 -8
  62. deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
  63. deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +14 -9
  64. deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
  65. deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +22 -17
  66. deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +21 -14
  67. deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +19 -11
  68. deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
  69. deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
  70. deepdoctection/extern/tp/tpfrcnn/preproc.py +12 -8
  71. deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
  72. deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
  73. deepdoctection/extern/tpdetect.py +45 -53
  74. deepdoctection/mapper/__init__.py +3 -8
  75. deepdoctection/mapper/cats.py +27 -29
  76. deepdoctection/mapper/cocostruct.py +10 -10
  77. deepdoctection/mapper/d2struct.py +27 -26
  78. deepdoctection/mapper/hfstruct.py +13 -8
  79. deepdoctection/mapper/laylmstruct.py +178 -37
  80. deepdoctection/mapper/maputils.py +12 -11
  81. deepdoctection/mapper/match.py +2 -2
  82. deepdoctection/mapper/misc.py +11 -9
  83. deepdoctection/mapper/pascalstruct.py +4 -4
  84. deepdoctection/mapper/prodigystruct.py +5 -5
  85. deepdoctection/mapper/pubstruct.py +84 -92
  86. deepdoctection/mapper/tpstruct.py +5 -5
  87. deepdoctection/mapper/xfundstruct.py +33 -33
  88. deepdoctection/pipe/__init__.py +1 -1
  89. deepdoctection/pipe/anngen.py +12 -14
  90. deepdoctection/pipe/base.py +52 -106
  91. deepdoctection/pipe/common.py +72 -59
  92. deepdoctection/pipe/concurrency.py +16 -11
  93. deepdoctection/pipe/doctectionpipe.py +24 -21
  94. deepdoctection/pipe/language.py +20 -25
  95. deepdoctection/pipe/layout.py +20 -16
  96. deepdoctection/pipe/lm.py +75 -105
  97. deepdoctection/pipe/order.py +194 -89
  98. deepdoctection/pipe/refine.py +111 -124
  99. deepdoctection/pipe/segment.py +156 -161
  100. deepdoctection/pipe/{cell.py → sub_layout.py} +50 -40
  101. deepdoctection/pipe/text.py +37 -36
  102. deepdoctection/pipe/transform.py +19 -16
  103. deepdoctection/train/__init__.py +6 -12
  104. deepdoctection/train/d2_frcnn_train.py +48 -41
  105. deepdoctection/train/hf_detr_train.py +41 -30
  106. deepdoctection/train/hf_layoutlm_train.py +153 -135
  107. deepdoctection/train/tp_frcnn_train.py +32 -31
  108. deepdoctection/utils/concurrency.py +1 -1
  109. deepdoctection/utils/context.py +13 -6
  110. deepdoctection/utils/develop.py +4 -4
  111. deepdoctection/utils/env_info.py +87 -125
  112. deepdoctection/utils/file_utils.py +6 -11
  113. deepdoctection/utils/fs.py +22 -18
  114. deepdoctection/utils/identifier.py +2 -2
  115. deepdoctection/utils/logger.py +16 -15
  116. deepdoctection/utils/metacfg.py +7 -7
  117. deepdoctection/utils/mocks.py +93 -0
  118. deepdoctection/utils/pdf_utils.py +11 -11
  119. deepdoctection/utils/settings.py +185 -181
  120. deepdoctection/utils/tqdm.py +1 -1
  121. deepdoctection/utils/transform.py +14 -9
  122. deepdoctection/utils/types.py +104 -0
  123. deepdoctection/utils/utils.py +7 -7
  124. deepdoctection/utils/viz.py +74 -72
  125. {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/METADATA +30 -21
  126. deepdoctection-0.33.dist-info/RECORD +146 -0
  127. {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/WHEEL +1 -1
  128. deepdoctection/utils/detection_types.py +0 -68
  129. deepdoctection-0.31.dist-info/RECORD +0 -144
  130. {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/LICENSE +0 -0
  131. {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/top_level.txt +0 -0
@@ -18,8 +18,11 @@
18
18
  """
19
19
  Module for common pipeline components
20
20
  """
21
- from copy import copy, deepcopy
22
- from typing import List, Literal, Mapping, Optional, Sequence, Union
21
+ from __future__ import annotations
22
+
23
+ import os
24
+ from copy import deepcopy
25
+ from typing import Literal, Mapping, Optional, Sequence, Union
23
26
 
24
27
  import numpy as np
25
28
 
@@ -29,17 +32,14 @@ from ..datapoint.view import IMAGE_DEFAULTS, Page
29
32
  from ..mapper.maputils import MappingContextManager
30
33
  from ..mapper.match import match_anns_by_intersection
31
34
  from ..mapper.misc import to_image
32
- from ..utils.detection_types import JsonDict
33
- from ..utils.file_utils import detectron2_available, pytorch_available, tf_available
34
35
  from ..utils.settings import LayoutType, ObjectTypes, Relationships, TypeOrStr, get_type
35
- from .base import PipelineComponent
36
+ from .base import MetaAnnotation, PipelineComponent
36
37
  from .registry import pipeline_component_registry
37
38
 
38
- if tf_available():
39
- from ..mapper.tpstruct import tf_nms_image_annotations as nms_image_annotations
40
-
41
- elif pytorch_available() and detectron2_available():
39
+ if os.environ.get("DD_USE_TORCH"):
42
40
  from ..mapper.d2struct import pt_nms_image_annotations as nms_image_annotations
41
+ elif os.environ.get("DD_USE_TF"):
42
+ from ..mapper.tpstruct import tf_nms_image_annotations as nms_image_annotations
43
43
 
44
44
 
45
45
  @pipeline_component_registry.register("ImageCroppingService")
@@ -55,20 +55,25 @@ class ImageCroppingService(PipelineComponent):
55
55
  :param category_names: A single name or a list of category names to crop
56
56
  """
57
57
 
58
- if isinstance(category_names, str):
59
- category_names = [category_names]
60
- self.category_names = [get_type(category_name) for category_name in category_names]
58
+ self.category_names = (
59
+ (category_names,)
60
+ if isinstance(category_names, str)
61
+ else tuple(get_type(category_name) for category_name in category_names)
62
+ )
61
63
  super().__init__("image_crop")
62
64
 
63
65
  def serve(self, dp: Image) -> None:
64
66
  for ann in dp.get_annotation(category_names=self.category_names):
65
67
  dp.image_ann_to_image(ann.annotation_id, crop_image=True)
66
68
 
67
- def clone(self) -> "PipelineComponent":
69
+ def clone(self) -> ImageCroppingService:
68
70
  return self.__class__(self.category_names)
69
71
 
70
- def get_meta_annotation(self) -> JsonDict:
71
- return dict([("image_annotations", []), ("sub_categories", {}), ("relationships", {}), ("summaries", [])])
72
+ def get_meta_annotation(self) -> MetaAnnotation:
73
+ return MetaAnnotation(image_annotations=(), sub_categories={}, relationships={}, summaries=())
74
+
75
+ def clear_predictor(self) -> None:
76
+ pass
72
77
 
73
78
 
74
79
  @pipeline_component_registry.register("MatchingService")
@@ -113,16 +118,18 @@ class MatchingService(PipelineComponent):
113
118
  :param max_parent_only: Will assign to each child at most one parent with maximum ioa
114
119
  """
115
120
  self.parent_categories = (
116
- [get_type(parent_categories)] # type: ignore
117
- if not isinstance(parent_categories, (list, set))
118
- else [get_type(parent_category) for parent_category in parent_categories]
121
+ (get_type(parent_categories),)
122
+ if isinstance(parent_categories, str)
123
+ else tuple(get_type(category_name) for category_name in parent_categories)
119
124
  )
120
125
  self.child_categories = (
121
- [get_type(child_categories)] # type: ignore
122
- if not isinstance(child_categories, (list, set))
123
- else [get_type(child_category) for child_category in child_categories]
126
+ (get_type(child_categories),)
127
+ if isinstance(child_categories, str)
128
+ else (tuple(get_type(category_name) for category_name in child_categories))
124
129
  )
125
- assert matching_rule in ["iou", "ioa"], "segment rule must be either iou or ioa"
130
+ if matching_rule not in ("iou", "ioa"):
131
+ raise ValueError("segment rule must be either iou or ioa")
132
+
126
133
  self.matching_rule = matching_rule
127
134
  self.threshold = threshold
128
135
  self.use_weighted_intersections = use_weighted_intersections
@@ -150,24 +157,25 @@ class MatchingService(PipelineComponent):
150
157
  matched_child_anns = np.take(child_anns, child_index) # type: ignore
151
158
  matched_parent_anns = np.take(parent_anns, parent_index) # type: ignore
152
159
  for idx, parent in enumerate(matched_parent_anns):
153
- parent.dump_relationship(Relationships.child, matched_child_anns[idx].annotation_id)
160
+ parent.dump_relationship(Relationships.CHILD, matched_child_anns[idx].annotation_id)
154
161
 
155
162
  def clone(self) -> PipelineComponent:
156
163
  return self.__class__(self.parent_categories, self.child_categories, self.matching_rule, self.threshold)
157
164
 
158
- def get_meta_annotation(self) -> JsonDict:
159
- return dict(
160
- [
161
- ("image_annotations", []),
162
- ("sub_categories", {}),
163
- ("relationships", {parent: {Relationships.child} for parent in self.parent_categories}),
164
- ("summaries", []),
165
- ]
165
+ def get_meta_annotation(self) -> MetaAnnotation:
166
+ return MetaAnnotation(
167
+ image_annotations=(),
168
+ sub_categories={},
169
+ relationships={parent: {Relationships.CHILD} for parent in self.parent_categories},
170
+ summaries=(),
166
171
  )
167
172
 
173
+ def clear_predictor(self) -> None:
174
+ pass
175
+
168
176
 
169
177
  @pipeline_component_registry.register("PageParsingService")
170
- class PageParsingService:
178
+ class PageParsingService(PipelineComponent):
171
179
  """
172
180
  A "pseudo" pipeline component that can be added to a pipeline to convert `Image`s into `Page` formats. It allows a
173
181
  custom parsing depending on customizing options of other pipeline components.
@@ -186,14 +194,20 @@ class PageParsingService:
186
194
  """
187
195
  self.name = "page_parser"
188
196
  if isinstance(floating_text_block_categories, (str, ObjectTypes)):
189
- floating_text_block_categories = [floating_text_block_categories]
197
+ floating_text_block_categories = (get_type(floating_text_block_categories),)
190
198
  if floating_text_block_categories is None:
191
- floating_text_block_categories = copy(IMAGE_DEFAULTS["floating_text_block_categories"])
199
+ floating_text_block_categories = IMAGE_DEFAULTS["floating_text_block_categories"]
192
200
 
193
201
  self.text_container = get_type(text_container)
194
- self.floating_text_block_categories = [get_type(text_block) for text_block in floating_text_block_categories]
202
+ self.floating_text_block_categories = tuple(
203
+ (get_type(text_block) for text_block in floating_text_block_categories)
204
+ )
195
205
  self.include_residual_text_container = include_residual_text_container
196
206
  self._init_sanity_checks()
207
+ super().__init__(self.name)
208
+
209
+ def serve(self, dp: Image) -> None:
210
+ raise NotImplementedError("PageParsingService is not meant to be used in serve method")
197
211
 
198
212
  def pass_datapoint(self, dp: Image) -> Page:
199
213
  """
@@ -203,29 +217,19 @@ class PageParsingService:
203
217
  """
204
218
  return Page.from_image(dp, self.text_container, self.floating_text_block_categories)
205
219
 
206
- def predict_dataflow(self, df: DataFlow) -> DataFlow:
207
- """
208
- Mapping a datapoint via `pass_datapoint` within a dataflow pipeline
209
-
210
- :param df: An input dataflow
211
- :return: A output dataflow
212
- """
213
- return MapData(df, self.pass_datapoint)
214
-
215
220
  def _init_sanity_checks(self) -> None:
216
221
  assert self.text_container in (
217
- LayoutType.word,
218
- LayoutType.line,
219
- ), f"text_container must be either {LayoutType.word} or {LayoutType.line}"
222
+ LayoutType.WORD,
223
+ LayoutType.LINE,
224
+ ), f"text_container must be either {LayoutType.WORD} or {LayoutType.LINE}"
220
225
 
221
- @staticmethod
222
- def get_meta_annotation() -> JsonDict:
226
+ def get_meta_annotation(self) -> MetaAnnotation:
223
227
  """
224
228
  meta annotation. We do not generate any new annotations here
225
229
  """
226
- return dict([("image_annotations", []), ("sub_categories", {}), ("relationships", {}), ("summaries", [])])
230
+ return MetaAnnotation(image_annotations=(), sub_categories={}, relationships={}, summaries=())
227
231
 
228
- def clone(self) -> "PageParsingService":
232
+ def clone(self) -> PageParsingService:
229
233
  """clone"""
230
234
  return self.__class__(
231
235
  deepcopy(self.text_container),
@@ -233,6 +237,9 @@ class PageParsingService:
233
237
  self.include_residual_text_container,
234
238
  )
235
239
 
240
+ def clear_predictor(self) -> None:
241
+ pass
242
+
236
243
 
237
244
  @pipeline_component_registry.register("AnnotationNmsService")
238
245
  class AnnotationNmsService(PipelineComponent):
@@ -257,8 +264,8 @@ class AnnotationNmsService(PipelineComponent):
257
264
  def __init__(
258
265
  self,
259
266
  nms_pairs: Sequence[Sequence[TypeOrStr]],
260
- thresholds: Union[float, List[float]],
261
- priority: Optional[List[Union[Optional[TypeOrStr]]]] = None,
267
+ thresholds: Union[float, list[float]],
268
+ priority: Optional[list[Union[Optional[TypeOrStr]]]] = None,
262
269
  ):
263
270
  """
264
271
  :param nms_pairs: Groups of categories, either as string or by `ObjectType`.
@@ -292,11 +299,14 @@ class AnnotationNmsService(PipelineComponent):
292
299
  if ann.annotation_id not in ann_ids_to_keep:
293
300
  self.dp_manager.deactivate_annotation(ann.annotation_id)
294
301
 
295
- def clone(self) -> "PipelineComponent":
302
+ def clone(self) -> PipelineComponent:
296
303
  return self.__class__(deepcopy(self.nms_pairs), self.threshold)
297
304
 
298
- def get_meta_annotation(self) -> JsonDict:
299
- return dict([("image_annotations", []), ("sub_categories", {}), ("relationships", {}), ("summaries", [])])
305
+ def get_meta_annotation(self) -> MetaAnnotation:
306
+ return MetaAnnotation(image_annotations=(), sub_categories={}, relationships={}, summaries=())
307
+
308
+ def clear_predictor(self) -> None:
309
+ pass
300
310
 
301
311
 
302
312
  @pipeline_component_registry.register("ImageParsingService")
@@ -326,13 +336,16 @@ class ImageParsingService:
326
336
  """
327
337
  return MapData(df, self.pass_datapoint)
328
338
 
329
- def clone(self) -> "ImageParsingService":
339
+ def clone(self) -> ImageParsingService:
330
340
  """clone"""
331
341
  return self.__class__(self.dpi)
332
342
 
333
343
  @staticmethod
334
- def get_meta_annotation() -> JsonDict:
344
+ def get_meta_annotation() -> MetaAnnotation:
335
345
  """
336
346
  meta annotation. We do not generate any new annotations here
337
347
  """
338
- return dict([("image_annotations", []), ("sub_categories", {}), ("relationships", {}), ("summaries", [])])
348
+ return MetaAnnotation(image_annotations=(), sub_categories={}, relationships={}, summaries=())
349
+
350
+ def clear_predictor(self) -> None:
351
+ """clear predictor. Will do nothing"""
@@ -18,21 +18,22 @@
18
18
  """
19
19
  Module for multithreading tasks
20
20
  """
21
+ from __future__ import annotations
21
22
 
22
23
  import itertools
23
24
  import queue
24
25
  from concurrent.futures import ThreadPoolExecutor
25
26
  from contextlib import ExitStack
26
- from typing import Callable, List, Optional, Sequence, Union
27
+ from typing import Callable, Optional, Sequence, Union
27
28
 
28
29
  import tqdm
29
30
 
30
31
  from ..dataflow import DataFlow, MapData
31
32
  from ..datapoint.image import Image
32
33
  from ..utils.context import timed_operation
33
- from ..utils.detection_types import JsonDict, QueueType, TqdmType
34
34
  from ..utils.tqdm import get_tqdm
35
- from .base import PipelineComponent
35
+ from ..utils.types import QueueType, TqdmType
36
+ from .base import MetaAnnotation, PipelineComponent
36
37
  from .common import ImageParsingService, PageParsingService
37
38
  from .registry import pipeline_component_registry
38
39
 
@@ -99,7 +100,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
99
100
 
100
101
  def __init__(
101
102
  self,
102
- pipeline_components: Sequence[Union[PipelineComponent, PageParsingService, ImageParsingService]],
103
+ pipeline_components: Sequence[Union[PipelineComponent, ImageParsingService]],
103
104
  pre_proc_func: Optional[Callable[[Image], Image]] = None,
104
105
  post_proc_func: Optional[Callable[[Image], Image]] = None,
105
106
  max_datapoints: Optional[int] = None,
@@ -122,7 +123,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
122
123
  self.timer_on = False
123
124
  super().__init__(f"multi_thread_{self.pipe_components[0].name}")
124
125
 
125
- def put_task(self, df: Union[DataFlow, List[Image]]) -> None:
126
+ def put_task(self, df: Union[DataFlow, list[Image]]) -> None:
126
127
  """
127
128
  Put a dataflow or a list of datapoints to the queue. Note, that the process will not start before `start`
128
129
  is called. If you do not know how many datapoints will be cached, use max_datapoint to ensure no oom.
@@ -132,7 +133,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
132
133
 
133
134
  self._put_datapoints_to_queue(df)
134
135
 
135
- def start(self) -> List[Image]:
136
+ def start(self) -> list[Image]:
136
137
  """
137
138
  Creates a worker for each component and starts processing the data points of the queue. A list of the results
138
139
  is returned once all points in the queue have been processed.
@@ -164,7 +165,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
164
165
  tqdm_bar: Optional[TqdmType] = None,
165
166
  pre_proc_func: Optional[Callable[[Image], Image]] = None,
166
167
  post_proc_func: Optional[Callable[[Image], Image]] = None,
167
- ) -> List[Image]:
168
+ ) -> list[Image]:
168
169
  outputs = []
169
170
 
170
171
  with ExitStack() as stack:
@@ -183,7 +184,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
183
184
  tqdm_bar.update(1)
184
185
  return outputs
185
186
 
186
- def _put_datapoints_to_queue(self, df: Union[DataFlow, List[Image]]) -> None:
187
+ def _put_datapoints_to_queue(self, df: Union[DataFlow, list[Image]]) -> None:
187
188
  if isinstance(df, DataFlow):
188
189
  df.reset_state()
189
190
  for idx, dp in enumerate(df):
@@ -192,7 +193,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
192
193
  break
193
194
  self.input_queue.put(dp)
194
195
 
195
- def pass_datapoints(self, dpts: List[Image]) -> List[Image]:
196
+ def pass_datapoints(self, dpts: list[Image]) -> list[Image]:
196
197
  """
197
198
  Putting the list of datapoints into a thread-save queue and start for each pipeline
198
199
  component a separate thread. It will return a list of datapoints where the order of appearance
@@ -221,8 +222,12 @@ class MultiThreadPipelineComponent(PipelineComponent):
221
222
  def serve(self, dp: Image) -> None:
222
223
  raise NotImplementedError("MultiThreadPipelineComponent does not follow the PipelineComponent implementation")
223
224
 
224
- def clone(self) -> "MultiThreadPipelineComponent":
225
+ def clone(self) -> MultiThreadPipelineComponent:
225
226
  raise NotImplementedError("MultiThreadPipelineComponent does not allow cloning")
226
227
 
227
- def get_meta_annotation(self) -> JsonDict:
228
+ def get_meta_annotation(self) -> MetaAnnotation:
228
229
  return self.pipe_components[0].get_meta_annotation()
230
+
231
+ def clear_predictor(self) -> None:
232
+ for pipe in self.pipe_components:
233
+ pipe.clear_predictor()
@@ -26,18 +26,18 @@ from typing import List, Mapping, Optional, Sequence, Tuple, Union
26
26
  from ..dataflow import DataFlow, MapData
27
27
  from ..dataflow.custom_serialize import SerializerFiles, SerializerPdfDoc
28
28
  from ..datapoint.image import Image
29
+ from ..datapoint.view import IMAGE_DEFAULTS
29
30
  from ..mapper.maputils import curry
30
31
  from ..mapper.misc import to_image
31
- from ..utils.detection_types import Pathlike
32
32
  from ..utils.fs import maybe_path_or_pdf
33
33
  from ..utils.logger import LoggingRecord, logger
34
- from ..utils.settings import LayoutType
35
- from .base import Pipeline, PipelineComponent, PredictorPipelineComponent
34
+ from ..utils.types import PathLikeOrStr
35
+ from .base import Pipeline, PipelineComponent
36
36
  from .common import PageParsingService
37
37
 
38
38
 
39
39
  def _collect_from_kwargs(
40
- **kwargs: Union[str, DataFlow, bool, int, Pathlike, Union[str, List[str]]]
40
+ **kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
41
41
  ) -> Tuple[Optional[str], Optional[str], bool, int, str, DataFlow]:
42
42
  dataset_dataflow = kwargs.get("dataset_dataflow")
43
43
  path = kwargs.get("path")
@@ -69,7 +69,7 @@ def _collect_from_kwargs(
69
69
 
70
70
  @curry
71
71
  def _proto_process(
72
- dp: Union[str, Mapping[str, str]], path: Optional[str], doc_path: Optional[str]
72
+ dp: Union[str, Mapping[str, str]], path: Optional[PathLikeOrStr], doc_path: Optional[PathLikeOrStr]
73
73
  ) -> Union[str, Mapping[str, str]]:
74
74
  if isinstance(dp, str):
75
75
  file_name = Path(dp).name
@@ -78,10 +78,14 @@ def _proto_process(
78
78
  else:
79
79
  file_name = dp["file_name"]
80
80
  if path is None:
81
- path_tmp = doc_path
81
+ path_tmp = doc_path or ""
82
82
  else:
83
83
  path_tmp = path
84
- logger.info(LoggingRecord(f"Processing {file_name}", {"path": path_tmp, "df": path_tmp, "file_name": file_name}))
84
+ logger.info(
85
+ LoggingRecord(
86
+ f"Processing {file_name}", {"path": os.fspath(path_tmp), "df": os.fspath(path_tmp), "file_name": file_name}
87
+ )
88
+ )
85
89
  return dp
86
90
 
87
91
 
@@ -90,7 +94,7 @@ def _to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int
90
94
  return to_image(dp, dpi)
91
95
 
92
96
 
93
- def _doc_to_dataflow(path: Pathlike, max_datapoints: Optional[int] = None) -> DataFlow:
97
+ def _doc_to_dataflow(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> DataFlow:
94
98
  if not os.path.isfile(path):
95
99
  raise FileExistsError(f"{path} not a file")
96
100
 
@@ -127,19 +131,18 @@ class DoctectionPipe(Pipeline):
127
131
 
128
132
  def __init__(
129
133
  self,
130
- pipeline_component_list: List[Union[PipelineComponent]],
134
+ pipeline_component_list: List[PipelineComponent],
131
135
  page_parsing_service: Optional[PageParsingService] = None,
132
136
  ):
133
- if page_parsing_service is None:
134
- self.page_parser = PageParsingService(text_container=LayoutType.word)
135
- else:
136
- self.page_parser = page_parsing_service
137
- assert all(
138
- isinstance(element, (PipelineComponent, PredictorPipelineComponent)) for element in pipeline_component_list
137
+ self.page_parser = (
138
+ PageParsingService(text_container=IMAGE_DEFAULTS["text_container"])
139
+ if page_parsing_service is None
140
+ else page_parsing_service
139
141
  )
142
+
140
143
  super().__init__(pipeline_component_list)
141
144
 
142
- def _entry(self, **kwargs: Union[str, DataFlow, bool, int, Pathlike, Union[str, List[str]]]) -> DataFlow:
145
+ def _entry(self, **kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]) -> DataFlow:
143
146
  path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow = _collect_from_kwargs(**kwargs)
144
147
 
145
148
  df: DataFlow
@@ -147,7 +150,7 @@ class DoctectionPipe(Pipeline):
147
150
  if isinstance(path, (str, Path)):
148
151
  if not isinstance(file_type, (str, list)):
149
152
  raise TypeError(f"file_type must be of type string or list, but is of type {type(file_type)}")
150
- df = DoctectionPipe.path_to_dataflow(path, file_type, shuffle=shuffle)
153
+ df = DoctectionPipe.path_to_dataflow(path=path, file_type=file_type, shuffle=shuffle)
151
154
  elif isinstance(doc_path, (str, Path)):
152
155
  df = DoctectionPipe.doc_to_dataflow(
153
156
  path=doc_path, max_datapoints=int(max_datapoints) if max_datapoints is not None else None
@@ -164,7 +167,7 @@ class DoctectionPipe(Pipeline):
164
167
 
165
168
  @staticmethod
166
169
  def path_to_dataflow(
167
- path: Pathlike,
170
+ path: PathLikeOrStr,
168
171
  file_type: Union[str, Sequence[str]],
169
172
  max_datapoints: Optional[int] = None,
170
173
  shuffle: bool = False,
@@ -179,12 +182,12 @@ class DoctectionPipe(Pipeline):
179
182
  :return: dataflow
180
183
  """
181
184
  if not os.path.isdir(path):
182
- raise NotADirectoryError(f"{path} not a directory")
185
+ raise NotADirectoryError(f"{os.fspath(path)} not a directory")
183
186
  df = SerializerFiles.load(path, file_type, max_datapoints, shuffle)
184
187
  return df
185
188
 
186
189
  @staticmethod
187
- def doc_to_dataflow(path: Pathlike, max_datapoints: Optional[int] = None) -> DataFlow:
190
+ def doc_to_dataflow(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> DataFlow:
188
191
  """
189
192
  Processing method for documents
190
193
 
@@ -203,7 +206,7 @@ class DoctectionPipe(Pipeline):
203
206
  """
204
207
  return self.page_parser.predict_dataflow(df)
205
208
 
206
- def analyze(self, **kwargs: Union[str, DataFlow, bool, int, Pathlike, Union[str, List[str]]]) -> DataFlow:
209
+ def analyze(self, **kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]) -> DataFlow:
207
210
  """
208
211
  `kwargs key dataset_dataflow:` Transfer a dataflow of a dataset via its dataflow builder
209
212
 
@@ -18,16 +18,14 @@
18
18
  """
19
19
  Module for language detection pipeline component
20
20
  """
21
- from copy import copy, deepcopy
22
21
  from typing import Optional, Sequence
23
22
 
24
23
  from ..datapoint.image import Image
25
24
  from ..datapoint.view import Page
26
25
  from ..extern.base import LanguageDetector, ObjectDetector
27
- from ..utils.detection_types import JsonDict
28
26
  from ..utils.error import ImageError
29
27
  from ..utils.settings import PageType, TypeOrStr, get_type
30
- from .base import PipelineComponent
28
+ from .base import MetaAnnotation, PipelineComponent
31
29
  from .registry import pipeline_component_registry
32
30
 
33
31
 
@@ -74,26 +72,27 @@ class LanguageDetectionService(PipelineComponent):
74
72
  self.predictor = language_detector
75
73
  self.text_detector = text_detector
76
74
  self.text_container = get_type(text_container) if text_container is not None else text_container
77
- if floating_text_block_categories:
78
- floating_text_block_categories = [get_type(text_block) for text_block in floating_text_block_categories]
79
- self.floating_text_block_categories = floating_text_block_categories if floating_text_block_categories else []
80
- super().__init__(
81
- self._get_name(self.predictor.name)
82
- ) # cannot use PredictorPipelineComponent class because of return type of predict meth
75
+ self.floating_text_block_categories = (
76
+ tuple(get_type(text_block) for text_block in floating_text_block_categories)
77
+ if (floating_text_block_categories is not None)
78
+ else ()
79
+ )
80
+
81
+ super().__init__(self._get_name(self.predictor.name))
83
82
 
84
83
  def serve(self, dp: Image) -> None:
85
84
  if self.text_detector is None:
86
- page = Page.from_image(dp, self.text_container, self.floating_text_block_categories) # type: ignore
85
+ page = Page.from_image(dp, self.text_container, self.floating_text_block_categories)
87
86
  text = page.text_no_line_break
88
87
  else:
89
88
  if dp.image is None:
90
89
  raise ImageError("image cannot be None")
91
90
  detect_result_list = self.text_detector.predict(dp.image)
92
91
  # this is a concatenation of all detection result. No reading order
93
- text = " ".join([result.text for result in detect_result_list if result.text is not None])
92
+ text = " ".join((result.text for result in detect_result_list if result.text is not None))
94
93
  predict_result = self.predictor.predict(text)
95
94
  self.dp_manager.set_summary_annotation(
96
- PageType.language, PageType.language, 1, predict_result.text, predict_result.score
95
+ PageType.LANGUAGE, PageType.LANGUAGE, 1, predict_result.text, predict_result.score
97
96
  )
98
97
 
99
98
  def clone(self) -> PipelineComponent:
@@ -101,22 +100,18 @@ class LanguageDetectionService(PipelineComponent):
101
100
  if not isinstance(predictor, LanguageDetector):
102
101
  raise TypeError(f"Predictor must be of type LanguageDetector, but is of type {type(predictor)}")
103
102
  return self.__class__(
104
- predictor,
105
- copy(self.text_container),
106
- deepcopy(self.text_detector),
107
- deepcopy(self.floating_text_block_categories),
103
+ language_detector=predictor,
104
+ text_container=self.text_container,
105
+ text_detector=self.text_detector.clone() if self.text_detector is not None else None,
106
+ floating_text_block_categories=self.floating_text_block_categories,
108
107
  )
109
108
 
110
- def get_meta_annotation(self) -> JsonDict:
111
- return dict(
112
- [
113
- ("image_annotations", []),
114
- ("sub_categories", {}),
115
- ("relationships", {}),
116
- ("summaries", [PageType.language]),
117
- ]
118
- )
109
+ def get_meta_annotation(self) -> MetaAnnotation:
110
+ return MetaAnnotation(image_annotations=(), sub_categories={}, relationships={}, summaries=(PageType.LANGUAGE,))
119
111
 
120
112
  @staticmethod
121
113
  def _get_name(predictor_name: str) -> str:
122
114
  return f"language_detection_{predictor_name}"
115
+
116
+ def clear_predictor(self) -> None:
117
+ self.predictor.clear_model()
@@ -18,21 +18,22 @@
18
18
  """
19
19
  Module for layout pipeline component
20
20
  """
21
+ from __future__ import annotations
22
+
21
23
  from typing import Optional
22
24
 
23
25
  import numpy as np
24
26
 
25
27
  from ..datapoint.image import Image
26
28
  from ..extern.base import ObjectDetector, PdfMiner
27
- from ..utils.detection_types import JsonDict
28
29
  from ..utils.error import ImageError
29
30
  from ..utils.transform import PadTransform
30
- from .base import PredictorPipelineComponent
31
+ from .base import MetaAnnotation, PipelineComponent
31
32
  from .registry import pipeline_component_registry
32
33
 
33
34
 
34
35
  @pipeline_component_registry.register("ImageLayoutService")
35
- class ImageLayoutService(PredictorPipelineComponent):
36
+ class ImageLayoutService(PipelineComponent):
36
37
  """
37
38
  Pipeline component for determining the layout. Which layout blocks are determined depends on the Detector and thus
38
39
  usually on the data set on which the Detector was pre-trained. If the Detector has been trained on Publaynet, these
@@ -63,6 +64,7 @@ class ImageLayoutService(PredictorPipelineComponent):
63
64
  :param crop_image: Do not only populate `ImageAnnotation.image` but also crop the detected block according
64
65
  to its bounding box and populate the resulting sub image to
65
66
  `ImageAnnotation.image.image`.
67
+ :param padder: If not `None`, will apply the padder to the image before prediction and inverse apply the padder
66
68
  :param skip_if_layout_extracted: When `True` will check, if there are already `ImageAnnotation` of a category
67
69
  available that will be predicted by the `layout_detector`. If yes, will skip
68
70
  the prediction process.
@@ -71,11 +73,12 @@ class ImageLayoutService(PredictorPipelineComponent):
71
73
  self.crop_image = crop_image
72
74
  self.padder = padder
73
75
  self.skip_if_layout_extracted = skip_if_layout_extracted
74
- super().__init__(self._get_name(layout_detector.name), layout_detector)
76
+ self.predictor = layout_detector
77
+ super().__init__(self._get_name(layout_detector.name), self.predictor.model_id)
75
78
 
76
79
  def serve(self, dp: Image) -> None:
77
80
  if self.skip_if_layout_extracted:
78
- categories = self.predictor.possible_categories() # type: ignore
81
+ categories = self.predictor.get_category_names()
79
82
  anns = dp.get_annotation(category_names=categories)
80
83
  if anns:
81
84
  return
@@ -84,7 +87,7 @@ class ImageLayoutService(PredictorPipelineComponent):
84
87
  np_image = dp.image
85
88
  if self.padder:
86
89
  np_image = self.padder.apply_image(np_image)
87
- detect_result_list = self.predictor.predict(np_image) # type: ignore
90
+ detect_result_list = self.predictor.predict(np_image)
88
91
  if self.padder and detect_result_list:
89
92
  boxes = np.array([detect_result.box for detect_result in detect_result_list])
90
93
  boxes_orig = self.padder.inverse_apply_coords(boxes)
@@ -94,22 +97,20 @@ class ImageLayoutService(PredictorPipelineComponent):
94
97
  for detect_result in detect_result_list:
95
98
  self.dp_manager.set_image_annotation(detect_result, to_image=self.to_image, crop_image=self.crop_image)
96
99
 
97
- def get_meta_annotation(self) -> JsonDict:
98
- assert isinstance(self.predictor, (ObjectDetector, PdfMiner))
99
- return dict(
100
- [
101
- ("image_annotations", self.predictor.possible_categories()),
102
- ("sub_categories", {}),
103
- ("relationships", {}),
104
- ("summaries", []),
105
- ]
100
+ def get_meta_annotation(self) -> MetaAnnotation:
101
+ if not isinstance(self.predictor, (ObjectDetector, PdfMiner)):
102
+ raise TypeError(
103
+ f"self.predictor must be of type ObjectDetector or PdfMiner but is of type " f"{type(self.predictor)}"
104
+ )
105
+ return MetaAnnotation(
106
+ image_annotations=self.predictor.get_category_names(), sub_categories={}, relationships={}, summaries=()
106
107
  )
107
108
 
108
109
  @staticmethod
109
110
  def _get_name(predictor_name: str) -> str:
110
111
  return f"image_{predictor_name}"
111
112
 
112
- def clone(self) -> "PredictorPipelineComponent":
113
+ def clone(self) -> ImageLayoutService:
113
114
  predictor = self.predictor.clone()
114
115
  padder_clone = None
115
116
  if self.padder:
@@ -117,3 +118,6 @@ class ImageLayoutService(PredictorPipelineComponent):
117
118
  if not isinstance(predictor, ObjectDetector):
118
119
  raise TypeError(f"predictor must be of type ObjectDetector, but is of type {type(predictor)}")
119
120
  return self.__class__(predictor, self.to_image, self.crop_image, padder_clone, self.skip_if_layout_extracted)
121
+
122
+ def clear_predictor(self) -> None:
123
+ self.predictor.clear_model()