deepdoctection 0.32__py3-none-any.whl → 0.34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (111) hide show
  1. deepdoctection/__init__.py +8 -25
  2. deepdoctection/analyzer/dd.py +84 -71
  3. deepdoctection/dataflow/common.py +9 -5
  4. deepdoctection/dataflow/custom.py +5 -5
  5. deepdoctection/dataflow/custom_serialize.py +75 -18
  6. deepdoctection/dataflow/parallel_map.py +3 -3
  7. deepdoctection/dataflow/serialize.py +4 -4
  8. deepdoctection/dataflow/stats.py +3 -3
  9. deepdoctection/datapoint/annotation.py +78 -56
  10. deepdoctection/datapoint/box.py +7 -7
  11. deepdoctection/datapoint/convert.py +6 -6
  12. deepdoctection/datapoint/image.py +157 -75
  13. deepdoctection/datapoint/view.py +175 -151
  14. deepdoctection/datasets/adapter.py +30 -24
  15. deepdoctection/datasets/base.py +10 -10
  16. deepdoctection/datasets/dataflow_builder.py +3 -3
  17. deepdoctection/datasets/info.py +23 -25
  18. deepdoctection/datasets/instances/doclaynet.py +48 -49
  19. deepdoctection/datasets/instances/fintabnet.py +44 -45
  20. deepdoctection/datasets/instances/funsd.py +23 -23
  21. deepdoctection/datasets/instances/iiitar13k.py +8 -8
  22. deepdoctection/datasets/instances/layouttest.py +2 -2
  23. deepdoctection/datasets/instances/publaynet.py +3 -3
  24. deepdoctection/datasets/instances/pubtables1m.py +18 -18
  25. deepdoctection/datasets/instances/pubtabnet.py +30 -29
  26. deepdoctection/datasets/instances/rvlcdip.py +28 -29
  27. deepdoctection/datasets/instances/xfund.py +51 -30
  28. deepdoctection/datasets/save.py +6 -6
  29. deepdoctection/eval/accmetric.py +32 -33
  30. deepdoctection/eval/base.py +8 -9
  31. deepdoctection/eval/cocometric.py +13 -12
  32. deepdoctection/eval/eval.py +32 -26
  33. deepdoctection/eval/tedsmetric.py +16 -12
  34. deepdoctection/eval/tp_eval_callback.py +7 -16
  35. deepdoctection/extern/base.py +339 -134
  36. deepdoctection/extern/d2detect.py +69 -89
  37. deepdoctection/extern/deskew.py +11 -10
  38. deepdoctection/extern/doctrocr.py +81 -64
  39. deepdoctection/extern/fastlang.py +23 -16
  40. deepdoctection/extern/hfdetr.py +53 -38
  41. deepdoctection/extern/hflayoutlm.py +216 -155
  42. deepdoctection/extern/hflm.py +35 -30
  43. deepdoctection/extern/model.py +433 -255
  44. deepdoctection/extern/pdftext.py +15 -15
  45. deepdoctection/extern/pt/ptutils.py +4 -2
  46. deepdoctection/extern/tessocr.py +39 -38
  47. deepdoctection/extern/texocr.py +14 -16
  48. deepdoctection/extern/tp/tfutils.py +16 -2
  49. deepdoctection/extern/tp/tpcompat.py +11 -7
  50. deepdoctection/extern/tp/tpfrcnn/config/config.py +4 -4
  51. deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +1 -1
  52. deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +5 -5
  53. deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +6 -6
  54. deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +4 -4
  55. deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +5 -3
  56. deepdoctection/extern/tp/tpfrcnn/preproc.py +5 -5
  57. deepdoctection/extern/tpdetect.py +40 -45
  58. deepdoctection/mapper/cats.py +36 -40
  59. deepdoctection/mapper/cocostruct.py +16 -12
  60. deepdoctection/mapper/d2struct.py +22 -22
  61. deepdoctection/mapper/hfstruct.py +7 -7
  62. deepdoctection/mapper/laylmstruct.py +22 -24
  63. deepdoctection/mapper/maputils.py +9 -10
  64. deepdoctection/mapper/match.py +33 -2
  65. deepdoctection/mapper/misc.py +6 -7
  66. deepdoctection/mapper/pascalstruct.py +4 -4
  67. deepdoctection/mapper/prodigystruct.py +6 -6
  68. deepdoctection/mapper/pubstruct.py +84 -92
  69. deepdoctection/mapper/tpstruct.py +3 -3
  70. deepdoctection/mapper/xfundstruct.py +33 -33
  71. deepdoctection/pipe/anngen.py +39 -14
  72. deepdoctection/pipe/base.py +68 -99
  73. deepdoctection/pipe/common.py +181 -85
  74. deepdoctection/pipe/concurrency.py +14 -10
  75. deepdoctection/pipe/doctectionpipe.py +24 -21
  76. deepdoctection/pipe/language.py +20 -25
  77. deepdoctection/pipe/layout.py +18 -16
  78. deepdoctection/pipe/lm.py +49 -47
  79. deepdoctection/pipe/order.py +63 -65
  80. deepdoctection/pipe/refine.py +102 -109
  81. deepdoctection/pipe/segment.py +157 -162
  82. deepdoctection/pipe/sub_layout.py +50 -40
  83. deepdoctection/pipe/text.py +37 -36
  84. deepdoctection/pipe/transform.py +19 -16
  85. deepdoctection/train/d2_frcnn_train.py +27 -25
  86. deepdoctection/train/hf_detr_train.py +22 -18
  87. deepdoctection/train/hf_layoutlm_train.py +49 -48
  88. deepdoctection/train/tp_frcnn_train.py +10 -11
  89. deepdoctection/utils/concurrency.py +1 -1
  90. deepdoctection/utils/context.py +13 -6
  91. deepdoctection/utils/develop.py +4 -4
  92. deepdoctection/utils/env_info.py +52 -14
  93. deepdoctection/utils/file_utils.py +6 -11
  94. deepdoctection/utils/fs.py +41 -14
  95. deepdoctection/utils/identifier.py +2 -2
  96. deepdoctection/utils/logger.py +15 -15
  97. deepdoctection/utils/metacfg.py +7 -7
  98. deepdoctection/utils/pdf_utils.py +39 -14
  99. deepdoctection/utils/settings.py +188 -182
  100. deepdoctection/utils/tqdm.py +1 -1
  101. deepdoctection/utils/transform.py +14 -9
  102. deepdoctection/utils/types.py +104 -0
  103. deepdoctection/utils/utils.py +7 -7
  104. deepdoctection/utils/viz.py +70 -69
  105. {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/METADATA +7 -4
  106. deepdoctection-0.34.dist-info/RECORD +146 -0
  107. {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/WHEEL +1 -1
  108. deepdoctection/utils/detection_types.py +0 -68
  109. deepdoctection-0.32.dist-info/RECORD +0 -146
  110. {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/LICENSE +0 -0
  111. {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/top_level.txt +0 -0
@@ -21,21 +21,18 @@ Module for common pipeline components
21
21
  from __future__ import annotations
22
22
 
23
23
  import os
24
-
25
- from copy import copy, deepcopy
26
- from typing import List, Literal, Mapping, Optional, Sequence, Union
24
+ from copy import deepcopy
25
+ from typing import Literal, Mapping, Optional, Sequence, Union
27
26
 
28
27
  import numpy as np
29
28
 
30
29
  from ..dataflow import DataFlow, MapData
31
30
  from ..datapoint.image import Image
32
31
  from ..datapoint.view import IMAGE_DEFAULTS, Page
33
- from ..mapper.maputils import MappingContextManager
34
- from ..mapper.match import match_anns_by_intersection
32
+ from ..mapper.match import match_anns_by_distance, match_anns_by_intersection
35
33
  from ..mapper.misc import to_image
36
- from ..utils.detection_types import JsonDict
37
34
  from ..utils.settings import LayoutType, ObjectTypes, Relationships, TypeOrStr, get_type
38
- from .base import PipelineComponent
35
+ from .base import MetaAnnotation, PipelineComponent
39
36
  from .registry import pipeline_component_registry
40
37
 
41
38
  if os.environ.get("DD_USE_TORCH"):
@@ -57,37 +54,44 @@ class ImageCroppingService(PipelineComponent):
57
54
  :param category_names: A single name or a list of category names to crop
58
55
  """
59
56
 
60
- if isinstance(category_names, str):
61
- category_names = [category_names]
62
- self.category_names = [get_type(category_name) for category_name in category_names]
57
+ self.category_names = (
58
+ (category_names,)
59
+ if isinstance(category_names, str)
60
+ else tuple(get_type(category_name) for category_name in category_names)
61
+ )
63
62
  super().__init__("image_crop")
64
63
 
65
64
  def serve(self, dp: Image) -> None:
66
65
  for ann in dp.get_annotation(category_names=self.category_names):
67
66
  dp.image_ann_to_image(ann.annotation_id, crop_image=True)
68
67
 
69
- def clone(self) -> PipelineComponent:
68
+ def clone(self) -> ImageCroppingService:
70
69
  return self.__class__(self.category_names)
71
70
 
72
- def get_meta_annotation(self) -> JsonDict:
73
- return dict([("image_annotations", []), ("sub_categories", {}), ("relationships", {}), ("summaries", [])])
71
+ def get_meta_annotation(self) -> MetaAnnotation:
72
+ return MetaAnnotation(image_annotations=(), sub_categories={}, relationships={}, summaries=())
74
73
 
74
+ def clear_predictor(self) -> None:
75
+ pass
75
76
 
76
- @pipeline_component_registry.register("MatchingService")
77
- class MatchingService(PipelineComponent):
77
+
78
+ class IntersectionMatcher:
78
79
  """
79
- Objects of two object classes can be assigned to one another by determining their pairwise average. If this is above
80
- a limit, a relation is created between them.
81
- The parent object class (based on its category) and the child object class are defined for the service. A child
82
- relation is created in the parent class if the conditions are met.
80
+ Objects of two object classes can be assigned to one another by determining their pairwise intersection. If this is
81
+ above a limit, a relation is created between them.
82
+ The parent object class (based on its category) and the child object class are defined for the service.
83
83
 
84
84
  Either `iou` (intersection-over-union) or `ioa` (intersection-over-area) can be selected as the matching rule.
85
85
 
86
86
  # the following will assign word annotations to text and title annotation, provided that their ioa-threshold
87
87
  # is above 0.7. words below that threshold will not be assigned.
88
88
 
89
- match = MatchingService(parent_categories=["TEXT","TITLE"],child_categories="WORD",matching_rule="ioa",
90
- threshold=0.7)
89
+ matcher = IntersectionMatcher(matching_rule="ioa", threshold=0.7)
90
+
91
+ match_service = MatchingService(parent_categories=["text","title"],
92
+ child_categories="word",
93
+ matcher=matcher,
94
+ relationship_key=Relationships.CHILD)
91
95
 
92
96
  # Assigning means that text and title annotation will receive a relationship called "CHILD" which is a list
93
97
  of annotation ids of mapped words.
@@ -95,16 +99,12 @@ class MatchingService(PipelineComponent):
95
99
 
96
100
  def __init__(
97
101
  self,
98
- parent_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
99
- child_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
100
102
  matching_rule: Literal["iou", "ioa"],
101
103
  threshold: float,
102
104
  use_weighted_intersections: bool = False,
103
105
  max_parent_only: bool = False,
104
106
  ) -> None:
105
107
  """
106
- :param parent_categories: list of categories to be used a for parent class. Will generate a child-relationship
107
- :param child_categories: list of categories to be used for a child class.
108
108
  :param matching_rule: "iou" or "ioa"
109
109
  :param threshold: iou/ioa threshold. Value between [0,1]
110
110
  :param use_weighted_intersections: This is currently only implemented for matching_rule 'ioa'. Instead of using
@@ -112,64 +112,150 @@ class MatchingService(PipelineComponent):
112
112
  that intersections with more cells will likely decrease the ioa value. By
113
113
  multiplying the ioa with the number of all intersection for each child this
114
114
  value calibrate the ioa.
115
- :param max_parent_only: Will assign to each child at most one parent with maximum ioa
116
- """
117
- self.parent_categories = (
118
- [get_type(parent_categories)] # type: ignore
119
- if not isinstance(parent_categories, (list, set))
120
- else [get_type(parent_category) for parent_category in parent_categories]
121
- )
122
- self.child_categories = (
123
- [get_type(child_categories)] # type: ignore
124
- if not isinstance(child_categories, (list, set))
125
- else [get_type(child_category) for child_category in child_categories]
126
- )
127
- assert matching_rule in ["iou", "ioa"], "segment rule must be either iou or ioa"
115
+ :param max_parent_only: Will assign to each child at most one parent with maximum ioa"""
116
+
117
+ if matching_rule not in ("iou", "ioa"):
118
+ raise ValueError("segment rule must be either iou or ioa")
128
119
  self.matching_rule = matching_rule
129
120
  self.threshold = threshold
130
121
  self.use_weighted_intersections = use_weighted_intersections
131
122
  self.max_parent_only = max_parent_only
132
- super().__init__("matching")
133
123
 
134
- def serve(self, dp: Image) -> None:
124
+ def match(
125
+ self,
126
+ dp: Image,
127
+ parent_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
128
+ child_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
129
+ ) -> list[tuple[str, str]]:
135
130
  """
136
- - generates pairwise match-score by intersection
137
- - generates child relationship at parent level
131
+ The matching algorithm
138
132
 
139
133
  :param dp: datapoint image
134
+ :param parent_categories: list of categories to be used a for parent class. Will generate a child-relationship
135
+ :param child_categories: list of categories to be used for a child class.
136
+
137
+ :return: A list of tuples with parent and child annotation ids
140
138
  """
141
139
  child_index, parent_index, child_anns, parent_anns = match_anns_by_intersection(
142
140
  dp,
143
- parent_ann_category_names=self.parent_categories,
144
- child_ann_category_names=self.child_categories,
141
+ parent_ann_category_names=parent_categories,
142
+ child_ann_category_names=child_categories,
145
143
  matching_rule=self.matching_rule,
146
144
  threshold=self.threshold,
147
145
  use_weighted_intersections=self.use_weighted_intersections,
148
146
  max_parent_only=self.max_parent_only,
149
147
  )
150
148
 
151
- with MappingContextManager(dp_name=dp.file_name):
152
- matched_child_anns = np.take(child_anns, child_index) # type: ignore
153
- matched_parent_anns = np.take(parent_anns, parent_index) # type: ignore
154
- for idx, parent in enumerate(matched_parent_anns):
155
- parent.dump_relationship(Relationships.child, matched_child_anns[idx].annotation_id)
149
+ matched_child_anns = np.take(child_anns, child_index) # type: ignore
150
+ matched_parent_anns = np.take(parent_anns, parent_index) # type: ignore
151
+
152
+ all_parent_child_relations = []
153
+ for idx, parent in enumerate(matched_parent_anns):
154
+ all_parent_child_relations.append((parent.annotation_id, matched_child_anns[idx].annotation_id))
155
+
156
+ return all_parent_child_relations
157
+
158
+
159
+ class NeighbourMatcher:
160
+ """
161
+ Objects of two object classes can be assigned to one another by determining their pairwise distance.
162
+
163
+ # the following will assign caption annotations to figure annotation
164
+
165
+ matcher = NeighbourMatcher()
166
+
167
+ match_service = MatchingService(parent_categories=["figure"],
168
+ child_categories="caption",
169
+ matcher=matcher,
170
+ relationship_key=Relationships.LAYOUT_LINK)
171
+
172
+ """
173
+
174
+ def match(
175
+ self,
176
+ dp: Image,
177
+ parent_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
178
+ child_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
179
+ ) -> list[tuple[str, str]]:
180
+ """
181
+ The matching algorithm
182
+
183
+ :param dp: datapoint image
184
+ :param parent_categories: list of categories to be used a for parent class. Will generate a child-relationship
185
+ :param child_categories: list of categories to be used for a child class.
186
+
187
+ :return: A list of tuples with parent and child annotation ids
188
+ """
189
+
190
+ return [
191
+ (pair[0].annotation_id, pair[1].annotation_id)
192
+ for pair in match_anns_by_distance(dp, parent_categories, child_categories)
193
+ ]
194
+
195
+
196
+ @pipeline_component_registry.register("MatchingService")
197
+ class MatchingService(PipelineComponent):
198
+ """
199
+ A service to match annotations of two categories by intersection or distance. The matched annotations will be
200
+ assigned a relationship. The parent category will receive a relationship to the child category.
201
+ """
202
+
203
+ def __init__(
204
+ self,
205
+ parent_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
206
+ child_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
207
+ matcher: Union[IntersectionMatcher, NeighbourMatcher],
208
+ relationship_key: Relationships,
209
+ ) -> None:
210
+ """
211
+ :param parent_categories: list of categories to be used a for parent class. Will generate a child-relationship
212
+ :param child_categories: list of categories to be used for a child class.
213
+
214
+ """
215
+ self.parent_categories = (
216
+ (get_type(parent_categories),)
217
+ if isinstance(parent_categories, str)
218
+ else tuple(get_type(category_name) for category_name in parent_categories)
219
+ )
220
+ self.child_categories = (
221
+ (get_type(child_categories),)
222
+ if isinstance(child_categories, str)
223
+ else (tuple(get_type(category_name) for category_name in child_categories))
224
+ )
225
+ self.matcher = matcher
226
+ self.relationship_key = relationship_key
227
+ super().__init__("matching")
228
+
229
+ def serve(self, dp: Image) -> None:
230
+ """
231
+ - generates pairwise match-score by intersection
232
+ - generates child relationship at parent level
233
+
234
+ :param dp: datapoint image
235
+ """
236
+
237
+ matched_pairs = self.matcher.match(dp, self.parent_categories, self.child_categories)
238
+
239
+ for pair in matched_pairs:
240
+ self.dp_manager.set_relationship_annotation(self.relationship_key, pair[0], pair[1])
156
241
 
157
242
  def clone(self) -> PipelineComponent:
158
- return self.__class__(self.parent_categories, self.child_categories, self.matching_rule, self.threshold)
159
-
160
- def get_meta_annotation(self) -> JsonDict:
161
- return dict(
162
- [
163
- ("image_annotations", []),
164
- ("sub_categories", {}),
165
- ("relationships", {parent: {Relationships.child} for parent in self.parent_categories}),
166
- ("summaries", []),
167
- ]
243
+ return self.__class__(self.parent_categories, self.child_categories, self.matcher, self.relationship_key)
244
+
245
+ def get_meta_annotation(self) -> MetaAnnotation:
246
+ return MetaAnnotation(
247
+ image_annotations=(),
248
+ sub_categories={},
249
+ relationships={parent: {Relationships.CHILD} for parent in self.parent_categories},
250
+ summaries=(),
168
251
  )
169
252
 
253
+ def clear_predictor(self) -> None:
254
+ pass
255
+
170
256
 
171
257
  @pipeline_component_registry.register("PageParsingService")
172
- class PageParsingService:
258
+ class PageParsingService(PipelineComponent):
173
259
  """
174
260
  A "pseudo" pipeline component that can be added to a pipeline to convert `Image`s into `Page` formats. It allows a
175
261
  custom parsing depending on customizing options of other pipeline components.
@@ -188,14 +274,20 @@ class PageParsingService:
188
274
  """
189
275
  self.name = "page_parser"
190
276
  if isinstance(floating_text_block_categories, (str, ObjectTypes)):
191
- floating_text_block_categories = [floating_text_block_categories]
277
+ floating_text_block_categories = (get_type(floating_text_block_categories),)
192
278
  if floating_text_block_categories is None:
193
- floating_text_block_categories = copy(IMAGE_DEFAULTS["floating_text_block_categories"])
279
+ floating_text_block_categories = IMAGE_DEFAULTS["floating_text_block_categories"]
194
280
 
195
281
  self.text_container = get_type(text_container)
196
- self.floating_text_block_categories = [get_type(text_block) for text_block in floating_text_block_categories]
282
+ self.floating_text_block_categories = tuple(
283
+ (get_type(text_block) for text_block in floating_text_block_categories)
284
+ )
197
285
  self.include_residual_text_container = include_residual_text_container
198
286
  self._init_sanity_checks()
287
+ super().__init__(self.name)
288
+
289
+ def serve(self, dp: Image) -> None:
290
+ raise NotImplementedError("PageParsingService is not meant to be used in serve method")
199
291
 
200
292
  def pass_datapoint(self, dp: Image) -> Page:
201
293
  """
@@ -203,29 +295,24 @@ class PageParsingService:
203
295
  :param dp: Image
204
296
  :return: Page
205
297
  """
206
- return Page.from_image(dp, self.text_container, self.floating_text_block_categories)
207
-
208
- def predict_dataflow(self, df: DataFlow) -> DataFlow:
209
- """
210
- Mapping a datapoint via `pass_datapoint` within a dataflow pipeline
211
-
212
- :param df: An input dataflow
213
- :return: A output dataflow
214
- """
215
- return MapData(df, self.pass_datapoint)
298
+ return Page.from_image(
299
+ dp,
300
+ text_container=self.text_container,
301
+ floating_text_block_categories=self.floating_text_block_categories,
302
+ include_residual_text_container=self.include_residual_text_container,
303
+ )
216
304
 
217
305
  def _init_sanity_checks(self) -> None:
218
306
  assert self.text_container in (
219
- LayoutType.word,
220
- LayoutType.line,
221
- ), f"text_container must be either {LayoutType.word} or {LayoutType.line}"
307
+ LayoutType.WORD,
308
+ LayoutType.LINE,
309
+ ), f"text_container must be either {LayoutType.WORD} or {LayoutType.LINE}"
222
310
 
223
- @staticmethod
224
- def get_meta_annotation() -> JsonDict:
311
+ def get_meta_annotation(self) -> MetaAnnotation:
225
312
  """
226
313
  meta annotation. We do not generate any new annotations here
227
314
  """
228
- return dict([("image_annotations", []), ("sub_categories", {}), ("relationships", {}), ("summaries", [])])
315
+ return MetaAnnotation(image_annotations=(), sub_categories={}, relationships={}, summaries=())
229
316
 
230
317
  def clone(self) -> PageParsingService:
231
318
  """clone"""
@@ -235,6 +322,9 @@ class PageParsingService:
235
322
  self.include_residual_text_container,
236
323
  )
237
324
 
325
+ def clear_predictor(self) -> None:
326
+ pass
327
+
238
328
 
239
329
  @pipeline_component_registry.register("AnnotationNmsService")
240
330
  class AnnotationNmsService(PipelineComponent):
@@ -259,8 +349,8 @@ class AnnotationNmsService(PipelineComponent):
259
349
  def __init__(
260
350
  self,
261
351
  nms_pairs: Sequence[Sequence[TypeOrStr]],
262
- thresholds: Union[float, List[float]],
263
- priority: Optional[List[Union[Optional[TypeOrStr]]]] = None,
352
+ thresholds: Union[float, list[float]],
353
+ priority: Optional[list[Union[Optional[TypeOrStr]]]] = None,
264
354
  ):
265
355
  """
266
356
  :param nms_pairs: Groups of categories, either as string or by `ObjectType`.
@@ -297,8 +387,11 @@ class AnnotationNmsService(PipelineComponent):
297
387
  def clone(self) -> PipelineComponent:
298
388
  return self.__class__(deepcopy(self.nms_pairs), self.threshold)
299
389
 
300
- def get_meta_annotation(self) -> JsonDict:
301
- return dict([("image_annotations", []), ("sub_categories", {}), ("relationships", {}), ("summaries", [])])
390
+ def get_meta_annotation(self) -> MetaAnnotation:
391
+ return MetaAnnotation(image_annotations=(), sub_categories={}, relationships={}, summaries=())
392
+
393
+ def clear_predictor(self) -> None:
394
+ pass
302
395
 
303
396
 
304
397
  @pipeline_component_registry.register("ImageParsingService")
@@ -333,8 +426,11 @@ class ImageParsingService:
333
426
  return self.__class__(self.dpi)
334
427
 
335
428
  @staticmethod
336
- def get_meta_annotation() -> JsonDict:
429
+ def get_meta_annotation() -> MetaAnnotation:
337
430
  """
338
431
  meta annotation. We do not generate any new annotations here
339
432
  """
340
- return dict([("image_annotations", []), ("sub_categories", {}), ("relationships", {}), ("summaries", [])])
433
+ return MetaAnnotation(image_annotations=(), sub_categories={}, relationships={}, summaries=())
434
+
435
+ def clear_predictor(self) -> None:
436
+ """clear predictor. Will do nothing"""
@@ -24,16 +24,16 @@ import itertools
24
24
  import queue
25
25
  from concurrent.futures import ThreadPoolExecutor
26
26
  from contextlib import ExitStack
27
- from typing import Callable, List, Optional, Sequence, Union
27
+ from typing import Callable, Optional, Sequence, Union
28
28
 
29
29
  import tqdm
30
30
 
31
31
  from ..dataflow import DataFlow, MapData
32
32
  from ..datapoint.image import Image
33
33
  from ..utils.context import timed_operation
34
- from ..utils.detection_types import JsonDict, QueueType, TqdmType
35
34
  from ..utils.tqdm import get_tqdm
36
- from .base import PipelineComponent
35
+ from ..utils.types import QueueType, TqdmType
36
+ from .base import MetaAnnotation, PipelineComponent
37
37
  from .common import ImageParsingService, PageParsingService
38
38
  from .registry import pipeline_component_registry
39
39
 
@@ -100,7 +100,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
100
100
 
101
101
  def __init__(
102
102
  self,
103
- pipeline_components: Sequence[Union[PipelineComponent, PageParsingService, ImageParsingService]],
103
+ pipeline_components: Sequence[Union[PipelineComponent, ImageParsingService]],
104
104
  pre_proc_func: Optional[Callable[[Image], Image]] = None,
105
105
  post_proc_func: Optional[Callable[[Image], Image]] = None,
106
106
  max_datapoints: Optional[int] = None,
@@ -123,7 +123,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
123
123
  self.timer_on = False
124
124
  super().__init__(f"multi_thread_{self.pipe_components[0].name}")
125
125
 
126
- def put_task(self, df: Union[DataFlow, List[Image]]) -> None:
126
+ def put_task(self, df: Union[DataFlow, list[Image]]) -> None:
127
127
  """
128
128
  Put a dataflow or a list of datapoints to the queue. Note, that the process will not start before `start`
129
129
  is called. If you do not know how many datapoints will be cached, use max_datapoint to ensure no oom.
@@ -133,7 +133,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
133
133
 
134
134
  self._put_datapoints_to_queue(df)
135
135
 
136
- def start(self) -> List[Image]:
136
+ def start(self) -> list[Image]:
137
137
  """
138
138
  Creates a worker for each component and starts processing the data points of the queue. A list of the results
139
139
  is returned once all points in the queue have been processed.
@@ -165,7 +165,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
165
165
  tqdm_bar: Optional[TqdmType] = None,
166
166
  pre_proc_func: Optional[Callable[[Image], Image]] = None,
167
167
  post_proc_func: Optional[Callable[[Image], Image]] = None,
168
- ) -> List[Image]:
168
+ ) -> list[Image]:
169
169
  outputs = []
170
170
 
171
171
  with ExitStack() as stack:
@@ -184,7 +184,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
184
184
  tqdm_bar.update(1)
185
185
  return outputs
186
186
 
187
- def _put_datapoints_to_queue(self, df: Union[DataFlow, List[Image]]) -> None:
187
+ def _put_datapoints_to_queue(self, df: Union[DataFlow, list[Image]]) -> None:
188
188
  if isinstance(df, DataFlow):
189
189
  df.reset_state()
190
190
  for idx, dp in enumerate(df):
@@ -193,7 +193,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
193
193
  break
194
194
  self.input_queue.put(dp)
195
195
 
196
- def pass_datapoints(self, dpts: List[Image]) -> List[Image]:
196
+ def pass_datapoints(self, dpts: list[Image]) -> list[Image]:
197
197
  """
198
198
  Putting the list of datapoints into a thread-save queue and start for each pipeline
199
199
  component a separate thread. It will return a list of datapoints where the order of appearance
@@ -225,5 +225,9 @@ class MultiThreadPipelineComponent(PipelineComponent):
225
225
  def clone(self) -> MultiThreadPipelineComponent:
226
226
  raise NotImplementedError("MultiThreadPipelineComponent does not allow cloning")
227
227
 
228
- def get_meta_annotation(self) -> JsonDict:
228
+ def get_meta_annotation(self) -> MetaAnnotation:
229
229
  return self.pipe_components[0].get_meta_annotation()
230
+
231
+ def clear_predictor(self) -> None:
232
+ for pipe in self.pipe_components:
233
+ pipe.clear_predictor()
@@ -26,18 +26,18 @@ from typing import List, Mapping, Optional, Sequence, Tuple, Union
26
26
  from ..dataflow import DataFlow, MapData
27
27
  from ..dataflow.custom_serialize import SerializerFiles, SerializerPdfDoc
28
28
  from ..datapoint.image import Image
29
+ from ..datapoint.view import IMAGE_DEFAULTS
29
30
  from ..mapper.maputils import curry
30
31
  from ..mapper.misc import to_image
31
- from ..utils.detection_types import Pathlike
32
32
  from ..utils.fs import maybe_path_or_pdf
33
33
  from ..utils.logger import LoggingRecord, logger
34
- from ..utils.settings import LayoutType
35
- from .base import Pipeline, PipelineComponent, PredictorPipelineComponent
34
+ from ..utils.types import PathLikeOrStr
35
+ from .base import Pipeline, PipelineComponent
36
36
  from .common import PageParsingService
37
37
 
38
38
 
39
39
  def _collect_from_kwargs(
40
- **kwargs: Union[str, DataFlow, bool, int, Pathlike, Union[str, List[str]]]
40
+ **kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
41
41
  ) -> Tuple[Optional[str], Optional[str], bool, int, str, DataFlow]:
42
42
  dataset_dataflow = kwargs.get("dataset_dataflow")
43
43
  path = kwargs.get("path")
@@ -69,7 +69,7 @@ def _collect_from_kwargs(
69
69
 
70
70
  @curry
71
71
  def _proto_process(
72
- dp: Union[str, Mapping[str, str]], path: Optional[str], doc_path: Optional[str]
72
+ dp: Union[str, Mapping[str, str]], path: Optional[PathLikeOrStr], doc_path: Optional[PathLikeOrStr]
73
73
  ) -> Union[str, Mapping[str, str]]:
74
74
  if isinstance(dp, str):
75
75
  file_name = Path(dp).name
@@ -78,10 +78,14 @@ def _proto_process(
78
78
  else:
79
79
  file_name = dp["file_name"]
80
80
  if path is None:
81
- path_tmp = doc_path
81
+ path_tmp = doc_path or ""
82
82
  else:
83
83
  path_tmp = path
84
- logger.info(LoggingRecord(f"Processing {file_name}", {"path": path_tmp, "df": path_tmp, "file_name": file_name}))
84
+ logger.info(
85
+ LoggingRecord(
86
+ f"Processing {file_name}", {"path": os.fspath(path_tmp), "df": os.fspath(path_tmp), "file_name": file_name}
87
+ )
88
+ )
85
89
  return dp
86
90
 
87
91
 
@@ -90,7 +94,7 @@ def _to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int
90
94
  return to_image(dp, dpi)
91
95
 
92
96
 
93
- def _doc_to_dataflow(path: Pathlike, max_datapoints: Optional[int] = None) -> DataFlow:
97
+ def _doc_to_dataflow(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> DataFlow:
94
98
  if not os.path.isfile(path):
95
99
  raise FileExistsError(f"{path} not a file")
96
100
 
@@ -127,19 +131,18 @@ class DoctectionPipe(Pipeline):
127
131
 
128
132
  def __init__(
129
133
  self,
130
- pipeline_component_list: List[Union[PipelineComponent]],
134
+ pipeline_component_list: List[PipelineComponent],
131
135
  page_parsing_service: Optional[PageParsingService] = None,
132
136
  ):
133
- if page_parsing_service is None:
134
- self.page_parser = PageParsingService(text_container=LayoutType.word)
135
- else:
136
- self.page_parser = page_parsing_service
137
- assert all(
138
- isinstance(element, (PipelineComponent, PredictorPipelineComponent)) for element in pipeline_component_list
137
+ self.page_parser = (
138
+ PageParsingService(text_container=IMAGE_DEFAULTS["text_container"])
139
+ if page_parsing_service is None
140
+ else page_parsing_service
139
141
  )
142
+
140
143
  super().__init__(pipeline_component_list)
141
144
 
142
- def _entry(self, **kwargs: Union[str, DataFlow, bool, int, Pathlike, Union[str, List[str]]]) -> DataFlow:
145
+ def _entry(self, **kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]) -> DataFlow:
143
146
  path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow = _collect_from_kwargs(**kwargs)
144
147
 
145
148
  df: DataFlow
@@ -147,7 +150,7 @@ class DoctectionPipe(Pipeline):
147
150
  if isinstance(path, (str, Path)):
148
151
  if not isinstance(file_type, (str, list)):
149
152
  raise TypeError(f"file_type must be of type string or list, but is of type {type(file_type)}")
150
- df = DoctectionPipe.path_to_dataflow(path, file_type, shuffle=shuffle)
153
+ df = DoctectionPipe.path_to_dataflow(path=path, file_type=file_type, shuffle=shuffle)
151
154
  elif isinstance(doc_path, (str, Path)):
152
155
  df = DoctectionPipe.doc_to_dataflow(
153
156
  path=doc_path, max_datapoints=int(max_datapoints) if max_datapoints is not None else None
@@ -164,7 +167,7 @@ class DoctectionPipe(Pipeline):
164
167
 
165
168
  @staticmethod
166
169
  def path_to_dataflow(
167
- path: Pathlike,
170
+ path: PathLikeOrStr,
168
171
  file_type: Union[str, Sequence[str]],
169
172
  max_datapoints: Optional[int] = None,
170
173
  shuffle: bool = False,
@@ -179,12 +182,12 @@ class DoctectionPipe(Pipeline):
179
182
  :return: dataflow
180
183
  """
181
184
  if not os.path.isdir(path):
182
- raise NotADirectoryError(f"{path} not a directory")
185
+ raise NotADirectoryError(f"{os.fspath(path)} not a directory")
183
186
  df = SerializerFiles.load(path, file_type, max_datapoints, shuffle)
184
187
  return df
185
188
 
186
189
  @staticmethod
187
- def doc_to_dataflow(path: Pathlike, max_datapoints: Optional[int] = None) -> DataFlow:
190
+ def doc_to_dataflow(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> DataFlow:
188
191
  """
189
192
  Processing method for documents
190
193
 
@@ -203,7 +206,7 @@ class DoctectionPipe(Pipeline):
203
206
  """
204
207
  return self.page_parser.predict_dataflow(df)
205
208
 
206
- def analyze(self, **kwargs: Union[str, DataFlow, bool, int, Pathlike, Union[str, List[str]]]) -> DataFlow:
209
+ def analyze(self, **kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]) -> DataFlow:
207
210
  """
208
211
  `kwargs key dataset_dataflow:` Transfer a dataflow of a dataset via its dataflow builder
209
212