deepdoctection 0.30__py3-none-any.whl → 0.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (74) hide show
  1. deepdoctection/__init__.py +4 -2
  2. deepdoctection/analyzer/dd.py +6 -5
  3. deepdoctection/dataflow/base.py +0 -19
  4. deepdoctection/dataflow/custom.py +4 -3
  5. deepdoctection/dataflow/custom_serialize.py +14 -5
  6. deepdoctection/dataflow/parallel_map.py +12 -11
  7. deepdoctection/dataflow/serialize.py +5 -4
  8. deepdoctection/datapoint/annotation.py +33 -12
  9. deepdoctection/datapoint/box.py +1 -4
  10. deepdoctection/datapoint/convert.py +3 -1
  11. deepdoctection/datapoint/image.py +66 -29
  12. deepdoctection/datapoint/view.py +57 -25
  13. deepdoctection/datasets/adapter.py +1 -1
  14. deepdoctection/datasets/base.py +83 -10
  15. deepdoctection/datasets/dataflow_builder.py +1 -1
  16. deepdoctection/datasets/info.py +2 -2
  17. deepdoctection/datasets/instances/layouttest.py +2 -7
  18. deepdoctection/eval/accmetric.py +1 -1
  19. deepdoctection/eval/base.py +5 -4
  20. deepdoctection/eval/eval.py +2 -2
  21. deepdoctection/eval/tp_eval_callback.py +5 -4
  22. deepdoctection/extern/base.py +39 -13
  23. deepdoctection/extern/d2detect.py +164 -64
  24. deepdoctection/extern/deskew.py +32 -7
  25. deepdoctection/extern/doctrocr.py +227 -39
  26. deepdoctection/extern/fastlang.py +45 -7
  27. deepdoctection/extern/hfdetr.py +90 -33
  28. deepdoctection/extern/hflayoutlm.py +109 -22
  29. deepdoctection/extern/pdftext.py +2 -1
  30. deepdoctection/extern/pt/ptutils.py +3 -2
  31. deepdoctection/extern/tessocr.py +134 -22
  32. deepdoctection/extern/texocr.py +2 -0
  33. deepdoctection/extern/tp/tpcompat.py +4 -4
  34. deepdoctection/extern/tp/tpfrcnn/preproc.py +2 -7
  35. deepdoctection/extern/tpdetect.py +50 -23
  36. deepdoctection/mapper/d2struct.py +1 -1
  37. deepdoctection/mapper/hfstruct.py +1 -1
  38. deepdoctection/mapper/laylmstruct.py +1 -1
  39. deepdoctection/mapper/maputils.py +13 -2
  40. deepdoctection/mapper/prodigystruct.py +1 -1
  41. deepdoctection/mapper/pubstruct.py +10 -10
  42. deepdoctection/mapper/tpstruct.py +1 -1
  43. deepdoctection/pipe/anngen.py +35 -8
  44. deepdoctection/pipe/base.py +53 -19
  45. deepdoctection/pipe/cell.py +29 -8
  46. deepdoctection/pipe/common.py +12 -4
  47. deepdoctection/pipe/doctectionpipe.py +2 -2
  48. deepdoctection/pipe/language.py +3 -2
  49. deepdoctection/pipe/layout.py +3 -2
  50. deepdoctection/pipe/lm.py +2 -2
  51. deepdoctection/pipe/refine.py +18 -10
  52. deepdoctection/pipe/segment.py +21 -16
  53. deepdoctection/pipe/text.py +14 -8
  54. deepdoctection/pipe/transform.py +16 -9
  55. deepdoctection/train/d2_frcnn_train.py +15 -12
  56. deepdoctection/train/hf_detr_train.py +8 -6
  57. deepdoctection/train/hf_layoutlm_train.py +16 -11
  58. deepdoctection/utils/__init__.py +3 -0
  59. deepdoctection/utils/concurrency.py +1 -1
  60. deepdoctection/utils/context.py +2 -2
  61. deepdoctection/utils/env_info.py +55 -22
  62. deepdoctection/utils/error.py +84 -0
  63. deepdoctection/utils/file_utils.py +4 -15
  64. deepdoctection/utils/fs.py +7 -7
  65. deepdoctection/utils/pdf_utils.py +5 -4
  66. deepdoctection/utils/settings.py +5 -1
  67. deepdoctection/utils/transform.py +1 -1
  68. deepdoctection/utils/utils.py +0 -6
  69. deepdoctection/utils/viz.py +44 -2
  70. {deepdoctection-0.30.dist-info → deepdoctection-0.31.dist-info}/METADATA +33 -58
  71. {deepdoctection-0.30.dist-info → deepdoctection-0.31.dist-info}/RECORD +74 -73
  72. {deepdoctection-0.30.dist-info → deepdoctection-0.31.dist-info}/WHEEL +1 -1
  73. {deepdoctection-0.30.dist-info → deepdoctection-0.31.dist-info}/LICENSE +0 -0
  74. {deepdoctection-0.30.dist-info → deepdoctection-0.31.dist-info}/top_level.txt +0 -0
@@ -19,9 +19,10 @@
19
19
  TP Faster RCNN model as predictor for deepdoctection pipeline
20
20
  """
21
21
 
22
+ from abc import ABC
22
23
  from copy import copy
23
24
  from pathlib import Path
24
- from typing import List, Mapping, Optional, Sequence, Union
25
+ from typing import Dict, List, Mapping, Optional, Sequence, Union
25
26
 
26
27
  from ..utils.detection_types import ImageType, Requirement
27
28
  from ..utils.file_utils import get_tensorflow_requirement, get_tensorpack_requirement, tensorpack_available
@@ -36,7 +37,46 @@ if tensorpack_available():
36
37
  from .tp.tpfrcnn.predict import tp_predict_image
37
38
 
38
39
 
39
- class TPFrcnnDetector(TensorpackPredictor, ObjectDetector):
40
+ class TPFrcnnDetectorMixin(ObjectDetector, ABC):
41
+ """Base class for TP FRCNN detector. This class only implements the basic wrapper functions"""
42
+
43
+ def __init__(self, categories: Mapping[str, TypeOrStr], filter_categories: Optional[Sequence[TypeOrStr]] = None):
44
+ self.categories = copy(categories) # type: ignore
45
+ if filter_categories:
46
+ filter_categories = [get_type(cat) for cat in filter_categories]
47
+ self.filter_categories = filter_categories
48
+ self._tp_categories = self._map_to_tp_categories(categories)
49
+
50
+ def _map_category_names(self, detection_results: List[DetectionResult]) -> List[DetectionResult]:
51
+ """
52
+ Populating category names to detection results
53
+
54
+ :param detection_results: list of detection results
55
+ :return: List of detection results with attribute class_name populated
56
+ """
57
+ filtered_detection_result: List[DetectionResult] = []
58
+ for result in detection_results:
59
+ result.class_name = self._tp_categories[str(result.class_id)]
60
+ if self.filter_categories:
61
+ if result.class_name not in self.filter_categories:
62
+ filtered_detection_result.append(result)
63
+ else:
64
+ filtered_detection_result.append(result)
65
+ return filtered_detection_result
66
+
67
+ @staticmethod
68
+ def _map_to_tp_categories(categories: Mapping[str, TypeOrStr]) -> Dict[str, ObjectTypes]:
69
+ categories = {str(key): get_type(categories[val]) for key, val in enumerate(categories, 1)}
70
+ categories["0"] = get_type("background")
71
+ return categories # type: ignore
72
+
73
+ @staticmethod
74
+ def get_name(path_weights: str, architecture: str) -> str:
75
+ """Returns the name of the model"""
76
+ return f"Tensorpack_{architecture}" + "_".join(Path(path_weights).parts[-2:])
77
+
78
+
79
+ class TPFrcnnDetector(TensorpackPredictor, TPFrcnnDetectorMixin):
40
80
  """
41
81
  Tensorpack Faster-RCNN implementation with FPN and optional Cascade-RCNN. The backbones Resnet-50, Resnet-101 and
42
82
  their Resnext counterparts are also available. Normalization options (group normalization, synchronized batch
@@ -87,19 +127,23 @@ class TPFrcnnDetector(TensorpackPredictor, ObjectDetector):
87
127
  :param filter_categories: The model might return objects that are not supposed to be predicted and that should
88
128
  be filtered. Pass a list of category names that must not be returned
89
129
  """
90
- self.name = "_".join(Path(path_weights).parts[-3:])
91
130
  self.path_yaml = path_yaml
131
+
92
132
  self.categories = copy(categories) # type: ignore
93
133
  self.config_overwrite = config_overwrite
94
134
  if filter_categories:
95
135
  filter_categories = [get_type(cat) for cat in filter_categories]
96
136
  self.filter_categories = filter_categories
97
- model = TPFrcnnDetector.set_model(path_yaml, self.categories, config_overwrite)
98
- super().__init__(model, path_weights, ignore_mismatch)
137
+ model = TPFrcnnDetector.get_wrapped_model(path_yaml, self.categories, config_overwrite)
138
+ TensorpackPredictor.__init__(self, model, path_weights, ignore_mismatch)
139
+ TPFrcnnDetectorMixin.__init__(self, categories, filter_categories)
140
+
141
+ self.name = self.get_name(path_weights, self._model.cfg.TAG)
142
+ self.model_id = self.get_model_id()
99
143
  assert self._number_gpus > 0, "Model only support inference with GPU"
100
144
 
101
145
  @staticmethod
102
- def set_model(
146
+ def get_wrapped_model(
103
147
  path_yaml: str, categories: Mapping[str, ObjectTypes], config_overwrite: Union[List[str], None]
104
148
  ) -> ResNetFPNModel:
105
149
  """
@@ -138,23 +182,6 @@ class TPFrcnnDetector(TensorpackPredictor, ObjectDetector):
138
182
  )
139
183
  return self._map_category_names(detection_results)
140
184
 
141
- def _map_category_names(self, detection_results: List[DetectionResult]) -> List[DetectionResult]:
142
- """
143
- Populating category names to detection results
144
-
145
- :param detection_results: list of detection results
146
- :return: List of detection results with attribute class_name populated
147
- """
148
- filtered_detection_result: List[DetectionResult] = []
149
- for result in detection_results:
150
- result.class_name = self._model.cfg.DATA.CLASS_DICT[str(result.class_id)]
151
- if self.filter_categories:
152
- if result.class_name not in self.filter_categories:
153
- filtered_detection_result.append(result)
154
- else:
155
- filtered_detection_result.append(result)
156
- return filtered_detection_result
157
-
158
185
  @classmethod
159
186
  def get_requirements(cls) -> List[Requirement]:
160
187
  return [get_tensorflow_requirement(), get_tensorpack_requirement()]
@@ -93,7 +93,7 @@ def image_to_d2_frcnn_training(
93
93
  annotations.append(mapped_ann)
94
94
 
95
95
  if add_mask:
96
- raise NotImplementedError
96
+ raise NotImplementedError("Segmentation in deepdoctection is not supported")
97
97
 
98
98
  output["annotations"] = annotations
99
99
 
@@ -81,7 +81,7 @@ def image_to_hf_detr_training(
81
81
  annotations.append(mapped_ann)
82
82
 
83
83
  if add_mask:
84
- raise NotImplementedError
84
+ raise NotImplementedError("Segmentation in deepdoctection is not supported")
85
85
 
86
86
  output["annotations"] = annotations
87
87
 
@@ -146,7 +146,7 @@ def image_to_raw_layoutlm_features(
146
146
  raise TypeError(f"char_cat must be of type ContainerAnnotation but is of type {type(char_cat)}")
147
147
  word = char_cat.value
148
148
  if not isinstance(word, str):
149
- raise ValueError(f"word must be of type str but is of type {type(word)}")
149
+ raise TypeError(f"word must be of type str but is of type {type(word)}")
150
150
  all_words.append(word)
151
151
 
152
152
  box = ann.get_bounding_box(dp.image_id)
@@ -28,8 +28,8 @@ import numpy as np
28
28
  from tabulate import tabulate
29
29
  from termcolor import colored
30
30
 
31
- from ..datapoint.box import BoundingBoxError
32
31
  from ..utils.detection_types import DP, BaseExceptionType, S, T
32
+ from ..utils.error import AnnotationError, BoundingBoxError, ImageError, UUIDError
33
33
  from ..utils.logger import LoggingRecord, logger
34
34
  from ..utils.settings import ObjectTypes
35
35
 
@@ -72,7 +72,18 @@ class MappingContextManager:
72
72
  """
73
73
  if (
74
74
  exc_type
75
- in (KeyError, ValueError, IndexError, AssertionError, TypeError, BoundingBoxError, FileNotFoundError)
75
+ in (
76
+ KeyError,
77
+ ValueError,
78
+ IndexError,
79
+ AssertionError,
80
+ TypeError,
81
+ FileNotFoundError,
82
+ BoundingBoxError,
83
+ AnnotationError,
84
+ ImageError,
85
+ UUIDError,
86
+ )
76
87
  and exc_tb is not None
77
88
  ):
78
89
  frame_summary = traceback.extract_tb(exc_tb)[0]
@@ -128,7 +128,7 @@ def prodigy_to_image(
128
128
  else:
129
129
  label = span["label"]
130
130
  if not isinstance(label, str):
131
- raise ValueError("label could not assigned to be a string")
131
+ raise TypeError("label must be a string")
132
132
 
133
133
  annotation = ImageAnnotation(
134
134
  category_name=label,
@@ -75,12 +75,14 @@ def _cell_token(html: Sequence[str]) -> List[List[int]]:
75
75
  def _item_spans(html: Sequence[str], index_cells: Sequence[Sequence[int]], item: str) -> List[List[int]]:
76
76
  item_spans = [
77
77
  [
78
- int(html[index_cell - 1].replace(item + "=", "").replace('"', ""))
79
- if (item in html[index_cell - 1] and html[index_cell] == ">")
80
- else (
81
- int(html[index_cell - 2].replace(item + "=", "").replace('"', ""))
82
- if (item in html[index_cell - 2] and html[index_cell] == ">")
83
- else 1
78
+ (
79
+ int(html[index_cell - 1].replace(item + "=", "").replace('"', ""))
80
+ if (item in html[index_cell - 1] and html[index_cell] == ">")
81
+ else (
82
+ int(html[index_cell - 2].replace(item + "=", "").replace('"', ""))
83
+ if (item in html[index_cell - 2] and html[index_cell] == ">")
84
+ else 1
85
+ )
84
86
  )
85
87
  for index_cell in index_cell_per_row
86
88
  ]
@@ -210,9 +212,7 @@ def _add_items(image: Image, item_type: str, categories_name_as_key: Dict[str, s
210
212
  items = image.get_annotation(category_names=TableType.item)
211
213
  item_type_anns = [ann for ann in items if ann.get_sub_category(TableType.item).category_name == item_type]
212
214
  item_type_anns.sort(
213
- key=lambda x: x.bounding_box.cx # type: ignore
214
- if item_type == LayoutType.column
215
- else x.bounding_box.cy # type: ignore
215
+ key=lambda x: (x.bounding_box.cx if item_type == LayoutType.column else x.bounding_box.cy) # type: ignore
216
216
  )
217
217
  if table.bounding_box:
218
218
  tmp_item_xy = table.bounding_box.uly + 1.0 if item_type == LayoutType.row else table.bounding_box.ulx + 1.0
@@ -389,7 +389,7 @@ def pub_to_image_uncur( # pylint: disable=R0914
389
389
  with MappingContextManager(str(idx)) as mapping_context:
390
390
  max_rs, max_cs = 0, 0
391
391
  if idx is None:
392
- raise ValueError("No valid datapoint external id")
392
+ raise TypeError("imgid is None but must be a string")
393
393
 
394
394
  image = Image(file_name=os.path.split(dp["filename"])[1], location=dp["filename"], external_id=idx)
395
395
 
@@ -67,7 +67,7 @@ def image_to_tp_frcnn_training(
67
67
  all_categories.append(ann.category_id)
68
68
 
69
69
  if add_mask:
70
- raise NotImplementedError
70
+ raise NotImplementedError()
71
71
 
72
72
  output["gt_boxes"] = np.asarray(all_boxes, dtype="float32")
73
73
  output["gt_labels"] = np.asarray(all_categories, dtype="int32")
@@ -42,11 +42,14 @@ class DatapointManager:
42
42
  The manager is part of each `PipelineComponent`.
43
43
  """
44
44
 
45
- def __init__(self) -> None:
45
+ def __init__(self, service_id: str, model_id: Optional[str] = None) -> None:
46
46
  self._datapoint: Optional[Image] = None
47
47
  self._cache_anns: Dict[str, ImageAnnotation] = {}
48
48
  self.datapoint_is_passed: bool = False
49
49
  self.category_id_mapping: Optional[Mapping[int, int]] = None
50
+ self.service_id = service_id
51
+ self.model_id = model_id
52
+ self.session_id: Optional[str] = None
50
53
 
51
54
  @property
52
55
  def datapoint(self) -> Image:
@@ -55,7 +58,7 @@ class DatapointManager:
55
58
  """
56
59
  if self._datapoint is not None:
57
60
  return self._datapoint
58
- raise ValueError("no datapoint passed")
61
+ raise ValueError("No datapoint passed")
59
62
 
60
63
  @datapoint.setter
61
64
  def datapoint(self, dp: Image) -> None:
@@ -154,6 +157,9 @@ class DatapointManager:
154
157
  bounding_box=box,
155
158
  category_id=str(detect_result.class_id),
156
159
  score=detect_result.score,
160
+ service_id=self.service_id,
161
+ model_id=self.model_id,
162
+ session_id=self.session_id,
157
163
  )
158
164
  if to_annotation_id is not None:
159
165
  parent_ann = self._cache_anns[to_annotation_id]
@@ -208,7 +214,14 @@ class DatapointManager:
208
214
  "annotation_id": annotation_id,
209
215
  },
210
216
  ) as annotation_context:
211
- cat_ann = CategoryAnnotation(category_name=category_name, category_id=str(category_id), score=score)
217
+ cat_ann = CategoryAnnotation(
218
+ category_name=category_name,
219
+ category_id=str(category_id),
220
+ score=score,
221
+ service_id=self.service_id,
222
+ model_id=self.model_id,
223
+ session_id=self.session_id,
224
+ )
212
225
  self._cache_anns[annotation_id].dump_sub_category(sub_cat_key, cat_ann)
213
226
  if annotation_context.context_error:
214
227
  return None
@@ -246,7 +259,13 @@ class DatapointManager:
246
259
  },
247
260
  ) as annotation_context:
248
261
  cont_ann = ContainerAnnotation(
249
- category_name=category_name, category_id=str(category_id), value=value, score=score
262
+ category_name=category_name,
263
+ category_id=str(category_id),
264
+ value=value,
265
+ score=score,
266
+ service_id=self.service_id,
267
+ model_id=self.model_id,
268
+ session_id=self.session_id,
250
269
  )
251
270
  self._cache_anns[annotation_id].dump_sub_category(sub_cat_key, cont_ann)
252
271
  if annotation_context.context_error:
@@ -257,7 +276,7 @@ class DatapointManager:
257
276
  self,
258
277
  summary_key: ObjectTypes,
259
278
  summary_name: ObjectTypes,
260
- summary_number: int,
279
+ summary_number: Optional[int] = None,
261
280
  summary_value: Optional[str] = None,
262
281
  summary_score: Optional[float] = None,
263
282
  annotation_id: Optional[str] = None,
@@ -294,16 +313,24 @@ class DatapointManager:
294
313
  "annotation_id": annotation_id,
295
314
  },
296
315
  ) as annotation_context:
297
- if summary_value:
316
+ if summary_value is not None:
298
317
  ann = ContainerAnnotation(
299
318
  category_name=summary_name,
300
- category_id=str(summary_number),
319
+ category_id=str(summary_number) if summary_number is not None else "",
301
320
  value=summary_value,
302
321
  score=summary_score,
322
+ service_id=self.service_id,
323
+ model_id=self.model_id,
324
+ session_id=self.session_id,
303
325
  )
304
326
  else:
305
327
  ann = CategoryAnnotation(
306
- category_name=summary_name, category_id=str(summary_number), score=summary_score
328
+ category_name=summary_name,
329
+ category_id=str(summary_number) if summary_number is not None else "",
330
+ score=summary_score,
331
+ service_id=self.service_id,
332
+ model_id=self.model_id,
333
+ session_id=self.session_id,
307
334
  )
308
335
  image.summary.dump_sub_category(summary_key, ann, image.image_id)
309
336
 
@@ -23,12 +23,14 @@ from abc import ABC, abstractmethod
23
23
  from collections import defaultdict
24
24
  from copy import deepcopy
25
25
  from typing import Any, Callable, DefaultDict, Dict, List, Mapping, Optional, Set, Union
26
+ from uuid import uuid1
26
27
 
27
28
  from ..dataflow import DataFlow, MapData
28
29
  from ..datapoint.image import Image
29
30
  from ..extern.base import ImageTransformer, ObjectDetector, PdfMiner, TextRecognizer
30
31
  from ..utils.context import timed_operation
31
32
  from ..utils.detection_types import JsonDict
33
+ from ..utils.identifier import get_uuid_from_str
32
34
  from .anngen import DatapointManager
33
35
 
34
36
 
@@ -58,8 +60,9 @@ class PipelineComponent(ABC):
58
60
  pipeline. Use something that describe the task of the pipeline.
59
61
  """
60
62
  self.name = name
63
+ self.service_id = self.get_service_id()
61
64
  self._meta_has_all_types()
62
- self.dp_manager = DatapointManager()
65
+ self.dp_manager = DatapointManager(self.service_id)
63
66
  self.timer_on = False
64
67
 
65
68
  @abstractmethod
@@ -75,7 +78,7 @@ class PipelineComponent(ABC):
75
78
  As a simplified interface `serve` does not have to return a dp. The data point is passed on within
76
79
  pipelines internally (via `pass_datapoint`).
77
80
  """
78
- raise NotImplementedError
81
+ raise NotImplementedError()
79
82
 
80
83
  def pass_datapoint(self, dp: Image) -> Image:
81
84
  """
@@ -109,7 +112,7 @@ class PipelineComponent(ABC):
109
112
  """
110
113
  Clone an instance
111
114
  """
112
- raise NotImplementedError
115
+ raise NotImplementedError()
113
116
 
114
117
  @abstractmethod
115
118
  def get_meta_annotation(self) -> JsonDict:
@@ -122,7 +125,7 @@ class PipelineComponent(ABC):
122
125
  `summaries` with values: A list of summary sub categories
123
126
  :return: Dict with meta infos as just described
124
127
  """
125
- raise NotImplementedError
128
+ raise NotImplementedError()
126
129
 
127
130
  def _meta_has_all_types(self) -> None:
128
131
  if not {"image_annotations", "sub_categories", "relationships", "summaries"}.issubset(
@@ -133,6 +136,12 @@ class PipelineComponent(ABC):
133
136
  f"Got {self.get_meta_annotation().keys()}"
134
137
  )
135
138
 
139
+ def get_service_id(self) -> str:
140
+ """
141
+ Get the generating model
142
+ """
143
+ return get_uuid_from_str(self.name)[:8]
144
+
136
145
 
137
146
  class PredictorPipelineComponent(PipelineComponent, ABC):
138
147
  """
@@ -151,10 +160,11 @@ class PredictorPipelineComponent(PipelineComponent, ABC):
151
160
  """
152
161
  self.predictor = predictor
153
162
  super().__init__(name)
163
+ self.dp_manager = DatapointManager(self.service_id, self.predictor.model_id)
154
164
 
155
165
  @abstractmethod
156
166
  def clone(self) -> "PredictorPipelineComponent":
157
- raise NotImplementedError
167
+ raise NotImplementedError()
158
168
 
159
169
 
160
170
  class LanguageModelPipelineComponent(PipelineComponent, ABC):
@@ -175,15 +185,15 @@ class LanguageModelPipelineComponent(PipelineComponent, ABC):
175
185
  """
176
186
 
177
187
  self.tokenizer = tokenizer
178
- self.mapping_to_lm_input_func = mapping_to_lm_input_func
179
188
  super().__init__(name)
189
+ self.mapping_to_lm_input_func = mapping_to_lm_input_func
180
190
 
181
191
  @abstractmethod
182
192
  def clone(self) -> "LanguageModelPipelineComponent":
183
193
  """
184
194
  Clone an instance
185
195
  """
186
- raise NotImplementedError
196
+ raise NotImplementedError()
187
197
 
188
198
 
189
199
  class ImageTransformPipelineComponent(PipelineComponent, ABC):
@@ -206,7 +216,7 @@ class ImageTransformPipelineComponent(PipelineComponent, ABC):
206
216
  """
207
217
  Clone an instance
208
218
  """
209
- raise NotImplementedError
219
+ raise NotImplementedError()
210
220
 
211
221
 
212
222
  class Pipeline(ABC):
@@ -228,7 +238,7 @@ class Pipeline(ABC):
228
238
 
229
239
  layout = LayoutPipeComponent(layout_detector ...)
230
240
  text = TextExtractPipeComponent(text_detector ...)
231
- simple_pipe = MyPipeline (pipeline_component = [layout, text])
241
+ simple_pipe = MyPipeline(pipeline_component = [layout, text])
232
242
  doc_dataflow = simple_pipe.analyze(input = path / to / dir)
233
243
 
234
244
  for page in doc_dataflow:
@@ -238,6 +248,18 @@ class Pipeline(ABC):
238
248
  model or already processed further).
239
249
 
240
250
  In addition to `analyze`, the internal `_entry` is used to bundle preprocessing steps.
251
+
252
+ It is possible to set a session id for the pipeline. This is useful for logging purposes. The session id can be
253
+ either passed to the pipeline via the `analyze` method or generated automatically.
254
+
255
+ To generate a session_id automatically:
256
+
257
+ **Example:**
258
+
259
+ pipe = MyPipeline(pipeline_component = [layout, text])
260
+ pipe.set_session_id = True
261
+
262
+ df = pipe.analyze(input = "path/to/dir") # session_id is generated automatically
241
263
  """
242
264
 
243
265
  def __init__(self, pipeline_component_list: List[PipelineComponent]) -> None:
@@ -245,6 +267,7 @@ class Pipeline(ABC):
245
267
  :param pipeline_component_list: A list of pipeline components.
246
268
  """
247
269
  self.pipe_component_list = pipeline_component_list
270
+ self.set_session_id = False
248
271
 
249
272
  @abstractmethod
250
273
  def _entry(self, **kwargs: Any) -> DataFlow:
@@ -254,14 +277,17 @@ class Pipeline(ABC):
254
277
 
255
278
  :param kwargs: Arguments, for dynamic customizing of the processing or for the transfer of processing types
256
279
  """
257
- raise NotImplementedError
280
+ raise NotImplementedError()
258
281
 
259
- def _build_pipe(self, df: DataFlow) -> DataFlow:
282
+ def _build_pipe(self, df: DataFlow, session_id: Optional[str] = None) -> DataFlow:
260
283
  """
261
284
  Composition of the backbone
262
285
  """
286
+ if session_id is None and self.set_session_id:
287
+ session_id = self.get_session_id()
263
288
  for component in self.pipe_component_list:
264
289
  component.timer_on = True
290
+ component.dp_manager.session_id = session_id
265
291
  df = component.predict_dataflow(df)
266
292
  return df
267
293
 
@@ -277,7 +303,7 @@ class Pipeline(ABC):
277
303
 
278
304
  can be triggered.
279
305
  """
280
- raise NotImplementedError
306
+ raise NotImplementedError()
281
307
 
282
308
  def get_meta_annotation(self) -> JsonDict:
283
309
  """
@@ -301,22 +327,30 @@ class Pipeline(ABC):
301
327
  for key, value in meta_anns["relationships"].items():
302
328
  pipeline_populations["relationships"][key].update(value)
303
329
  pipeline_populations["summaries"].extend(meta_anns["summaries"]) # type: ignore
304
-
330
+ pipeline_populations["sub_categories"] = dict(pipeline_populations["sub_categories"]) # type: ignore
331
+ pipeline_populations["relationships"] = dict(pipeline_populations["relationships"]) # type: ignore
305
332
  return pipeline_populations
306
333
 
307
334
  def get_pipeline_info(
308
- self, position: Optional[int] = None, name: Optional[str] = None
309
- ) -> Union[Mapping[int, str], str, int]:
335
+ self, service_id: Optional[str] = None, name: Optional[str] = None
336
+ ) -> Union[str, Mapping[str, str]]:
310
337
  """Get pipeline information: Returns a dictionary with a description of each pipeline component
311
- :param position: position of the pipeline component in the pipeline
338
+ :param service_id: service_id of the pipeline component to search for
312
339
  :param name: name of the pipeline component to search for
313
340
  :return: Either a full dictionary with position and name of all pipeline components or the name, if the position
314
341
  has been passed or the position if the name has been passed.
315
342
  """
316
- comp_info = {key: comp.name for key, comp in enumerate(self.pipe_component_list)}
343
+ comp_info = {comp.service_id: comp.name for comp in self.pipe_component_list}
317
344
  comp_info_name_as_key = {value: key for key, value in comp_info.items()}
318
- if position is not None:
319
- return comp_info[position]
345
+ if service_id is not None:
346
+ return comp_info[service_id]
320
347
  if name is not None:
321
348
  return comp_info_name_as_key[name]
322
349
  return comp_info
350
+
351
+ @staticmethod
352
+ def get_session_id() -> str:
353
+ """
354
+ Get the generating a session id
355
+ """
356
+ return str(uuid1())[:8]
@@ -24,9 +24,11 @@ from typing import Dict, List, Mapping, Optional, Sequence, Union
24
24
 
25
25
  import numpy as np
26
26
 
27
+ from ..datapoint.annotation import ImageAnnotation
28
+ from ..datapoint.box import crop_box_from_image
27
29
  from ..datapoint.image import Image
28
30
  from ..extern.base import DetectionResult, ObjectDetector, PdfMiner
29
- from ..utils.detection_types import JsonDict
31
+ from ..utils.detection_types import ImageType, JsonDict
30
32
  from ..utils.settings import ObjectTypes, Relationships
31
33
  from ..utils.transform import PadTransform
32
34
  from .base import PredictorPipelineComponent
@@ -181,18 +183,14 @@ class SubImageLayoutService(PredictorPipelineComponent):
181
183
  """
182
184
  sub_image_anns = dp.get_annotation_iter(category_names=self.sub_image_name)
183
185
  for sub_image_ann in sub_image_anns:
184
- if sub_image_ann.image is None:
185
- raise ValueError("sub_image_ann.image is None, but must be an image")
186
- np_image = sub_image_ann.image.image
187
- if self.padder:
188
- np_image = self.padder.apply_image(np_image)
189
- detect_result_list = self.predictor.predict(np_image)
186
+ np_image = self.prepare_np_image(sub_image_ann)
187
+ detect_result_list = self.predictor.predict(np_image) # type: ignore
190
188
  if self.padder and detect_result_list:
191
189
  boxes = np.array([detect_result.box for detect_result in detect_result_list])
192
190
  boxes_orig = self.padder.inverse_apply_coords(boxes)
193
191
  for idx, detect_result in enumerate(detect_result_list):
194
192
  detect_result.box = boxes_orig[idx, :].tolist()
195
- if self.detect_result_generator:
193
+ if self.detect_result_generator and sub_image_ann.image:
196
194
  self.detect_result_generator.width = sub_image_ann.image.width
197
195
  self.detect_result_generator.height = sub_image_ann.image.height
198
196
  detect_result_list = self.detect_result_generator.create_detection_result(detect_result_list)
@@ -235,3 +233,26 @@ class SubImageLayoutService(PredictorPipelineComponent):
235
233
  deepcopy(self.detect_result_generator),
236
234
  padder_clone,
237
235
  )
236
+
237
+ def prepare_np_image(self, sub_image_ann: ImageAnnotation) -> ImageType:
238
+ """Maybe crop and pad a np_array before passing it to the predictor.
239
+
240
+ Note that we currently assume to a two level hierachy of images, e.g. we can crop a sub-image from the base
241
+ image, e.g. the original input but we cannot crop a sub-image from an image which is itself a sub-image.
242
+
243
+ :param sub_image_ann: ImageAnnotation to be processed
244
+ :return: processed np_image
245
+ """
246
+ if sub_image_ann.image is None:
247
+ raise ValueError("sub_image_ann.image is None, but must be an datapoint.Image")
248
+ np_image = sub_image_ann.image.image
249
+ if np_image is None and self.dp_manager.datapoint.image is not None:
250
+ np_image = crop_box_from_image(
251
+ self.dp_manager.datapoint.image,
252
+ sub_image_ann.get_bounding_box(self.dp_manager.datapoint.image_id),
253
+ self.dp_manager.datapoint.width,
254
+ self.dp_manager.datapoint.height,
255
+ )
256
+ if self.padder:
257
+ np_image = self.padder.apply_image(np_image)
258
+ return np_image
@@ -93,8 +93,8 @@ class MatchingService(PipelineComponent):
93
93
 
94
94
  def __init__(
95
95
  self,
96
- parent_categories: Union[TypeOrStr, List[TypeOrStr]],
97
- child_categories: Union[TypeOrStr, List[TypeOrStr]],
96
+ parent_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
97
+ child_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
98
98
  matching_rule: Literal["iou", "ioa"],
99
99
  threshold: float,
100
100
  use_weighted_intersections: bool = False,
@@ -112,8 +112,16 @@ class MatchingService(PipelineComponent):
112
112
  value calibrate the ioa.
113
113
  :param max_parent_only: Will assign to each child at most one parent with maximum ioa
114
114
  """
115
- self.parent_categories = parent_categories
116
- self.child_categories = child_categories
115
+ self.parent_categories = (
116
+ [get_type(parent_categories)] # type: ignore
117
+ if not isinstance(parent_categories, (list, set))
118
+ else [get_type(parent_category) for parent_category in parent_categories]
119
+ )
120
+ self.child_categories = (
121
+ [get_type(child_categories)] # type: ignore
122
+ if not isinstance(child_categories, (list, set))
123
+ else [get_type(child_category) for child_category in child_categories]
124
+ )
117
125
  assert matching_rule in ["iou", "ioa"], "segment rule must be either iou or ioa"
118
126
  self.matching_rule = matching_rule
119
127
  self.threshold = threshold
@@ -82,7 +82,6 @@ def _proto_process(
82
82
  else:
83
83
  path_tmp = path
84
84
  logger.info(LoggingRecord(f"Processing {file_name}", {"path": path_tmp, "df": path_tmp, "file_name": file_name}))
85
- # logger.info("Processing %s", file_name, {"path": path_tmp, "df": path_tmp, "file_name": file_name})
86
85
  return dp
87
86
 
88
87
 
@@ -221,9 +220,10 @@ class DoctectionPipe(Pipeline):
221
220
  """
222
221
 
223
222
  output = kwargs.get("output", "page")
223
+ session_id = kwargs.get("session_id")
224
224
  assert output in ("page", "image", "dict"), "output must be either page image or dict"
225
225
  df = self._entry(**kwargs)
226
- df = self._build_pipe(df)
226
+ df = self._build_pipe(df, session_id=session_id) # type: ignore
227
227
  if output == "page":
228
228
  df = self.dataflow_to_page(df)
229
229
  elif output == "dict":