deepdoctection 0.30__py3-none-any.whl → 0.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (120) hide show
  1. deepdoctection/__init__.py +38 -29
  2. deepdoctection/analyzer/dd.py +36 -29
  3. deepdoctection/configs/conf_dd_one.yaml +34 -31
  4. deepdoctection/dataflow/base.py +0 -19
  5. deepdoctection/dataflow/custom.py +4 -3
  6. deepdoctection/dataflow/custom_serialize.py +14 -5
  7. deepdoctection/dataflow/parallel_map.py +12 -11
  8. deepdoctection/dataflow/serialize.py +5 -4
  9. deepdoctection/datapoint/annotation.py +35 -13
  10. deepdoctection/datapoint/box.py +3 -5
  11. deepdoctection/datapoint/convert.py +3 -1
  12. deepdoctection/datapoint/image.py +79 -36
  13. deepdoctection/datapoint/view.py +152 -49
  14. deepdoctection/datasets/__init__.py +1 -4
  15. deepdoctection/datasets/adapter.py +6 -3
  16. deepdoctection/datasets/base.py +86 -11
  17. deepdoctection/datasets/dataflow_builder.py +1 -1
  18. deepdoctection/datasets/info.py +4 -4
  19. deepdoctection/datasets/instances/doclaynet.py +3 -2
  20. deepdoctection/datasets/instances/fintabnet.py +2 -1
  21. deepdoctection/datasets/instances/funsd.py +2 -1
  22. deepdoctection/datasets/instances/iiitar13k.py +5 -2
  23. deepdoctection/datasets/instances/layouttest.py +4 -8
  24. deepdoctection/datasets/instances/publaynet.py +2 -2
  25. deepdoctection/datasets/instances/pubtables1m.py +6 -3
  26. deepdoctection/datasets/instances/pubtabnet.py +2 -1
  27. deepdoctection/datasets/instances/rvlcdip.py +2 -1
  28. deepdoctection/datasets/instances/xfund.py +2 -1
  29. deepdoctection/eval/__init__.py +1 -4
  30. deepdoctection/eval/accmetric.py +1 -1
  31. deepdoctection/eval/base.py +5 -4
  32. deepdoctection/eval/cocometric.py +2 -1
  33. deepdoctection/eval/eval.py +19 -15
  34. deepdoctection/eval/tedsmetric.py +14 -11
  35. deepdoctection/eval/tp_eval_callback.py +14 -7
  36. deepdoctection/extern/__init__.py +2 -7
  37. deepdoctection/extern/base.py +39 -13
  38. deepdoctection/extern/d2detect.py +182 -90
  39. deepdoctection/extern/deskew.py +36 -9
  40. deepdoctection/extern/doctrocr.py +265 -83
  41. deepdoctection/extern/fastlang.py +49 -9
  42. deepdoctection/extern/hfdetr.py +106 -55
  43. deepdoctection/extern/hflayoutlm.py +441 -122
  44. deepdoctection/extern/hflm.py +225 -0
  45. deepdoctection/extern/model.py +56 -47
  46. deepdoctection/extern/pdftext.py +10 -5
  47. deepdoctection/extern/pt/__init__.py +1 -3
  48. deepdoctection/extern/pt/nms.py +6 -2
  49. deepdoctection/extern/pt/ptutils.py +27 -18
  50. deepdoctection/extern/tessocr.py +134 -22
  51. deepdoctection/extern/texocr.py +6 -2
  52. deepdoctection/extern/tp/tfutils.py +43 -9
  53. deepdoctection/extern/tp/tpcompat.py +14 -11
  54. deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
  55. deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
  56. deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
  57. deepdoctection/extern/tp/tpfrcnn/config/config.py +9 -6
  58. deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
  59. deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +17 -7
  60. deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
  61. deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +9 -4
  62. deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
  63. deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +16 -11
  64. deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +17 -10
  65. deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +14 -8
  66. deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
  67. deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
  68. deepdoctection/extern/tp/tpfrcnn/preproc.py +8 -9
  69. deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
  70. deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
  71. deepdoctection/extern/tpdetect.py +54 -30
  72. deepdoctection/mapper/__init__.py +3 -8
  73. deepdoctection/mapper/d2struct.py +9 -7
  74. deepdoctection/mapper/hfstruct.py +7 -2
  75. deepdoctection/mapper/laylmstruct.py +164 -21
  76. deepdoctection/mapper/maputils.py +16 -3
  77. deepdoctection/mapper/misc.py +6 -3
  78. deepdoctection/mapper/prodigystruct.py +1 -1
  79. deepdoctection/mapper/pubstruct.py +10 -10
  80. deepdoctection/mapper/tpstruct.py +3 -3
  81. deepdoctection/pipe/__init__.py +1 -1
  82. deepdoctection/pipe/anngen.py +35 -8
  83. deepdoctection/pipe/base.py +53 -19
  84. deepdoctection/pipe/common.py +23 -13
  85. deepdoctection/pipe/concurrency.py +2 -1
  86. deepdoctection/pipe/doctectionpipe.py +2 -2
  87. deepdoctection/pipe/language.py +3 -2
  88. deepdoctection/pipe/layout.py +6 -3
  89. deepdoctection/pipe/lm.py +34 -66
  90. deepdoctection/pipe/order.py +142 -35
  91. deepdoctection/pipe/refine.py +26 -24
  92. deepdoctection/pipe/segment.py +21 -16
  93. deepdoctection/pipe/{cell.py → sub_layout.py} +30 -9
  94. deepdoctection/pipe/text.py +14 -8
  95. deepdoctection/pipe/transform.py +16 -9
  96. deepdoctection/train/__init__.py +6 -12
  97. deepdoctection/train/d2_frcnn_train.py +36 -28
  98. deepdoctection/train/hf_detr_train.py +26 -17
  99. deepdoctection/train/hf_layoutlm_train.py +133 -111
  100. deepdoctection/train/tp_frcnn_train.py +21 -19
  101. deepdoctection/utils/__init__.py +3 -0
  102. deepdoctection/utils/concurrency.py +1 -1
  103. deepdoctection/utils/context.py +2 -2
  104. deepdoctection/utils/env_info.py +41 -84
  105. deepdoctection/utils/error.py +84 -0
  106. deepdoctection/utils/file_utils.py +4 -15
  107. deepdoctection/utils/fs.py +7 -7
  108. deepdoctection/utils/logger.py +1 -0
  109. deepdoctection/utils/mocks.py +93 -0
  110. deepdoctection/utils/pdf_utils.py +5 -4
  111. deepdoctection/utils/settings.py +6 -1
  112. deepdoctection/utils/transform.py +1 -1
  113. deepdoctection/utils/utils.py +0 -6
  114. deepdoctection/utils/viz.py +48 -5
  115. {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/METADATA +57 -73
  116. deepdoctection-0.32.dist-info/RECORD +146 -0
  117. {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/WHEEL +1 -1
  118. deepdoctection-0.30.dist-info/RECORD +0 -143
  119. {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/LICENSE +0 -0
  120. {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/top_level.txt +0 -0
@@ -23,12 +23,14 @@ from abc import ABC, abstractmethod
23
23
  from collections import defaultdict
24
24
  from copy import deepcopy
25
25
  from typing import Any, Callable, DefaultDict, Dict, List, Mapping, Optional, Set, Union
26
+ from uuid import uuid1
26
27
 
27
28
  from ..dataflow import DataFlow, MapData
28
29
  from ..datapoint.image import Image
29
30
  from ..extern.base import ImageTransformer, ObjectDetector, PdfMiner, TextRecognizer
30
31
  from ..utils.context import timed_operation
31
32
  from ..utils.detection_types import JsonDict
33
+ from ..utils.identifier import get_uuid_from_str
32
34
  from .anngen import DatapointManager
33
35
 
34
36
 
@@ -58,8 +60,9 @@ class PipelineComponent(ABC):
58
60
  pipeline. Use something that describe the task of the pipeline.
59
61
  """
60
62
  self.name = name
63
+ self.service_id = self.get_service_id()
61
64
  self._meta_has_all_types()
62
- self.dp_manager = DatapointManager()
65
+ self.dp_manager = DatapointManager(self.service_id)
63
66
  self.timer_on = False
64
67
 
65
68
  @abstractmethod
@@ -75,7 +78,7 @@ class PipelineComponent(ABC):
75
78
  As a simplified interface `serve` does not have to return a dp. The data point is passed on within
76
79
  pipelines internally (via `pass_datapoint`).
77
80
  """
78
- raise NotImplementedError
81
+ raise NotImplementedError()
79
82
 
80
83
  def pass_datapoint(self, dp: Image) -> Image:
81
84
  """
@@ -109,7 +112,7 @@ class PipelineComponent(ABC):
109
112
  """
110
113
  Clone an instance
111
114
  """
112
- raise NotImplementedError
115
+ raise NotImplementedError()
113
116
 
114
117
  @abstractmethod
115
118
  def get_meta_annotation(self) -> JsonDict:
@@ -122,7 +125,7 @@ class PipelineComponent(ABC):
122
125
  `summaries` with values: A list of summary sub categories
123
126
  :return: Dict with meta infos as just described
124
127
  """
125
- raise NotImplementedError
128
+ raise NotImplementedError()
126
129
 
127
130
  def _meta_has_all_types(self) -> None:
128
131
  if not {"image_annotations", "sub_categories", "relationships", "summaries"}.issubset(
@@ -133,6 +136,12 @@ class PipelineComponent(ABC):
133
136
  f"Got {self.get_meta_annotation().keys()}"
134
137
  )
135
138
 
139
+ def get_service_id(self) -> str:
140
+ """
141
+ Get the generating model
142
+ """
143
+ return get_uuid_from_str(self.name)[:8]
144
+
136
145
 
137
146
  class PredictorPipelineComponent(PipelineComponent, ABC):
138
147
  """
@@ -151,10 +160,11 @@ class PredictorPipelineComponent(PipelineComponent, ABC):
151
160
  """
152
161
  self.predictor = predictor
153
162
  super().__init__(name)
163
+ self.dp_manager = DatapointManager(self.service_id, self.predictor.model_id)
154
164
 
155
165
  @abstractmethod
156
166
  def clone(self) -> "PredictorPipelineComponent":
157
- raise NotImplementedError
167
+ raise NotImplementedError()
158
168
 
159
169
 
160
170
  class LanguageModelPipelineComponent(PipelineComponent, ABC):
@@ -175,15 +185,15 @@ class LanguageModelPipelineComponent(PipelineComponent, ABC):
175
185
  """
176
186
 
177
187
  self.tokenizer = tokenizer
178
- self.mapping_to_lm_input_func = mapping_to_lm_input_func
179
188
  super().__init__(name)
189
+ self.mapping_to_lm_input_func = mapping_to_lm_input_func
180
190
 
181
191
  @abstractmethod
182
192
  def clone(self) -> "LanguageModelPipelineComponent":
183
193
  """
184
194
  Clone an instance
185
195
  """
186
- raise NotImplementedError
196
+ raise NotImplementedError()
187
197
 
188
198
 
189
199
  class ImageTransformPipelineComponent(PipelineComponent, ABC):
@@ -206,7 +216,7 @@ class ImageTransformPipelineComponent(PipelineComponent, ABC):
206
216
  """
207
217
  Clone an instance
208
218
  """
209
- raise NotImplementedError
219
+ raise NotImplementedError()
210
220
 
211
221
 
212
222
  class Pipeline(ABC):
@@ -228,7 +238,7 @@ class Pipeline(ABC):
228
238
 
229
239
  layout = LayoutPipeComponent(layout_detector ...)
230
240
  text = TextExtractPipeComponent(text_detector ...)
231
- simple_pipe = MyPipeline (pipeline_component = [layout, text])
241
+ simple_pipe = MyPipeline(pipeline_component = [layout, text])
232
242
  doc_dataflow = simple_pipe.analyze(input = path / to / dir)
233
243
 
234
244
  for page in doc_dataflow:
@@ -238,6 +248,18 @@ class Pipeline(ABC):
238
248
  model or already processed further).
239
249
 
240
250
  In addition to `analyze`, the internal `_entry` is used to bundle preprocessing steps.
251
+
252
+ It is possible to set a session id for the pipeline. This is useful for logging purposes. The session id can be
253
+ either passed to the pipeline via the `analyze` method or generated automatically.
254
+
255
+ To generate a session_id automatically:
256
+
257
+ **Example:**
258
+
259
+ pipe = MyPipeline(pipeline_component = [layout, text])
260
+ pipe.set_session_id = True
261
+
262
+ df = pipe.analyze(input = "path/to/dir") # session_id is generated automatically
241
263
  """
242
264
 
243
265
  def __init__(self, pipeline_component_list: List[PipelineComponent]) -> None:
@@ -245,6 +267,7 @@ class Pipeline(ABC):
245
267
  :param pipeline_component_list: A list of pipeline components.
246
268
  """
247
269
  self.pipe_component_list = pipeline_component_list
270
+ self.set_session_id = False
248
271
 
249
272
  @abstractmethod
250
273
  def _entry(self, **kwargs: Any) -> DataFlow:
@@ -254,14 +277,17 @@ class Pipeline(ABC):
254
277
 
255
278
  :param kwargs: Arguments, for dynamic customizing of the processing or for the transfer of processing types
256
279
  """
257
- raise NotImplementedError
280
+ raise NotImplementedError()
258
281
 
259
- def _build_pipe(self, df: DataFlow) -> DataFlow:
282
+ def _build_pipe(self, df: DataFlow, session_id: Optional[str] = None) -> DataFlow:
260
283
  """
261
284
  Composition of the backbone
262
285
  """
286
+ if session_id is None and self.set_session_id:
287
+ session_id = self.get_session_id()
263
288
  for component in self.pipe_component_list:
264
289
  component.timer_on = True
290
+ component.dp_manager.session_id = session_id
265
291
  df = component.predict_dataflow(df)
266
292
  return df
267
293
 
@@ -277,7 +303,7 @@ class Pipeline(ABC):
277
303
 
278
304
  can be triggered.
279
305
  """
280
- raise NotImplementedError
306
+ raise NotImplementedError()
281
307
 
282
308
  def get_meta_annotation(self) -> JsonDict:
283
309
  """
@@ -301,22 +327,30 @@ class Pipeline(ABC):
301
327
  for key, value in meta_anns["relationships"].items():
302
328
  pipeline_populations["relationships"][key].update(value)
303
329
  pipeline_populations["summaries"].extend(meta_anns["summaries"]) # type: ignore
304
-
330
+ pipeline_populations["sub_categories"] = dict(pipeline_populations["sub_categories"]) # type: ignore
331
+ pipeline_populations["relationships"] = dict(pipeline_populations["relationships"]) # type: ignore
305
332
  return pipeline_populations
306
333
 
307
334
  def get_pipeline_info(
308
- self, position: Optional[int] = None, name: Optional[str] = None
309
- ) -> Union[Mapping[int, str], str, int]:
335
+ self, service_id: Optional[str] = None, name: Optional[str] = None
336
+ ) -> Union[str, Mapping[str, str]]:
310
337
  """Get pipeline information: Returns a dictionary with a description of each pipeline component
311
- :param position: position of the pipeline component in the pipeline
338
+ :param service_id: service_id of the pipeline component to search for
312
339
  :param name: name of the pipeline component to search for
313
340
  :return: Either a full dictionary with position and name of all pipeline components or the name, if the position
314
341
  has been passed or the position if the name has been passed.
315
342
  """
316
- comp_info = {key: comp.name for key, comp in enumerate(self.pipe_component_list)}
343
+ comp_info = {comp.service_id: comp.name for comp in self.pipe_component_list}
317
344
  comp_info_name_as_key = {value: key for key, value in comp_info.items()}
318
- if position is not None:
319
- return comp_info[position]
345
+ if service_id is not None:
346
+ return comp_info[service_id]
320
347
  if name is not None:
321
348
  return comp_info_name_as_key[name]
322
349
  return comp_info
350
+
351
+ @staticmethod
352
+ def get_session_id() -> str:
353
+ """
354
+ Get the generating a session id
355
+ """
356
+ return str(uuid1())[:8]
@@ -18,6 +18,10 @@
18
18
  """
19
19
  Module for common pipeline components
20
20
  """
21
+ from __future__ import annotations
22
+
23
+ import os
24
+
21
25
  from copy import copy, deepcopy
22
26
  from typing import List, Literal, Mapping, Optional, Sequence, Union
23
27
 
@@ -30,16 +34,14 @@ from ..mapper.maputils import MappingContextManager
30
34
  from ..mapper.match import match_anns_by_intersection
31
35
  from ..mapper.misc import to_image
32
36
  from ..utils.detection_types import JsonDict
33
- from ..utils.file_utils import detectron2_available, pytorch_available, tf_available
34
37
  from ..utils.settings import LayoutType, ObjectTypes, Relationships, TypeOrStr, get_type
35
38
  from .base import PipelineComponent
36
39
  from .registry import pipeline_component_registry
37
40
 
38
- if tf_available():
39
- from ..mapper.tpstruct import tf_nms_image_annotations as nms_image_annotations
40
-
41
- elif pytorch_available() and detectron2_available():
41
+ if os.environ.get("DD_USE_TORCH"):
42
42
  from ..mapper.d2struct import pt_nms_image_annotations as nms_image_annotations
43
+ elif os.environ.get("DD_USE_TF"):
44
+ from ..mapper.tpstruct import tf_nms_image_annotations as nms_image_annotations
43
45
 
44
46
 
45
47
  @pipeline_component_registry.register("ImageCroppingService")
@@ -64,7 +66,7 @@ class ImageCroppingService(PipelineComponent):
64
66
  for ann in dp.get_annotation(category_names=self.category_names):
65
67
  dp.image_ann_to_image(ann.annotation_id, crop_image=True)
66
68
 
67
- def clone(self) -> "PipelineComponent":
69
+ def clone(self) -> PipelineComponent:
68
70
  return self.__class__(self.category_names)
69
71
 
70
72
  def get_meta_annotation(self) -> JsonDict:
@@ -93,8 +95,8 @@ class MatchingService(PipelineComponent):
93
95
 
94
96
  def __init__(
95
97
  self,
96
- parent_categories: Union[TypeOrStr, List[TypeOrStr]],
97
- child_categories: Union[TypeOrStr, List[TypeOrStr]],
98
+ parent_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
99
+ child_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
98
100
  matching_rule: Literal["iou", "ioa"],
99
101
  threshold: float,
100
102
  use_weighted_intersections: bool = False,
@@ -112,8 +114,16 @@ class MatchingService(PipelineComponent):
112
114
  value calibrate the ioa.
113
115
  :param max_parent_only: Will assign to each child at most one parent with maximum ioa
114
116
  """
115
- self.parent_categories = parent_categories
116
- self.child_categories = child_categories
117
+ self.parent_categories = (
118
+ [get_type(parent_categories)] # type: ignore
119
+ if not isinstance(parent_categories, (list, set))
120
+ else [get_type(parent_category) for parent_category in parent_categories]
121
+ )
122
+ self.child_categories = (
123
+ [get_type(child_categories)] # type: ignore
124
+ if not isinstance(child_categories, (list, set))
125
+ else [get_type(child_category) for child_category in child_categories]
126
+ )
117
127
  assert matching_rule in ["iou", "ioa"], "segment rule must be either iou or ioa"
118
128
  self.matching_rule = matching_rule
119
129
  self.threshold = threshold
@@ -217,7 +227,7 @@ class PageParsingService:
217
227
  """
218
228
  return dict([("image_annotations", []), ("sub_categories", {}), ("relationships", {}), ("summaries", [])])
219
229
 
220
- def clone(self) -> "PageParsingService":
230
+ def clone(self) -> PageParsingService:
221
231
  """clone"""
222
232
  return self.__class__(
223
233
  deepcopy(self.text_container),
@@ -284,7 +294,7 @@ class AnnotationNmsService(PipelineComponent):
284
294
  if ann.annotation_id not in ann_ids_to_keep:
285
295
  self.dp_manager.deactivate_annotation(ann.annotation_id)
286
296
 
287
- def clone(self) -> "PipelineComponent":
297
+ def clone(self) -> PipelineComponent:
288
298
  return self.__class__(deepcopy(self.nms_pairs), self.threshold)
289
299
 
290
300
  def get_meta_annotation(self) -> JsonDict:
@@ -318,7 +328,7 @@ class ImageParsingService:
318
328
  """
319
329
  return MapData(df, self.pass_datapoint)
320
330
 
321
- def clone(self) -> "ImageParsingService":
331
+ def clone(self) -> ImageParsingService:
322
332
  """clone"""
323
333
  return self.__class__(self.dpi)
324
334
 
@@ -18,6 +18,7 @@
18
18
  """
19
19
  Module for multithreading tasks
20
20
  """
21
+ from __future__ import annotations
21
22
 
22
23
  import itertools
23
24
  import queue
@@ -221,7 +222,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
221
222
  def serve(self, dp: Image) -> None:
222
223
  raise NotImplementedError("MultiThreadPipelineComponent does not follow the PipelineComponent implementation")
223
224
 
224
- def clone(self) -> "MultiThreadPipelineComponent":
225
+ def clone(self) -> MultiThreadPipelineComponent:
225
226
  raise NotImplementedError("MultiThreadPipelineComponent does not allow cloning")
226
227
 
227
228
  def get_meta_annotation(self) -> JsonDict:
@@ -82,7 +82,6 @@ def _proto_process(
82
82
  else:
83
83
  path_tmp = path
84
84
  logger.info(LoggingRecord(f"Processing {file_name}", {"path": path_tmp, "df": path_tmp, "file_name": file_name}))
85
- # logger.info("Processing %s", file_name, {"path": path_tmp, "df": path_tmp, "file_name": file_name})
86
85
  return dp
87
86
 
88
87
 
@@ -221,9 +220,10 @@ class DoctectionPipe(Pipeline):
221
220
  """
222
221
 
223
222
  output = kwargs.get("output", "page")
223
+ session_id = kwargs.get("session_id")
224
224
  assert output in ("page", "image", "dict"), "output must be either page image or dict"
225
225
  df = self._entry(**kwargs)
226
- df = self._build_pipe(df)
226
+ df = self._build_pipe(df, session_id=session_id) # type: ignore
227
227
  if output == "page":
228
228
  df = self.dataflow_to_page(df)
229
229
  elif output == "dict":
@@ -25,6 +25,7 @@ from ..datapoint.image import Image
25
25
  from ..datapoint.view import Page
26
26
  from ..extern.base import LanguageDetector, ObjectDetector
27
27
  from ..utils.detection_types import JsonDict
28
+ from ..utils.error import ImageError
28
29
  from ..utils.settings import PageType, TypeOrStr, get_type
29
30
  from .base import PipelineComponent
30
31
  from .registry import pipeline_component_registry
@@ -86,7 +87,7 @@ class LanguageDetectionService(PipelineComponent):
86
87
  text = page.text_no_line_break
87
88
  else:
88
89
  if dp.image is None:
89
- raise ValueError("dp.image cannot be None")
90
+ raise ImageError("image cannot be None")
90
91
  detect_result_list = self.text_detector.predict(dp.image)
91
92
  # this is a concatenation of all detection result. No reading order
92
93
  text = " ".join([result.text for result in detect_result_list if result.text is not None])
@@ -98,7 +99,7 @@ class LanguageDetectionService(PipelineComponent):
98
99
  def clone(self) -> PipelineComponent:
99
100
  predictor = self.predictor.clone()
100
101
  if not isinstance(predictor, LanguageDetector):
101
- raise ValueError(f"Predictor must be of type LanguageDetector, but is of type {type(predictor)}")
102
+ raise TypeError(f"Predictor must be of type LanguageDetector, but is of type {type(predictor)}")
102
103
  return self.__class__(
103
104
  predictor,
104
105
  copy(self.text_container),
@@ -18,6 +18,8 @@
18
18
  """
19
19
  Module for layout pipeline component
20
20
  """
21
+ from __future__ import annotations
22
+
21
23
  from typing import Optional
22
24
 
23
25
  import numpy as np
@@ -25,6 +27,7 @@ import numpy as np
25
27
  from ..datapoint.image import Image
26
28
  from ..extern.base import ObjectDetector, PdfMiner
27
29
  from ..utils.detection_types import JsonDict
30
+ from ..utils.error import ImageError
28
31
  from ..utils.transform import PadTransform
29
32
  from .base import PredictorPipelineComponent
30
33
  from .registry import pipeline_component_registry
@@ -79,7 +82,7 @@ class ImageLayoutService(PredictorPipelineComponent):
79
82
  if anns:
80
83
  return
81
84
  if dp.image is None:
82
- raise ValueError("image cannot be None")
85
+ raise ImageError("image cannot be None")
83
86
  np_image = dp.image
84
87
  if self.padder:
85
88
  np_image = self.padder.apply_image(np_image)
@@ -108,11 +111,11 @@ class ImageLayoutService(PredictorPipelineComponent):
108
111
  def _get_name(predictor_name: str) -> str:
109
112
  return f"image_{predictor_name}"
110
113
 
111
- def clone(self) -> "PredictorPipelineComponent":
114
+ def clone(self) -> PredictorPipelineComponent:
112
115
  predictor = self.predictor.clone()
113
116
  padder_clone = None
114
117
  if self.padder:
115
118
  padder_clone = self.padder.clone()
116
119
  if not isinstance(predictor, ObjectDetector):
117
- raise ValueError(f"predictor must be of type ObjectDetector, but is of type {type(predictor)}")
120
+ raise TypeError(f"predictor must be of type ObjectDetector, but is of type {type(predictor)}")
118
121
  return self.__class__(predictor, self.to_image, self.crop_image, padder_clone, self.skip_if_layout_extracted)
deepdoctection/pipe/lm.py CHANGED
@@ -1,5 +1,5 @@
1
1
  # -*- coding: utf-8 -*-
2
- # File: tokenclass.py
2
+ # File: lm.py
3
3
 
4
4
  # Copyright 2021 Dr. Janis Meyer. All rights reserved.
5
5
  #
@@ -18,57 +18,19 @@
18
18
  """
19
19
  Module for token classification pipeline
20
20
  """
21
+ from __future__ import annotations
21
22
 
22
23
  from copy import copy
23
- from typing import Any, List, Literal, Optional, Sequence, Union
24
+ from typing import Any, Callable, List, Literal, Optional, Sequence, Union
24
25
 
25
26
  from ..datapoint.image import Image
26
27
  from ..extern.hflayoutlm import HFLayoutLmSequenceClassifierBase, HFLayoutLmTokenClassifierBase
27
- from ..mapper.laylmstruct import image_to_layoutlm_features
28
+ from ..mapper.laylmstruct import image_to_layoutlm_features, image_to_lm_features
28
29
  from ..utils.detection_types import JsonDict
29
- from ..utils.file_utils import transformers_available
30
30
  from ..utils.settings import BioTag, LayoutType, ObjectTypes, PageType, TokenClasses, WordType
31
31
  from .base import LanguageModelPipelineComponent
32
32
  from .registry import pipeline_component_registry
33
33
 
34
- if transformers_available():
35
- from transformers import LayoutLMTokenizerFast, RobertaTokenizerFast, XLMRobertaTokenizerFast
36
-
37
- _ARCHITECTURES_TO_TOKENIZER = {
38
- ("LayoutLMForTokenClassification", False): LayoutLMTokenizerFast.from_pretrained(
39
- "microsoft/layoutlm-base-uncased"
40
- ),
41
- ("LayoutLMForSequenceClassification", False): LayoutLMTokenizerFast.from_pretrained(
42
- "microsoft/layoutlm-base-uncased"
43
- ),
44
- ("LayoutLMv2ForTokenClassification", False): LayoutLMTokenizerFast.from_pretrained(
45
- "microsoft/layoutlm-base-uncased"
46
- ),
47
- ("LayoutLMv2ForSequenceClassification", False): LayoutLMTokenizerFast.from_pretrained(
48
- "microsoft/layoutlm-base-uncased"
49
- ),
50
- ("LayoutLMv2ForTokenClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
51
- ("LayoutLMv2ForSequenceClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
52
- ("LayoutLMv3ForSequenceClassification", False): RobertaTokenizerFast.from_pretrained(
53
- "roberta-base", add_prefix_space=True
54
- ),
55
- ("LayoutLMv3ForTokenClassification", False): RobertaTokenizerFast.from_pretrained(
56
- "roberta-base", add_prefix_space=True
57
- ),
58
- }
59
-
60
-
61
- def get_tokenizer_from_architecture(architecture_name: str, use_xlm_tokenizer: bool) -> Any:
62
- """
63
- We do not use the tokenizer for a particular model that the transformer library provides. Thie mapping therefore
64
- returns the tokenizer that should be used for a particular model.
65
-
66
- :param architecture_name: The model as stated in the transformer library.
67
- :param use_xlm_tokenizer: True if one uses the LayoutXLM. (The model cannot be distinguished from LayoutLMv2).
68
- :return: Tokenizer instance to use.
69
- """
70
- return _ARCHITECTURES_TO_TOKENIZER[(architecture_name, use_xlm_tokenizer)]
71
-
72
34
 
73
35
  @pipeline_component_registry.register("LMTokenClassifierService")
74
36
  class LMTokenClassifierService(LanguageModelPipelineComponent):
@@ -154,7 +116,8 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
154
116
  else:
155
117
  self.default_key = TokenClasses.other
156
118
  self.other_name_as_key = {self.default_key: categories_name_as_key[self.default_key]}
157
- super().__init__(self._get_name(), tokenizer, image_to_layoutlm_features)
119
+ image_to_features_func = self.image_to_features_func(self.language_model.image_to_features_mapping())
120
+ super().__init__(self._get_name(), tokenizer, image_to_features_func)
158
121
  self.required_kwargs = {
159
122
  "tokenizer": self.tokenizer,
160
123
  "padding": self.padding,
@@ -218,7 +181,9 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
218
181
  word.annotation_id,
219
182
  )
220
183
 
221
- def clone(self) -> "LMTokenClassifierService":
184
+ def clone(self) -> LMTokenClassifierService:
185
+ # ToDo: replace copying of tokenizer with a proper clone method. Otherwise we cannot run the evaluation with
186
+ # multiple threads
222
187
  return self.__class__(
223
188
  copy(self.tokenizer),
224
189
  self.language_model.clone(),
@@ -244,19 +209,20 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
244
209
  return f"lm_token_class_{self.language_model.name}"
245
210
 
246
211
  def _init_sanity_checks(self) -> None:
247
- tokenizer_class = self.language_model.model.config.tokenizer_class
248
- use_xlm_tokenizer = False
249
- if tokenizer_class is not None:
250
- use_xlm_tokenizer = True
251
- tokenizer_reference = get_tokenizer_from_architecture(
252
- self.language_model.model.__class__.__name__, use_xlm_tokenizer
253
- )
254
- if not isinstance(self.tokenizer, type(tokenizer_reference)):
255
- raise ValueError(
256
- f"You want to use {type(self.tokenizer)} but you should use {type(tokenizer_reference)} "
212
+ tokenizer_class_name = self.language_model.model.config.tokenizer_class
213
+ if tokenizer_class_name != self.tokenizer.__class__.__name__:
214
+ raise TypeError(
215
+ f"You want to use {type(self.tokenizer)} but you should use {tokenizer_class_name} "
257
216
  f"in this framework"
258
217
  )
259
218
 
219
+ @staticmethod
220
+ def image_to_features_func(mapping_str: str) -> Callable[..., Callable[[Image], Optional[Any]]]:
221
+ """Replacing eval functions"""
222
+ return {"image_to_layoutlm_features": image_to_layoutlm_features, "image_to_lm_features": image_to_lm_features}[
223
+ mapping_str
224
+ ]
225
+
260
226
 
261
227
  @pipeline_component_registry.register("LMSequenceClassifierService")
262
228
  class LMSequenceClassifierService(LanguageModelPipelineComponent):
@@ -315,7 +281,8 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
315
281
  self.padding = padding
316
282
  self.truncation = truncation
317
283
  self.return_overflowing_tokens = return_overflowing_tokens
318
- super().__init__(self._get_name(), tokenizer, image_to_layoutlm_features)
284
+ image_to_features_func = self.image_to_features_func(self.language_model.image_to_features_mapping())
285
+ super().__init__(self._get_name(), tokenizer, image_to_features_func)
319
286
  self.required_kwargs = {
320
287
  "tokenizer": self.tokenizer,
321
288
  "padding": self.padding,
@@ -335,7 +302,7 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
335
302
  PageType.document_type, lm_output.class_name, lm_output.class_id, None, lm_output.score
336
303
  )
337
304
 
338
- def clone(self) -> "LMSequenceClassifierService":
305
+ def clone(self) -> LMSequenceClassifierService:
339
306
  return self.__class__(
340
307
  copy(self.tokenizer),
341
308
  self.language_model.clone(),
@@ -358,15 +325,16 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
358
325
  return f"lm_sequence_class_{self.language_model.name}"
359
326
 
360
327
  def _init_sanity_checks(self) -> None:
361
- tokenizer_class = self.language_model.model.config.tokenizer_class
362
- use_xlm_tokenizer = False
363
- if tokenizer_class is not None:
364
- use_xlm_tokenizer = True
365
- tokenizer_reference = get_tokenizer_from_architecture(
366
- self.language_model.model.__class__.__name__, use_xlm_tokenizer
367
- )
368
- if not isinstance(self.tokenizer, type(tokenizer_reference)):
369
- raise ValueError(
370
- f"You want to use {type(self.tokenizer)} but you should use {type(tokenizer_reference)} "
328
+ tokenizer_class_name = self.language_model.model.config.tokenizer_class
329
+ if tokenizer_class_name != self.tokenizer.__class__.__name__:
330
+ raise TypeError(
331
+ f"You want to use {type(self.tokenizer)} but you should use {tokenizer_class_name} "
371
332
  f"in this framework"
372
333
  )
334
+
335
+ @staticmethod
336
+ def image_to_features_func(mapping_str: str) -> Callable[..., Callable[[Image], Optional[Any]]]:
337
+ """Replacing eval functions"""
338
+ return {"image_to_layoutlm_features": image_to_layoutlm_features, "image_to_lm_features": image_to_lm_features}[
339
+ mapping_str
340
+ ]