deepdoctection 0.42.0__py3-none-any.whl → 0.43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (124) hide show
  1. deepdoctection/__init__.py +2 -1
  2. deepdoctection/analyzer/__init__.py +2 -1
  3. deepdoctection/analyzer/config.py +904 -0
  4. deepdoctection/analyzer/dd.py +36 -62
  5. deepdoctection/analyzer/factory.py +311 -141
  6. deepdoctection/configs/conf_dd_one.yaml +100 -44
  7. deepdoctection/configs/profiles.jsonl +32 -0
  8. deepdoctection/dataflow/__init__.py +9 -6
  9. deepdoctection/dataflow/base.py +33 -15
  10. deepdoctection/dataflow/common.py +96 -75
  11. deepdoctection/dataflow/custom.py +36 -29
  12. deepdoctection/dataflow/custom_serialize.py +135 -91
  13. deepdoctection/dataflow/parallel_map.py +33 -31
  14. deepdoctection/dataflow/serialize.py +15 -10
  15. deepdoctection/dataflow/stats.py +41 -28
  16. deepdoctection/datapoint/__init__.py +4 -6
  17. deepdoctection/datapoint/annotation.py +104 -66
  18. deepdoctection/datapoint/box.py +190 -130
  19. deepdoctection/datapoint/convert.py +66 -39
  20. deepdoctection/datapoint/image.py +151 -95
  21. deepdoctection/datapoint/view.py +383 -236
  22. deepdoctection/datasets/__init__.py +2 -6
  23. deepdoctection/datasets/adapter.py +11 -11
  24. deepdoctection/datasets/base.py +118 -81
  25. deepdoctection/datasets/dataflow_builder.py +18 -12
  26. deepdoctection/datasets/info.py +76 -57
  27. deepdoctection/datasets/instances/__init__.py +6 -2
  28. deepdoctection/datasets/instances/doclaynet.py +17 -14
  29. deepdoctection/datasets/instances/fintabnet.py +16 -22
  30. deepdoctection/datasets/instances/funsd.py +11 -6
  31. deepdoctection/datasets/instances/iiitar13k.py +9 -9
  32. deepdoctection/datasets/instances/layouttest.py +9 -9
  33. deepdoctection/datasets/instances/publaynet.py +9 -9
  34. deepdoctection/datasets/instances/pubtables1m.py +13 -13
  35. deepdoctection/datasets/instances/pubtabnet.py +13 -15
  36. deepdoctection/datasets/instances/rvlcdip.py +8 -8
  37. deepdoctection/datasets/instances/xfund.py +11 -9
  38. deepdoctection/datasets/registry.py +18 -11
  39. deepdoctection/datasets/save.py +12 -11
  40. deepdoctection/eval/__init__.py +3 -2
  41. deepdoctection/eval/accmetric.py +72 -52
  42. deepdoctection/eval/base.py +29 -10
  43. deepdoctection/eval/cocometric.py +14 -12
  44. deepdoctection/eval/eval.py +56 -41
  45. deepdoctection/eval/registry.py +6 -3
  46. deepdoctection/eval/tedsmetric.py +24 -9
  47. deepdoctection/eval/tp_eval_callback.py +13 -12
  48. deepdoctection/extern/__init__.py +1 -1
  49. deepdoctection/extern/base.py +176 -97
  50. deepdoctection/extern/d2detect.py +127 -92
  51. deepdoctection/extern/deskew.py +19 -10
  52. deepdoctection/extern/doctrocr.py +157 -106
  53. deepdoctection/extern/fastlang.py +25 -17
  54. deepdoctection/extern/hfdetr.py +137 -60
  55. deepdoctection/extern/hflayoutlm.py +329 -248
  56. deepdoctection/extern/hflm.py +67 -33
  57. deepdoctection/extern/model.py +108 -762
  58. deepdoctection/extern/pdftext.py +37 -12
  59. deepdoctection/extern/pt/nms.py +15 -1
  60. deepdoctection/extern/pt/ptutils.py +13 -9
  61. deepdoctection/extern/tessocr.py +87 -54
  62. deepdoctection/extern/texocr.py +29 -14
  63. deepdoctection/extern/tp/tfutils.py +36 -8
  64. deepdoctection/extern/tp/tpcompat.py +54 -16
  65. deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
  66. deepdoctection/extern/tpdetect.py +4 -2
  67. deepdoctection/mapper/__init__.py +1 -1
  68. deepdoctection/mapper/cats.py +117 -76
  69. deepdoctection/mapper/cocostruct.py +35 -17
  70. deepdoctection/mapper/d2struct.py +56 -29
  71. deepdoctection/mapper/hfstruct.py +32 -19
  72. deepdoctection/mapper/laylmstruct.py +221 -185
  73. deepdoctection/mapper/maputils.py +71 -35
  74. deepdoctection/mapper/match.py +76 -62
  75. deepdoctection/mapper/misc.py +68 -44
  76. deepdoctection/mapper/pascalstruct.py +13 -12
  77. deepdoctection/mapper/prodigystruct.py +33 -19
  78. deepdoctection/mapper/pubstruct.py +42 -32
  79. deepdoctection/mapper/tpstruct.py +39 -19
  80. deepdoctection/mapper/xfundstruct.py +20 -13
  81. deepdoctection/pipe/__init__.py +1 -2
  82. deepdoctection/pipe/anngen.py +104 -62
  83. deepdoctection/pipe/base.py +226 -107
  84. deepdoctection/pipe/common.py +206 -123
  85. deepdoctection/pipe/concurrency.py +74 -47
  86. deepdoctection/pipe/doctectionpipe.py +108 -47
  87. deepdoctection/pipe/language.py +41 -24
  88. deepdoctection/pipe/layout.py +45 -18
  89. deepdoctection/pipe/lm.py +146 -78
  90. deepdoctection/pipe/order.py +196 -113
  91. deepdoctection/pipe/refine.py +111 -63
  92. deepdoctection/pipe/registry.py +1 -1
  93. deepdoctection/pipe/segment.py +213 -142
  94. deepdoctection/pipe/sub_layout.py +76 -46
  95. deepdoctection/pipe/text.py +52 -33
  96. deepdoctection/pipe/transform.py +8 -6
  97. deepdoctection/train/d2_frcnn_train.py +87 -69
  98. deepdoctection/train/hf_detr_train.py +72 -40
  99. deepdoctection/train/hf_layoutlm_train.py +85 -46
  100. deepdoctection/train/tp_frcnn_train.py +56 -28
  101. deepdoctection/utils/concurrency.py +59 -16
  102. deepdoctection/utils/context.py +40 -19
  103. deepdoctection/utils/develop.py +25 -17
  104. deepdoctection/utils/env_info.py +85 -36
  105. deepdoctection/utils/error.py +16 -10
  106. deepdoctection/utils/file_utils.py +246 -62
  107. deepdoctection/utils/fs.py +162 -43
  108. deepdoctection/utils/identifier.py +29 -16
  109. deepdoctection/utils/logger.py +49 -32
  110. deepdoctection/utils/metacfg.py +83 -21
  111. deepdoctection/utils/pdf_utils.py +119 -62
  112. deepdoctection/utils/settings.py +24 -10
  113. deepdoctection/utils/tqdm.py +10 -5
  114. deepdoctection/utils/transform.py +182 -46
  115. deepdoctection/utils/utils.py +61 -28
  116. deepdoctection/utils/viz.py +150 -104
  117. deepdoctection-0.43.dist-info/METADATA +376 -0
  118. deepdoctection-0.43.dist-info/RECORD +149 -0
  119. {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/WHEEL +1 -1
  120. deepdoctection/analyzer/_config.py +0 -146
  121. deepdoctection-0.42.0.dist-info/METADATA +0 -431
  122. deepdoctection-0.42.0.dist-info/RECORD +0 -148
  123. {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/licenses/LICENSE +0 -0
  124. {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/top_level.txt +0 -0
@@ -15,7 +15,9 @@
15
15
  # See the License for the specific language governing permissions and
16
16
  # limitations under the License.
17
17
 
18
- """Factory for building the deepdoctection analyzer pipeline"""
18
+ """
19
+ `ServiceFactory` for building analyzers
20
+ """
19
21
 
20
22
 
21
23
  from os import environ
@@ -68,7 +70,7 @@ class ServiceFactory:
68
70
  """
69
71
  Factory class for building various components of the deepdoctection analyzer pipeline.
70
72
 
71
- This class uses the `cfg` configuration object from `_config.py`, which is an instance of the `AttrDict` class.
73
+ This class uses the `cfg` configuration object from `config.py`, which is an instance of the `AttrDict` class.
72
74
  The configuration is not passed explicitly in an `__init__` method but is accessed directly within the methods.
73
75
 
74
76
  The class provides static methods to build different services and detectors required for the pipeline, such as
@@ -78,7 +80,7 @@ class ServiceFactory:
78
80
 
79
81
  Extending the Class:
80
82
  This class can be extended by using inheritance and adding new methods or overriding existing ones.
81
- To extend the configuration attributes, you can modify the `cfg` object in `_config.py` to include new
83
+ To extend the configuration attributes, you can modify the `cfg` object in `config.py` to include new
82
84
  settings or parameters required for the new methods.
83
85
  """
84
86
 
@@ -87,11 +89,13 @@ class ServiceFactory:
87
89
  config: AttrDict,
88
90
  mode: str,
89
91
  ) -> Union[D2FrcnnDetector, TPFrcnnDetector, HFDetrDerivedDetector, D2FrcnnTracingDetector]:
90
- """Building a D2-Detector, a TP-Detector as Detr-Detector or a D2-Torch Tracing Detector according to
91
- the config
92
+ """
93
+ Building a D2-Detector, a TP-Detector as Detr-Detector or a D2-Torch Tracing Detector according to
94
+ the config.
92
95
 
93
- :param config: configuration object
94
- :param mode: either `LAYOUT`,`CELL` or `ITEM`
96
+ Args:
97
+ config: Configuration object.
98
+ mode: Either `LAYOUT`, `CELL`, or `ITEM`.
95
99
  """
96
100
  if config.LIB is None:
97
101
  raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
@@ -101,7 +105,7 @@ class ServiceFactory:
101
105
  if config.LIB == "TF"
102
106
  else (
103
107
  getattr(config.PT, mode).WEIGHTS
104
- if getattr(config.PT.ENFORCE_WEIGHTS,mode)
108
+ if getattr(config.PT.ENFORCE_WEIGHTS, mode)
105
109
  else getattr(config.PT, mode).WEIGHTS_TS
106
110
  )
107
111
  )
@@ -113,6 +117,8 @@ class ServiceFactory:
113
117
  config_path = ModelCatalog.get_full_path_configs(weights)
114
118
  weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
115
119
  profile = ModelCatalog.get_profile(weights)
120
+ if config.LIB == "PT" and profile.padding is not None:
121
+ getattr(config.PT, mode).PADDING = profile.padding
116
122
  categories = profile.categories if profile.categories is not None else {}
117
123
 
118
124
  if profile.model_wrapper in ("TPFrcnnDetector",):
@@ -156,40 +162,72 @@ class ServiceFactory:
156
162
  def build_layout_detector(
157
163
  config: AttrDict, mode: str
158
164
  ) -> Union[D2FrcnnDetector, TPFrcnnDetector, HFDetrDerivedDetector, D2FrcnnTracingDetector]:
159
- """Building a layout detector according to the config
165
+ """
166
+ Building a layout detector according to the config.
160
167
 
161
- :param config: configuration object
162
- :param mode: either `LAYOUT`,`CELL` or `ITEM`
168
+ Args:
169
+ config: Configuration object.
170
+ mode: Either `LAYOUT`, `CELL`, or `ITEM`.
163
171
  """
164
172
  return ServiceFactory._build_layout_detector(config, mode)
165
173
 
166
174
  @staticmethod
167
175
  def _build_rotation_detector() -> TesseractRotationTransformer:
168
- """Building a rotation detector"""
176
+ """
177
+ Building a rotation detector.
178
+
179
+ Returns:
180
+ TesseractRotationTransformer: Rotation detector instance.
181
+ """
169
182
  return TesseractRotationTransformer()
170
183
 
171
184
  @staticmethod
172
185
  def build_rotation_detector() -> TesseractRotationTransformer:
173
- """Building a rotation detector"""
186
+ """
187
+ Building a rotation detector.
188
+
189
+ Returns:
190
+ TesseractRotationTransformer: Rotation detector instance.
191
+ """
174
192
  return ServiceFactory._build_rotation_detector()
175
193
 
176
194
  @staticmethod
177
195
  def _build_transform_service(transform_predictor: ImageTransformer) -> SimpleTransformService:
178
- """Building a transform service with a given predictor"""
196
+ """
197
+ Building a transform service with a given predictor.
198
+
199
+ Args:
200
+ transform_predictor: Predictor for image transformation.
201
+
202
+ Returns:
203
+ SimpleTransformService: Transform service instance.
204
+ """
179
205
  return SimpleTransformService(transform_predictor)
180
206
 
181
207
  @staticmethod
182
208
  def build_transform_service(transform_predictor: ImageTransformer) -> SimpleTransformService:
183
- """Building a transform service with a given predictor"""
209
+ """
210
+ Building a transform service with a given predictor.
211
+
212
+ Args:
213
+ transform_predictor: Predictor for image transformation.
214
+
215
+ Returns:
216
+ SimpleTransformService: Transform service instance.
217
+ """
184
218
  return ServiceFactory._build_transform_service(transform_predictor)
185
219
 
186
220
  @staticmethod
187
221
  def _build_padder(config: AttrDict, mode: str) -> PadTransform:
188
- """Building a padder according to the config
222
+ """
223
+ Building a padder according to the config.
224
+
225
+ Args:
226
+ config: Configuration object.
227
+ mode: Either `LAYOUT`, `CELL`, or `ITEM`.
189
228
 
190
- :param config: configuration object
191
- :param mode: either `LAYOUT`,`CELL` or `ITEM`
192
- :return `PadTransform` instance
229
+ Returns:
230
+ PadTransform: `PadTransform` instance.
193
231
  """
194
232
  top, right, bottom, left = (
195
233
  getattr(config.PT, mode).PAD.TOP,
@@ -201,44 +239,61 @@ class ServiceFactory:
201
239
 
202
240
  @staticmethod
203
241
  def build_padder(config: AttrDict, mode: str) -> PadTransform:
204
- """Building a padder according to the config
242
+ """
243
+ Building a padder according to the config.
244
+
245
+ Args:
246
+ config: Configuration object.
247
+ mode: Either `LAYOUT`, `CELL`, or `ITEM`.
205
248
 
206
- :param config: configuration object
207
- :param mode: either `LAYOUT`,`CELL` or `ITEM`
208
- :return `PadTransform` instance
249
+ Returns:
250
+ PadTransform: `PadTransform` instance.
209
251
  """
210
252
  return ServiceFactory._build_padder(config, mode)
211
253
 
212
254
  @staticmethod
213
255
  def _build_layout_service(config: AttrDict, detector: ObjectDetector, mode: str) -> ImageLayoutService:
214
- """Building a layout service with a given detector
256
+ """
257
+ Building a layout service with a given detector.
258
+
259
+ Args:
260
+ config: Configuration object.
261
+ detector: Will be passed to the `ImageLayoutService`.
262
+ mode: Either `LAYOUT`, `CELL`, or `ITEM`.
215
263
 
216
- :param config: configuration object
217
- :param detector: will be passed to the `ImageLayoutService`
218
- :param mode: either `LAYOUT`,`CELL` or `ITEM`
219
- :return `ImageLayoutService` instance
264
+ Returns:
265
+ ImageLayoutService: `ImageLayoutService` instance.
220
266
  """
221
267
  padder = None
222
- if detector.__class__.__name__ in ("HFDetrDerivedDetector",):
268
+ if getattr(config.PT, mode).PADDING:
223
269
  padder = ServiceFactory.build_padder(config, mode=mode)
224
270
  return ImageLayoutService(layout_detector=detector, to_image=True, crop_image=True, padder=padder)
225
271
 
226
272
  @staticmethod
227
273
  def build_layout_service(config: AttrDict, detector: ObjectDetector, mode: str) -> ImageLayoutService:
228
- """Building a layout service with a given detector
274
+ """
275
+ Building a layout service with a given detector.
276
+
277
+ Args:
278
+ config: Configuration object.
279
+ detector: Will be passed to the `ImageLayoutService`.
280
+ mode: Either `LAYOUT`, `CELL`, or `ITEM`.
229
281
 
230
- :param config: configuration object
231
- :param detector: will be passed to the `ImageLayoutService`
232
- :param mode: either `LAYOUT`,`CELL` or `ITEM`
233
- :return `ImageLayoutService` instance
282
+ Returns:
283
+ ImageLayoutService: `ImageLayoutService` instance.
234
284
  """
235
285
  return ServiceFactory._build_layout_service(config, detector, mode)
236
286
 
237
287
  @staticmethod
238
288
  def _build_layout_nms_service(config: AttrDict) -> AnnotationNmsService:
239
- """Building a NMS service for layout annotations
289
+ """
290
+ Building a NMS service for layout annotations.
291
+
292
+ Args:
293
+ config: Configuration object.
240
294
 
241
- :param config: configuration object
295
+ Returns:
296
+ AnnotationNmsService: NMS service instance.
242
297
  """
243
298
  if not isinstance(config.LAYOUT_NMS_PAIRS.COMBINATIONS, list) and not isinstance(
244
299
  config.LAYOUT_NMS_PAIRS.COMBINATIONS[0], list
@@ -252,21 +307,29 @@ class ServiceFactory:
252
307
 
253
308
  @staticmethod
254
309
  def build_layout_nms_service(config: AttrDict) -> AnnotationNmsService:
255
- """Building a NMS service for layout annotations
310
+ """
311
+ Building a NMS service for layout annotations.
256
312
 
257
- :param config: configuration object
313
+ Args:
314
+ config: Configuration object.
315
+
316
+ Returns:
317
+ AnnotationNmsService: NMS service instance.
258
318
  """
259
319
  return ServiceFactory._build_layout_nms_service(config)
260
320
 
261
321
  @staticmethod
262
322
  def _build_sub_image_service(config: AttrDict, detector: ObjectDetector, mode: str) -> SubImageLayoutService:
263
323
  """
264
- Building a sub image layout service with a given detector
324
+ Building a sub image layout service with a given detector.
325
+
326
+ Args:
327
+ config: Configuration object.
328
+ detector: Will be passed to the `SubImageLayoutService`.
329
+ mode: Either `LAYOUT`, `CELL`, or `ITEM`.
265
330
 
266
- :param config: configuration object
267
- :param detector: will be passed to the `SubImageLayoutService`
268
- :param mode: either `LAYOUT`,`CELL` or `ITEM`
269
- :return: `SubImageLayoutService` instance
331
+ Returns:
332
+ SubImageLayoutService: `SubImageLayoutService` instance.
270
333
  """
271
334
  exclude_category_names = []
272
335
  padder = None
@@ -290,21 +353,28 @@ class ServiceFactory:
290
353
  @staticmethod
291
354
  def build_sub_image_service(config: AttrDict, detector: ObjectDetector, mode: str) -> SubImageLayoutService:
292
355
  """
293
- Building a sub image layout service with a given detector
356
+ Building a sub image layout service with a given detector.
294
357
 
295
- :param config: configuration object
296
- :param detector: will be passed to the `SubImageLayoutService`
297
- :param mode: either `LAYOUT`,`CELL` or `ITEM`
298
- :return: `SubImageLayoutService` instance
358
+ Args:
359
+ config: Configuration object.
360
+ detector: Will be passed to the `SubImageLayoutService`.
361
+ mode: Either `LAYOUT`, `CELL`, or `ITEM`.
362
+
363
+ Returns:
364
+ SubImageLayoutService: `SubImageLayoutService` instance.
299
365
  """
300
366
  return ServiceFactory._build_sub_image_service(config, detector, mode)
301
367
 
302
368
  @staticmethod
303
369
  def _build_ocr_detector(config: AttrDict) -> Union[TesseractOcrDetector, DoctrTextRecognizer, TextractOcrDetector]:
304
370
  """
305
- Building OCR predictor
371
+ Building OCR predictor.
372
+
373
+ Args:
374
+ config: Configuration object.
306
375
 
307
- :param config: configuration object
376
+ Returns:
377
+ Union[TesseractOcrDetector, DoctrTextRecognizer, TextractOcrDetector]: OCR detector instance.
308
378
  """
309
379
  if config.OCR.USE_TESSERACT:
310
380
  ocr_config_path = get_configs_dir_path() / config.OCR.CONFIG.TESSERACT
@@ -345,18 +415,26 @@ class ServiceFactory:
345
415
  @staticmethod
346
416
  def build_ocr_detector(config: AttrDict) -> Union[TesseractOcrDetector, DoctrTextRecognizer, TextractOcrDetector]:
347
417
  """
348
- Building OCR predictor
418
+ Building OCR predictor.
419
+
420
+ Args:
421
+ config: Configuration object.
349
422
 
350
- :param config: configuration object
423
+ Returns:
424
+ Union[TesseractOcrDetector, DoctrTextRecognizer, TextractOcrDetector]: OCR detector instance.
351
425
  """
352
426
  return ServiceFactory._build_ocr_detector(config)
353
427
 
354
428
  @staticmethod
355
429
  def build_doctr_word_detector(config: AttrDict) -> DoctrTextlineDetector:
356
- """Building `DoctrTextlineDetector` instance
430
+ """
431
+ Building `DoctrTextlineDetector` instance.
432
+
433
+ Args:
434
+ config: Configuration object.
357
435
 
358
- :param config: configuration object
359
- :return: DoctrTextlineDetector
436
+ Returns:
437
+ DoctrTextlineDetector: Textline detector instance.
360
438
  """
361
439
  if config.LIB is None:
362
440
  raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
@@ -379,20 +457,22 @@ class ServiceFactory:
379
457
  """
380
458
  Build and return a table segmentation service based on the provided detector.
381
459
 
382
- Depending on the type of the detector, this method will return either a `PubtablesSegmentationService` or a
383
- `TableSegmentationService` instance. The selection is made as follows:
460
+ Note:
461
+ Depending on the type of the detector, this method will return either a `PubtablesSegmentationService` or a
462
+ `TableSegmentationService` instance. The selection is made as follows:
384
463
 
385
- - If the detector is an instance of `HFDetrDerivedDetector`, a `PubtablesSegmentationService` is created and
386
- returned. This service uses specific configuration parameters for segmentation, such as assignment rules,
387
- thresholds, and cell names defined in the `cfg` object.
388
- - For other detector types, a `TableSegmentationService` is created and returned. This service also uses
389
- configuration parameters from the `cfg` object but is tailored for different segmentation needs.
464
+ - If the detector is an instance of `HFDetrDerivedDetector`, a `PubtablesSegmentationService` is created and
465
+ returned. This service uses specific configuration parameters for segmentation, such as assignment rules,
466
+ thresholds, and cell names defined in the `cfg` object.
467
+ - For other detector types, a `TableSegmentationService` is created and returned. This service also uses
468
+ configuration parameters from the `cfg` object but is tailored for different segmentation needs.
390
469
 
391
- :param config: configuration object
392
- :param detector: An instance of `ObjectDetector` used to determine the type of table segmentation
393
- service to build.
394
- :return: An instance of either `PubtablesSegmentationService` or `TableSegmentationService` based on the
395
- detector type.
470
+ Args:
471
+ config: Configuration object.
472
+ detector: An instance of `ObjectDetector` used to determine the type of table segmentation service to build.
473
+
474
+ Returns:
475
+ Table segmentation service instance.
396
476
  """
397
477
  table_segmentation: Union[PubtablesSegmentationService, TableSegmentationService]
398
478
  if detector.__class__.__name__ in ("HFDetrDerivedDetector",):
@@ -437,29 +517,35 @@ class ServiceFactory:
437
517
  """
438
518
  Build and return a table segmentation service based on the provided detector.
439
519
 
440
- Depending on the type of the detector, this method will return either a `PubtablesSegmentationService` or a
441
- `TableSegmentationService` instance. The selection is made as follows:
520
+ Note:
521
+ Depending on the type of the detector, this method will return either a `PubtablesSegmentationService` or a
522
+ `TableSegmentationService` instance. The selection is made as follows:
523
+
524
+ - If the detector is an instance of `HFDetrDerivedDetector`, a `PubtablesSegmentationService` is created and
525
+ returned. This service uses specific configuration parameters for segmentation, such as assignment rules,
526
+ thresholds, and cell names defined in the `cfg` object.
527
+ - For other detector types, a `TableSegmentationService` is created and returned. This service also uses
528
+ configuration parameters from the `cfg` object but is tailored for different segmentation needs.
442
529
 
443
- - If the detector is an instance of `HFDetrDerivedDetector`, a `PubtablesSegmentationService` is created and
444
- returned. This service uses specific configuration parameters for segmentation, such as assignment rules,
445
- thresholds, and cell names defined in the `cfg` object.
446
- - For other detector types, a `TableSegmentationService` is created and returned. This service also uses
447
- configuration parameters from the `cfg` object but is tailored for different segmentation needs.
530
+ Args:
531
+ config: Configuration object.
532
+ detector: An instance of `ObjectDetector` used to determine the type of table segmentation service to build.
448
533
 
449
- :param config: configuration object
450
- :param detector: An instance of `ObjectDetector` used to determine the type of table segmentation
451
- service to build.
452
- :return: An instance of either `PubtablesSegmentationService` or `TableSegmentationService` based on the
453
- detector type.
534
+ Returns:
535
+ Table segmentation service instance.
454
536
  """
455
537
  return ServiceFactory._build_table_segmentation_service(config, detector)
456
538
 
457
539
  @staticmethod
458
540
  def _build_table_refinement_service(config: AttrDict) -> TableSegmentationRefinementService:
459
- """Building a table segmentation refinement service
541
+ """
542
+ Building a table segmentation refinement service.
460
543
 
461
- :param config: configuration object
462
- :return: TableSegmentationRefinementService
544
+ Args:
545
+ config: Configuration object.
546
+
547
+ Returns:
548
+ TableSegmentationRefinementService: Refinement service instance.
463
549
  """
464
550
  return TableSegmentationRefinementService(
465
551
  [config.SEGMENTATION.TABLE_NAME],
@@ -468,19 +554,27 @@ class ServiceFactory:
468
554
 
469
555
  @staticmethod
470
556
  def build_table_refinement_service(config: AttrDict) -> TableSegmentationRefinementService:
471
- """Building a table segmentation refinement service
557
+ """
558
+ Building a table segmentation refinement service.
472
559
 
473
- :param config: configuration object
474
- :return: TableSegmentationRefinementService
560
+ Args:
561
+ config: Configuration object.
562
+
563
+ Returns:
564
+ TableSegmentationRefinementService: Refinement service instance.
475
565
  """
476
566
  return ServiceFactory._build_table_refinement_service(config)
477
567
 
478
568
  @staticmethod
479
569
  def _build_pdf_text_detector(config: AttrDict) -> PdfPlumberTextDetector:
480
- """Building a PDF text detector
570
+ """
571
+ Building a PDF text detector.
572
+
573
+ Args:
574
+ config: Configuration object.
481
575
 
482
- :param config: configuration object
483
- :return: PdfPlumberTextDetector
576
+ Returns:
577
+ PdfPlumberTextDetector: PDF text detector instance.
484
578
  """
485
579
  return PdfPlumberTextDetector(
486
580
  x_tolerance=config.PDF_MINER.X_TOLERANCE, y_tolerance=config.PDF_MINER.Y_TOLERANCE
@@ -488,46 +582,66 @@ class ServiceFactory:
488
582
 
489
583
  @staticmethod
490
584
  def build_pdf_text_detector(config: AttrDict) -> PdfPlumberTextDetector:
491
- """Building a PDF text detector
585
+ """
586
+ Building a PDF text detector.
587
+
588
+ Args:
589
+ config: Configuration object.
492
590
 
493
- :param config: configuration object
494
- :return: PdfPlumberTextDetector
591
+ Returns:
592
+ PdfPlumberTextDetector: PDF text detector instance.
495
593
  """
496
594
  return ServiceFactory._build_pdf_text_detector(config)
497
595
 
498
596
  @staticmethod
499
597
  def _build_pdf_miner_text_service(detector: PdfMiner) -> TextExtractionService:
500
- """Building a PDFMiner text extraction service
598
+ """
599
+ Building a PDFMiner text extraction service.
600
+
601
+ Args:
602
+ detector: PdfMiner instance.
501
603
 
502
- :param detector: PdfMiner
503
- :return: TextExtractionService
604
+ Returns:
605
+ TextExtractionService: Text extraction service instance.
504
606
  """
505
607
  return TextExtractionService(detector)
506
608
 
507
609
  @staticmethod
508
610
  def build_pdf_miner_text_service(detector: PdfMiner) -> TextExtractionService:
509
- """Building a PDFMiner text extraction service
611
+ """
612
+ Building a PDFMiner text extraction service.
510
613
 
511
- :param detector: PdfMiner
512
- :return: TextExtractionService
614
+ Args:
615
+ detector: PdfMiner instance.
616
+
617
+ Returns:
618
+ TextExtractionService: Text extraction service instance.
513
619
  """
514
620
  return ServiceFactory._build_pdf_miner_text_service(detector)
515
621
 
516
622
  @staticmethod
517
623
  def _build_doctr_word_detector_service(detector: DoctrTextlineDetector) -> ImageLayoutService:
518
- """Building a Doctr word detector service
624
+ """
625
+ Building a Doctr word detector service.
519
626
 
520
- :param detector: DoctrTextlineDetector
521
- :return: ImageLayoutService
627
+ Args:
628
+ detector: DoctrTextlineDetector instance.
629
+
630
+ Returns:
631
+ ImageLayoutService: Word detector service instance.
522
632
  """
523
633
  return ImageLayoutService(layout_detector=detector, to_image=True, crop_image=True)
524
634
 
525
635
  @staticmethod
526
636
  def build_doctr_word_detector_service(detector: DoctrTextlineDetector) -> ImageLayoutService:
527
- """Building a Doctr word detector service
637
+ """
638
+ Building a Doctr word detector service.
639
+
640
+ Args:
641
+ detector: DoctrTextlineDetector instance.
528
642
 
529
- :param detector: DoctrTextlineDetector
530
- :return: ImageLayoutService
643
+ Returns:
644
+ ImageLayoutService: Word detector service instance.
531
645
  """
532
646
  return ServiceFactory._build_doctr_word_detector_service(detector)
533
647
 
@@ -535,11 +649,15 @@ class ServiceFactory:
535
649
  def _build_text_extraction_service(
536
650
  config: AttrDict, detector: Union[TesseractOcrDetector, DoctrTextRecognizer, TextractOcrDetector]
537
651
  ) -> TextExtractionService:
538
- """Building a text extraction service
652
+ """
653
+ Building a text extraction service.
654
+
655
+ Args:
656
+ config: Configuration object.
657
+ detector: OCR detector instance.
539
658
 
540
- :param config: configuration object
541
- :param detector: OCR detector
542
- :return: TextExtractionService
659
+ Returns:
660
+ TextExtractionService: Text extraction service instance.
543
661
  """
544
662
  return TextExtractionService(
545
663
  detector,
@@ -550,20 +668,28 @@ class ServiceFactory:
550
668
  def build_text_extraction_service(
551
669
  config: AttrDict, detector: Union[TesseractOcrDetector, DoctrTextRecognizer, TextractOcrDetector]
552
670
  ) -> TextExtractionService:
553
- """Building a text extraction service
671
+ """
672
+ Building a text extraction service.
673
+
674
+ Args:
675
+ config: Configuration object.
676
+ detector: OCR detector instance.
554
677
 
555
- :param config: configuration object
556
- :param detector: OCR detector
557
- :return: TextExtractionService
678
+ Returns:
679
+ TextExtractionService: Text extraction service instance.
558
680
  """
559
681
  return ServiceFactory._build_text_extraction_service(config, detector)
560
682
 
561
683
  @staticmethod
562
684
  def _build_word_matching_service(config: AttrDict) -> MatchingService:
563
- """Building a word matching service
685
+ """
686
+ Building a word matching service.
564
687
 
565
- :param config: configuration object
566
- :return: MatchingService
688
+ Args:
689
+ config: Configuration object.
690
+
691
+ Returns:
692
+ MatchingService: Word matching service instance.
567
693
  """
568
694
  matcher = IntersectionMatcher(
569
695
  matching_rule=config.WORD_MATCHING.RULE,
@@ -591,19 +717,27 @@ class ServiceFactory:
591
717
 
592
718
  @staticmethod
593
719
  def build_word_matching_service(config: AttrDict) -> MatchingService:
594
- """Building a word matching service
720
+ """
721
+ Building a word matching service.
595
722
 
596
- :param config: configuration object
597
- :return: MatchingService
723
+ Args:
724
+ config: Configuration object.
725
+
726
+ Returns:
727
+ MatchingService: Word matching service instance.
598
728
  """
599
729
  return ServiceFactory._build_word_matching_service(config)
600
730
 
601
731
  @staticmethod
602
732
  def _build_layout_link_matching_service(config: AttrDict) -> MatchingService:
603
- """Building a word matching service
733
+ """
734
+ Building a layout link matching service.
735
+
736
+ Args:
737
+ config: Configuration object.
604
738
 
605
- :param config: configuration object
606
- :return: MatchingService
739
+ Returns:
740
+ MatchingService: Layout link matching service instance.
607
741
  """
608
742
  neighbor_matcher = NeighbourMatcher()
609
743
  family_compounds = [
@@ -620,15 +754,28 @@ class ServiceFactory:
620
754
 
621
755
  @staticmethod
622
756
  def build_layout_link_matching_service(config: AttrDict) -> MatchingService:
623
- """Building a word matching service
757
+ """
758
+ Building a layout link matching service.
759
+
760
+ Args:
761
+ config: Configuration object.
624
762
 
625
- :param config: configuration object
626
- :return: MatchingService
763
+ Returns:
764
+ MatchingService: Layout link matching service instance.
627
765
  """
628
766
  return ServiceFactory._build_layout_link_matching_service(config)
629
767
 
630
768
  @staticmethod
631
769
  def _build_line_matching_service(config: AttrDict) -> MatchingService:
770
+ """
771
+ Building a line matching service.
772
+
773
+ Args:
774
+ config: Configuration object.
775
+
776
+ Returns:
777
+ MatchingService: Line matching service instance.
778
+ """
632
779
  matcher = IntersectionMatcher(
633
780
  matching_rule=config.WORD_MATCHING.RULE,
634
781
  threshold=config.WORD_MATCHING.THRESHOLD,
@@ -648,19 +795,27 @@ class ServiceFactory:
648
795
 
649
796
  @staticmethod
650
797
  def build_line_matching_service(config: AttrDict) -> MatchingService:
651
- """Building a word matching service
798
+ """
799
+ Building a line matching service.
652
800
 
653
- :param config: configuration object
654
- :return: MatchingService
801
+ Args:
802
+ config: Configuration object.
803
+
804
+ Returns:
805
+ MatchingService: Line matching service instance.
655
806
  """
656
807
  return ServiceFactory._build_line_matching_service(config)
657
808
 
658
809
  @staticmethod
659
810
  def _build_text_order_service(config: AttrDict) -> TextOrderService:
660
- """Building a text order service
811
+ """
812
+ Building a text order service.
813
+
814
+ Args:
815
+ config: Configuration object.
661
816
 
662
- :param config: configuration object
663
- :return: TextOrderService instance
817
+ Returns:
818
+ TextOrderService: Text order service instance.
664
819
  """
665
820
  return TextOrderService(
666
821
  text_container=config.TEXT_CONTAINER,
@@ -675,19 +830,27 @@ class ServiceFactory:
675
830
 
676
831
  @staticmethod
677
832
  def build_text_order_service(config: AttrDict) -> TextOrderService:
678
- """Building a text order service
833
+ """
834
+ Building a text order service.
835
+
836
+ Args:
837
+ config: Configuration object.
679
838
 
680
- :param config: configuration object
681
- :return: TextOrderService instance
839
+ Returns:
840
+ TextOrderService: Text order service instance.
682
841
  """
683
842
  return ServiceFactory._build_text_order_service(config)
684
843
 
685
844
  @staticmethod
686
845
  def _build_page_parsing_service(config: AttrDict) -> PageParsingService:
687
- """Building a page parsing service
846
+ """
847
+ Building a page parsing service.
848
+
849
+ Args:
850
+ config: Configuration object.
688
851
 
689
- :param config: configuration object
690
- :return: PageParsingService instance
852
+ Returns:
853
+ PageParsingService: Page parsing service instance.
691
854
  """
692
855
  return PageParsingService(
693
856
  text_container=config.TEXT_CONTAINER,
@@ -697,20 +860,27 @@ class ServiceFactory:
697
860
 
698
861
  @staticmethod
699
862
  def build_page_parsing_service(config: AttrDict) -> PageParsingService:
700
- """Building a page parsing service
863
+ """
864
+ Building a page parsing service.
701
865
 
702
- :param config: configuration object
703
- :return: PageParsingService instance
866
+ Args:
867
+ config: Configuration object.
868
+
869
+ Returns:
870
+ PageParsingService: Page parsing service instance.
704
871
  """
705
872
  return ServiceFactory._build_page_parsing_service(config)
706
873
 
707
874
  @staticmethod
708
875
  def build_analyzer(config: AttrDict) -> DoctectionPipe:
709
876
  """
710
- Builds the analyzer with a given config
877
+ Builds the analyzer with a given config.
878
+
879
+ Args:
880
+ config: Configuration object.
711
881
 
712
- :param config: configuration object
713
- :return: Analyzer pipeline
882
+ Returns:
883
+ DoctectionPipe: Analyzer pipeline instance.
714
884
  """
715
885
  pipe_component_list: list[PipelineComponent] = []
716
886