deepdoctection 0.42.1__py3-none-any.whl → 0.43__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +2 -1
- deepdoctection/analyzer/__init__.py +2 -1
- deepdoctection/analyzer/config.py +904 -0
- deepdoctection/analyzer/dd.py +36 -62
- deepdoctection/analyzer/factory.py +311 -141
- deepdoctection/configs/conf_dd_one.yaml +100 -44
- deepdoctection/configs/profiles.jsonl +32 -0
- deepdoctection/dataflow/__init__.py +9 -6
- deepdoctection/dataflow/base.py +33 -15
- deepdoctection/dataflow/common.py +96 -75
- deepdoctection/dataflow/custom.py +36 -29
- deepdoctection/dataflow/custom_serialize.py +135 -91
- deepdoctection/dataflow/parallel_map.py +33 -31
- deepdoctection/dataflow/serialize.py +15 -10
- deepdoctection/dataflow/stats.py +41 -28
- deepdoctection/datapoint/__init__.py +4 -6
- deepdoctection/datapoint/annotation.py +104 -66
- deepdoctection/datapoint/box.py +190 -130
- deepdoctection/datapoint/convert.py +66 -39
- deepdoctection/datapoint/image.py +151 -95
- deepdoctection/datapoint/view.py +383 -236
- deepdoctection/datasets/__init__.py +2 -6
- deepdoctection/datasets/adapter.py +11 -11
- deepdoctection/datasets/base.py +118 -81
- deepdoctection/datasets/dataflow_builder.py +18 -12
- deepdoctection/datasets/info.py +76 -57
- deepdoctection/datasets/instances/__init__.py +6 -2
- deepdoctection/datasets/instances/doclaynet.py +17 -14
- deepdoctection/datasets/instances/fintabnet.py +16 -22
- deepdoctection/datasets/instances/funsd.py +11 -6
- deepdoctection/datasets/instances/iiitar13k.py +9 -9
- deepdoctection/datasets/instances/layouttest.py +9 -9
- deepdoctection/datasets/instances/publaynet.py +9 -9
- deepdoctection/datasets/instances/pubtables1m.py +13 -13
- deepdoctection/datasets/instances/pubtabnet.py +13 -15
- deepdoctection/datasets/instances/rvlcdip.py +8 -8
- deepdoctection/datasets/instances/xfund.py +11 -9
- deepdoctection/datasets/registry.py +18 -11
- deepdoctection/datasets/save.py +12 -11
- deepdoctection/eval/__init__.py +3 -2
- deepdoctection/eval/accmetric.py +72 -52
- deepdoctection/eval/base.py +29 -10
- deepdoctection/eval/cocometric.py +14 -12
- deepdoctection/eval/eval.py +56 -41
- deepdoctection/eval/registry.py +6 -3
- deepdoctection/eval/tedsmetric.py +24 -9
- deepdoctection/eval/tp_eval_callback.py +13 -12
- deepdoctection/extern/__init__.py +1 -1
- deepdoctection/extern/base.py +176 -97
- deepdoctection/extern/d2detect.py +127 -92
- deepdoctection/extern/deskew.py +19 -10
- deepdoctection/extern/doctrocr.py +157 -106
- deepdoctection/extern/fastlang.py +25 -17
- deepdoctection/extern/hfdetr.py +137 -60
- deepdoctection/extern/hflayoutlm.py +329 -248
- deepdoctection/extern/hflm.py +67 -33
- deepdoctection/extern/model.py +108 -762
- deepdoctection/extern/pdftext.py +37 -12
- deepdoctection/extern/pt/nms.py +15 -1
- deepdoctection/extern/pt/ptutils.py +13 -9
- deepdoctection/extern/tessocr.py +87 -54
- deepdoctection/extern/texocr.py +29 -14
- deepdoctection/extern/tp/tfutils.py +36 -8
- deepdoctection/extern/tp/tpcompat.py +54 -16
- deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
- deepdoctection/extern/tpdetect.py +4 -2
- deepdoctection/mapper/__init__.py +1 -1
- deepdoctection/mapper/cats.py +117 -76
- deepdoctection/mapper/cocostruct.py +35 -17
- deepdoctection/mapper/d2struct.py +56 -29
- deepdoctection/mapper/hfstruct.py +32 -19
- deepdoctection/mapper/laylmstruct.py +221 -185
- deepdoctection/mapper/maputils.py +71 -35
- deepdoctection/mapper/match.py +76 -62
- deepdoctection/mapper/misc.py +68 -44
- deepdoctection/mapper/pascalstruct.py +13 -12
- deepdoctection/mapper/prodigystruct.py +33 -19
- deepdoctection/mapper/pubstruct.py +42 -32
- deepdoctection/mapper/tpstruct.py +39 -19
- deepdoctection/mapper/xfundstruct.py +20 -13
- deepdoctection/pipe/__init__.py +1 -2
- deepdoctection/pipe/anngen.py +104 -62
- deepdoctection/pipe/base.py +226 -107
- deepdoctection/pipe/common.py +206 -123
- deepdoctection/pipe/concurrency.py +74 -47
- deepdoctection/pipe/doctectionpipe.py +108 -47
- deepdoctection/pipe/language.py +41 -24
- deepdoctection/pipe/layout.py +45 -18
- deepdoctection/pipe/lm.py +146 -78
- deepdoctection/pipe/order.py +196 -113
- deepdoctection/pipe/refine.py +111 -63
- deepdoctection/pipe/registry.py +1 -1
- deepdoctection/pipe/segment.py +213 -142
- deepdoctection/pipe/sub_layout.py +76 -46
- deepdoctection/pipe/text.py +52 -33
- deepdoctection/pipe/transform.py +8 -6
- deepdoctection/train/d2_frcnn_train.py +87 -69
- deepdoctection/train/hf_detr_train.py +72 -40
- deepdoctection/train/hf_layoutlm_train.py +85 -46
- deepdoctection/train/tp_frcnn_train.py +56 -28
- deepdoctection/utils/concurrency.py +59 -16
- deepdoctection/utils/context.py +40 -19
- deepdoctection/utils/develop.py +25 -17
- deepdoctection/utils/env_info.py +85 -36
- deepdoctection/utils/error.py +16 -10
- deepdoctection/utils/file_utils.py +246 -62
- deepdoctection/utils/fs.py +162 -43
- deepdoctection/utils/identifier.py +29 -16
- deepdoctection/utils/logger.py +49 -32
- deepdoctection/utils/metacfg.py +83 -21
- deepdoctection/utils/pdf_utils.py +119 -62
- deepdoctection/utils/settings.py +24 -10
- deepdoctection/utils/tqdm.py +10 -5
- deepdoctection/utils/transform.py +182 -46
- deepdoctection/utils/utils.py +61 -28
- deepdoctection/utils/viz.py +150 -104
- deepdoctection-0.43.dist-info/METADATA +376 -0
- deepdoctection-0.43.dist-info/RECORD +149 -0
- deepdoctection/analyzer/_config.py +0 -146
- deepdoctection-0.42.1.dist-info/METADATA +0 -431
- deepdoctection-0.42.1.dist-info/RECORD +0 -148
- {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/WHEEL +0 -0
- {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/licenses/LICENSE +0 -0
- {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/top_level.txt +0 -0
|
@@ -15,7 +15,9 @@
|
|
|
15
15
|
# See the License for the specific language governing permissions and
|
|
16
16
|
# limitations under the License.
|
|
17
17
|
|
|
18
|
-
"""
|
|
18
|
+
"""
|
|
19
|
+
`ServiceFactory` for building analyzers
|
|
20
|
+
"""
|
|
19
21
|
|
|
20
22
|
|
|
21
23
|
from os import environ
|
|
@@ -68,7 +70,7 @@ class ServiceFactory:
|
|
|
68
70
|
"""
|
|
69
71
|
Factory class for building various components of the deepdoctection analyzer pipeline.
|
|
70
72
|
|
|
71
|
-
This class uses the `cfg` configuration object from `
|
|
73
|
+
This class uses the `cfg` configuration object from `config.py`, which is an instance of the `AttrDict` class.
|
|
72
74
|
The configuration is not passed explicitly in an `__init__` method but is accessed directly within the methods.
|
|
73
75
|
|
|
74
76
|
The class provides static methods to build different services and detectors required for the pipeline, such as
|
|
@@ -78,7 +80,7 @@ class ServiceFactory:
|
|
|
78
80
|
|
|
79
81
|
Extending the Class:
|
|
80
82
|
This class can be extended by using inheritance and adding new methods or overriding existing ones.
|
|
81
|
-
To extend the configuration attributes, you can modify the `cfg` object in `
|
|
83
|
+
To extend the configuration attributes, you can modify the `cfg` object in `config.py` to include new
|
|
82
84
|
settings or parameters required for the new methods.
|
|
83
85
|
"""
|
|
84
86
|
|
|
@@ -87,11 +89,13 @@ class ServiceFactory:
|
|
|
87
89
|
config: AttrDict,
|
|
88
90
|
mode: str,
|
|
89
91
|
) -> Union[D2FrcnnDetector, TPFrcnnDetector, HFDetrDerivedDetector, D2FrcnnTracingDetector]:
|
|
90
|
-
"""
|
|
91
|
-
|
|
92
|
+
"""
|
|
93
|
+
Building a D2-Detector, a TP-Detector as Detr-Detector or a D2-Torch Tracing Detector according to
|
|
94
|
+
the config.
|
|
92
95
|
|
|
93
|
-
:
|
|
94
|
-
|
|
96
|
+
Args:
|
|
97
|
+
config: Configuration object.
|
|
98
|
+
mode: Either `LAYOUT`, `CELL`, or `ITEM`.
|
|
95
99
|
"""
|
|
96
100
|
if config.LIB is None:
|
|
97
101
|
raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
|
|
@@ -101,7 +105,7 @@ class ServiceFactory:
|
|
|
101
105
|
if config.LIB == "TF"
|
|
102
106
|
else (
|
|
103
107
|
getattr(config.PT, mode).WEIGHTS
|
|
104
|
-
if getattr(config.PT.ENFORCE_WEIGHTS,mode)
|
|
108
|
+
if getattr(config.PT.ENFORCE_WEIGHTS, mode)
|
|
105
109
|
else getattr(config.PT, mode).WEIGHTS_TS
|
|
106
110
|
)
|
|
107
111
|
)
|
|
@@ -113,6 +117,8 @@ class ServiceFactory:
|
|
|
113
117
|
config_path = ModelCatalog.get_full_path_configs(weights)
|
|
114
118
|
weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
|
|
115
119
|
profile = ModelCatalog.get_profile(weights)
|
|
120
|
+
if config.LIB == "PT" and profile.padding is not None:
|
|
121
|
+
getattr(config.PT, mode).PADDING = profile.padding
|
|
116
122
|
categories = profile.categories if profile.categories is not None else {}
|
|
117
123
|
|
|
118
124
|
if profile.model_wrapper in ("TPFrcnnDetector",):
|
|
@@ -156,40 +162,72 @@ class ServiceFactory:
|
|
|
156
162
|
def build_layout_detector(
|
|
157
163
|
config: AttrDict, mode: str
|
|
158
164
|
) -> Union[D2FrcnnDetector, TPFrcnnDetector, HFDetrDerivedDetector, D2FrcnnTracingDetector]:
|
|
159
|
-
"""
|
|
165
|
+
"""
|
|
166
|
+
Building a layout detector according to the config.
|
|
160
167
|
|
|
161
|
-
:
|
|
162
|
-
|
|
168
|
+
Args:
|
|
169
|
+
config: Configuration object.
|
|
170
|
+
mode: Either `LAYOUT`, `CELL`, or `ITEM`.
|
|
163
171
|
"""
|
|
164
172
|
return ServiceFactory._build_layout_detector(config, mode)
|
|
165
173
|
|
|
166
174
|
@staticmethod
|
|
167
175
|
def _build_rotation_detector() -> TesseractRotationTransformer:
|
|
168
|
-
"""
|
|
176
|
+
"""
|
|
177
|
+
Building a rotation detector.
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
TesseractRotationTransformer: Rotation detector instance.
|
|
181
|
+
"""
|
|
169
182
|
return TesseractRotationTransformer()
|
|
170
183
|
|
|
171
184
|
@staticmethod
|
|
172
185
|
def build_rotation_detector() -> TesseractRotationTransformer:
|
|
173
|
-
"""
|
|
186
|
+
"""
|
|
187
|
+
Building a rotation detector.
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
TesseractRotationTransformer: Rotation detector instance.
|
|
191
|
+
"""
|
|
174
192
|
return ServiceFactory._build_rotation_detector()
|
|
175
193
|
|
|
176
194
|
@staticmethod
|
|
177
195
|
def _build_transform_service(transform_predictor: ImageTransformer) -> SimpleTransformService:
|
|
178
|
-
"""
|
|
196
|
+
"""
|
|
197
|
+
Building a transform service with a given predictor.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
transform_predictor: Predictor for image transformation.
|
|
201
|
+
|
|
202
|
+
Returns:
|
|
203
|
+
SimpleTransformService: Transform service instance.
|
|
204
|
+
"""
|
|
179
205
|
return SimpleTransformService(transform_predictor)
|
|
180
206
|
|
|
181
207
|
@staticmethod
|
|
182
208
|
def build_transform_service(transform_predictor: ImageTransformer) -> SimpleTransformService:
|
|
183
|
-
"""
|
|
209
|
+
"""
|
|
210
|
+
Building a transform service with a given predictor.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
transform_predictor: Predictor for image transformation.
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
SimpleTransformService: Transform service instance.
|
|
217
|
+
"""
|
|
184
218
|
return ServiceFactory._build_transform_service(transform_predictor)
|
|
185
219
|
|
|
186
220
|
@staticmethod
|
|
187
221
|
def _build_padder(config: AttrDict, mode: str) -> PadTransform:
|
|
188
|
-
"""
|
|
222
|
+
"""
|
|
223
|
+
Building a padder according to the config.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
config: Configuration object.
|
|
227
|
+
mode: Either `LAYOUT`, `CELL`, or `ITEM`.
|
|
189
228
|
|
|
190
|
-
:
|
|
191
|
-
|
|
192
|
-
:return `PadTransform` instance
|
|
229
|
+
Returns:
|
|
230
|
+
PadTransform: `PadTransform` instance.
|
|
193
231
|
"""
|
|
194
232
|
top, right, bottom, left = (
|
|
195
233
|
getattr(config.PT, mode).PAD.TOP,
|
|
@@ -201,44 +239,61 @@ class ServiceFactory:
|
|
|
201
239
|
|
|
202
240
|
@staticmethod
|
|
203
241
|
def build_padder(config: AttrDict, mode: str) -> PadTransform:
|
|
204
|
-
"""
|
|
242
|
+
"""
|
|
243
|
+
Building a padder according to the config.
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
config: Configuration object.
|
|
247
|
+
mode: Either `LAYOUT`, `CELL`, or `ITEM`.
|
|
205
248
|
|
|
206
|
-
:
|
|
207
|
-
|
|
208
|
-
:return `PadTransform` instance
|
|
249
|
+
Returns:
|
|
250
|
+
PadTransform: `PadTransform` instance.
|
|
209
251
|
"""
|
|
210
252
|
return ServiceFactory._build_padder(config, mode)
|
|
211
253
|
|
|
212
254
|
@staticmethod
|
|
213
255
|
def _build_layout_service(config: AttrDict, detector: ObjectDetector, mode: str) -> ImageLayoutService:
|
|
214
|
-
"""
|
|
256
|
+
"""
|
|
257
|
+
Building a layout service with a given detector.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
config: Configuration object.
|
|
261
|
+
detector: Will be passed to the `ImageLayoutService`.
|
|
262
|
+
mode: Either `LAYOUT`, `CELL`, or `ITEM`.
|
|
215
263
|
|
|
216
|
-
:
|
|
217
|
-
|
|
218
|
-
:param mode: either `LAYOUT`,`CELL` or `ITEM`
|
|
219
|
-
:return `ImageLayoutService` instance
|
|
264
|
+
Returns:
|
|
265
|
+
ImageLayoutService: `ImageLayoutService` instance.
|
|
220
266
|
"""
|
|
221
267
|
padder = None
|
|
222
|
-
if
|
|
268
|
+
if getattr(config.PT, mode).PADDING:
|
|
223
269
|
padder = ServiceFactory.build_padder(config, mode=mode)
|
|
224
270
|
return ImageLayoutService(layout_detector=detector, to_image=True, crop_image=True, padder=padder)
|
|
225
271
|
|
|
226
272
|
@staticmethod
|
|
227
273
|
def build_layout_service(config: AttrDict, detector: ObjectDetector, mode: str) -> ImageLayoutService:
|
|
228
|
-
"""
|
|
274
|
+
"""
|
|
275
|
+
Building a layout service with a given detector.
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
config: Configuration object.
|
|
279
|
+
detector: Will be passed to the `ImageLayoutService`.
|
|
280
|
+
mode: Either `LAYOUT`, `CELL`, or `ITEM`.
|
|
229
281
|
|
|
230
|
-
:
|
|
231
|
-
|
|
232
|
-
:param mode: either `LAYOUT`,`CELL` or `ITEM`
|
|
233
|
-
:return `ImageLayoutService` instance
|
|
282
|
+
Returns:
|
|
283
|
+
ImageLayoutService: `ImageLayoutService` instance.
|
|
234
284
|
"""
|
|
235
285
|
return ServiceFactory._build_layout_service(config, detector, mode)
|
|
236
286
|
|
|
237
287
|
@staticmethod
|
|
238
288
|
def _build_layout_nms_service(config: AttrDict) -> AnnotationNmsService:
|
|
239
|
-
"""
|
|
289
|
+
"""
|
|
290
|
+
Building a NMS service for layout annotations.
|
|
291
|
+
|
|
292
|
+
Args:
|
|
293
|
+
config: Configuration object.
|
|
240
294
|
|
|
241
|
-
:
|
|
295
|
+
Returns:
|
|
296
|
+
AnnotationNmsService: NMS service instance.
|
|
242
297
|
"""
|
|
243
298
|
if not isinstance(config.LAYOUT_NMS_PAIRS.COMBINATIONS, list) and not isinstance(
|
|
244
299
|
config.LAYOUT_NMS_PAIRS.COMBINATIONS[0], list
|
|
@@ -252,21 +307,29 @@ class ServiceFactory:
|
|
|
252
307
|
|
|
253
308
|
@staticmethod
|
|
254
309
|
def build_layout_nms_service(config: AttrDict) -> AnnotationNmsService:
|
|
255
|
-
"""
|
|
310
|
+
"""
|
|
311
|
+
Building a NMS service for layout annotations.
|
|
256
312
|
|
|
257
|
-
:
|
|
313
|
+
Args:
|
|
314
|
+
config: Configuration object.
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
AnnotationNmsService: NMS service instance.
|
|
258
318
|
"""
|
|
259
319
|
return ServiceFactory._build_layout_nms_service(config)
|
|
260
320
|
|
|
261
321
|
@staticmethod
|
|
262
322
|
def _build_sub_image_service(config: AttrDict, detector: ObjectDetector, mode: str) -> SubImageLayoutService:
|
|
263
323
|
"""
|
|
264
|
-
Building a sub image layout service with a given detector
|
|
324
|
+
Building a sub image layout service with a given detector.
|
|
325
|
+
|
|
326
|
+
Args:
|
|
327
|
+
config: Configuration object.
|
|
328
|
+
detector: Will be passed to the `SubImageLayoutService`.
|
|
329
|
+
mode: Either `LAYOUT`, `CELL`, or `ITEM`.
|
|
265
330
|
|
|
266
|
-
:
|
|
267
|
-
|
|
268
|
-
:param mode: either `LAYOUT`,`CELL` or `ITEM`
|
|
269
|
-
:return: `SubImageLayoutService` instance
|
|
331
|
+
Returns:
|
|
332
|
+
SubImageLayoutService: `SubImageLayoutService` instance.
|
|
270
333
|
"""
|
|
271
334
|
exclude_category_names = []
|
|
272
335
|
padder = None
|
|
@@ -290,21 +353,28 @@ class ServiceFactory:
|
|
|
290
353
|
@staticmethod
|
|
291
354
|
def build_sub_image_service(config: AttrDict, detector: ObjectDetector, mode: str) -> SubImageLayoutService:
|
|
292
355
|
"""
|
|
293
|
-
Building a sub image layout service with a given detector
|
|
356
|
+
Building a sub image layout service with a given detector.
|
|
294
357
|
|
|
295
|
-
:
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
358
|
+
Args:
|
|
359
|
+
config: Configuration object.
|
|
360
|
+
detector: Will be passed to the `SubImageLayoutService`.
|
|
361
|
+
mode: Either `LAYOUT`, `CELL`, or `ITEM`.
|
|
362
|
+
|
|
363
|
+
Returns:
|
|
364
|
+
SubImageLayoutService: `SubImageLayoutService` instance.
|
|
299
365
|
"""
|
|
300
366
|
return ServiceFactory._build_sub_image_service(config, detector, mode)
|
|
301
367
|
|
|
302
368
|
@staticmethod
|
|
303
369
|
def _build_ocr_detector(config: AttrDict) -> Union[TesseractOcrDetector, DoctrTextRecognizer, TextractOcrDetector]:
|
|
304
370
|
"""
|
|
305
|
-
Building OCR predictor
|
|
371
|
+
Building OCR predictor.
|
|
372
|
+
|
|
373
|
+
Args:
|
|
374
|
+
config: Configuration object.
|
|
306
375
|
|
|
307
|
-
:
|
|
376
|
+
Returns:
|
|
377
|
+
Union[TesseractOcrDetector, DoctrTextRecognizer, TextractOcrDetector]: OCR detector instance.
|
|
308
378
|
"""
|
|
309
379
|
if config.OCR.USE_TESSERACT:
|
|
310
380
|
ocr_config_path = get_configs_dir_path() / config.OCR.CONFIG.TESSERACT
|
|
@@ -345,18 +415,26 @@ class ServiceFactory:
|
|
|
345
415
|
@staticmethod
|
|
346
416
|
def build_ocr_detector(config: AttrDict) -> Union[TesseractOcrDetector, DoctrTextRecognizer, TextractOcrDetector]:
|
|
347
417
|
"""
|
|
348
|
-
Building OCR predictor
|
|
418
|
+
Building OCR predictor.
|
|
419
|
+
|
|
420
|
+
Args:
|
|
421
|
+
config: Configuration object.
|
|
349
422
|
|
|
350
|
-
:
|
|
423
|
+
Returns:
|
|
424
|
+
Union[TesseractOcrDetector, DoctrTextRecognizer, TextractOcrDetector]: OCR detector instance.
|
|
351
425
|
"""
|
|
352
426
|
return ServiceFactory._build_ocr_detector(config)
|
|
353
427
|
|
|
354
428
|
@staticmethod
|
|
355
429
|
def build_doctr_word_detector(config: AttrDict) -> DoctrTextlineDetector:
|
|
356
|
-
"""
|
|
430
|
+
"""
|
|
431
|
+
Building `DoctrTextlineDetector` instance.
|
|
432
|
+
|
|
433
|
+
Args:
|
|
434
|
+
config: Configuration object.
|
|
357
435
|
|
|
358
|
-
:
|
|
359
|
-
|
|
436
|
+
Returns:
|
|
437
|
+
DoctrTextlineDetector: Textline detector instance.
|
|
360
438
|
"""
|
|
361
439
|
if config.LIB is None:
|
|
362
440
|
raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
|
|
@@ -379,20 +457,22 @@ class ServiceFactory:
|
|
|
379
457
|
"""
|
|
380
458
|
Build and return a table segmentation service based on the provided detector.
|
|
381
459
|
|
|
382
|
-
|
|
383
|
-
|
|
460
|
+
Note:
|
|
461
|
+
Depending on the type of the detector, this method will return either a `PubtablesSegmentationService` or a
|
|
462
|
+
`TableSegmentationService` instance. The selection is made as follows:
|
|
384
463
|
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
464
|
+
- If the detector is an instance of `HFDetrDerivedDetector`, a `PubtablesSegmentationService` is created and
|
|
465
|
+
returned. This service uses specific configuration parameters for segmentation, such as assignment rules,
|
|
466
|
+
thresholds, and cell names defined in the `cfg` object.
|
|
467
|
+
- For other detector types, a `TableSegmentationService` is created and returned. This service also uses
|
|
468
|
+
configuration parameters from the `cfg` object but is tailored for different segmentation needs.
|
|
390
469
|
|
|
391
|
-
:
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
470
|
+
Args:
|
|
471
|
+
config: Configuration object.
|
|
472
|
+
detector: An instance of `ObjectDetector` used to determine the type of table segmentation service to build.
|
|
473
|
+
|
|
474
|
+
Returns:
|
|
475
|
+
Table segmentation service instance.
|
|
396
476
|
"""
|
|
397
477
|
table_segmentation: Union[PubtablesSegmentationService, TableSegmentationService]
|
|
398
478
|
if detector.__class__.__name__ in ("HFDetrDerivedDetector",):
|
|
@@ -437,29 +517,35 @@ class ServiceFactory:
|
|
|
437
517
|
"""
|
|
438
518
|
Build and return a table segmentation service based on the provided detector.
|
|
439
519
|
|
|
440
|
-
|
|
441
|
-
|
|
520
|
+
Note:
|
|
521
|
+
Depending on the type of the detector, this method will return either a `PubtablesSegmentationService` or a
|
|
522
|
+
`TableSegmentationService` instance. The selection is made as follows:
|
|
523
|
+
|
|
524
|
+
- If the detector is an instance of `HFDetrDerivedDetector`, a `PubtablesSegmentationService` is created and
|
|
525
|
+
returned. This service uses specific configuration parameters for segmentation, such as assignment rules,
|
|
526
|
+
thresholds, and cell names defined in the `cfg` object.
|
|
527
|
+
- For other detector types, a `TableSegmentationService` is created and returned. This service also uses
|
|
528
|
+
configuration parameters from the `cfg` object but is tailored for different segmentation needs.
|
|
442
529
|
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
- For other detector types, a `TableSegmentationService` is created and returned. This service also uses
|
|
447
|
-
configuration parameters from the `cfg` object but is tailored for different segmentation needs.
|
|
530
|
+
Args:
|
|
531
|
+
config: Configuration object.
|
|
532
|
+
detector: An instance of `ObjectDetector` used to determine the type of table segmentation service to build.
|
|
448
533
|
|
|
449
|
-
:
|
|
450
|
-
|
|
451
|
-
service to build.
|
|
452
|
-
:return: An instance of either `PubtablesSegmentationService` or `TableSegmentationService` based on the
|
|
453
|
-
detector type.
|
|
534
|
+
Returns:
|
|
535
|
+
Table segmentation service instance.
|
|
454
536
|
"""
|
|
455
537
|
return ServiceFactory._build_table_segmentation_service(config, detector)
|
|
456
538
|
|
|
457
539
|
@staticmethod
|
|
458
540
|
def _build_table_refinement_service(config: AttrDict) -> TableSegmentationRefinementService:
|
|
459
|
-
"""
|
|
541
|
+
"""
|
|
542
|
+
Building a table segmentation refinement service.
|
|
460
543
|
|
|
461
|
-
:
|
|
462
|
-
|
|
544
|
+
Args:
|
|
545
|
+
config: Configuration object.
|
|
546
|
+
|
|
547
|
+
Returns:
|
|
548
|
+
TableSegmentationRefinementService: Refinement service instance.
|
|
463
549
|
"""
|
|
464
550
|
return TableSegmentationRefinementService(
|
|
465
551
|
[config.SEGMENTATION.TABLE_NAME],
|
|
@@ -468,19 +554,27 @@ class ServiceFactory:
|
|
|
468
554
|
|
|
469
555
|
@staticmethod
|
|
470
556
|
def build_table_refinement_service(config: AttrDict) -> TableSegmentationRefinementService:
|
|
471
|
-
"""
|
|
557
|
+
"""
|
|
558
|
+
Building a table segmentation refinement service.
|
|
472
559
|
|
|
473
|
-
:
|
|
474
|
-
|
|
560
|
+
Args:
|
|
561
|
+
config: Configuration object.
|
|
562
|
+
|
|
563
|
+
Returns:
|
|
564
|
+
TableSegmentationRefinementService: Refinement service instance.
|
|
475
565
|
"""
|
|
476
566
|
return ServiceFactory._build_table_refinement_service(config)
|
|
477
567
|
|
|
478
568
|
@staticmethod
|
|
479
569
|
def _build_pdf_text_detector(config: AttrDict) -> PdfPlumberTextDetector:
|
|
480
|
-
"""
|
|
570
|
+
"""
|
|
571
|
+
Building a PDF text detector.
|
|
572
|
+
|
|
573
|
+
Args:
|
|
574
|
+
config: Configuration object.
|
|
481
575
|
|
|
482
|
-
:
|
|
483
|
-
|
|
576
|
+
Returns:
|
|
577
|
+
PdfPlumberTextDetector: PDF text detector instance.
|
|
484
578
|
"""
|
|
485
579
|
return PdfPlumberTextDetector(
|
|
486
580
|
x_tolerance=config.PDF_MINER.X_TOLERANCE, y_tolerance=config.PDF_MINER.Y_TOLERANCE
|
|
@@ -488,46 +582,66 @@ class ServiceFactory:
|
|
|
488
582
|
|
|
489
583
|
@staticmethod
|
|
490
584
|
def build_pdf_text_detector(config: AttrDict) -> PdfPlumberTextDetector:
|
|
491
|
-
"""
|
|
585
|
+
"""
|
|
586
|
+
Building a PDF text detector.
|
|
587
|
+
|
|
588
|
+
Args:
|
|
589
|
+
config: Configuration object.
|
|
492
590
|
|
|
493
|
-
:
|
|
494
|
-
|
|
591
|
+
Returns:
|
|
592
|
+
PdfPlumberTextDetector: PDF text detector instance.
|
|
495
593
|
"""
|
|
496
594
|
return ServiceFactory._build_pdf_text_detector(config)
|
|
497
595
|
|
|
498
596
|
@staticmethod
|
|
499
597
|
def _build_pdf_miner_text_service(detector: PdfMiner) -> TextExtractionService:
|
|
500
|
-
"""
|
|
598
|
+
"""
|
|
599
|
+
Building a PDFMiner text extraction service.
|
|
600
|
+
|
|
601
|
+
Args:
|
|
602
|
+
detector: PdfMiner instance.
|
|
501
603
|
|
|
502
|
-
:
|
|
503
|
-
|
|
604
|
+
Returns:
|
|
605
|
+
TextExtractionService: Text extraction service instance.
|
|
504
606
|
"""
|
|
505
607
|
return TextExtractionService(detector)
|
|
506
608
|
|
|
507
609
|
@staticmethod
|
|
508
610
|
def build_pdf_miner_text_service(detector: PdfMiner) -> TextExtractionService:
|
|
509
|
-
"""
|
|
611
|
+
"""
|
|
612
|
+
Building a PDFMiner text extraction service.
|
|
510
613
|
|
|
511
|
-
:
|
|
512
|
-
|
|
614
|
+
Args:
|
|
615
|
+
detector: PdfMiner instance.
|
|
616
|
+
|
|
617
|
+
Returns:
|
|
618
|
+
TextExtractionService: Text extraction service instance.
|
|
513
619
|
"""
|
|
514
620
|
return ServiceFactory._build_pdf_miner_text_service(detector)
|
|
515
621
|
|
|
516
622
|
@staticmethod
|
|
517
623
|
def _build_doctr_word_detector_service(detector: DoctrTextlineDetector) -> ImageLayoutService:
|
|
518
|
-
"""
|
|
624
|
+
"""
|
|
625
|
+
Building a Doctr word detector service.
|
|
519
626
|
|
|
520
|
-
:
|
|
521
|
-
|
|
627
|
+
Args:
|
|
628
|
+
detector: DoctrTextlineDetector instance.
|
|
629
|
+
|
|
630
|
+
Returns:
|
|
631
|
+
ImageLayoutService: Word detector service instance.
|
|
522
632
|
"""
|
|
523
633
|
return ImageLayoutService(layout_detector=detector, to_image=True, crop_image=True)
|
|
524
634
|
|
|
525
635
|
@staticmethod
|
|
526
636
|
def build_doctr_word_detector_service(detector: DoctrTextlineDetector) -> ImageLayoutService:
|
|
527
|
-
"""
|
|
637
|
+
"""
|
|
638
|
+
Building a Doctr word detector service.
|
|
639
|
+
|
|
640
|
+
Args:
|
|
641
|
+
detector: DoctrTextlineDetector instance.
|
|
528
642
|
|
|
529
|
-
:
|
|
530
|
-
|
|
643
|
+
Returns:
|
|
644
|
+
ImageLayoutService: Word detector service instance.
|
|
531
645
|
"""
|
|
532
646
|
return ServiceFactory._build_doctr_word_detector_service(detector)
|
|
533
647
|
|
|
@@ -535,11 +649,15 @@ class ServiceFactory:
|
|
|
535
649
|
def _build_text_extraction_service(
|
|
536
650
|
config: AttrDict, detector: Union[TesseractOcrDetector, DoctrTextRecognizer, TextractOcrDetector]
|
|
537
651
|
) -> TextExtractionService:
|
|
538
|
-
"""
|
|
652
|
+
"""
|
|
653
|
+
Building a text extraction service.
|
|
654
|
+
|
|
655
|
+
Args:
|
|
656
|
+
config: Configuration object.
|
|
657
|
+
detector: OCR detector instance.
|
|
539
658
|
|
|
540
|
-
:
|
|
541
|
-
|
|
542
|
-
:return: TextExtractionService
|
|
659
|
+
Returns:
|
|
660
|
+
TextExtractionService: Text extraction service instance.
|
|
543
661
|
"""
|
|
544
662
|
return TextExtractionService(
|
|
545
663
|
detector,
|
|
@@ -550,20 +668,28 @@ class ServiceFactory:
|
|
|
550
668
|
def build_text_extraction_service(
|
|
551
669
|
config: AttrDict, detector: Union[TesseractOcrDetector, DoctrTextRecognizer, TextractOcrDetector]
|
|
552
670
|
) -> TextExtractionService:
|
|
553
|
-
"""
|
|
671
|
+
"""
|
|
672
|
+
Building a text extraction service.
|
|
673
|
+
|
|
674
|
+
Args:
|
|
675
|
+
config: Configuration object.
|
|
676
|
+
detector: OCR detector instance.
|
|
554
677
|
|
|
555
|
-
:
|
|
556
|
-
|
|
557
|
-
:return: TextExtractionService
|
|
678
|
+
Returns:
|
|
679
|
+
TextExtractionService: Text extraction service instance.
|
|
558
680
|
"""
|
|
559
681
|
return ServiceFactory._build_text_extraction_service(config, detector)
|
|
560
682
|
|
|
561
683
|
@staticmethod
|
|
562
684
|
def _build_word_matching_service(config: AttrDict) -> MatchingService:
|
|
563
|
-
"""
|
|
685
|
+
"""
|
|
686
|
+
Building a word matching service.
|
|
564
687
|
|
|
565
|
-
:
|
|
566
|
-
|
|
688
|
+
Args:
|
|
689
|
+
config: Configuration object.
|
|
690
|
+
|
|
691
|
+
Returns:
|
|
692
|
+
MatchingService: Word matching service instance.
|
|
567
693
|
"""
|
|
568
694
|
matcher = IntersectionMatcher(
|
|
569
695
|
matching_rule=config.WORD_MATCHING.RULE,
|
|
@@ -591,19 +717,27 @@ class ServiceFactory:
|
|
|
591
717
|
|
|
592
718
|
@staticmethod
|
|
593
719
|
def build_word_matching_service(config: AttrDict) -> MatchingService:
|
|
594
|
-
"""
|
|
720
|
+
"""
|
|
721
|
+
Building a word matching service.
|
|
595
722
|
|
|
596
|
-
:
|
|
597
|
-
|
|
723
|
+
Args:
|
|
724
|
+
config: Configuration object.
|
|
725
|
+
|
|
726
|
+
Returns:
|
|
727
|
+
MatchingService: Word matching service instance.
|
|
598
728
|
"""
|
|
599
729
|
return ServiceFactory._build_word_matching_service(config)
|
|
600
730
|
|
|
601
731
|
@staticmethod
|
|
602
732
|
def _build_layout_link_matching_service(config: AttrDict) -> MatchingService:
|
|
603
|
-
"""
|
|
733
|
+
"""
|
|
734
|
+
Building a layout link matching service.
|
|
735
|
+
|
|
736
|
+
Args:
|
|
737
|
+
config: Configuration object.
|
|
604
738
|
|
|
605
|
-
:
|
|
606
|
-
|
|
739
|
+
Returns:
|
|
740
|
+
MatchingService: Layout link matching service instance.
|
|
607
741
|
"""
|
|
608
742
|
neighbor_matcher = NeighbourMatcher()
|
|
609
743
|
family_compounds = [
|
|
@@ -620,15 +754,28 @@ class ServiceFactory:
|
|
|
620
754
|
|
|
621
755
|
@staticmethod
|
|
622
756
|
def build_layout_link_matching_service(config: AttrDict) -> MatchingService:
|
|
623
|
-
"""
|
|
757
|
+
"""
|
|
758
|
+
Building a layout link matching service.
|
|
759
|
+
|
|
760
|
+
Args:
|
|
761
|
+
config: Configuration object.
|
|
624
762
|
|
|
625
|
-
:
|
|
626
|
-
|
|
763
|
+
Returns:
|
|
764
|
+
MatchingService: Layout link matching service instance.
|
|
627
765
|
"""
|
|
628
766
|
return ServiceFactory._build_layout_link_matching_service(config)
|
|
629
767
|
|
|
630
768
|
@staticmethod
|
|
631
769
|
def _build_line_matching_service(config: AttrDict) -> MatchingService:
|
|
770
|
+
"""
|
|
771
|
+
Building a line matching service.
|
|
772
|
+
|
|
773
|
+
Args:
|
|
774
|
+
config: Configuration object.
|
|
775
|
+
|
|
776
|
+
Returns:
|
|
777
|
+
MatchingService: Line matching service instance.
|
|
778
|
+
"""
|
|
632
779
|
matcher = IntersectionMatcher(
|
|
633
780
|
matching_rule=config.WORD_MATCHING.RULE,
|
|
634
781
|
threshold=config.WORD_MATCHING.THRESHOLD,
|
|
@@ -648,19 +795,27 @@ class ServiceFactory:
|
|
|
648
795
|
|
|
649
796
|
@staticmethod
|
|
650
797
|
def build_line_matching_service(config: AttrDict) -> MatchingService:
|
|
651
|
-
"""
|
|
798
|
+
"""
|
|
799
|
+
Building a line matching service.
|
|
652
800
|
|
|
653
|
-
:
|
|
654
|
-
|
|
801
|
+
Args:
|
|
802
|
+
config: Configuration object.
|
|
803
|
+
|
|
804
|
+
Returns:
|
|
805
|
+
MatchingService: Line matching service instance.
|
|
655
806
|
"""
|
|
656
807
|
return ServiceFactory._build_line_matching_service(config)
|
|
657
808
|
|
|
658
809
|
@staticmethod
|
|
659
810
|
def _build_text_order_service(config: AttrDict) -> TextOrderService:
|
|
660
|
-
"""
|
|
811
|
+
"""
|
|
812
|
+
Building a text order service.
|
|
813
|
+
|
|
814
|
+
Args:
|
|
815
|
+
config: Configuration object.
|
|
661
816
|
|
|
662
|
-
:
|
|
663
|
-
|
|
817
|
+
Returns:
|
|
818
|
+
TextOrderService: Text order service instance.
|
|
664
819
|
"""
|
|
665
820
|
return TextOrderService(
|
|
666
821
|
text_container=config.TEXT_CONTAINER,
|
|
@@ -675,19 +830,27 @@ class ServiceFactory:
|
|
|
675
830
|
|
|
676
831
|
@staticmethod
|
|
677
832
|
def build_text_order_service(config: AttrDict) -> TextOrderService:
|
|
678
|
-
"""
|
|
833
|
+
"""
|
|
834
|
+
Building a text order service.
|
|
835
|
+
|
|
836
|
+
Args:
|
|
837
|
+
config: Configuration object.
|
|
679
838
|
|
|
680
|
-
:
|
|
681
|
-
|
|
839
|
+
Returns:
|
|
840
|
+
TextOrderService: Text order service instance.
|
|
682
841
|
"""
|
|
683
842
|
return ServiceFactory._build_text_order_service(config)
|
|
684
843
|
|
|
685
844
|
@staticmethod
|
|
686
845
|
def _build_page_parsing_service(config: AttrDict) -> PageParsingService:
|
|
687
|
-
"""
|
|
846
|
+
"""
|
|
847
|
+
Building a page parsing service.
|
|
848
|
+
|
|
849
|
+
Args:
|
|
850
|
+
config: Configuration object.
|
|
688
851
|
|
|
689
|
-
:
|
|
690
|
-
|
|
852
|
+
Returns:
|
|
853
|
+
PageParsingService: Page parsing service instance.
|
|
691
854
|
"""
|
|
692
855
|
return PageParsingService(
|
|
693
856
|
text_container=config.TEXT_CONTAINER,
|
|
@@ -697,20 +860,27 @@ class ServiceFactory:
|
|
|
697
860
|
|
|
698
861
|
@staticmethod
|
|
699
862
|
def build_page_parsing_service(config: AttrDict) -> PageParsingService:
|
|
700
|
-
"""
|
|
863
|
+
"""
|
|
864
|
+
Building a page parsing service.
|
|
701
865
|
|
|
702
|
-
:
|
|
703
|
-
|
|
866
|
+
Args:
|
|
867
|
+
config: Configuration object.
|
|
868
|
+
|
|
869
|
+
Returns:
|
|
870
|
+
PageParsingService: Page parsing service instance.
|
|
704
871
|
"""
|
|
705
872
|
return ServiceFactory._build_page_parsing_service(config)
|
|
706
873
|
|
|
707
874
|
@staticmethod
|
|
708
875
|
def build_analyzer(config: AttrDict) -> DoctectionPipe:
|
|
709
876
|
"""
|
|
710
|
-
Builds the analyzer with a given config
|
|
877
|
+
Builds the analyzer with a given config.
|
|
878
|
+
|
|
879
|
+
Args:
|
|
880
|
+
config: Configuration object.
|
|
711
881
|
|
|
712
|
-
:
|
|
713
|
-
|
|
882
|
+
Returns:
|
|
883
|
+
DoctectionPipe: Analyzer pipeline instance.
|
|
714
884
|
"""
|
|
715
885
|
pipe_component_list: list[PipelineComponent] = []
|
|
716
886
|
|