deepdoctection 0.30__py3-none-any.whl → 0.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +38 -29
- deepdoctection/analyzer/dd.py +36 -29
- deepdoctection/configs/conf_dd_one.yaml +34 -31
- deepdoctection/dataflow/base.py +0 -19
- deepdoctection/dataflow/custom.py +4 -3
- deepdoctection/dataflow/custom_serialize.py +14 -5
- deepdoctection/dataflow/parallel_map.py +12 -11
- deepdoctection/dataflow/serialize.py +5 -4
- deepdoctection/datapoint/annotation.py +35 -13
- deepdoctection/datapoint/box.py +3 -5
- deepdoctection/datapoint/convert.py +3 -1
- deepdoctection/datapoint/image.py +79 -36
- deepdoctection/datapoint/view.py +152 -49
- deepdoctection/datasets/__init__.py +1 -4
- deepdoctection/datasets/adapter.py +6 -3
- deepdoctection/datasets/base.py +86 -11
- deepdoctection/datasets/dataflow_builder.py +1 -1
- deepdoctection/datasets/info.py +4 -4
- deepdoctection/datasets/instances/doclaynet.py +3 -2
- deepdoctection/datasets/instances/fintabnet.py +2 -1
- deepdoctection/datasets/instances/funsd.py +2 -1
- deepdoctection/datasets/instances/iiitar13k.py +5 -2
- deepdoctection/datasets/instances/layouttest.py +4 -8
- deepdoctection/datasets/instances/publaynet.py +2 -2
- deepdoctection/datasets/instances/pubtables1m.py +6 -3
- deepdoctection/datasets/instances/pubtabnet.py +2 -1
- deepdoctection/datasets/instances/rvlcdip.py +2 -1
- deepdoctection/datasets/instances/xfund.py +2 -1
- deepdoctection/eval/__init__.py +1 -4
- deepdoctection/eval/accmetric.py +1 -1
- deepdoctection/eval/base.py +5 -4
- deepdoctection/eval/cocometric.py +2 -1
- deepdoctection/eval/eval.py +19 -15
- deepdoctection/eval/tedsmetric.py +14 -11
- deepdoctection/eval/tp_eval_callback.py +14 -7
- deepdoctection/extern/__init__.py +2 -7
- deepdoctection/extern/base.py +39 -13
- deepdoctection/extern/d2detect.py +182 -90
- deepdoctection/extern/deskew.py +36 -9
- deepdoctection/extern/doctrocr.py +265 -83
- deepdoctection/extern/fastlang.py +49 -9
- deepdoctection/extern/hfdetr.py +106 -55
- deepdoctection/extern/hflayoutlm.py +441 -122
- deepdoctection/extern/hflm.py +225 -0
- deepdoctection/extern/model.py +56 -47
- deepdoctection/extern/pdftext.py +10 -5
- deepdoctection/extern/pt/__init__.py +1 -3
- deepdoctection/extern/pt/nms.py +6 -2
- deepdoctection/extern/pt/ptutils.py +27 -18
- deepdoctection/extern/tessocr.py +134 -22
- deepdoctection/extern/texocr.py +6 -2
- deepdoctection/extern/tp/tfutils.py +43 -9
- deepdoctection/extern/tp/tpcompat.py +14 -11
- deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
- deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/config/config.py +9 -6
- deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +17 -7
- deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
- deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +9 -4
- deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
- deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +16 -11
- deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +17 -10
- deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +14 -8
- deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
- deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
- deepdoctection/extern/tp/tpfrcnn/preproc.py +8 -9
- deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
- deepdoctection/extern/tpdetect.py +54 -30
- deepdoctection/mapper/__init__.py +3 -8
- deepdoctection/mapper/d2struct.py +9 -7
- deepdoctection/mapper/hfstruct.py +7 -2
- deepdoctection/mapper/laylmstruct.py +164 -21
- deepdoctection/mapper/maputils.py +16 -3
- deepdoctection/mapper/misc.py +6 -3
- deepdoctection/mapper/prodigystruct.py +1 -1
- deepdoctection/mapper/pubstruct.py +10 -10
- deepdoctection/mapper/tpstruct.py +3 -3
- deepdoctection/pipe/__init__.py +1 -1
- deepdoctection/pipe/anngen.py +35 -8
- deepdoctection/pipe/base.py +53 -19
- deepdoctection/pipe/common.py +23 -13
- deepdoctection/pipe/concurrency.py +2 -1
- deepdoctection/pipe/doctectionpipe.py +2 -2
- deepdoctection/pipe/language.py +3 -2
- deepdoctection/pipe/layout.py +6 -3
- deepdoctection/pipe/lm.py +34 -66
- deepdoctection/pipe/order.py +142 -35
- deepdoctection/pipe/refine.py +26 -24
- deepdoctection/pipe/segment.py +21 -16
- deepdoctection/pipe/{cell.py → sub_layout.py} +30 -9
- deepdoctection/pipe/text.py +14 -8
- deepdoctection/pipe/transform.py +16 -9
- deepdoctection/train/__init__.py +6 -12
- deepdoctection/train/d2_frcnn_train.py +36 -28
- deepdoctection/train/hf_detr_train.py +26 -17
- deepdoctection/train/hf_layoutlm_train.py +133 -111
- deepdoctection/train/tp_frcnn_train.py +21 -19
- deepdoctection/utils/__init__.py +3 -0
- deepdoctection/utils/concurrency.py +1 -1
- deepdoctection/utils/context.py +2 -2
- deepdoctection/utils/env_info.py +41 -84
- deepdoctection/utils/error.py +84 -0
- deepdoctection/utils/file_utils.py +4 -15
- deepdoctection/utils/fs.py +7 -7
- deepdoctection/utils/logger.py +1 -0
- deepdoctection/utils/mocks.py +93 -0
- deepdoctection/utils/pdf_utils.py +5 -4
- deepdoctection/utils/settings.py +6 -1
- deepdoctection/utils/transform.py +1 -1
- deepdoctection/utils/utils.py +0 -6
- deepdoctection/utils/viz.py +48 -5
- {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/METADATA +57 -73
- deepdoctection-0.32.dist-info/RECORD +146 -0
- {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/WHEEL +1 -1
- deepdoctection-0.30.dist-info/RECORD +0 -143
- {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/LICENSE +0 -0
- {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/top_level.txt +0 -0
deepdoctection/__init__.py
CHANGED
|
@@ -19,15 +19,13 @@ import os
|
|
|
19
19
|
import sys
|
|
20
20
|
from typing import TYPE_CHECKING
|
|
21
21
|
|
|
22
|
-
from
|
|
23
|
-
|
|
24
|
-
from .utils.env_info import auto_select_lib_and_device
|
|
22
|
+
from .utils.env_info import collect_env_info
|
|
25
23
|
from .utils.file_utils import _LazyModule, get_tf_version, pytorch_available, tf_available
|
|
26
|
-
from .utils.logger import logger
|
|
24
|
+
from .utils.logger import LoggingRecord, logger
|
|
27
25
|
|
|
28
26
|
# pylint: enable=wrong-import-position
|
|
29
27
|
|
|
30
|
-
__version__ = 0.
|
|
28
|
+
__version__ = 0.32
|
|
31
29
|
|
|
32
30
|
_IMPORT_STRUCTURE = {
|
|
33
31
|
"analyzer": [
|
|
@@ -179,8 +177,10 @@ _IMPORT_STRUCTURE = {
|
|
|
179
177
|
"Jdeskewer",
|
|
180
178
|
"DoctrTextlineDetector",
|
|
181
179
|
"DoctrTextRecognizer",
|
|
180
|
+
"DocTrRotationTransformer",
|
|
182
181
|
"FasttextLangDetector",
|
|
183
182
|
"HFDetrDerivedDetector",
|
|
183
|
+
"get_tokenizer_from_architecture",
|
|
184
184
|
"HFLayoutLmTokenClassifierBase",
|
|
185
185
|
"HFLayoutLmTokenClassifier",
|
|
186
186
|
"HFLayoutLmv2TokenClassifier",
|
|
@@ -188,12 +188,16 @@ _IMPORT_STRUCTURE = {
|
|
|
188
188
|
"HFLayoutLmSequenceClassifier",
|
|
189
189
|
"HFLayoutLmv2SequenceClassifier",
|
|
190
190
|
"HFLayoutLmv3SequenceClassifier",
|
|
191
|
+
"HFLiltTokenClassifier",
|
|
192
|
+
"HFLiltSequenceClassifier",
|
|
193
|
+
"HFLmSequenceClassifier",
|
|
191
194
|
"ModelProfile",
|
|
192
195
|
"ModelCatalog",
|
|
193
196
|
"print_model_infos",
|
|
194
197
|
"ModelDownloadManager",
|
|
195
198
|
"PdfPlumberTextDetector",
|
|
196
199
|
"TesseractOcrDetector",
|
|
200
|
+
"TesseractRotationTransformer",
|
|
197
201
|
"TextractOcrDetector",
|
|
198
202
|
"TPFrcnnDetector",
|
|
199
203
|
],
|
|
@@ -266,11 +270,11 @@ _IMPORT_STRUCTURE = {
|
|
|
266
270
|
"DoctectionPipe",
|
|
267
271
|
"LanguageDetectionService",
|
|
268
272
|
"ImageLayoutService",
|
|
269
|
-
"get_tokenizer_from_architecture",
|
|
270
273
|
"LMTokenClassifierService",
|
|
271
274
|
"LMSequenceClassifierService",
|
|
272
275
|
"OrderGenerator",
|
|
273
276
|
"TextLineGenerator",
|
|
277
|
+
"TextLineService",
|
|
274
278
|
"TextOrderService",
|
|
275
279
|
"TableSegmentationRefinementService",
|
|
276
280
|
"generate_html_string",
|
|
@@ -279,7 +283,7 @@ _IMPORT_STRUCTURE = {
|
|
|
279
283
|
"PubtablesSegmentationService",
|
|
280
284
|
"SegmentationResult",
|
|
281
285
|
"TextExtractionService",
|
|
282
|
-
"
|
|
286
|
+
"SimpleTransformService",
|
|
283
287
|
],
|
|
284
288
|
"train": [
|
|
285
289
|
"D2Trainer",
|
|
@@ -295,14 +299,13 @@ _IMPORT_STRUCTURE = {
|
|
|
295
299
|
"save_tmp_file",
|
|
296
300
|
"timed_operation",
|
|
297
301
|
"collect_env_info",
|
|
298
|
-
"get_device",
|
|
299
|
-
"auto_select_lib_and_device",
|
|
300
302
|
"auto_select_viz_library",
|
|
301
303
|
"get_tensorflow_requirement",
|
|
302
304
|
"tf_addons_available",
|
|
303
305
|
"get_tf_addons_requirements",
|
|
304
306
|
"tensorpack_available",
|
|
305
307
|
"get_tensorpack_requirement",
|
|
308
|
+
"pytorch_available",
|
|
306
309
|
"get_pytorch_requirement",
|
|
307
310
|
"lxml_available",
|
|
308
311
|
"get_lxml_requirement",
|
|
@@ -416,25 +419,31 @@ _IMPORT_STRUCTURE = {
|
|
|
416
419
|
],
|
|
417
420
|
}
|
|
418
421
|
|
|
422
|
+
# Setting some environment variables so that standard functions can be invoked with available hardware
|
|
423
|
+
env_info = collect_env_info()
|
|
424
|
+
logger.debug(LoggingRecord(msg=env_info))
|
|
419
425
|
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
except Exception: # pylint: disable=W0703
|
|
434
|
-
pass
|
|
426
|
+
if os.environ.get("PYTORCH_AVAILABLE") and os.environ.get("DD_USE_TORCH") is None:
|
|
427
|
+
os.environ["DD_USE_TORCH"] = "1"
|
|
428
|
+
os.environ["USE_TORCH"] = "1"
|
|
429
|
+
if os.environ.get("TENSORFLOW_AVAILABLE") and os.environ.get("DD_USE_TF") is None:
|
|
430
|
+
os.environ["DD_USE_TF"] = "1"
|
|
431
|
+
os.environ["USE_TF"] = "1"
|
|
432
|
+
if os.environ.get("DD_USE_TORCH") and os.environ.get("DD_USE_TF"):
|
|
433
|
+
logger.warning(
|
|
434
|
+
"Both DD_USE_TORCH and DD_USE_TF are set. Defaulting to PyTorch. If you want a different "
|
|
435
|
+
"behaviour, set DD_USE_TORCH to None before importing deepdoctection."
|
|
436
|
+
)
|
|
437
|
+
os.environ.pop("DD_USE_TF")
|
|
438
|
+
os.environ.pop("USE_TF")
|
|
435
439
|
|
|
436
|
-
|
|
437
|
-
|
|
440
|
+
if not os.environ.get("PYTORCH_AVAILABLE") and not os.environ.get("TENSORFLOW_AVAILABLE"):
|
|
441
|
+
logger.warning(
|
|
442
|
+
LoggingRecord(
|
|
443
|
+
msg="Neither Tensorflow or Pytorch are available. You will not be able to use any Deep Learning "
|
|
444
|
+
"model from the library."
|
|
445
|
+
)
|
|
446
|
+
)
|
|
438
447
|
|
|
439
448
|
|
|
440
449
|
# Direct imports for type-checking
|
|
@@ -442,10 +451,10 @@ if TYPE_CHECKING:
|
|
|
442
451
|
from .analyzer import *
|
|
443
452
|
from .dataflow import *
|
|
444
453
|
from .datapoint import *
|
|
445
|
-
from .datasets import *
|
|
454
|
+
from .datasets import * # type: ignore
|
|
446
455
|
from .eval import *
|
|
447
|
-
from .extern import *
|
|
448
|
-
from .mapper import *
|
|
456
|
+
from .extern import * # type: ignore
|
|
457
|
+
from .mapper import * # type: ignore
|
|
449
458
|
from .pipe import *
|
|
450
459
|
from .train import *
|
|
451
460
|
from .utils import *
|
deepdoctection/analyzer/dd.py
CHANGED
|
@@ -23,51 +23,43 @@ Module for **deep**doctection analyzer.
|
|
|
23
23
|
-user factory with a reduced config setting
|
|
24
24
|
"""
|
|
25
25
|
|
|
26
|
-
import ast
|
|
27
26
|
import os
|
|
28
27
|
from os import environ
|
|
29
28
|
from shutil import copyfile
|
|
30
29
|
from typing import List, Optional, Union
|
|
31
30
|
|
|
31
|
+
from lazy_imports import try_import
|
|
32
|
+
|
|
32
33
|
from ..extern.base import ObjectDetector
|
|
34
|
+
from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
|
|
33
35
|
from ..extern.doctrocr import DoctrTextlineDetector, DoctrTextRecognizer
|
|
36
|
+
from ..extern.hfdetr import HFDetrDerivedDetector
|
|
34
37
|
from ..extern.model import ModelCatalog, ModelDownloadManager
|
|
35
38
|
from ..extern.pdftext import PdfPlumberTextDetector
|
|
39
|
+
from ..extern.pt.ptutils import get_torch_device
|
|
36
40
|
from ..extern.tessocr import TesseractOcrDetector
|
|
37
41
|
from ..extern.texocr import TextractOcrDetector
|
|
42
|
+
from ..extern.tp.tfutils import disable_tp_layer_logging, get_tf_device
|
|
43
|
+
from ..extern.tpdetect import TPFrcnnDetector
|
|
38
44
|
from ..pipe.base import PipelineComponent
|
|
39
|
-
from ..pipe.cell import DetectResultGenerator, SubImageLayoutService
|
|
40
45
|
from ..pipe.common import AnnotationNmsService, MatchingService, PageParsingService
|
|
41
46
|
from ..pipe.doctectionpipe import DoctectionPipe
|
|
42
47
|
from ..pipe.layout import ImageLayoutService
|
|
43
48
|
from ..pipe.order import TextOrderService
|
|
44
49
|
from ..pipe.refine import TableSegmentationRefinementService
|
|
45
50
|
from ..pipe.segment import PubtablesSegmentationService, TableSegmentationService
|
|
51
|
+
from ..pipe.sub_layout import DetectResultGenerator, SubImageLayoutService
|
|
46
52
|
from ..pipe.text import TextExtractionService
|
|
47
53
|
from ..utils.detection_types import Pathlike
|
|
48
|
-
from ..utils.
|
|
49
|
-
from ..utils.file_utils import
|
|
50
|
-
boto3_available,
|
|
51
|
-
detectron2_available,
|
|
52
|
-
pytorch_available,
|
|
53
|
-
tensorpack_available,
|
|
54
|
-
tf_available,
|
|
55
|
-
)
|
|
54
|
+
from ..utils.error import DependencyError
|
|
55
|
+
from ..utils.file_utils import detectron2_available, tensorpack_available
|
|
56
56
|
from ..utils.fs import get_configs_dir_path, get_package_path, mkdir_p
|
|
57
57
|
from ..utils.logger import LoggingRecord, logger
|
|
58
58
|
from ..utils.metacfg import AttrDict, set_config_by_yaml
|
|
59
59
|
from ..utils.settings import CellType, LayoutType
|
|
60
60
|
from ..utils.transform import PadTransform
|
|
61
61
|
|
|
62
|
-
|
|
63
|
-
from ..extern.tp.tfutils import disable_tp_layer_logging
|
|
64
|
-
from ..extern.tpdetect import TPFrcnnDetector
|
|
65
|
-
|
|
66
|
-
if pytorch_available():
|
|
67
|
-
from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
|
|
68
|
-
from ..extern.hfdetr import HFDetrDerivedDetector
|
|
69
|
-
|
|
70
|
-
if boto3_available():
|
|
62
|
+
with try_import() as image_guard:
|
|
71
63
|
from botocore.config import Config # type: ignore
|
|
72
64
|
|
|
73
65
|
|
|
@@ -113,11 +105,12 @@ def config_sanity_checks(cfg: AttrDict) -> None:
|
|
|
113
105
|
"""Some config sanity checks"""
|
|
114
106
|
if cfg.USE_PDF_MINER and cfg.USE_OCR and cfg.OCR.USE_DOCTR:
|
|
115
107
|
raise ValueError("Configuration USE_PDF_MINER= True and USE_OCR=True and USE_DOCTR=True is not allowed")
|
|
116
|
-
if cfg.
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
108
|
+
if cfg.USE_OCR:
|
|
109
|
+
if cfg.OCR.USE_TESSERACT + cfg.OCR.USE_DOCTR + cfg.OCR.USE_TEXTRACT != 1:
|
|
110
|
+
raise ValueError(
|
|
111
|
+
"Choose either OCR.USE_TESSERACT=True or OCR.USE_DOCTR=True or OCR.USE_TEXTRACT=True "
|
|
112
|
+
"and set the other two to False. Only one OCR system can be activated."
|
|
113
|
+
)
|
|
121
114
|
|
|
122
115
|
|
|
123
116
|
def build_detector(
|
|
@@ -343,11 +336,20 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
|
|
|
343
336
|
pipe_component_list.append(table_segmentation)
|
|
344
337
|
|
|
345
338
|
if cfg.USE_TABLE_REFINEMENT:
|
|
346
|
-
table_segmentation_refinement = TableSegmentationRefinementService(
|
|
339
|
+
table_segmentation_refinement = TableSegmentationRefinementService(
|
|
340
|
+
[LayoutType.table, LayoutType.table_rotated],
|
|
341
|
+
[
|
|
342
|
+
LayoutType.cell,
|
|
343
|
+
CellType.column_header,
|
|
344
|
+
CellType.projected_row_header,
|
|
345
|
+
CellType.spanning,
|
|
346
|
+
CellType.row_header,
|
|
347
|
+
],
|
|
348
|
+
)
|
|
347
349
|
pipe_component_list.append(table_segmentation_refinement)
|
|
348
350
|
|
|
349
351
|
if cfg.USE_PDF_MINER:
|
|
350
|
-
pdf_text = PdfPlumberTextDetector()
|
|
352
|
+
pdf_text = PdfPlumberTextDetector(x_tolerance=cfg.PDF_MINER.X_TOLERANCE, y_tolerance=cfg.PDF_MINER.Y_TOLERANCE)
|
|
351
353
|
d_text = TextExtractionService(pdf_text)
|
|
352
354
|
pipe_component_list.append(d_text)
|
|
353
355
|
|
|
@@ -400,7 +402,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
|
|
|
400
402
|
|
|
401
403
|
|
|
402
404
|
def get_dd_analyzer(
|
|
403
|
-
reset_config_file: bool =
|
|
405
|
+
reset_config_file: bool = True,
|
|
404
406
|
config_overwrite: Optional[List[str]] = None,
|
|
405
407
|
path_config_file: Optional[Pathlike] = None,
|
|
406
408
|
) -> DoctectionPipe:
|
|
@@ -429,8 +431,13 @@ def get_dd_analyzer(
|
|
|
429
431
|
:return: A DoctectionPipe instance with given configs
|
|
430
432
|
"""
|
|
431
433
|
config_overwrite = [] if config_overwrite is None else config_overwrite
|
|
432
|
-
lib = "TF" if
|
|
433
|
-
|
|
434
|
+
lib = "TF" if os.environ.get("DD_USE_TF") else "PT"
|
|
435
|
+
if lib == "TF":
|
|
436
|
+
device = get_tf_device()
|
|
437
|
+
elif lib == "PT":
|
|
438
|
+
device = get_torch_device()
|
|
439
|
+
else:
|
|
440
|
+
raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
|
|
434
441
|
dd_one_config_path = maybe_copy_config_to_cache(
|
|
435
442
|
get_package_path(), get_configs_dir_path(), _DD_ONE, reset_config_file
|
|
436
443
|
)
|
|
@@ -1,38 +1,38 @@
|
|
|
1
1
|
USE_LAYOUT: True
|
|
2
2
|
USE_TABLE_SEGMENTATION: True
|
|
3
3
|
TF:
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
4
|
+
LAYOUT:
|
|
5
|
+
WEIGHTS: layout/model-800000_inf_only.data-00000-of-00001
|
|
6
|
+
FILTER:
|
|
7
|
+
CELL:
|
|
8
|
+
WEIGHTS: cell/model-1800000_inf_only.data-00000-of-00001
|
|
9
|
+
FILTER:
|
|
10
|
+
ITEM:
|
|
11
|
+
WEIGHTS: item/model-1620000_inf_only.data-00000-of-00001
|
|
12
|
+
FILTER:
|
|
13
13
|
PT:
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
14
|
+
LAYOUT:
|
|
15
|
+
WEIGHTS: layout/d2_model_0829999_layout_inf_only.pt
|
|
16
|
+
WEIGHTS_TS: layout/d2_model_0829999_layout_inf_only.ts
|
|
17
|
+
FILTER:
|
|
18
|
+
PAD:
|
|
19
|
+
TOP: 60
|
|
20
|
+
RIGHT: 60
|
|
21
|
+
BOTTOM: 60
|
|
22
|
+
LEFT: 60
|
|
23
|
+
ITEM:
|
|
24
|
+
WEIGHTS: item/d2_model_1639999_item_inf_only.pt
|
|
25
|
+
WEIGHTS_TS: item/d2_model_1639999_item_inf_only.ts
|
|
26
|
+
FILTER:
|
|
27
|
+
PAD:
|
|
28
|
+
TOP: 60
|
|
29
|
+
RIGHT: 60
|
|
30
|
+
BOTTOM: 60
|
|
31
|
+
LEFT: 60
|
|
32
|
+
CELL:
|
|
33
|
+
WEIGHTS: cell/d2_model_1849999_cell_inf_only.pt
|
|
34
|
+
WEIGHTS_TS: cell/d2_model_1849999_cell_inf_only.ts
|
|
35
|
+
FILTER:
|
|
36
36
|
LAYOUT_NMS_PAIRS:
|
|
37
37
|
COMBINATIONS:
|
|
38
38
|
THRESHOLDS:
|
|
@@ -48,6 +48,9 @@ SEGMENTATION:
|
|
|
48
48
|
STRETCH_RULE: equal
|
|
49
49
|
USE_TABLE_REFINEMENT: True
|
|
50
50
|
USE_PDF_MINER: False
|
|
51
|
+
PDF_MINER:
|
|
52
|
+
X_TOLERANCE: 3
|
|
53
|
+
Y_TOLERANCE: 3
|
|
51
54
|
USE_OCR: True
|
|
52
55
|
OCR:
|
|
53
56
|
USE_TESSERACT: True
|
deepdoctection/dataflow/base.py
CHANGED
|
@@ -17,25 +17,6 @@ from typing import Any, Iterator, no_type_check
|
|
|
17
17
|
from ..utils.utils import get_rng
|
|
18
18
|
|
|
19
19
|
|
|
20
|
-
class DataFlowTerminated(BaseException):
|
|
21
|
-
"""
|
|
22
|
-
An exception indicating that the DataFlow is unable to produce any more
|
|
23
|
-
data, i.e. something wrong happened so that calling `__iter__`
|
|
24
|
-
cannot give a valid iterator anymore.
|
|
25
|
-
In most DataFlow this will never be raised.
|
|
26
|
-
"""
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
class DataFlowResetStateNotCalled(BaseException):
|
|
30
|
-
"""
|
|
31
|
-
An exception indicating that `reset_state()` has not been called before starting
|
|
32
|
-
iteration.
|
|
33
|
-
"""
|
|
34
|
-
|
|
35
|
-
def __init__(self) -> None:
|
|
36
|
-
super().__init__("Iterating a dataflow requires .reset_state() to be called first")
|
|
37
|
-
|
|
38
|
-
|
|
39
20
|
class DataFlowReentrantGuard:
|
|
40
21
|
"""
|
|
41
22
|
A tool to enforce non-reentrancy.
|
|
@@ -25,10 +25,11 @@ from typing import Any, Callable, Iterable, Iterator, List, Optional
|
|
|
25
25
|
|
|
26
26
|
import numpy as np
|
|
27
27
|
|
|
28
|
+
from ..utils.error import DataFlowResetStateNotCalledError
|
|
28
29
|
from ..utils.logger import LoggingRecord, logger
|
|
29
30
|
from ..utils.tqdm import get_tqdm
|
|
30
31
|
from ..utils.utils import get_rng
|
|
31
|
-
from .base import DataFlow, DataFlowReentrantGuard,
|
|
32
|
+
from .base import DataFlow, DataFlowReentrantGuard, ProxyDataFlow
|
|
32
33
|
from .serialize import DataFromIterable, DataFromList
|
|
33
34
|
|
|
34
35
|
__all__ = ["CacheData", "CustomDataFromList", "CustomDataFromIterable"]
|
|
@@ -65,7 +66,7 @@ class CacheData(ProxyDataFlow):
|
|
|
65
66
|
|
|
66
67
|
def __iter__(self) -> Iterator[Any]:
|
|
67
68
|
if self._guard is None:
|
|
68
|
-
raise
|
|
69
|
+
raise DataFlowResetStateNotCalledError()
|
|
69
70
|
|
|
70
71
|
with self._guard:
|
|
71
72
|
if self.buffer:
|
|
@@ -139,7 +140,7 @@ class CustomDataFromList(DataFromList):
|
|
|
139
140
|
|
|
140
141
|
def __iter__(self) -> Iterator[Any]:
|
|
141
142
|
if self.rng is None:
|
|
142
|
-
raise
|
|
143
|
+
raise DataFlowResetStateNotCalledError()
|
|
143
144
|
if self.rebalance_func is not None:
|
|
144
145
|
lst_tmp = self.rebalance_func(self.lst)
|
|
145
146
|
logger.info(LoggingRecord(f"CustomDataFromList: subset size after re-balancing: {len(lst_tmp)}"))
|
|
@@ -27,13 +27,16 @@ from pathlib import Path
|
|
|
27
27
|
from typing import DefaultDict, Dict, List, Optional, Sequence, Union
|
|
28
28
|
|
|
29
29
|
from jsonlines import Reader, Writer
|
|
30
|
+
from tabulate import tabulate
|
|
31
|
+
from termcolor import colored
|
|
30
32
|
|
|
31
33
|
from ..utils.context import timed_operation
|
|
32
34
|
from ..utils.detection_types import JsonDict, Pathlike
|
|
35
|
+
from ..utils.error import FileExtensionError
|
|
33
36
|
from ..utils.identifier import get_uuid_from_str
|
|
34
37
|
from ..utils.pdf_utils import PDFStreamer
|
|
35
38
|
from ..utils.tqdm import get_tqdm
|
|
36
|
-
from ..utils.utils import
|
|
39
|
+
from ..utils.utils import is_file_extension
|
|
37
40
|
from .base import DataFlow
|
|
38
41
|
from .common import FlattenData, JoinData, MapData
|
|
39
42
|
from .custom import CacheData, CustomDataFromIterable, CustomDataFromList
|
|
@@ -223,7 +226,7 @@ class SerializerFiles:
|
|
|
223
226
|
"""
|
|
224
227
|
Not implemented
|
|
225
228
|
"""
|
|
226
|
-
raise NotImplementedError
|
|
229
|
+
raise NotImplementedError()
|
|
227
230
|
|
|
228
231
|
|
|
229
232
|
class CocoParser:
|
|
@@ -283,8 +286,14 @@ class CocoParser:
|
|
|
283
286
|
"""
|
|
284
287
|
Print information about the annotation file.
|
|
285
288
|
"""
|
|
289
|
+
rows = []
|
|
286
290
|
for key, value in self.dataset["info"].items():
|
|
287
|
-
|
|
291
|
+
row = [key, value]
|
|
292
|
+
rows.append(row)
|
|
293
|
+
|
|
294
|
+
header = ["key", "value"]
|
|
295
|
+
table = tabulate(rows, headers=header, tablefmt="fancy_grid", stralign="left", numalign="left")
|
|
296
|
+
print(colored(table, "cyan"))
|
|
288
297
|
|
|
289
298
|
def get_ann_ids(
|
|
290
299
|
self,
|
|
@@ -499,7 +508,7 @@ class SerializerCoco:
|
|
|
499
508
|
"""
|
|
500
509
|
Not implemented
|
|
501
510
|
"""
|
|
502
|
-
raise NotImplementedError
|
|
511
|
+
raise NotImplementedError()
|
|
503
512
|
|
|
504
513
|
|
|
505
514
|
class SerializerPdfDoc:
|
|
@@ -547,7 +556,7 @@ class SerializerPdfDoc:
|
|
|
547
556
|
"""
|
|
548
557
|
Not implemented
|
|
549
558
|
"""
|
|
550
|
-
raise NotImplementedError
|
|
559
|
+
raise NotImplementedError()
|
|
551
560
|
|
|
552
561
|
@staticmethod
|
|
553
562
|
def split(path: Pathlike, path_target: Optional[Pathlike] = None, max_datapoint: Optional[int] = None) -> None:
|
|
@@ -28,8 +28,9 @@ from typing import Any, Callable, Iterator, List, no_type_check
|
|
|
28
28
|
import zmq
|
|
29
29
|
|
|
30
30
|
from ..utils.concurrency import StoppableThread, enable_death_signal, start_proc_mask_signal
|
|
31
|
+
from ..utils.error import DataFlowTerminatedError
|
|
31
32
|
from ..utils.logger import LoggingRecord, logger
|
|
32
|
-
from .base import DataFlow, DataFlowReentrantGuard,
|
|
33
|
+
from .base import DataFlow, DataFlowReentrantGuard, ProxyDataFlow
|
|
33
34
|
from .common import RepeatedData
|
|
34
35
|
from .serialize import PickleSerializer
|
|
35
36
|
|
|
@@ -49,14 +50,14 @@ def _zmq_catch_error(name):
|
|
|
49
50
|
yield
|
|
50
51
|
except zmq.ContextTerminated as exc:
|
|
51
52
|
logger.info(LoggingRecord(f"_zmq_catch_error: [{name}] Context terminated."))
|
|
52
|
-
raise
|
|
53
|
+
raise DataFlowTerminatedError() from exc
|
|
53
54
|
except zmq.ZMQError as exc:
|
|
54
55
|
if exc.errno == errno.ENOTSOCK: # socket closed
|
|
55
56
|
logger.info(LoggingRecord(f"_zmq_catch_error: [{name}] Socket closed."))
|
|
56
|
-
raise
|
|
57
|
-
raise ValueError from exc
|
|
57
|
+
raise DataFlowTerminatedError() from exc
|
|
58
|
+
raise ValueError() from exc
|
|
58
59
|
except Exception as exc:
|
|
59
|
-
raise ValueError from exc
|
|
60
|
+
raise ValueError() from exc
|
|
60
61
|
|
|
61
62
|
|
|
62
63
|
@no_type_check
|
|
@@ -78,8 +79,8 @@ def _get_pipe_name(name):
|
|
|
78
79
|
class _ParallelMapData(ProxyDataFlow, ABC):
|
|
79
80
|
def __init__(self, df: DataFlow, buffer_size: int, strict: bool = False) -> None:
|
|
80
81
|
super().__init__(df)
|
|
81
|
-
if
|
|
82
|
-
raise ValueError("buffer_size must be a positive number")
|
|
82
|
+
if buffer_size <= 0:
|
|
83
|
+
raise ValueError(f"buffer_size must be a positive number, got {buffer_size}")
|
|
83
84
|
self._buffer_size = buffer_size
|
|
84
85
|
self._buffer_occupancy = 0 # actual #elements in buffer, only useful in strict mode
|
|
85
86
|
self._strict = strict
|
|
@@ -95,12 +96,12 @@ class _ParallelMapData(ProxyDataFlow, ABC):
|
|
|
95
96
|
@no_type_check
|
|
96
97
|
@abstractmethod
|
|
97
98
|
def _recv(self):
|
|
98
|
-
raise NotImplementedError
|
|
99
|
+
raise NotImplementedError()
|
|
99
100
|
|
|
100
101
|
@no_type_check
|
|
101
102
|
@abstractmethod
|
|
102
103
|
def _send(self, dp: Any):
|
|
103
|
-
raise NotImplementedError
|
|
104
|
+
raise NotImplementedError()
|
|
104
105
|
|
|
105
106
|
@no_type_check
|
|
106
107
|
def _recv_filter_none(self):
|
|
@@ -398,8 +399,8 @@ class MultiProcessMapData(_ParallelMapData, _MultiProcessZMQDataFlow):
|
|
|
398
399
|
|
|
399
400
|
_ParallelMapData.__init__(self, df, buffer_size, strict)
|
|
400
401
|
_MultiProcessZMQDataFlow.__init__(self)
|
|
401
|
-
if
|
|
402
|
-
raise ValueError("num_proc must be a positive number")
|
|
402
|
+
if num_proc <= 0:
|
|
403
|
+
raise ValueError(f"num_proc must be a positive number, got {num_proc}")
|
|
403
404
|
self.num_proc = num_proc
|
|
404
405
|
self.map_func = map_func
|
|
405
406
|
self._strict = strict
|
|
@@ -16,7 +16,8 @@ from typing import Any, Iterable, Iterator, List, Optional, Tuple, Union
|
|
|
16
16
|
|
|
17
17
|
import numpy as np
|
|
18
18
|
|
|
19
|
-
from .
|
|
19
|
+
from ..utils.error import DataFlowResetStateNotCalledError
|
|
20
|
+
from .base import DataFlow, RNGDataFlow
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
class DataFromList(RNGDataFlow):
|
|
@@ -44,7 +45,7 @@ class DataFromList(RNGDataFlow):
|
|
|
44
45
|
for k in idxs:
|
|
45
46
|
yield self.lst[k]
|
|
46
47
|
else:
|
|
47
|
-
raise
|
|
48
|
+
raise DataFlowResetStateNotCalledError()
|
|
48
49
|
|
|
49
50
|
|
|
50
51
|
class DataFromIterable(DataFlow):
|
|
@@ -63,7 +64,7 @@ class DataFromIterable(DataFlow):
|
|
|
63
64
|
|
|
64
65
|
def __len__(self) -> int:
|
|
65
66
|
if self._len is None:
|
|
66
|
-
raise NotImplementedError
|
|
67
|
+
raise NotImplementedError()
|
|
67
68
|
return self._len
|
|
68
69
|
|
|
69
70
|
def __iter__(self) -> Iterator[Any]:
|
|
@@ -107,7 +108,7 @@ class FakeData(RNGDataFlow):
|
|
|
107
108
|
|
|
108
109
|
def __iter__(self) -> Iterator[Any]:
|
|
109
110
|
if self.rng is None:
|
|
110
|
-
raise
|
|
111
|
+
raise DataFlowResetStateNotCalledError()
|
|
111
112
|
if self.random:
|
|
112
113
|
for _ in range(self._size):
|
|
113
114
|
val = []
|