deepdoctection 0.32__tar.gz → 0.33__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- {deepdoctection-0.32 → deepdoctection-0.33}/PKG-INFO +4 -4
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/__init__.py +3 -23
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/analyzer/dd.py +47 -42
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/dataflow/common.py +9 -5
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/dataflow/custom.py +5 -5
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/dataflow/custom_serialize.py +75 -18
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/dataflow/parallel_map.py +3 -3
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/dataflow/serialize.py +4 -4
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/dataflow/stats.py +3 -3
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datapoint/annotation.py +39 -55
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datapoint/box.py +7 -7
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datapoint/convert.py +6 -6
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datapoint/image.py +43 -37
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datapoint/view.py +175 -151
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/adapter.py +30 -24
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/base.py +9 -9
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/dataflow_builder.py +3 -3
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/info.py +23 -25
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/instances/doclaynet.py +48 -49
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/instances/fintabnet.py +44 -45
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/instances/funsd.py +23 -23
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/instances/iiitar13k.py +8 -8
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/instances/layouttest.py +2 -2
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/instances/publaynet.py +3 -3
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/instances/pubtables1m.py +18 -18
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/instances/pubtabnet.py +30 -29
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/instances/rvlcdip.py +28 -29
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/instances/xfund.py +24 -25
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/save.py +6 -6
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/eval/accmetric.py +32 -33
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/eval/base.py +8 -9
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/eval/cocometric.py +13 -12
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/eval/eval.py +26 -26
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/eval/tedsmetric.py +16 -12
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/eval/tp_eval_callback.py +7 -16
- deepdoctection-0.33/deepdoctection/extern/base.py +644 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/d2detect.py +69 -89
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/deskew.py +11 -10
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/doctrocr.py +81 -64
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/fastlang.py +23 -16
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/hfdetr.py +53 -38
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/hflayoutlm.py +216 -155
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/hflm.py +35 -30
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/model.py +432 -255
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/pdftext.py +15 -15
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/pt/ptutils.py +4 -2
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tessocr.py +39 -38
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/texocr.py +14 -16
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tfutils.py +16 -2
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpcompat.py +11 -7
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/config/config.py +4 -4
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +1 -1
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +5 -5
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +6 -6
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +4 -4
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +5 -3
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/preproc.py +5 -5
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tpdetect.py +40 -45
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/mapper/cats.py +27 -29
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/mapper/cocostruct.py +10 -10
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/mapper/d2struct.py +20 -21
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/mapper/hfstruct.py +7 -7
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/mapper/laylmstruct.py +22 -24
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/mapper/maputils.py +9 -10
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/mapper/match.py +2 -2
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/mapper/misc.py +5 -6
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/mapper/pascalstruct.py +4 -4
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/mapper/prodigystruct.py +5 -5
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/mapper/pubstruct.py +84 -92
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/mapper/tpstruct.py +3 -3
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/mapper/xfundstruct.py +33 -33
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/pipe/anngen.py +12 -14
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/pipe/base.py +52 -106
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/pipe/common.py +63 -52
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/pipe/concurrency.py +14 -10
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/pipe/doctectionpipe.py +24 -21
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/pipe/language.py +20 -25
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/pipe/layout.py +18 -16
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/pipe/lm.py +49 -47
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/pipe/order.py +63 -65
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/pipe/refine.py +102 -109
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/pipe/segment.py +156 -161
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/pipe/sub_layout.py +49 -39
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/pipe/text.py +37 -36
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/pipe/transform.py +19 -16
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/train/d2_frcnn_train.py +27 -25
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/train/hf_detr_train.py +22 -18
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/train/hf_layoutlm_train.py +49 -48
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/train/tp_frcnn_train.py +10 -11
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/concurrency.py +1 -1
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/context.py +13 -6
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/develop.py +4 -4
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/env_info.py +51 -13
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/file_utils.py +6 -11
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/fs.py +22 -18
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/identifier.py +2 -2
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/logger.py +15 -15
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/metacfg.py +7 -7
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/pdf_utils.py +11 -11
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/settings.py +185 -182
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/tqdm.py +1 -1
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/transform.py +14 -9
- deepdoctection-0.33/deepdoctection/utils/types.py +104 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/utils.py +7 -7
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/viz.py +70 -69
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection.egg-info/PKG-INFO +4 -4
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection.egg-info/SOURCES.txt +1 -1
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection.egg-info/requires.txt +3 -3
- {deepdoctection-0.32 → deepdoctection-0.33}/setup.cfg +3 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/setup.py +1 -1
- deepdoctection-0.32/deepdoctection/extern/base.py +0 -439
- deepdoctection-0.32/deepdoctection/utils/detection_types.py +0 -68
- {deepdoctection-0.32 → deepdoctection-0.33}/LICENSE +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/README.md +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/analyzer/__init__.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/configs/__init__.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/configs/conf_dd_one.yaml +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/configs/conf_tesseract.yaml +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/dataflow/__init__.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/dataflow/base.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datapoint/__init__.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/__init__.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/instances/__init__.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/instances/xsl/__init__.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/instances/xsl/pascal_voc.xsl +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/datasets/registry.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/eval/__init__.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/eval/registry.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/__init__.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/pt/__init__.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/pt/nms.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/__init__.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/__init__.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/common.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/predict.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/utils/np_box_ops.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/mapper/__init__.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/pipe/__init__.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/pipe/registry.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/py.typed +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/train/__init__.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/__init__.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/error.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/utils/mocks.py +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection.egg-info/dependency_links.txt +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection.egg-info/top_level.txt +0 -0
- {deepdoctection-0.32 → deepdoctection-0.33}/tests/test_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: deepdoctection
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.33
|
|
4
4
|
Summary: Repository for Document AI
|
|
5
5
|
Home-page: https://github.com/deepdoctection/deepdoctection
|
|
6
6
|
Author: Dr. Janis Meyer
|
|
@@ -23,7 +23,7 @@ Requires-Dist: jsonlines==3.1.0
|
|
|
23
23
|
Requires-Dist: lazy-imports==0.3.1
|
|
24
24
|
Requires-Dist: mock==4.0.3
|
|
25
25
|
Requires-Dist: networkx>=2.7.1
|
|
26
|
-
Requires-Dist: numpy
|
|
26
|
+
Requires-Dist: numpy<2.0,>=1.21
|
|
27
27
|
Requires-Dist: packaging>=20.0
|
|
28
28
|
Requires-Dist: Pillow>=10.0.0
|
|
29
29
|
Requires-Dist: pypdf>=3.16.0
|
|
@@ -40,7 +40,7 @@ Requires-Dist: jsonlines==3.1.0; extra == "tf"
|
|
|
40
40
|
Requires-Dist: lazy-imports==0.3.1; extra == "tf"
|
|
41
41
|
Requires-Dist: mock==4.0.3; extra == "tf"
|
|
42
42
|
Requires-Dist: networkx>=2.7.1; extra == "tf"
|
|
43
|
-
Requires-Dist: numpy
|
|
43
|
+
Requires-Dist: numpy<2.0,>=1.21; extra == "tf"
|
|
44
44
|
Requires-Dist: packaging>=20.0; extra == "tf"
|
|
45
45
|
Requires-Dist: Pillow>=10.0.0; extra == "tf"
|
|
46
46
|
Requires-Dist: pypdf>=3.16.0; extra == "tf"
|
|
@@ -70,7 +70,7 @@ Requires-Dist: jsonlines==3.1.0; extra == "pt"
|
|
|
70
70
|
Requires-Dist: lazy-imports==0.3.1; extra == "pt"
|
|
71
71
|
Requires-Dist: mock==4.0.3; extra == "pt"
|
|
72
72
|
Requires-Dist: networkx>=2.7.1; extra == "pt"
|
|
73
|
-
Requires-Dist: numpy
|
|
73
|
+
Requires-Dist: numpy<2.0,>=1.21; extra == "pt"
|
|
74
74
|
Requires-Dist: packaging>=20.0; extra == "pt"
|
|
75
75
|
Requires-Dist: Pillow>=10.0.0; extra == "pt"
|
|
76
76
|
Requires-Dist: pypdf>=3.16.0; extra == "pt"
|
|
@@ -25,7 +25,7 @@ from .utils.logger import LoggingRecord, logger
|
|
|
25
25
|
|
|
26
26
|
# pylint: enable=wrong-import-position
|
|
27
27
|
|
|
28
|
-
__version__ = 0.
|
|
28
|
+
__version__ = 0.33
|
|
29
29
|
|
|
30
30
|
_IMPORT_STRUCTURE = {
|
|
31
31
|
"analyzer": [
|
|
@@ -160,6 +160,8 @@ _IMPORT_STRUCTURE = {
|
|
|
160
160
|
"EvalCallback",
|
|
161
161
|
],
|
|
162
162
|
"extern": [
|
|
163
|
+
"ModelCategories",
|
|
164
|
+
"NerModelCategories",
|
|
163
165
|
"PredictorBase",
|
|
164
166
|
"DetectionResult",
|
|
165
167
|
"ObjectDetector",
|
|
@@ -423,28 +425,6 @@ _IMPORT_STRUCTURE = {
|
|
|
423
425
|
env_info = collect_env_info()
|
|
424
426
|
logger.debug(LoggingRecord(msg=env_info))
|
|
425
427
|
|
|
426
|
-
if os.environ.get("PYTORCH_AVAILABLE") and os.environ.get("DD_USE_TORCH") is None:
|
|
427
|
-
os.environ["DD_USE_TORCH"] = "1"
|
|
428
|
-
os.environ["USE_TORCH"] = "1"
|
|
429
|
-
if os.environ.get("TENSORFLOW_AVAILABLE") and os.environ.get("DD_USE_TF") is None:
|
|
430
|
-
os.environ["DD_USE_TF"] = "1"
|
|
431
|
-
os.environ["USE_TF"] = "1"
|
|
432
|
-
if os.environ.get("DD_USE_TORCH") and os.environ.get("DD_USE_TF"):
|
|
433
|
-
logger.warning(
|
|
434
|
-
"Both DD_USE_TORCH and DD_USE_TF are set. Defaulting to PyTorch. If you want a different "
|
|
435
|
-
"behaviour, set DD_USE_TORCH to None before importing deepdoctection."
|
|
436
|
-
)
|
|
437
|
-
os.environ.pop("DD_USE_TF")
|
|
438
|
-
os.environ.pop("USE_TF")
|
|
439
|
-
|
|
440
|
-
if not os.environ.get("PYTORCH_AVAILABLE") and not os.environ.get("TENSORFLOW_AVAILABLE"):
|
|
441
|
-
logger.warning(
|
|
442
|
-
LoggingRecord(
|
|
443
|
-
msg="Neither Tensorflow or Pytorch are available. You will not be able to use any Deep Learning "
|
|
444
|
-
"model from the library."
|
|
445
|
-
)
|
|
446
|
-
)
|
|
447
|
-
|
|
448
428
|
|
|
449
429
|
# Direct imports for type-checking
|
|
450
430
|
if TYPE_CHECKING:
|
|
@@ -23,10 +23,12 @@ Module for **deep**doctection analyzer.
|
|
|
23
23
|
-user factory with a reduced config setting
|
|
24
24
|
"""
|
|
25
25
|
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
26
28
|
import os
|
|
27
29
|
from os import environ
|
|
28
30
|
from shutil import copyfile
|
|
29
|
-
from typing import
|
|
31
|
+
from typing import Optional, Union
|
|
30
32
|
|
|
31
33
|
from lazy_imports import try_import
|
|
32
34
|
|
|
@@ -50,7 +52,7 @@ from ..pipe.refine import TableSegmentationRefinementService
|
|
|
50
52
|
from ..pipe.segment import PubtablesSegmentationService, TableSegmentationService
|
|
51
53
|
from ..pipe.sub_layout import DetectResultGenerator, SubImageLayoutService
|
|
52
54
|
from ..pipe.text import TextExtractionService
|
|
53
|
-
from ..utils.
|
|
55
|
+
from ..utils.env_info import ENV_VARS_TRUE
|
|
54
56
|
from ..utils.error import DependencyError
|
|
55
57
|
from ..utils.file_utils import detectron2_available, tensorpack_available
|
|
56
58
|
from ..utils.fs import get_configs_dir_path, get_package_path, mkdir_p
|
|
@@ -58,6 +60,7 @@ from ..utils.logger import LoggingRecord, logger
|
|
|
58
60
|
from ..utils.metacfg import AttrDict, set_config_by_yaml
|
|
59
61
|
from ..utils.settings import CellType, LayoutType
|
|
60
62
|
from ..utils.transform import PadTransform
|
|
63
|
+
from ..utils.types import PathLikeOrStr
|
|
61
64
|
|
|
62
65
|
with try_import() as image_guard:
|
|
63
66
|
from botocore.config import Config # type: ignore
|
|
@@ -81,7 +84,7 @@ _TESSERACT = "deepdoctection/configs/conf_tesseract.yaml"
|
|
|
81
84
|
|
|
82
85
|
|
|
83
86
|
def maybe_copy_config_to_cache(
|
|
84
|
-
package_path:
|
|
87
|
+
package_path: PathLikeOrStr, configs_dir_path: PathLikeOrStr, file_name: str, force_copy: bool = True
|
|
85
88
|
) -> str:
|
|
86
89
|
"""
|
|
87
90
|
Initial copying of various files
|
|
@@ -115,7 +118,7 @@ def config_sanity_checks(cfg: AttrDict) -> None:
|
|
|
115
118
|
|
|
116
119
|
def build_detector(
|
|
117
120
|
cfg: AttrDict, mode: str
|
|
118
|
-
) -> Union[
|
|
121
|
+
) -> Union[D2FrcnnDetector, TPFrcnnDetector, HFDetrDerivedDetector, D2FrcnnTracingDetector]:
|
|
119
122
|
"""Building a D2-Detector, a TP-Detector as Detr-Detector or a D2-Torch Tracing Detector according to
|
|
120
123
|
the config
|
|
121
124
|
|
|
@@ -133,8 +136,8 @@ def build_detector(
|
|
|
133
136
|
config_path = ModelCatalog.get_full_path_configs(weights)
|
|
134
137
|
weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
|
|
135
138
|
profile = ModelCatalog.get_profile(weights)
|
|
136
|
-
categories = profile.categories
|
|
137
|
-
|
|
139
|
+
categories = profile.categories if profile.categories is not None else {}
|
|
140
|
+
|
|
138
141
|
if profile.model_wrapper in ("TPFrcnnDetector",):
|
|
139
142
|
return TPFrcnnDetector(config_path, weights_path, categories, filter_categories=filter_categories)
|
|
140
143
|
if profile.model_wrapper in ("D2FrcnnDetector",):
|
|
@@ -202,11 +205,13 @@ def build_sub_image_service(detector: ObjectDetector, cfg: AttrDict, mode: str)
|
|
|
202
205
|
padder = None
|
|
203
206
|
if mode == "ITEM":
|
|
204
207
|
if detector.__class__.__name__ in ("HFDetrDerivedDetector",):
|
|
205
|
-
exclude_category_ids.extend([
|
|
208
|
+
exclude_category_ids.extend([1, 3, 4, 5, 6])
|
|
206
209
|
padder = build_padder(cfg, mode)
|
|
207
|
-
detect_result_generator = DetectResultGenerator(
|
|
210
|
+
detect_result_generator = DetectResultGenerator(
|
|
211
|
+
categories=detector.categories.categories, exclude_category_ids=exclude_category_ids
|
|
212
|
+
)
|
|
208
213
|
return SubImageLayoutService(
|
|
209
|
-
detector, [LayoutType.
|
|
214
|
+
detector, [LayoutType.TABLE, LayoutType.TABLE_ROTATED], None, detect_result_generator, padder
|
|
210
215
|
)
|
|
211
216
|
|
|
212
217
|
|
|
@@ -233,9 +238,9 @@ def build_ocr(cfg: AttrDict) -> Union[TesseractOcrDetector, DoctrTextRecognizer,
|
|
|
233
238
|
)
|
|
234
239
|
if cfg.OCR.USE_TEXTRACT:
|
|
235
240
|
credentials_kwargs = {
|
|
236
|
-
"aws_access_key_id": environ.get("ACCESS_KEY"),
|
|
237
|
-
"aws_secret_access_key": environ.get("SECRET_KEY"),
|
|
238
|
-
"config": Config(region_name=environ.get("REGION")),
|
|
241
|
+
"aws_access_key_id": environ.get("ACCESS_KEY", None),
|
|
242
|
+
"aws_secret_access_key": environ.get("SECRET_KEY", None),
|
|
243
|
+
"config": Config(region_name=environ.get("REGION", None)),
|
|
239
244
|
}
|
|
240
245
|
return TextractOcrDetector(**credentials_kwargs)
|
|
241
246
|
raise ValueError("You have set USE_OCR=True but any of USE_TESSERACT, USE_DOCTR, USE_TEXTRACT is set to False")
|
|
@@ -260,7 +265,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
|
|
|
260
265
|
:param cfg: A configuration
|
|
261
266
|
:return: Analyzer pipeline
|
|
262
267
|
"""
|
|
263
|
-
pipe_component_list:
|
|
268
|
+
pipe_component_list: list[PipelineComponent] = []
|
|
264
269
|
|
|
265
270
|
if cfg.USE_LAYOUT:
|
|
266
271
|
d_layout = build_detector(cfg, "LAYOUT")
|
|
@@ -300,22 +305,22 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
|
|
|
300
305
|
cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
|
|
301
306
|
cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
|
|
302
307
|
cfg.SEGMENTATION.CELL_CATEGORY_ID,
|
|
303
|
-
LayoutType.
|
|
308
|
+
LayoutType.TABLE,
|
|
304
309
|
[
|
|
305
|
-
CellType.
|
|
306
|
-
CellType.
|
|
307
|
-
CellType.
|
|
308
|
-
CellType.
|
|
309
|
-
LayoutType.
|
|
310
|
+
CellType.SPANNING,
|
|
311
|
+
CellType.ROW_HEADER,
|
|
312
|
+
CellType.COLUMN_HEADER,
|
|
313
|
+
CellType.PROJECTED_ROW_HEADER,
|
|
314
|
+
LayoutType.CELL,
|
|
310
315
|
],
|
|
311
316
|
[
|
|
312
|
-
CellType.
|
|
313
|
-
CellType.
|
|
314
|
-
CellType.
|
|
315
|
-
CellType.
|
|
317
|
+
CellType.SPANNING,
|
|
318
|
+
CellType.ROW_HEADER,
|
|
319
|
+
CellType.COLUMN_HEADER,
|
|
320
|
+
CellType.PROJECTED_ROW_HEADER,
|
|
316
321
|
],
|
|
317
|
-
[LayoutType.
|
|
318
|
-
[CellType.
|
|
322
|
+
[LayoutType.ROW, LayoutType.COLUMN],
|
|
323
|
+
[CellType.ROW_NUMBER, CellType.COLUMN_NUMBER],
|
|
319
324
|
stretch_rule=cfg.SEGMENTATION.STRETCH_RULE,
|
|
320
325
|
)
|
|
321
326
|
pipe_component_list.append(pubtables)
|
|
@@ -327,23 +332,23 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
|
|
|
327
332
|
cfg.SEGMENTATION.FULL_TABLE_TILING,
|
|
328
333
|
cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
|
|
329
334
|
cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
|
|
330
|
-
LayoutType.
|
|
331
|
-
[CellType.
|
|
332
|
-
[LayoutType.
|
|
333
|
-
[CellType.
|
|
335
|
+
LayoutType.TABLE,
|
|
336
|
+
[CellType.HEADER, CellType.BODY, LayoutType.CELL],
|
|
337
|
+
[LayoutType.ROW, LayoutType.COLUMN],
|
|
338
|
+
[CellType.ROW_NUMBER, CellType.COLUMN_NUMBER],
|
|
334
339
|
cfg.SEGMENTATION.STRETCH_RULE,
|
|
335
340
|
)
|
|
336
341
|
pipe_component_list.append(table_segmentation)
|
|
337
342
|
|
|
338
343
|
if cfg.USE_TABLE_REFINEMENT:
|
|
339
344
|
table_segmentation_refinement = TableSegmentationRefinementService(
|
|
340
|
-
[LayoutType.
|
|
345
|
+
[LayoutType.TABLE, LayoutType.TABLE_ROTATED],
|
|
341
346
|
[
|
|
342
|
-
LayoutType.
|
|
343
|
-
CellType.
|
|
344
|
-
CellType.
|
|
345
|
-
CellType.
|
|
346
|
-
CellType.
|
|
347
|
+
LayoutType.CELL,
|
|
348
|
+
CellType.COLUMN_HEADER,
|
|
349
|
+
CellType.PROJECTED_ROW_HEADER,
|
|
350
|
+
CellType.SPANNING,
|
|
351
|
+
CellType.ROW_HEADER,
|
|
347
352
|
],
|
|
348
353
|
)
|
|
349
354
|
pipe_component_list.append(table_segmentation_refinement)
|
|
@@ -363,7 +368,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
|
|
|
363
368
|
|
|
364
369
|
ocr = build_ocr(cfg)
|
|
365
370
|
skip_if_text_extracted = cfg.USE_PDF_MINER
|
|
366
|
-
extract_from_roi = LayoutType.
|
|
371
|
+
extract_from_roi = LayoutType.WORD if cfg.OCR.USE_DOCTR else None
|
|
367
372
|
text = TextExtractionService(
|
|
368
373
|
ocr, skip_if_text_extracted=skip_if_text_extracted, extract_from_roi=extract_from_roi
|
|
369
374
|
)
|
|
@@ -372,7 +377,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
|
|
|
372
377
|
if cfg.USE_PDF_MINER or cfg.USE_OCR:
|
|
373
378
|
match = MatchingService(
|
|
374
379
|
parent_categories=cfg.WORD_MATCHING.PARENTAL_CATEGORIES,
|
|
375
|
-
child_categories=LayoutType.
|
|
380
|
+
child_categories=LayoutType.WORD,
|
|
376
381
|
matching_rule=cfg.WORD_MATCHING.RULE,
|
|
377
382
|
threshold=cfg.WORD_MATCHING.THRESHOLD,
|
|
378
383
|
max_parent_only=cfg.WORD_MATCHING.MAX_PARENT_ONLY,
|
|
@@ -380,7 +385,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
|
|
|
380
385
|
pipe_component_list.append(match)
|
|
381
386
|
|
|
382
387
|
order = TextOrderService(
|
|
383
|
-
text_container=LayoutType.
|
|
388
|
+
text_container=LayoutType.WORD,
|
|
384
389
|
text_block_categories=cfg.TEXT_ORDERING.TEXT_BLOCK_CATEGORIES,
|
|
385
390
|
floating_text_block_categories=cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES,
|
|
386
391
|
include_residual_text_container=cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
|
|
@@ -392,7 +397,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
|
|
|
392
397
|
pipe_component_list.append(order)
|
|
393
398
|
|
|
394
399
|
page_parsing_service = PageParsingService(
|
|
395
|
-
text_container=LayoutType.
|
|
400
|
+
text_container=LayoutType.WORD,
|
|
396
401
|
floating_text_block_categories=cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES,
|
|
397
402
|
include_residual_text_container=cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
|
|
398
403
|
)
|
|
@@ -403,8 +408,8 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
|
|
|
403
408
|
|
|
404
409
|
def get_dd_analyzer(
|
|
405
410
|
reset_config_file: bool = True,
|
|
406
|
-
config_overwrite: Optional[
|
|
407
|
-
path_config_file: Optional[
|
|
411
|
+
config_overwrite: Optional[list[str]] = None,
|
|
412
|
+
path_config_file: Optional[PathLikeOrStr] = None,
|
|
408
413
|
) -> DoctectionPipe:
|
|
409
414
|
"""
|
|
410
415
|
Factory function for creating the built-in **deep**doctection analyzer.
|
|
@@ -431,7 +436,7 @@ def get_dd_analyzer(
|
|
|
431
436
|
:return: A DoctectionPipe instance with given configs
|
|
432
437
|
"""
|
|
433
438
|
config_overwrite = [] if config_overwrite is None else config_overwrite
|
|
434
|
-
lib = "TF" if os.environ.get("DD_USE_TF") else "PT"
|
|
439
|
+
lib = "TF" if os.environ.get("DD_USE_TF", "0") in ENV_VARS_TRUE else "PT"
|
|
435
440
|
if lib == "TF":
|
|
436
441
|
device = get_tf_device()
|
|
437
442
|
elif lib == "PT":
|
|
@@ -12,7 +12,7 @@ Some DataFlow classes for transforming and processing datapoints. Many classes h
|
|
|
12
12
|
"""
|
|
13
13
|
import itertools
|
|
14
14
|
from copy import copy
|
|
15
|
-
from typing import Any, Callable, Iterator,
|
|
15
|
+
from typing import Any, Callable, Iterator, Union
|
|
16
16
|
|
|
17
17
|
import tqdm
|
|
18
18
|
|
|
@@ -164,6 +164,10 @@ class RepeatedData(ProxyDataFlow):
|
|
|
164
164
|
Set to -1 to repeat ``ds`` infinite times.
|
|
165
165
|
"""
|
|
166
166
|
self.num = num
|
|
167
|
+
if self.num != -1:
|
|
168
|
+
self.dfs = itertools.tee(df, self.num)
|
|
169
|
+
else:
|
|
170
|
+
self.dfs = ()
|
|
167
171
|
super().__init__(df)
|
|
168
172
|
|
|
169
173
|
def __len__(self) -> int:
|
|
@@ -180,8 +184,8 @@ class RepeatedData(ProxyDataFlow):
|
|
|
180
184
|
while True:
|
|
181
185
|
yield from self.df
|
|
182
186
|
else:
|
|
183
|
-
for
|
|
184
|
-
yield from
|
|
187
|
+
for df in self.dfs:
|
|
188
|
+
yield from df
|
|
185
189
|
|
|
186
190
|
|
|
187
191
|
class ConcatData(DataFlow):
|
|
@@ -197,7 +201,7 @@ class ConcatData(DataFlow):
|
|
|
197
201
|
df = ConcatData([df_1,df_2])
|
|
198
202
|
"""
|
|
199
203
|
|
|
200
|
-
def __init__(self, df_lists:
|
|
204
|
+
def __init__(self, df_lists: list[DataFlow]) -> None:
|
|
201
205
|
"""
|
|
202
206
|
:param df_lists: a list of DataFlow.
|
|
203
207
|
"""
|
|
@@ -233,7 +237,7 @@ class JoinData(DataFlow):
|
|
|
233
237
|
`JoinData` will stop once the first Dataflow throws a StopIteration
|
|
234
238
|
"""
|
|
235
239
|
|
|
236
|
-
def __init__(self, df_lists:
|
|
240
|
+
def __init__(self, df_lists: list[DataFlow]) -> None:
|
|
237
241
|
"""
|
|
238
242
|
:param df_lists: a list of DataFlow. When these dataflows have different sizes, JoinData will stop when any
|
|
239
243
|
of them is exhausted.
|
|
@@ -21,7 +21,7 @@ from
|
|
|
21
21
|
|
|
22
22
|
<https://github.com/tensorpack/dataflow/blob/master/dataflow/dataflow/common.py>
|
|
23
23
|
"""
|
|
24
|
-
from typing import Any, Callable, Iterable, Iterator,
|
|
24
|
+
from typing import Any, Callable, Iterable, Iterator, Optional
|
|
25
25
|
|
|
26
26
|
import numpy as np
|
|
27
27
|
|
|
@@ -54,7 +54,7 @@ class CacheData(ProxyDataFlow):
|
|
|
54
54
|
:param shuffle: whether to shuffle the cache before yielding from it.
|
|
55
55
|
"""
|
|
56
56
|
self.shuffle = shuffle
|
|
57
|
-
self.buffer:
|
|
57
|
+
self.buffer: list[Any] = []
|
|
58
58
|
self._guard: Optional[DataFlowReentrantGuard] = None
|
|
59
59
|
self.rng = get_rng(self)
|
|
60
60
|
super().__init__(df)
|
|
@@ -78,7 +78,7 @@ class CacheData(ProxyDataFlow):
|
|
|
78
78
|
yield dp
|
|
79
79
|
self.buffer.append(dp)
|
|
80
80
|
|
|
81
|
-
def get_cache(self) ->
|
|
81
|
+
def get_cache(self) -> list[Any]:
|
|
82
82
|
"""
|
|
83
83
|
get the cache of the whole dataflow as a list
|
|
84
84
|
|
|
@@ -115,10 +115,10 @@ class CustomDataFromList(DataFromList):
|
|
|
115
115
|
|
|
116
116
|
def __init__(
|
|
117
117
|
self,
|
|
118
|
-
lst:
|
|
118
|
+
lst: list[Any],
|
|
119
119
|
shuffle: bool = False,
|
|
120
120
|
max_datapoints: Optional[int] = None,
|
|
121
|
-
rebalance_func: Optional[Callable[[
|
|
121
|
+
rebalance_func: Optional[Callable[[list[Any]], list[Any]]] = None,
|
|
122
122
|
):
|
|
123
123
|
"""
|
|
124
124
|
:param lst: the input list. Each element represents a datapoint.
|
|
@@ -19,23 +19,25 @@
|
|
|
19
19
|
Methods that convert incoming data to dataflows.
|
|
20
20
|
"""
|
|
21
21
|
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
22
24
|
import itertools
|
|
23
25
|
import json
|
|
24
26
|
import os
|
|
25
27
|
from collections import defaultdict
|
|
26
28
|
from pathlib import Path
|
|
27
|
-
from typing import DefaultDict, Dict, List, Optional, Sequence, Union
|
|
29
|
+
from typing import Any, DefaultDict, Dict, Iterator, List, Optional, Sequence, TextIO, Union
|
|
28
30
|
|
|
29
31
|
from jsonlines import Reader, Writer
|
|
30
32
|
from tabulate import tabulate
|
|
31
33
|
from termcolor import colored
|
|
32
34
|
|
|
33
35
|
from ..utils.context import timed_operation
|
|
34
|
-
from ..utils.detection_types import JsonDict, Pathlike
|
|
35
36
|
from ..utils.error import FileExtensionError
|
|
36
37
|
from ..utils.identifier import get_uuid_from_str
|
|
37
38
|
from ..utils.pdf_utils import PDFStreamer
|
|
38
39
|
from ..utils.tqdm import get_tqdm
|
|
40
|
+
from ..utils.types import JsonDict, PathLikeOrStr
|
|
39
41
|
from ..utils.utils import is_file_extension
|
|
40
42
|
from .base import DataFlow
|
|
41
43
|
from .common import FlattenData, JoinData, MapData
|
|
@@ -53,6 +55,59 @@ def _reset_df_and_get_length(df: DataFlow) -> int:
|
|
|
53
55
|
return length
|
|
54
56
|
|
|
55
57
|
|
|
58
|
+
class FileClosingIterator:
|
|
59
|
+
"""
|
|
60
|
+
A custom iterator that closes the file object once the iteration is complete.
|
|
61
|
+
|
|
62
|
+
This iterator is used to ensure that the file object is properly closed after
|
|
63
|
+
reading the data from it. It is used in the context of reading data from a file
|
|
64
|
+
in a streaming manner, where the data is not loaded into memory all at once.
|
|
65
|
+
|
|
66
|
+
**Example:**
|
|
67
|
+
|
|
68
|
+
file = open(path, "r")
|
|
69
|
+
iterator = Reader(file)
|
|
70
|
+
closing_iterator = FileClosingIterator(file, iter(iterator))
|
|
71
|
+
|
|
72
|
+
df = CustomDataFromIterable(closing_iterator, max_datapoints=max_datapoints) # set up a dataflow
|
|
73
|
+
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
def __init__(self, file_obj: TextIO, iterator: Iterator[Any]):
|
|
77
|
+
"""
|
|
78
|
+
Initializes the FileClosingIterator with a file object and its iterator.
|
|
79
|
+
|
|
80
|
+
:param file_obj (TextIO): The file object to read data from.
|
|
81
|
+
:param iterator (Iterator): The actual iterator of the file object.
|
|
82
|
+
"""
|
|
83
|
+
self.file_obj = file_obj
|
|
84
|
+
self.iterator = iterator
|
|
85
|
+
|
|
86
|
+
def __iter__(self) -> FileClosingIterator:
|
|
87
|
+
"""
|
|
88
|
+
Returns the iterator object itself.
|
|
89
|
+
|
|
90
|
+
:return: FileClosingIterator: The instance of the class itself.
|
|
91
|
+
"""
|
|
92
|
+
return self
|
|
93
|
+
|
|
94
|
+
def __next__(self) -> Any:
|
|
95
|
+
"""
|
|
96
|
+
Returns the next item from the file object's iterator.
|
|
97
|
+
Closes the file object if the iteration is finished.
|
|
98
|
+
|
|
99
|
+
:return: The next item from the file object's iterator.
|
|
100
|
+
|
|
101
|
+
Raises:
|
|
102
|
+
StopIteration: If there are no more items to return.
|
|
103
|
+
"""
|
|
104
|
+
try:
|
|
105
|
+
return next(self.iterator)
|
|
106
|
+
except StopIteration as exc:
|
|
107
|
+
self.file_obj.close()
|
|
108
|
+
raise StopIteration from exc
|
|
109
|
+
|
|
110
|
+
|
|
56
111
|
class SerializerJsonlines:
|
|
57
112
|
"""
|
|
58
113
|
Serialize a dataflow from a jsonlines file. Alternatively, save a dataflow of JSON objects to a .jsonl file.
|
|
@@ -66,7 +121,7 @@ class SerializerJsonlines:
|
|
|
66
121
|
"""
|
|
67
122
|
|
|
68
123
|
@staticmethod
|
|
69
|
-
def load(path:
|
|
124
|
+
def load(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> CustomDataFromIterable:
|
|
70
125
|
"""
|
|
71
126
|
:param path: a path to a .jsonl file.
|
|
72
127
|
:param max_datapoints: Will stop the iteration once max_datapoints have been streamed
|
|
@@ -75,10 +130,11 @@ class SerializerJsonlines:
|
|
|
75
130
|
"""
|
|
76
131
|
file = open(path, "r") # pylint: disable=W1514,R1732
|
|
77
132
|
iterator = Reader(file)
|
|
78
|
-
|
|
133
|
+
closing_iterator = FileClosingIterator(file, iter(iterator))
|
|
134
|
+
return CustomDataFromIterable(closing_iterator, max_datapoints=max_datapoints)
|
|
79
135
|
|
|
80
136
|
@staticmethod
|
|
81
|
-
def save(df: DataFlow, path:
|
|
137
|
+
def save(df: DataFlow, path: PathLikeOrStr, file_name: str, max_datapoints: Optional[int] = None) -> None:
|
|
82
138
|
"""
|
|
83
139
|
Writes a dataflow iteratively to a .jsonl file. Every datapoint must be a dict where all items are serializable.
|
|
84
140
|
As the length of the dataflow cannot be determined in every case max_datapoint prevents generating an
|
|
@@ -120,7 +176,7 @@ class SerializerTabsepFiles:
|
|
|
120
176
|
"""
|
|
121
177
|
|
|
122
178
|
@staticmethod
|
|
123
|
-
def load(path:
|
|
179
|
+
def load(path: PathLikeOrStr, max_datapoins: Optional[int] = None) -> CustomDataFromList:
|
|
124
180
|
"""
|
|
125
181
|
:param path: a path to a .txt file.
|
|
126
182
|
:param max_datapoins: Will stop the iteration once max_datapoints have been streamed
|
|
@@ -133,7 +189,7 @@ class SerializerTabsepFiles:
|
|
|
133
189
|
return CustomDataFromList(file_list, max_datapoints=max_datapoins)
|
|
134
190
|
|
|
135
191
|
@staticmethod
|
|
136
|
-
def save(df: DataFlow, path:
|
|
192
|
+
def save(df: DataFlow, path: PathLikeOrStr, file_name: str, max_datapoints: Optional[int] = None) -> None:
|
|
137
193
|
"""
|
|
138
194
|
Writes a dataflow iteratively to a .txt file. Every datapoint must be a string.
|
|
139
195
|
As the length of the dataflow cannot be determined in every case max_datapoint prevents generating an
|
|
@@ -168,7 +224,7 @@ class SerializerFiles:
|
|
|
168
224
|
|
|
169
225
|
@staticmethod
|
|
170
226
|
def load(
|
|
171
|
-
path:
|
|
227
|
+
path: PathLikeOrStr,
|
|
172
228
|
file_type: Union[str, Sequence[str]],
|
|
173
229
|
max_datapoints: Optional[int] = None,
|
|
174
230
|
shuffle: Optional[bool] = False,
|
|
@@ -190,15 +246,14 @@ class SerializerFiles:
|
|
|
190
246
|
df2: DataFlow
|
|
191
247
|
df3: DataFlow
|
|
192
248
|
|
|
193
|
-
|
|
194
|
-
path = Path(path)
|
|
249
|
+
path = Path(path)
|
|
195
250
|
if not path.exists():
|
|
196
251
|
raise NotADirectoryError(f"The path {path} to the directory or file does not exist")
|
|
197
252
|
|
|
198
253
|
if shuffle:
|
|
199
254
|
sort = False
|
|
200
|
-
it1 = os.walk(path, topdown=False)
|
|
201
|
-
it2 = os.walk(path, topdown=False)
|
|
255
|
+
it1 = os.walk(os.fspath(path), topdown=False)
|
|
256
|
+
it2 = os.walk(os.fspath(path), topdown=False)
|
|
202
257
|
df1 = CustomDataFromIterable(it1)
|
|
203
258
|
df2 = CustomDataFromIterable(it2)
|
|
204
259
|
df1 = MapData(df1, lambda dp: None if len(dp[2]) == 0 else dp)
|
|
@@ -237,7 +292,7 @@ class CocoParser:
|
|
|
237
292
|
:param annotation_file: location of annotation file
|
|
238
293
|
"""
|
|
239
294
|
|
|
240
|
-
def __init__(self, annotation_file: Optional[
|
|
295
|
+
def __init__(self, annotation_file: Optional[PathLikeOrStr] = None) -> None:
|
|
241
296
|
self.dataset: JsonDict = {}
|
|
242
297
|
self.anns: Dict[int, JsonDict] = {}
|
|
243
298
|
self.cats: Dict[int, JsonDict] = {}
|
|
@@ -465,7 +520,7 @@ class SerializerCoco:
|
|
|
465
520
|
"""
|
|
466
521
|
|
|
467
522
|
@staticmethod
|
|
468
|
-
def load(path:
|
|
523
|
+
def load(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> DataFlow:
|
|
469
524
|
"""
|
|
470
525
|
Loads a .json file and generates a dataflow.
|
|
471
526
|
|
|
@@ -478,7 +533,7 @@ class SerializerCoco:
|
|
|
478
533
|
|
|
479
534
|
{'image':{'id',...},'annotations':[{'id':…,'bbox':...}]}
|
|
480
535
|
|
|
481
|
-
for each
|
|
536
|
+
for each image id. We use the type hint CocoDatapointDict to describe this dictionary
|
|
482
537
|
|
|
483
538
|
:param max_datapoints: Will stop the iteration once max_datapoints have been streamed.
|
|
484
539
|
:param path: a path to a .json file.
|
|
@@ -525,7 +580,7 @@ class SerializerPdfDoc:
|
|
|
525
580
|
"""
|
|
526
581
|
|
|
527
582
|
@staticmethod
|
|
528
|
-
def load(path:
|
|
583
|
+
def load(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> DataFlow:
|
|
529
584
|
"""
|
|
530
585
|
Loads the document page wise and returns a dataflow accordingly.
|
|
531
586
|
|
|
@@ -552,14 +607,16 @@ class SerializerPdfDoc:
|
|
|
552
607
|
return df
|
|
553
608
|
|
|
554
609
|
@staticmethod
|
|
555
|
-
def save(path:
|
|
610
|
+
def save(path: PathLikeOrStr) -> None:
|
|
556
611
|
"""
|
|
557
612
|
Not implemented
|
|
558
613
|
"""
|
|
559
614
|
raise NotImplementedError()
|
|
560
615
|
|
|
561
616
|
@staticmethod
|
|
562
|
-
def split(
|
|
617
|
+
def split(
|
|
618
|
+
path: PathLikeOrStr, path_target: Optional[PathLikeOrStr] = None, max_datapoint: Optional[int] = None
|
|
619
|
+
) -> None:
|
|
563
620
|
"""
|
|
564
621
|
Split a document into single pages.
|
|
565
622
|
"""
|
|
@@ -23,7 +23,7 @@ import uuid
|
|
|
23
23
|
import weakref
|
|
24
24
|
from abc import ABC, abstractmethod
|
|
25
25
|
from contextlib import contextmanager
|
|
26
|
-
from typing import Any, Callable, Iterator,
|
|
26
|
+
from typing import Any, Callable, Iterator, no_type_check
|
|
27
27
|
|
|
28
28
|
import zmq
|
|
29
29
|
|
|
@@ -236,7 +236,7 @@ class MultiThreadMapData(_ParallelMapData):
|
|
|
236
236
|
self._strict = strict
|
|
237
237
|
self.num_thread = num_thread
|
|
238
238
|
self.map_func = map_func
|
|
239
|
-
self._threads:
|
|
239
|
+
self._threads: list[Any] = []
|
|
240
240
|
self._evt = None
|
|
241
241
|
|
|
242
242
|
def reset_state(self) -> None:
|
|
@@ -284,7 +284,7 @@ class _MultiProcessZMQDataFlow(DataFlow, ABC):
|
|
|
284
284
|
if os.name == "nt":
|
|
285
285
|
raise EnvironmentError("ZMQ IPC doesn't support windows")
|
|
286
286
|
self._reset_done = False
|
|
287
|
-
self._procs:
|
|
287
|
+
self._procs: list[Any] = []
|
|
288
288
|
self.context = None
|
|
289
289
|
self.socket = None
|
|
290
290
|
|
|
@@ -12,7 +12,7 @@ Some DataFlow classes for serialization. Many classes have been taken from
|
|
|
12
12
|
|
|
13
13
|
import pickle
|
|
14
14
|
from copy import copy
|
|
15
|
-
from typing import Any, Iterable, Iterator,
|
|
15
|
+
from typing import Any, Iterable, Iterator, Optional, Union
|
|
16
16
|
|
|
17
17
|
import numpy as np
|
|
18
18
|
|
|
@@ -23,7 +23,7 @@ from .base import DataFlow, RNGDataFlow
|
|
|
23
23
|
class DataFromList(RNGDataFlow):
|
|
24
24
|
"""Wrap a list of datapoints to a DataFlow"""
|
|
25
25
|
|
|
26
|
-
def __init__(self, lst:
|
|
26
|
+
def __init__(self, lst: list[Any], shuffle: bool = True) -> None:
|
|
27
27
|
"""
|
|
28
28
|
:param lst: input list. Each element is a datapoint.
|
|
29
29
|
:param shuffle: shuffle data.
|
|
@@ -79,11 +79,11 @@ class FakeData(RNGDataFlow):
|
|
|
79
79
|
|
|
80
80
|
def __init__(
|
|
81
81
|
self,
|
|
82
|
-
shapes:
|
|
82
|
+
shapes: list[Union[list[Any], tuple[Any]]],
|
|
83
83
|
size: int = 1000,
|
|
84
84
|
random: bool = True,
|
|
85
85
|
dtype: str = "float32",
|
|
86
|
-
domain:
|
|
86
|
+
domain: tuple[Union[float, int], Union[float, int]] = (0, 1),
|
|
87
87
|
):
|
|
88
88
|
"""
|
|
89
89
|
:param shapes: a list of lists/tuples. Shapes of each component.
|