deepdoctection 0.43.6__tar.gz → 0.44.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/PKG-INFO +4 -4
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/__init__.py +5 -1
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datapoint/__init__.py +1 -1
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datapoint/image.py +50 -1
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datapoint/view.py +149 -54
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/base.py +196 -51
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/fastlang.py +4 -2
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/mapper/laylmstruct.py +7 -7
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/pipe/base.py +29 -25
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/pipe/common.py +2 -2
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/pipe/concurrency.py +2 -2
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/pipe/language.py +2 -2
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/pipe/layout.py +2 -2
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/pipe/lm.py +13 -3
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/pipe/order.py +9 -5
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/pipe/refine.py +7 -7
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/pipe/segment.py +30 -30
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/pipe/sub_layout.py +2 -2
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/pipe/text.py +10 -5
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/pipe/transform.py +2 -4
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/file_utils.py +34 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/types.py +0 -1
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection.egg-info/PKG-INFO +4 -4
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection.egg-info/requires.txt +3 -3
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/setup.py +1 -1
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/LICENSE +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/README.md +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/analyzer/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/analyzer/config.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/analyzer/dd.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/analyzer/factory.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/configs/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/configs/conf_dd_one.yaml +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/configs/conf_tesseract.yaml +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/configs/profiles.jsonl +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/dataflow/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/dataflow/base.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/dataflow/common.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/dataflow/custom.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/dataflow/custom_serialize.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/dataflow/parallel_map.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/dataflow/serialize.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/dataflow/stats.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datapoint/annotation.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datapoint/box.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datapoint/convert.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/adapter.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/dataflow_builder.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/info.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/instances/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/instances/doclaynet.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/instances/fintabnet.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/instances/funsd.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/instances/iiitar13k.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/instances/layouttest.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/instances/publaynet.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/instances/pubtables1m.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/instances/pubtabnet.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/instances/rvlcdip.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/instances/xfund.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/instances/xsl/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/instances/xsl/pascal_voc.xsl +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/registry.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/save.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/eval/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/eval/accmetric.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/eval/base.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/eval/cocometric.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/eval/eval.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/eval/registry.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/eval/tedsmetric.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/eval/tp_eval_callback.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/base.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/d2detect.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/deskew.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/doctrocr.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/hfdetr.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/hflayoutlm.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/hflm.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/model.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/pdftext.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/pt/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/pt/nms.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/pt/ptutils.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tessocr.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/texocr.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tfutils.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpcompat.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/common.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/config/config.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/predict.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/preproc.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/utils/np_box_ops.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tpdetect.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/mapper/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/mapper/cats.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/mapper/cocostruct.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/mapper/d2struct.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/mapper/hfstruct.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/mapper/maputils.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/mapper/match.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/mapper/misc.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/mapper/pascalstruct.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/mapper/prodigystruct.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/mapper/pubstruct.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/mapper/tpstruct.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/mapper/xfundstruct.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/pipe/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/pipe/anngen.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/pipe/doctectionpipe.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/pipe/registry.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/py.typed +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/train/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/train/d2_frcnn_train.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/train/hf_detr_train.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/train/hf_layoutlm_train.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/train/tp_frcnn_train.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/concurrency.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/context.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/develop.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/env_info.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/error.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/fs.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/identifier.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/logger.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/metacfg.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/mocks.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/pdf_utils.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/settings.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/tqdm.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/transform.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/utils.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/viz.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection.egg-info/SOURCES.txt +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection.egg-info/dependency_links.txt +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection.egg-info/top_level.txt +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/setup.cfg +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.1}/tests/test_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: deepdoctection
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.44.1
|
|
4
4
|
Summary: Repository for Document AI
|
|
5
5
|
Home-page: https://github.com/deepdoctection/deepdoctection
|
|
6
6
|
Author: Dr. Janis Meyer
|
|
@@ -27,7 +27,7 @@ Requires-Dist: networkx>=2.7.1
|
|
|
27
27
|
Requires-Dist: numpy<2.0,>=1.21
|
|
28
28
|
Requires-Dist: packaging>=20.0
|
|
29
29
|
Requires-Dist: Pillow>=10.0.0
|
|
30
|
-
Requires-Dist: pypdf>=
|
|
30
|
+
Requires-Dist: pypdf>=6.0.0
|
|
31
31
|
Requires-Dist: pypdfium2>=4.30.0
|
|
32
32
|
Requires-Dist: pyyaml>=6.0.1
|
|
33
33
|
Requires-Dist: pyzmq>=16
|
|
@@ -46,7 +46,7 @@ Requires-Dist: networkx>=2.7.1; extra == "tf"
|
|
|
46
46
|
Requires-Dist: numpy<2.0,>=1.21; extra == "tf"
|
|
47
47
|
Requires-Dist: packaging>=20.0; extra == "tf"
|
|
48
48
|
Requires-Dist: Pillow>=10.0.0; extra == "tf"
|
|
49
|
-
Requires-Dist: pypdf>=
|
|
49
|
+
Requires-Dist: pypdf>=6.0.0; extra == "tf"
|
|
50
50
|
Requires-Dist: pypdfium2>=4.30.0; extra == "tf"
|
|
51
51
|
Requires-Dist: pyyaml>=6.0.1; extra == "tf"
|
|
52
52
|
Requires-Dist: pyzmq>=16; extra == "tf"
|
|
@@ -78,7 +78,7 @@ Requires-Dist: networkx>=2.7.1; extra == "pt"
|
|
|
78
78
|
Requires-Dist: numpy<2.0,>=1.21; extra == "pt"
|
|
79
79
|
Requires-Dist: packaging>=20.0; extra == "pt"
|
|
80
80
|
Requires-Dist: Pillow>=10.0.0; extra == "pt"
|
|
81
|
-
Requires-Dist: pypdf>=
|
|
81
|
+
Requires-Dist: pypdf>=6.0.0; extra == "pt"
|
|
82
82
|
Requires-Dist: pypdfium2>=4.30.0; extra == "pt"
|
|
83
83
|
Requires-Dist: pyyaml>=6.0.1; extra == "pt"
|
|
84
84
|
Requires-Dist: pyzmq>=16; extra == "pt"
|
|
@@ -25,7 +25,7 @@ from .utils.logger import LoggingRecord, logger
|
|
|
25
25
|
|
|
26
26
|
# pylint: enable=wrong-import-position
|
|
27
27
|
|
|
28
|
-
__version__ = "0.
|
|
28
|
+
__version__ = "0.44.1"
|
|
29
29
|
|
|
30
30
|
_IMPORT_STRUCTURE = {
|
|
31
31
|
"analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory"],
|
|
@@ -92,6 +92,7 @@ _IMPORT_STRUCTURE = {
|
|
|
92
92
|
"convert_pdf_bytes_to_np_array_v2",
|
|
93
93
|
"as_dict",
|
|
94
94
|
"ImageAnnotationBaseView",
|
|
95
|
+
"MetaAnnotation",
|
|
95
96
|
"Image",
|
|
96
97
|
"Word",
|
|
97
98
|
"Layout",
|
|
@@ -105,6 +106,7 @@ _IMPORT_STRUCTURE = {
|
|
|
105
106
|
"DatasetAdapter",
|
|
106
107
|
"DatasetBase",
|
|
107
108
|
"MergeDataset",
|
|
109
|
+
"DatasetCard",
|
|
108
110
|
"CustomDataset",
|
|
109
111
|
"DataFlowBaseBuilder",
|
|
110
112
|
"DatasetInfo",
|
|
@@ -313,6 +315,8 @@ _IMPORT_STRUCTURE = {
|
|
|
313
315
|
"get_apted_requirement",
|
|
314
316
|
"distance_available",
|
|
315
317
|
"get_distance_requirement",
|
|
318
|
+
"numpy_v1_available",
|
|
319
|
+
"get_numpy_v1_requirement",
|
|
316
320
|
"transformers_available",
|
|
317
321
|
"get_transformers_requirement",
|
|
318
322
|
"detectron2_available",
|
|
@@ -25,7 +25,7 @@ from collections import defaultdict
|
|
|
25
25
|
from dataclasses import dataclass, field
|
|
26
26
|
from os import environ, fspath
|
|
27
27
|
from pathlib import Path
|
|
28
|
-
from typing import Any, Optional, Sequence, Union, no_type_check
|
|
28
|
+
from typing import Any, Optional, Sequence, TypedDict, Union, no_type_check
|
|
29
29
|
|
|
30
30
|
import numpy as np
|
|
31
31
|
from numpy import uint8
|
|
@@ -40,6 +40,55 @@ from .box import crop_box_from_image, global_to_local_coords, intersection_box
|
|
|
40
40
|
from .convert import as_dict, convert_b64_to_np_array, convert_np_array_to_b64, convert_pdf_bytes_to_np_array_v2
|
|
41
41
|
|
|
42
42
|
|
|
43
|
+
class MetaAnnotationDict(TypedDict):
|
|
44
|
+
"""MetaAnnotationDict"""
|
|
45
|
+
|
|
46
|
+
image_annotations: list[str]
|
|
47
|
+
sub_categories: dict[str, dict[str, list[str]]]
|
|
48
|
+
relationships: dict[str, list[str]]
|
|
49
|
+
summaries: list[str]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass(frozen=True)
|
|
53
|
+
class MetaAnnotation:
|
|
54
|
+
"""
|
|
55
|
+
An immutable dataclass that stores information about what `Image` are being
|
|
56
|
+
modified through a pipeline component.
|
|
57
|
+
|
|
58
|
+
Attributes:
|
|
59
|
+
image_annotations: Tuple of `ObjectTypes` representing image annotations.
|
|
60
|
+
sub_categories: Dictionary mapping `ObjectTypes` to dicts of `ObjectTypes` to sets of `ObjectTypes`
|
|
61
|
+
for sub-categories.
|
|
62
|
+
relationships: Dictionary mapping `ObjectTypes` to sets of `ObjectTypes` for relationships.
|
|
63
|
+
summaries: Tuple of `ObjectTypes` representing summaries.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
image_annotations: tuple[ObjectTypes, ...] = field(default=())
|
|
67
|
+
sub_categories: dict[ObjectTypes, dict[ObjectTypes, set[ObjectTypes]]] = field(default_factory=dict)
|
|
68
|
+
relationships: dict[ObjectTypes, set[ObjectTypes]] = field(default_factory=dict)
|
|
69
|
+
summaries: tuple[ObjectTypes, ...] = field(default=())
|
|
70
|
+
|
|
71
|
+
def as_dict(self) -> MetaAnnotationDict:
|
|
72
|
+
"""
|
|
73
|
+
Returns the MetaAnnotation as a dictionary, with all `ObjectTypes` converted to strings.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
A dictionary representation of the MetaAnnotation where all `ObjectTypes` are converted to strings.
|
|
77
|
+
"""
|
|
78
|
+
return {
|
|
79
|
+
"image_annotations": [obj.value for obj in self.image_annotations],
|
|
80
|
+
"sub_categories": {
|
|
81
|
+
outer_key.value: {
|
|
82
|
+
inner_key.value: [val.value for val in inner_values]
|
|
83
|
+
for inner_key, inner_values in outer_value.items()
|
|
84
|
+
}
|
|
85
|
+
for outer_key, outer_value in self.sub_categories.items()
|
|
86
|
+
},
|
|
87
|
+
"relationships": {key.value: [val.value for val in values] for key, values in self.relationships.items()},
|
|
88
|
+
"summaries": [obj.value for obj in self.summaries],
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
|
|
43
92
|
@dataclass
|
|
44
93
|
class Image:
|
|
45
94
|
"""
|
|
@@ -42,13 +42,60 @@ from ..utils.settings import (
|
|
|
42
42
|
get_type,
|
|
43
43
|
)
|
|
44
44
|
from ..utils.transform import ResizeTransform, box_to_point4, point4_to_box
|
|
45
|
-
from ..utils.types import HTML, AnnotationDict, Chunks, ImageDict, PathLikeOrStr, PixelValues,
|
|
45
|
+
from ..utils.types import HTML, AnnotationDict, Chunks, ImageDict, PathLikeOrStr, PixelValues, csv
|
|
46
46
|
from ..utils.viz import draw_boxes, interactive_imshow, viz_handler
|
|
47
47
|
from .annotation import CategoryAnnotation, ContainerAnnotation, ImageAnnotation, ann_from_dict
|
|
48
48
|
from .box import BoundingBox, crop_box_from_image
|
|
49
49
|
from .image import Image
|
|
50
50
|
|
|
51
51
|
|
|
52
|
+
@dataclass(frozen=True)
|
|
53
|
+
class Text_:
|
|
54
|
+
"""
|
|
55
|
+
Immutable dataclass for storing structured text extraction results.
|
|
56
|
+
|
|
57
|
+
Attributes:
|
|
58
|
+
text: The concatenated text string.
|
|
59
|
+
words: List of word strings.
|
|
60
|
+
ann_ids: List of annotation IDs for each word.
|
|
61
|
+
token_classes: List of token class names for each word.
|
|
62
|
+
token_class_ann_ids: List of annotation IDs for each token class.
|
|
63
|
+
token_tags: List of token tag names for each word.
|
|
64
|
+
token_tag_ann_ids: List of annotation IDs for each token tag.
|
|
65
|
+
token_class_ids: List of token class IDs.
|
|
66
|
+
token_tag_ids: List of token tag IDs.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
text: str = ""
|
|
70
|
+
words: list[str] = field(default_factory=list)
|
|
71
|
+
ann_ids: list[str] = field(default_factory=list)
|
|
72
|
+
token_classes: list[str] = field(default_factory=list)
|
|
73
|
+
token_class_ann_ids: list[str] = field(default_factory=list)
|
|
74
|
+
token_tags: list[str] = field(default_factory=list)
|
|
75
|
+
token_tag_ann_ids: list[str] = field(default_factory=list)
|
|
76
|
+
token_class_ids: list[str] = field(default_factory=list)
|
|
77
|
+
token_tag_ids: list[str] = field(default_factory=list)
|
|
78
|
+
|
|
79
|
+
def as_dict(self) -> dict[str, Union[list[str], str]]:
|
|
80
|
+
"""
|
|
81
|
+
Returns the Text_ as a dictionary.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
A dictionary representation of the Text_ dataclass.
|
|
85
|
+
"""
|
|
86
|
+
return {
|
|
87
|
+
"text": self.text,
|
|
88
|
+
"words": self.words,
|
|
89
|
+
"ann_ids": self.ann_ids,
|
|
90
|
+
"token_classes": self.token_classes,
|
|
91
|
+
"token_class_ann_ids": self.token_class_ann_ids,
|
|
92
|
+
"token_tags": self.token_tags,
|
|
93
|
+
"token_tag_ann_ids": self.token_tag_ann_ids,
|
|
94
|
+
"token_class_ids": self.token_class_ids,
|
|
95
|
+
"token_tag_ids": self.token_tag_ids,
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
|
|
52
99
|
class ImageAnnotationBaseView(ImageAnnotation):
|
|
53
100
|
"""
|
|
54
101
|
Consumption class for having easier access to categories added to an `ImageAnnotation`.
|
|
@@ -263,13 +310,28 @@ class Layout(ImageAnnotationBaseView):
|
|
|
263
310
|
"""
|
|
264
311
|
words = self.get_ordered_words()
|
|
265
312
|
if words:
|
|
266
|
-
|
|
313
|
+
(
|
|
314
|
+
characters,
|
|
315
|
+
ann_ids,
|
|
316
|
+
token_classes,
|
|
317
|
+
token_class_ann_ids,
|
|
318
|
+
token_tags,
|
|
319
|
+
token_tag_ann_ids,
|
|
320
|
+
token_classes_ids,
|
|
321
|
+
token_tag_ids,
|
|
322
|
+
) = map(list, zip(
|
|
267
323
|
*[
|
|
268
324
|
(
|
|
269
325
|
word.characters,
|
|
270
326
|
word.annotation_id,
|
|
271
327
|
word.token_class,
|
|
328
|
+
word.get_sub_category(WordType.TOKEN_CLASS).annotation_id
|
|
329
|
+
if WordType.TOKEN_CLASS in word.sub_categories
|
|
330
|
+
else None,
|
|
272
331
|
word.token_tag,
|
|
332
|
+
word.get_sub_category(WordType.TOKEN_TAG).annotation_id
|
|
333
|
+
if WordType.TOKEN_TAG in word.sub_categories
|
|
334
|
+
else None,
|
|
273
335
|
word.get_sub_category(WordType.TOKEN_CLASS).category_id
|
|
274
336
|
if WordType.TOKEN_CLASS in word.sub_categories
|
|
275
337
|
else None,
|
|
@@ -279,25 +341,40 @@ class Layout(ImageAnnotationBaseView):
|
|
|
279
341
|
)
|
|
280
342
|
for word in words
|
|
281
343
|
]
|
|
282
|
-
)
|
|
344
|
+
))
|
|
283
345
|
else:
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
346
|
+
(
|
|
347
|
+
characters,
|
|
348
|
+
ann_ids,
|
|
349
|
+
token_classes,
|
|
350
|
+
token_class_ann_ids,
|
|
351
|
+
token_tags,
|
|
352
|
+
token_tag_ann_ids,
|
|
353
|
+
token_classes_ids,
|
|
354
|
+
token_tag_ids,
|
|
355
|
+
) = (
|
|
356
|
+
[],
|
|
357
|
+
[],
|
|
358
|
+
[],
|
|
359
|
+
[],
|
|
360
|
+
[],
|
|
361
|
+
[],
|
|
362
|
+
[],
|
|
363
|
+
[],
|
|
291
364
|
)
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
"
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
365
|
+
|
|
366
|
+
return Text_(
|
|
367
|
+
text=" ".join(characters), # type: ignore
|
|
368
|
+
words=characters, # type: ignore
|
|
369
|
+
ann_ids=ann_ids, # type: ignore
|
|
370
|
+
token_classes=token_classes, # type: ignore
|
|
371
|
+
token_class_ann_ids=token_class_ann_ids, # type: ignore
|
|
372
|
+
token_tags=token_tags, # type: ignore
|
|
373
|
+
token_tag_ann_ids=token_tag_ann_ids, # type: ignore
|
|
374
|
+
token_class_ids=token_classes_ids, # type: ignore
|
|
375
|
+
token_tag_ids=token_tag_ids, # type: ignore
|
|
376
|
+
)
|
|
377
|
+
|
|
301
378
|
|
|
302
379
|
def get_attribute_names(self) -> set[str]:
|
|
303
380
|
attr_names = (
|
|
@@ -590,14 +667,16 @@ class Table(Layout):
|
|
|
590
667
|
|
|
591
668
|
@property
|
|
592
669
|
def csv_(self) -> list[list[list[Text_]]]:
|
|
670
|
+
"""
|
|
671
|
+
Returns:
|
|
672
|
+
A csv-style representation of a table as list of lists of cell.text_.
|
|
673
|
+
"""
|
|
593
674
|
cells = self.cells
|
|
594
675
|
table_list = [[[] for _ in range(self.number_of_columns)] for _ in range(self.number_of_rows)] # type: ignore
|
|
595
676
|
for cell in cells:
|
|
596
677
|
table_list[cell.row_number - 1][cell.column_number - 1].append(cell.text_) # type: ignore
|
|
597
678
|
return table_list
|
|
598
679
|
|
|
599
|
-
|
|
600
|
-
|
|
601
680
|
def __str__(self) -> str:
|
|
602
681
|
out = " ".join([" ".join(row + ["\n"]) for row in self.csv])
|
|
603
682
|
return out
|
|
@@ -624,26 +703,34 @@ class Table(Layout):
|
|
|
624
703
|
words: list[str] = []
|
|
625
704
|
ann_ids: list[str] = []
|
|
626
705
|
token_classes: list[str] = []
|
|
706
|
+
token_class_ann_ids: list[str] = []
|
|
627
707
|
token_tags: list[str] = []
|
|
708
|
+
token_tag_ann_ids: list[str] = []
|
|
628
709
|
token_class_ids: list[str] = []
|
|
629
710
|
token_tag_ids: list[str] = []
|
|
630
711
|
for cell in cells:
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
"
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
712
|
+
text_ = cell.text_
|
|
713
|
+
text.append(text_.text)
|
|
714
|
+
words.extend(text_.words)
|
|
715
|
+
ann_ids.extend(text_.ann_ids)
|
|
716
|
+
token_classes.extend(text_.token_classes)
|
|
717
|
+
token_class_ann_ids.extend(text_.token_class_ann_ids)
|
|
718
|
+
token_tags.extend(text_.token_tags)
|
|
719
|
+
token_tag_ann_ids.extend(text_.token_tag_ann_ids)
|
|
720
|
+
token_class_ids.extend(text_.token_class_ids)
|
|
721
|
+
token_tag_ids.extend(text_.token_tag_ids)
|
|
722
|
+
return Text_(
|
|
723
|
+
text=" ".join(text),
|
|
724
|
+
words=words,
|
|
725
|
+
ann_ids=ann_ids,
|
|
726
|
+
token_classes=token_classes,
|
|
727
|
+
token_class_ann_ids=token_class_ann_ids,
|
|
728
|
+
token_tags=token_tags,
|
|
729
|
+
token_tag_ann_ids=token_tag_ann_ids,
|
|
730
|
+
token_class_ids=token_class_ids,
|
|
731
|
+
token_tag_ids=token_tag_ids,
|
|
732
|
+
)
|
|
733
|
+
|
|
647
734
|
|
|
648
735
|
@property
|
|
649
736
|
def words(self) -> list[ImageAnnotationBaseView]:
|
|
@@ -1051,7 +1138,7 @@ class Page(Image):
|
|
|
1051
1138
|
|
|
1052
1139
|
```python
|
|
1053
1140
|
{"text": text string,
|
|
1054
|
-
"
|
|
1141
|
+
"words": list of single words,
|
|
1055
1142
|
"annotation_ids": word annotation ids}
|
|
1056
1143
|
```
|
|
1057
1144
|
"""
|
|
@@ -1060,26 +1147,34 @@ class Page(Image):
|
|
|
1060
1147
|
words: list[str] = []
|
|
1061
1148
|
ann_ids: list[str] = []
|
|
1062
1149
|
token_classes: list[str] = []
|
|
1150
|
+
token_class_ann_ids: list[str] = []
|
|
1063
1151
|
token_tags: list[str] = []
|
|
1152
|
+
token_tag_ann_ids: list[str] = []
|
|
1064
1153
|
token_class_ids: list[str] = []
|
|
1065
1154
|
token_tag_ids: list[str] = []
|
|
1066
1155
|
for block in block_with_order:
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
"
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1156
|
+
text_ = block.text_
|
|
1157
|
+
text.append(text_.text) # type: ignore
|
|
1158
|
+
words.extend(text_.words) # type: ignore
|
|
1159
|
+
ann_ids.extend(text_.ann_ids) # type: ignore
|
|
1160
|
+
token_classes.extend(text_.token_classes) # type: ignore
|
|
1161
|
+
token_class_ann_ids.extend(text_.token_class_ann_ids) # type: ignore
|
|
1162
|
+
token_tags.extend(text_.token_tags) # type: ignore
|
|
1163
|
+
token_tag_ann_ids.extend(text_.token_tag_ann_ids) # type: ignore
|
|
1164
|
+
token_class_ids.extend(text_.token_class_ids) # type: ignore
|
|
1165
|
+
token_tag_ids.extend(text_.token_tag_ids) # type: ignore
|
|
1166
|
+
return Text_(
|
|
1167
|
+
text=" ".join(text),
|
|
1168
|
+
words=words,
|
|
1169
|
+
ann_ids=ann_ids,
|
|
1170
|
+
token_classes=token_classes,
|
|
1171
|
+
token_class_ann_ids=token_class_ann_ids,
|
|
1172
|
+
token_tags=token_tags,
|
|
1173
|
+
token_tag_ann_ids=token_tag_ann_ids,
|
|
1174
|
+
token_class_ids=token_class_ids,
|
|
1175
|
+
token_tag_ids=token_tag_ann_ids,
|
|
1176
|
+
)
|
|
1177
|
+
|
|
1083
1178
|
|
|
1084
1179
|
def get_layout_context(self, annotation_id: str, context_size: int = 3) -> list[ImageAnnotationBaseView]:
|
|
1085
1180
|
"""
|