deepdoctection 0.33__tar.gz → 0.34__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- {deepdoctection-0.33 → deepdoctection-0.34}/PKG-INFO +4 -1
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/__init__.py +6 -3
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/analyzer/dd.py +39 -31
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datapoint/annotation.py +40 -2
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datapoint/image.py +117 -41
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datapoint/view.py +1 -1
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/base.py +1 -1
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/instances/fintabnet.py +1 -1
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/instances/xfund.py +29 -7
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/eval/eval.py +7 -1
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/model.py +2 -1
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/mapper/cats.py +11 -13
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/mapper/cocostruct.py +6 -2
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/mapper/d2struct.py +2 -1
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/mapper/laylmstruct.py +1 -1
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/mapper/match.py +31 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/mapper/misc.py +1 -1
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/mapper/prodigystruct.py +1 -1
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/pipe/anngen.py +27 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/pipe/base.py +23 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/pipe/common.py +123 -38
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/pipe/segment.py +1 -1
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/pipe/sub_layout.py +1 -1
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/env_info.py +1 -1
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/fs.py +27 -4
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/pdf_utils.py +28 -3
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/settings.py +3 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection.egg-info/PKG-INFO +4 -1
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection.egg-info/requires.txt +3 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/setup.cfg +3 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/setup.py +3 -1
- {deepdoctection-0.33 → deepdoctection-0.34}/LICENSE +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/README.md +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/analyzer/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/configs/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/configs/conf_dd_one.yaml +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/configs/conf_tesseract.yaml +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/dataflow/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/dataflow/base.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/dataflow/common.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/dataflow/custom.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/dataflow/custom_serialize.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/dataflow/parallel_map.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/dataflow/serialize.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/dataflow/stats.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datapoint/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datapoint/box.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datapoint/convert.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/adapter.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/dataflow_builder.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/info.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/instances/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/instances/doclaynet.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/instances/funsd.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/instances/iiitar13k.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/instances/layouttest.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/instances/publaynet.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/instances/pubtables1m.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/instances/pubtabnet.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/instances/rvlcdip.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/instances/xsl/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/instances/xsl/pascal_voc.xsl +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/registry.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/save.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/eval/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/eval/accmetric.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/eval/base.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/eval/cocometric.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/eval/registry.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/eval/tedsmetric.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/eval/tp_eval_callback.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/base.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/d2detect.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/deskew.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/doctrocr.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/fastlang.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/hfdetr.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/hflayoutlm.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/hflm.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/pdftext.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/pt/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/pt/nms.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/pt/ptutils.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tessocr.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/texocr.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tfutils.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpcompat.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/common.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/config/config.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/predict.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/preproc.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/utils/np_box_ops.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tpdetect.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/mapper/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/mapper/hfstruct.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/mapper/maputils.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/mapper/pascalstruct.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/mapper/pubstruct.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/mapper/tpstruct.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/mapper/xfundstruct.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/pipe/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/pipe/concurrency.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/pipe/doctectionpipe.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/pipe/language.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/pipe/layout.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/pipe/lm.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/pipe/order.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/pipe/refine.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/pipe/registry.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/pipe/text.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/pipe/transform.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/py.typed +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/train/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/train/d2_frcnn_train.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/train/hf_detr_train.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/train/hf_layoutlm_train.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/train/tp_frcnn_train.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/concurrency.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/context.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/develop.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/error.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/file_utils.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/identifier.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/logger.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/metacfg.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/mocks.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/tqdm.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/transform.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/types.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/utils.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/viz.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection.egg-info/SOURCES.txt +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection.egg-info/dependency_links.txt +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection.egg-info/top_level.txt +0 -0
- {deepdoctection-0.33 → deepdoctection-0.34}/tests/test_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: deepdoctection
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.34
|
|
4
4
|
Summary: Repository for Document AI
|
|
5
5
|
Home-page: https://github.com/deepdoctection/deepdoctection
|
|
6
6
|
Author: Dr. Janis Meyer
|
|
@@ -29,6 +29,7 @@ Requires-Dist: Pillow>=10.0.0
|
|
|
29
29
|
Requires-Dist: pypdf>=3.16.0
|
|
30
30
|
Requires-Dist: pyyaml>=6.0.1
|
|
31
31
|
Requires-Dist: pyzmq>=16
|
|
32
|
+
Requires-Dist: scipy>=1.13.1
|
|
32
33
|
Requires-Dist: termcolor>=1.1
|
|
33
34
|
Requires-Dist: tabulate>=0.7.7
|
|
34
35
|
Requires-Dist: tqdm==4.64.0
|
|
@@ -46,6 +47,7 @@ Requires-Dist: Pillow>=10.0.0; extra == "tf"
|
|
|
46
47
|
Requires-Dist: pypdf>=3.16.0; extra == "tf"
|
|
47
48
|
Requires-Dist: pyyaml>=6.0.1; extra == "tf"
|
|
48
49
|
Requires-Dist: pyzmq>=16; extra == "tf"
|
|
50
|
+
Requires-Dist: scipy>=1.13.1; extra == "tf"
|
|
49
51
|
Requires-Dist: termcolor>=1.1; extra == "tf"
|
|
50
52
|
Requires-Dist: tabulate>=0.7.7; extra == "tf"
|
|
51
53
|
Requires-Dist: tqdm==4.64.0; extra == "tf"
|
|
@@ -76,6 +78,7 @@ Requires-Dist: Pillow>=10.0.0; extra == "pt"
|
|
|
76
78
|
Requires-Dist: pypdf>=3.16.0; extra == "pt"
|
|
77
79
|
Requires-Dist: pyyaml>=6.0.1; extra == "pt"
|
|
78
80
|
Requires-Dist: pyzmq>=16; extra == "pt"
|
|
81
|
+
Requires-Dist: scipy>=1.13.1; extra == "pt"
|
|
79
82
|
Requires-Dist: termcolor>=1.1; extra == "pt"
|
|
80
83
|
Requires-Dist: tabulate>=0.7.7; extra == "pt"
|
|
81
84
|
Requires-Dist: tqdm==4.64.0; extra == "pt"
|
|
@@ -15,7 +15,6 @@ if importlib.util.find_spec("dotenv") is not None:
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
# pylint: disable=wrong-import-position
|
|
18
|
-
import os
|
|
19
18
|
import sys
|
|
20
19
|
from typing import TYPE_CHECKING
|
|
21
20
|
|
|
@@ -25,11 +24,10 @@ from .utils.logger import LoggingRecord, logger
|
|
|
25
24
|
|
|
26
25
|
# pylint: enable=wrong-import-position
|
|
27
26
|
|
|
28
|
-
__version__ = 0.
|
|
27
|
+
__version__ = 0.34
|
|
29
28
|
|
|
30
29
|
_IMPORT_STRUCTURE = {
|
|
31
30
|
"analyzer": [
|
|
32
|
-
"maybe_copy_config_to_cache",
|
|
33
31
|
"config_sanity_checks",
|
|
34
32
|
"build_detector",
|
|
35
33
|
"build_padder",
|
|
@@ -76,6 +74,7 @@ _IMPORT_STRUCTURE = {
|
|
|
76
74
|
],
|
|
77
75
|
"datapoint": [
|
|
78
76
|
"ann_from_dict",
|
|
77
|
+
"AnnotationMap",
|
|
79
78
|
"Annotation",
|
|
80
79
|
"CategoryAnnotation",
|
|
81
80
|
"ImageAnnotation",
|
|
@@ -237,6 +236,7 @@ _IMPORT_STRUCTURE = {
|
|
|
237
236
|
"LabelSummarizer",
|
|
238
237
|
"curry",
|
|
239
238
|
"match_anns_by_intersection",
|
|
239
|
+
"match_anns_by_distance",
|
|
240
240
|
"to_image",
|
|
241
241
|
"maybe_load_image",
|
|
242
242
|
"maybe_remove_image",
|
|
@@ -265,6 +265,8 @@ _IMPORT_STRUCTURE = {
|
|
|
265
265
|
"DetectResultGenerator",
|
|
266
266
|
"SubImageLayoutService",
|
|
267
267
|
"ImageCroppingService",
|
|
268
|
+
"IntersectionMatcher",
|
|
269
|
+
"NeighbourMatcher",
|
|
268
270
|
"MatchingService",
|
|
269
271
|
"PageParsingService",
|
|
270
272
|
"AnnotationNmsService",
|
|
@@ -364,6 +366,7 @@ _IMPORT_STRUCTURE = {
|
|
|
364
366
|
"get_configs_dir_path",
|
|
365
367
|
"get_weights_dir_path",
|
|
366
368
|
"get_dataset_dir_path",
|
|
369
|
+
"maybe_copy_config_to_cache",
|
|
367
370
|
"is_uuid_like",
|
|
368
371
|
"get_uuid_from_str",
|
|
369
372
|
"get_uuid",
|
|
@@ -27,7 +27,6 @@ from __future__ import annotations
|
|
|
27
27
|
|
|
28
28
|
import os
|
|
29
29
|
from os import environ
|
|
30
|
-
from shutil import copyfile
|
|
31
30
|
from typing import Optional, Union
|
|
32
31
|
|
|
33
32
|
from lazy_imports import try_import
|
|
@@ -44,7 +43,7 @@ from ..extern.texocr import TextractOcrDetector
|
|
|
44
43
|
from ..extern.tp.tfutils import disable_tp_layer_logging, get_tf_device
|
|
45
44
|
from ..extern.tpdetect import TPFrcnnDetector
|
|
46
45
|
from ..pipe.base import PipelineComponent
|
|
47
|
-
from ..pipe.common import AnnotationNmsService, MatchingService, PageParsingService
|
|
46
|
+
from ..pipe.common import AnnotationNmsService, IntersectionMatcher, MatchingService, PageParsingService
|
|
48
47
|
from ..pipe.doctectionpipe import DoctectionPipe
|
|
49
48
|
from ..pipe.layout import ImageLayoutService
|
|
50
49
|
from ..pipe.order import TextOrderService
|
|
@@ -55,10 +54,10 @@ from ..pipe.text import TextExtractionService
|
|
|
55
54
|
from ..utils.env_info import ENV_VARS_TRUE
|
|
56
55
|
from ..utils.error import DependencyError
|
|
57
56
|
from ..utils.file_utils import detectron2_available, tensorpack_available
|
|
58
|
-
from ..utils.fs import get_configs_dir_path, get_package_path,
|
|
57
|
+
from ..utils.fs import get_configs_dir_path, get_package_path, maybe_copy_config_to_cache
|
|
59
58
|
from ..utils.logger import LoggingRecord, logger
|
|
60
59
|
from ..utils.metacfg import AttrDict, set_config_by_yaml
|
|
61
|
-
from ..utils.settings import CellType, LayoutType
|
|
60
|
+
from ..utils.settings import CellType, LayoutType, Relationships
|
|
62
61
|
from ..utils.transform import PadTransform
|
|
63
62
|
from ..utils.types import PathLikeOrStr
|
|
64
63
|
|
|
@@ -67,7 +66,6 @@ with try_import() as image_guard:
|
|
|
67
66
|
|
|
68
67
|
|
|
69
68
|
__all__ = [
|
|
70
|
-
"maybe_copy_config_to_cache",
|
|
71
69
|
"config_sanity_checks",
|
|
72
70
|
"build_detector",
|
|
73
71
|
"build_padder",
|
|
@@ -77,31 +75,37 @@ __all__ = [
|
|
|
77
75
|
"build_doctr_word",
|
|
78
76
|
"get_dd_analyzer",
|
|
79
77
|
"build_analyzer",
|
|
78
|
+
"set_config_by_yaml",
|
|
80
79
|
]
|
|
81
80
|
|
|
82
81
|
_DD_ONE = "deepdoctection/configs/conf_dd_one.yaml"
|
|
83
82
|
_TESSERACT = "deepdoctection/configs/conf_tesseract.yaml"
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
:
|
|
97
|
-
"""
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
83
|
+
_MODEL_CHOICES = {
|
|
84
|
+
"layout": [
|
|
85
|
+
"layout/d2_model_0829999_layout_inf_only.pt",
|
|
86
|
+
"xrf_layout/model_final_inf_only.pt",
|
|
87
|
+
"microsoft/table-transformer-detection/pytorch_model.bin",
|
|
88
|
+
],
|
|
89
|
+
"segmentation": [
|
|
90
|
+
"item/model-1620000_inf_only.data-00000-of-00001",
|
|
91
|
+
"xrf_item/model_final_inf_only.pt",
|
|
92
|
+
"microsoft/table-transformer-structure-recognition/pytorch_model.bin",
|
|
93
|
+
"deepdoctection/tatr_tab_struct_v2/pytorch_model.bin",
|
|
94
|
+
],
|
|
95
|
+
"ocr": ["Tesseract", "DocTr", "Textract"],
|
|
96
|
+
"doctr_word": ["doctr/db_resnet50/pt/db_resnet50-ac60cadc.pt"],
|
|
97
|
+
"doctr_recognition": [
|
|
98
|
+
"doctr/crnn_vgg16_bn/pt/crnn_vgg16_bn-9762b0b0.pt",
|
|
99
|
+
"doctr/crnn_vgg16_bn/pt/pytorch_model.bin",
|
|
100
|
+
],
|
|
101
|
+
"llm": ["gpt-3.5-turbo", "gpt-4"],
|
|
102
|
+
"segmentation_choices": {
|
|
103
|
+
"item/model-1620000_inf_only.data-00000-of-00001": "cell/model-1800000_inf_only.data-00000-of-00001",
|
|
104
|
+
"xrf_item/model_final_inf_only.pt": "xrf_cell/model_final_inf_only.pt",
|
|
105
|
+
"microsoft/table-transformer-structure-recognition/pytorch_model.bin": None,
|
|
106
|
+
"deepdoctection/tatr_tab_struct_v2/pytorch_model.bin": None,
|
|
107
|
+
},
|
|
108
|
+
}
|
|
105
109
|
|
|
106
110
|
|
|
107
111
|
def config_sanity_checks(cfg: AttrDict) -> None:
|
|
@@ -375,13 +379,17 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
|
|
|
375
379
|
pipe_component_list.append(text)
|
|
376
380
|
|
|
377
381
|
if cfg.USE_PDF_MINER or cfg.USE_OCR:
|
|
378
|
-
|
|
379
|
-
parent_categories=cfg.WORD_MATCHING.PARENTAL_CATEGORIES,
|
|
380
|
-
child_categories=LayoutType.WORD,
|
|
382
|
+
matcher = IntersectionMatcher(
|
|
381
383
|
matching_rule=cfg.WORD_MATCHING.RULE,
|
|
382
384
|
threshold=cfg.WORD_MATCHING.THRESHOLD,
|
|
383
385
|
max_parent_only=cfg.WORD_MATCHING.MAX_PARENT_ONLY,
|
|
384
386
|
)
|
|
387
|
+
match = MatchingService(
|
|
388
|
+
parent_categories=cfg.WORD_MATCHING.PARENTAL_CATEGORIES,
|
|
389
|
+
child_categories=LayoutType.WORD,
|
|
390
|
+
matcher=matcher,
|
|
391
|
+
relationship_key=Relationships.CHILD,
|
|
392
|
+
)
|
|
385
393
|
pipe_component_list.append(match)
|
|
386
394
|
|
|
387
395
|
order = TextOrderService(
|
|
@@ -444,9 +452,9 @@ def get_dd_analyzer(
|
|
|
444
452
|
else:
|
|
445
453
|
raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
|
|
446
454
|
dd_one_config_path = maybe_copy_config_to_cache(
|
|
447
|
-
get_package_path(), get_configs_dir_path(), _DD_ONE, reset_config_file
|
|
455
|
+
get_package_path(), get_configs_dir_path() / "dd", _DD_ONE, reset_config_file
|
|
448
456
|
)
|
|
449
|
-
maybe_copy_config_to_cache(get_package_path(), get_configs_dir_path(), _TESSERACT)
|
|
457
|
+
maybe_copy_config_to_cache(get_package_path(), get_configs_dir_path() / "dd", _TESSERACT)
|
|
450
458
|
|
|
451
459
|
# Set up of the configuration and logging
|
|
452
460
|
cfg = set_config_by_yaml(dd_one_config_path if not path_config_file else path_config_file)
|
|
@@ -21,6 +21,7 @@ Dataclass for annotations and their derived classes.
|
|
|
21
21
|
from __future__ import annotations
|
|
22
22
|
|
|
23
23
|
from abc import ABC, abstractmethod
|
|
24
|
+
from collections import defaultdict
|
|
24
25
|
from dataclasses import dataclass, field
|
|
25
26
|
from typing import Optional, Union, no_type_check
|
|
26
27
|
|
|
@@ -66,6 +67,16 @@ def ann_from_dict(cls, **kwargs: AnnotationDict):
|
|
|
66
67
|
return ann
|
|
67
68
|
|
|
68
69
|
|
|
70
|
+
@dataclass(frozen=True)
|
|
71
|
+
class AnnotationMap:
|
|
72
|
+
"""AnnotationMap to store all sub categories, relationship keys and summary keys of an annotation"""
|
|
73
|
+
|
|
74
|
+
image_annotation_id: str
|
|
75
|
+
sub_category_key: Optional[ObjectTypes] = None
|
|
76
|
+
relationship_key: Optional[ObjectTypes] = None
|
|
77
|
+
summary_key: Optional[ObjectTypes] = None
|
|
78
|
+
|
|
79
|
+
|
|
69
80
|
@dataclass
|
|
70
81
|
class Annotation(ABC):
|
|
71
82
|
"""
|
|
@@ -397,7 +408,8 @@ class CategoryAnnotation(Annotation):
|
|
|
397
408
|
except ValueError:
|
|
398
409
|
logger.warning(LoggingRecord(f"Relationship {key} cannot be removed because it does not exist"))
|
|
399
410
|
else:
|
|
400
|
-
self.relationships
|
|
411
|
+
if key in self.relationships:
|
|
412
|
+
self.relationships[key].clear()
|
|
401
413
|
|
|
402
414
|
def get_defining_attributes(self) -> list[str]:
|
|
403
415
|
return ["category_name", "category_id"]
|
|
@@ -409,7 +421,7 @@ class CategoryAnnotation(Annotation):
|
|
|
409
421
|
|
|
410
422
|
:return: list of attributes.
|
|
411
423
|
"""
|
|
412
|
-
return []
|
|
424
|
+
return ["_category_name"]
|
|
413
425
|
|
|
414
426
|
@classmethod
|
|
415
427
|
def from_dict(cls, **kwargs: AnnotationDict) -> CategoryAnnotation:
|
|
@@ -470,6 +482,32 @@ class ImageAnnotation(CategoryAnnotation):
|
|
|
470
482
|
return self.image.summary.get_sub_category(key)
|
|
471
483
|
raise AnnotationError(f"Summary does not exist for {self.annotation_id} and key: {key}")
|
|
472
484
|
|
|
485
|
+
def get_annotation_map(self) -> defaultdict[str, list[AnnotationMap]]:
|
|
486
|
+
"""
|
|
487
|
+
Returns a defaultdict with annotation ids as keys and a list of AnnotationMap instances as values for all sub
|
|
488
|
+
categories, relationships and image summaries.
|
|
489
|
+
:return: defaultdict with annotation ids as keys and a list of AnnotationMap instances as values.
|
|
490
|
+
"""
|
|
491
|
+
annotation_id_dict = defaultdict(list)
|
|
492
|
+
annotation_id_dict[self.annotation_id].append(AnnotationMap(image_annotation_id=self.annotation_id))
|
|
493
|
+
for sub_cat_key in self.sub_categories:
|
|
494
|
+
sub_cat = self.get_sub_category(sub_cat_key)
|
|
495
|
+
annotation_id_dict[sub_cat.annotation_id].append(
|
|
496
|
+
AnnotationMap(image_annotation_id=self.annotation_id, sub_category_key=sub_cat_key)
|
|
497
|
+
)
|
|
498
|
+
if self.image is not None:
|
|
499
|
+
for summary_cat_key in self.image.summary.sub_categories:
|
|
500
|
+
summary_cat = self.get_summary(summary_cat_key)
|
|
501
|
+
annotation_id_dict[summary_cat.annotation_id].append(
|
|
502
|
+
AnnotationMap(image_annotation_id=self.annotation_id, summary_key=summary_cat_key)
|
|
503
|
+
)
|
|
504
|
+
for rel_key in self.relationships:
|
|
505
|
+
for rel_ann_ids in self.get_relationship(rel_key):
|
|
506
|
+
annotation_id_dict[rel_ann_ids].append(
|
|
507
|
+
AnnotationMap(image_annotation_id=self.annotation_id, relationship_key=rel_key)
|
|
508
|
+
)
|
|
509
|
+
return annotation_id_dict
|
|
510
|
+
|
|
473
511
|
|
|
474
512
|
@dataclass
|
|
475
513
|
class ContainerAnnotation(CategoryAnnotation):
|
|
@@ -21,10 +21,11 @@ Dataclass Image
|
|
|
21
21
|
from __future__ import annotations
|
|
22
22
|
|
|
23
23
|
import json
|
|
24
|
+
from collections import defaultdict
|
|
24
25
|
from dataclasses import dataclass, field
|
|
25
26
|
from os import environ
|
|
26
27
|
from pathlib import Path
|
|
27
|
-
from typing import Any,
|
|
28
|
+
from typing import Any, Optional, Sequence, Union, no_type_check
|
|
28
29
|
|
|
29
30
|
import numpy as np
|
|
30
31
|
from numpy import uint8
|
|
@@ -33,7 +34,7 @@ from ..utils.error import AnnotationError, BoundingBoxError, ImageError, UUIDErr
|
|
|
33
34
|
from ..utils.identifier import get_uuid, is_uuid_like
|
|
34
35
|
from ..utils.settings import ObjectTypes, SummaryType, get_type
|
|
35
36
|
from ..utils.types import ImageDict, PathLikeOrStr, PixelValues
|
|
36
|
-
from .annotation import Annotation, BoundingBox, CategoryAnnotation, ImageAnnotation
|
|
37
|
+
from .annotation import Annotation, AnnotationMap, BoundingBox, CategoryAnnotation, ImageAnnotation
|
|
37
38
|
from .box import crop_box_from_image, global_to_local_coords, intersection_box
|
|
38
39
|
from .convert import as_dict, convert_b64_to_np_array, convert_np_array_to_b64, convert_pdf_bytes_to_np_array_v2
|
|
39
40
|
|
|
@@ -303,6 +304,15 @@ class Image:
|
|
|
303
304
|
|
|
304
305
|
return self.embeddings[image_id]
|
|
305
306
|
|
|
307
|
+
def remove_embedding(self, image_id: str) -> None:
|
|
308
|
+
"""
|
|
309
|
+
Remove an embedding from the image.
|
|
310
|
+
|
|
311
|
+
:param image_id: uuid string of the embedding image
|
|
312
|
+
"""
|
|
313
|
+
if image_id in self.embeddings:
|
|
314
|
+
self.embeddings.pop(image_id)
|
|
315
|
+
|
|
306
316
|
def _self_embedding(self) -> None:
|
|
307
317
|
if self._bbox is not None:
|
|
308
318
|
self.set_embedding(self.image_id, self._bbox)
|
|
@@ -387,39 +397,6 @@ class Image:
|
|
|
387
397
|
|
|
388
398
|
return list(anns)
|
|
389
399
|
|
|
390
|
-
def get_annotation_iter(
|
|
391
|
-
self,
|
|
392
|
-
category_names: Optional[Union[str, ObjectTypes, Sequence[Union[str, ObjectTypes]]]] = None,
|
|
393
|
-
annotation_ids: Optional[Union[str, Sequence[str]]] = None,
|
|
394
|
-
service_id: Optional[Union[str, Sequence[str]]] = None,
|
|
395
|
-
model_id: Optional[Union[str, Sequence[str]]] = None,
|
|
396
|
-
session_ids: Optional[Union[str, Sequence[str]]] = None,
|
|
397
|
-
ignore_inactive: bool = True,
|
|
398
|
-
) -> Iterable[ImageAnnotation]:
|
|
399
|
-
"""
|
|
400
|
-
Get annotation as an iterator. Same as `get_annotation` but returns an iterator instead of a list.
|
|
401
|
-
|
|
402
|
-
:param category_names: A single name or list of names
|
|
403
|
-
:param annotation_ids: A single id or list of ids
|
|
404
|
-
:param service_id: A single service name or list of service names
|
|
405
|
-
:param model_id: A single model name or list of model names
|
|
406
|
-
:param session_ids: A single session id or list of session ids
|
|
407
|
-
:param ignore_inactive: If set to `True` only active annotations are returned.
|
|
408
|
-
|
|
409
|
-
:return: A (possibly empty) list of annotations
|
|
410
|
-
"""
|
|
411
|
-
|
|
412
|
-
return iter(
|
|
413
|
-
self.get_annotation(
|
|
414
|
-
category_names=category_names,
|
|
415
|
-
annotation_ids=annotation_ids,
|
|
416
|
-
service_id=service_id,
|
|
417
|
-
model_id=model_id,
|
|
418
|
-
session_ids=session_ids,
|
|
419
|
-
ignore_inactive=ignore_inactive,
|
|
420
|
-
)
|
|
421
|
-
)
|
|
422
|
-
|
|
423
400
|
def as_dict(self) -> dict[str, Any]:
|
|
424
401
|
"""
|
|
425
402
|
Returns the full image dataclass as dict. Uses the custom `convert.as_dict` to disregard attributes
|
|
@@ -441,7 +418,7 @@ class Image:
|
|
|
441
418
|
A list of attributes to suspend from as_dict creation.
|
|
442
419
|
"""
|
|
443
420
|
|
|
444
|
-
return ["_image"]
|
|
421
|
+
return ["_image", "_annotation_ids"]
|
|
445
422
|
|
|
446
423
|
def define_annotation_id(self, annotation: Annotation) -> str:
|
|
447
424
|
"""
|
|
@@ -456,7 +433,11 @@ class Image:
|
|
|
456
433
|
attributes_values = [str(getattr(annotation, attribute)) for attribute in attributes]
|
|
457
434
|
return get_uuid(*attributes_values, str(self.image_id))
|
|
458
435
|
|
|
459
|
-
def remove(
|
|
436
|
+
def remove(
|
|
437
|
+
self,
|
|
438
|
+
annotation_ids: Optional[Union[str, list[str]]] = None,
|
|
439
|
+
service_ids: Optional[Union[str, list[str]]] = None,
|
|
440
|
+
) -> None:
|
|
460
441
|
"""
|
|
461
442
|
Instead of removing consider deactivating annotations.
|
|
462
443
|
|
|
@@ -464,9 +445,66 @@ class Image:
|
|
|
464
445
|
|
|
465
446
|
:param annotation: The annotation to remove
|
|
466
447
|
"""
|
|
448
|
+
ann_id_to_annotation_maps = self.get_annotation_id_to_annotation_maps()
|
|
449
|
+
|
|
450
|
+
if annotation_ids is not None:
|
|
451
|
+
annotation_ids = [annotation_ids] if isinstance(annotation_ids, str) else annotation_ids
|
|
452
|
+
|
|
453
|
+
for ann_id in annotation_ids:
|
|
454
|
+
if ann_id not in ann_id_to_annotation_maps:
|
|
455
|
+
raise ImageError(f"Annotation with id {ann_id} not found")
|
|
456
|
+
annotation_maps = ann_id_to_annotation_maps[ann_id]
|
|
457
|
+
|
|
458
|
+
for annotation_map in annotation_maps:
|
|
459
|
+
self._remove_by_annotation_id(ann_id, annotation_map)
|
|
460
|
+
|
|
461
|
+
if service_ids is not None:
|
|
462
|
+
service_ids = [service_ids] if isinstance(service_ids, str) else service_ids
|
|
463
|
+
service_id_to_annotation_id = self.get_service_id_to_annotation_id()
|
|
464
|
+
|
|
465
|
+
for service_id in service_ids:
|
|
466
|
+
if service_id not in service_id_to_annotation_id:
|
|
467
|
+
raise ImageError(f"Service id {service_id} not found")
|
|
468
|
+
annotation_ids = service_id_to_annotation_id[service_id]
|
|
469
|
+
|
|
470
|
+
for ann_id in annotation_ids:
|
|
471
|
+
if ann_id not in ann_id_to_annotation_maps:
|
|
472
|
+
raise ImageError(f"Annotation with id {ann_id} not found")
|
|
473
|
+
annotation_maps = ann_id_to_annotation_maps[ann_id]
|
|
474
|
+
|
|
475
|
+
for annotation_map in annotation_maps:
|
|
476
|
+
self._remove_by_annotation_id(ann_id, annotation_map)
|
|
477
|
+
|
|
478
|
+
def _remove_by_annotation_id(self, annotation_id: str, location_dict: AnnotationMap) -> None:
|
|
479
|
+
image_annotation_id = location_dict.image_annotation_id
|
|
480
|
+
annotations = self.get_annotation(annotation_ids=image_annotation_id)
|
|
481
|
+
if not annotations:
|
|
482
|
+
return
|
|
483
|
+
# There can only be one annotation with a given id
|
|
484
|
+
annotation = annotations[0]
|
|
485
|
+
|
|
486
|
+
if (
|
|
487
|
+
location_dict.sub_category_key is None
|
|
488
|
+
and location_dict.relationship_key is None
|
|
489
|
+
and location_dict.summary_key is None
|
|
490
|
+
):
|
|
491
|
+
self.annotations.remove(annotation)
|
|
492
|
+
self._annotation_ids.remove(annotation.annotation_id)
|
|
493
|
+
|
|
494
|
+
sub_category_key = location_dict.sub_category_key
|
|
495
|
+
|
|
496
|
+
if sub_category_key is not None:
|
|
497
|
+
annotation.remove_sub_category(sub_category_key)
|
|
498
|
+
|
|
499
|
+
relationship_key = location_dict.relationship_key
|
|
467
500
|
|
|
468
|
-
|
|
469
|
-
|
|
501
|
+
if relationship_key is not None:
|
|
502
|
+
annotation.remove_relationship(relationship_key, annotation_id)
|
|
503
|
+
|
|
504
|
+
summary_key = location_dict.summary_key
|
|
505
|
+
if summary_key is not None:
|
|
506
|
+
if annotation.image is not None:
|
|
507
|
+
annotation.image.summary.remove_sub_category(summary_key)
|
|
470
508
|
|
|
471
509
|
def image_ann_to_image(self, annotation_id: str, crop_image: bool = False) -> None:
|
|
472
510
|
"""
|
|
@@ -580,6 +618,7 @@ class Image:
|
|
|
580
618
|
if summary_dict := kwargs.get("_summary", kwargs.get("summary")):
|
|
581
619
|
image.summary = CategoryAnnotation.from_dict(**summary_dict)
|
|
582
620
|
image.summary.category_name = SummaryType.SUMMARY
|
|
621
|
+
|
|
583
622
|
return image
|
|
584
623
|
|
|
585
624
|
@classmethod
|
|
@@ -645,7 +684,7 @@ class Image:
|
|
|
645
684
|
highest_hierarchy_only: bool = False,
|
|
646
685
|
path: Optional[PathLikeOrStr] = None,
|
|
647
686
|
dry: bool = False,
|
|
648
|
-
) -> Optional[ImageDict]:
|
|
687
|
+
) -> Optional[Union[ImageDict, str]]:
|
|
649
688
|
"""
|
|
650
689
|
Export image as dictionary. As numpy array cannot be serialized `image` values will be converted into
|
|
651
690
|
base64 encodings.
|
|
@@ -677,8 +716,45 @@ class Image:
|
|
|
677
716
|
return export_dict
|
|
678
717
|
with open(path_json, "w", encoding="UTF-8") as file:
|
|
679
718
|
json.dump(export_dict, file, indent=2)
|
|
680
|
-
return
|
|
719
|
+
return path_json
|
|
681
720
|
|
|
682
721
|
def get_categories_from_current_state(self) -> set[str]:
|
|
683
722
|
"""Returns all active dumped categories"""
|
|
684
723
|
return {ann.category_name for ann in self.get_annotation()}
|
|
724
|
+
|
|
725
|
+
def get_service_id_to_annotation_id(self) -> defaultdict[str, list[str]]:
|
|
726
|
+
"""
|
|
727
|
+
Returns a dictionary with service ids as keys and lists of annotation ids that have been generated by the
|
|
728
|
+
service
|
|
729
|
+
:return: default with service ids as keys and lists of annotation ids as values
|
|
730
|
+
"""
|
|
731
|
+
service_id_dict = defaultdict(list)
|
|
732
|
+
for ann in self.get_annotation():
|
|
733
|
+
if ann.service_id:
|
|
734
|
+
service_id_dict[ann.service_id].append(ann.annotation_id)
|
|
735
|
+
for sub_cat_key in ann.sub_categories:
|
|
736
|
+
sub_cat = ann.get_sub_category(sub_cat_key)
|
|
737
|
+
if sub_cat.service_id:
|
|
738
|
+
service_id_dict[sub_cat.service_id].append(sub_cat.annotation_id)
|
|
739
|
+
if ann.image is not None:
|
|
740
|
+
for summary_cat_key in ann.image.summary:
|
|
741
|
+
summary_cat = ann.get_summary(summary_cat_key)
|
|
742
|
+
if summary_cat.service_id:
|
|
743
|
+
service_id_dict[summary_cat.service_id].append(summary_cat.annotation_id)
|
|
744
|
+
|
|
745
|
+
return service_id_dict
|
|
746
|
+
|
|
747
|
+
def get_annotation_id_to_annotation_maps(self) -> defaultdict[str, list[AnnotationMap]]:
|
|
748
|
+
"""
|
|
749
|
+
Returns a dictionary with annotation ids as keys and lists of AnnotationMap as values. The range of ids
|
|
750
|
+
is the union of all ImageAnnotation, CategoryAnnotation and ContainerAnnotation of the image.
|
|
751
|
+
|
|
752
|
+
:return: default dict with annotation ids as keys and lists of AnnotationMap as values
|
|
753
|
+
"""
|
|
754
|
+
all_ann_id_dict = defaultdict(list)
|
|
755
|
+
for ann in self.get_annotation():
|
|
756
|
+
ann_id_dict = ann.get_annotation_map()
|
|
757
|
+
for key, val in ann_id_dict.items():
|
|
758
|
+
all_ann_id_dict[key].extend(val)
|
|
759
|
+
|
|
760
|
+
return all_ann_id_dict
|
|
@@ -971,7 +971,7 @@ class Page(Image):
|
|
|
971
971
|
highest_hierarchy_only: bool = False,
|
|
972
972
|
path: Optional[PathLikeOrStr] = None,
|
|
973
973
|
dry: bool = False,
|
|
974
|
-
) -> Optional[ImageDict]:
|
|
974
|
+
) -> Optional[Union[ImageDict, str]]:
|
|
975
975
|
"""
|
|
976
976
|
Export image as dictionary. As numpy array cannot be serialized `image` values will be converted into
|
|
977
977
|
base64 encodings.
|
|
@@ -451,7 +451,7 @@ class CustomDataset(DatasetBase):
|
|
|
451
451
|
return self.dataflow_builder
|
|
452
452
|
|
|
453
453
|
@staticmethod
|
|
454
|
-
def from_dataset_card(file_path:
|
|
454
|
+
def from_dataset_card(file_path: PathLikeOrStr, dataflow_builder: Type[DataFlowBaseBuilder]) -> CustomDataset:
|
|
455
455
|
"""
|
|
456
456
|
This static method creates a CustomDataset instance from a dataset card.
|
|
457
457
|
|
|
@@ -264,7 +264,7 @@ class FintabnetBuilder(DataFlowBaseBuilder):
|
|
|
264
264
|
add_summary=True,
|
|
265
265
|
),
|
|
266
266
|
)
|
|
267
|
-
df = MapData(df, lambda dp: [ann.image for ann in dp.
|
|
267
|
+
df = MapData(df, lambda dp: [ann.image for ann in dp.get_annotation(category_names=LayoutType.TABLE)])
|
|
268
268
|
df = FlattenData(df)
|
|
269
269
|
df = MapData(df, lambda dp: dp[0])
|
|
270
270
|
|
|
@@ -180,13 +180,35 @@ class XfundBuilder(DataFlowBaseBuilder):
|
|
|
180
180
|
"answer": TokenClasses.ANSWER,
|
|
181
181
|
"header": TokenClasses.HEADER,
|
|
182
182
|
}
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
183
|
+
if LayoutType.WORD in self.categories.get_categories(filtered=True, name_as_key=True):
|
|
184
|
+
ner_token_to_id_mapping = self.categories.get_sub_categories(
|
|
185
|
+
categories=LayoutType.WORD,
|
|
186
|
+
sub_categories={LayoutType.WORD: [WordType.TOKEN_TAG, WordType.TAG, WordType.TOKEN_CLASS]},
|
|
187
|
+
keys=False,
|
|
188
|
+
values_as_dict=True,
|
|
189
|
+
name_as_key=True,
|
|
190
|
+
)
|
|
191
|
+
else:
|
|
192
|
+
ner_token_to_id_mapping = {
|
|
193
|
+
LayoutType.WORD: {
|
|
194
|
+
WordType.TAG: {BioTag.BEGIN: 3, BioTag.INSIDE: 1, BioTag.OUTSIDE: 2},
|
|
195
|
+
WordType.TOKEN_CLASS: {
|
|
196
|
+
TokenClasses.ANSWER: 3,
|
|
197
|
+
TokenClasses.HEADER: 4,
|
|
198
|
+
TokenClasses.OTHER: 1,
|
|
199
|
+
TokenClasses.QUESTION: 2,
|
|
200
|
+
},
|
|
201
|
+
WordType.TOKEN_TAG: {
|
|
202
|
+
TokenClassWithTag.B_ANSWER: 1,
|
|
203
|
+
TokenClassWithTag.B_HEADER: 2,
|
|
204
|
+
TokenClassWithTag.B_QUESTION: 3,
|
|
205
|
+
TokenClassWithTag.I_ANSWER: 4,
|
|
206
|
+
TokenClassWithTag.I_HEADER: 5,
|
|
207
|
+
TokenClassWithTag.I_QUESTION: 6,
|
|
208
|
+
BioTag.OUTSIDE: 7,
|
|
209
|
+
},
|
|
210
|
+
}
|
|
211
|
+
}
|
|
190
212
|
df = MapData(
|
|
191
213
|
df,
|
|
192
214
|
xfund_to_image(
|
|
@@ -293,6 +293,8 @@ class Evaluator:
|
|
|
293
293
|
show_words = kwargs.pop("show_words", False)
|
|
294
294
|
show_token_class = kwargs.pop("show_token_class", True)
|
|
295
295
|
ignore_default_token_class = kwargs.pop("ignore_default_token_class", False)
|
|
296
|
+
floating_text_block_categories = kwargs.pop("floating_text_block_categories", None)
|
|
297
|
+
include_residual_text_containers = kwargs.pop("include_residual_Text_containers", True)
|
|
296
298
|
|
|
297
299
|
df_gt = self.dataset.dataflow.build(**kwargs)
|
|
298
300
|
df_pr = self.dataset.dataflow.build(**kwargs)
|
|
@@ -301,7 +303,11 @@ class Evaluator:
|
|
|
301
303
|
df_pr = MapData(df_pr, deepcopy)
|
|
302
304
|
df_pr = self._clean_up_predict_dataflow_annotations(df_pr)
|
|
303
305
|
|
|
304
|
-
page_parsing_component = PageParsingService(
|
|
306
|
+
page_parsing_component = PageParsingService(
|
|
307
|
+
text_container=LayoutType.WORD,
|
|
308
|
+
floating_text_block_categories=floating_text_block_categories, # type: ignore
|
|
309
|
+
include_residual_text_container=bool(include_residual_text_containers),
|
|
310
|
+
)
|
|
305
311
|
df_gt = page_parsing_component.predict_dataflow(df_gt)
|
|
306
312
|
|
|
307
313
|
if self.pipe_component:
|
|
@@ -1051,7 +1051,8 @@ class ModelCatalog:
|
|
|
1051
1051
|
with jsonlines.open(path) as reader:
|
|
1052
1052
|
for obj in reader:
|
|
1053
1053
|
if not obj["name"] in ModelCatalog.CATALOG:
|
|
1054
|
-
|
|
1054
|
+
categories = obj.get("categories") or {}
|
|
1055
|
+
obj["categories"] = {int(key): get_type(val) for key, val in categories.items()}
|
|
1055
1056
|
ModelCatalog.register(obj["name"], ModelProfile(**obj))
|
|
1056
1057
|
|
|
1057
1058
|
@staticmethod
|