deepdoctection 0.43.6__tar.gz → 0.44.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/PKG-INFO +1 -1
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/__init__.py +3 -1
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/datapoint/__init__.py +1 -1
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/datapoint/image.py +49 -1
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/datapoint/view.py +4 -2
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/datasets/base.py +195 -51
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/pipe/base.py +29 -25
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/pipe/common.py +2 -2
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/pipe/concurrency.py +2 -2
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/pipe/language.py +2 -2
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/pipe/layout.py +2 -2
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/pipe/lm.py +13 -3
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/pipe/order.py +9 -5
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/pipe/refine.py +7 -7
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/pipe/segment.py +30 -30
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/pipe/sub_layout.py +2 -2
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/pipe/text.py +10 -5
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/pipe/transform.py +2 -4
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection.egg-info/PKG-INFO +1 -1
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/LICENSE +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/README.md +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/analyzer/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/analyzer/config.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/analyzer/dd.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/analyzer/factory.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/configs/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/configs/conf_dd_one.yaml +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/configs/conf_tesseract.yaml +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/configs/profiles.jsonl +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/dataflow/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/dataflow/base.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/dataflow/common.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/dataflow/custom.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/dataflow/custom_serialize.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/dataflow/parallel_map.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/dataflow/serialize.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/dataflow/stats.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/datapoint/annotation.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/datapoint/box.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/datapoint/convert.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/datasets/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/datasets/adapter.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/datasets/dataflow_builder.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/datasets/info.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/doclaynet.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/fintabnet.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/funsd.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/iiitar13k.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/layouttest.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/publaynet.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/pubtables1m.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/pubtabnet.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/rvlcdip.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/xfund.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/xsl/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/xsl/pascal_voc.xsl +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/datasets/registry.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/datasets/save.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/eval/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/eval/accmetric.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/eval/base.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/eval/cocometric.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/eval/eval.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/eval/registry.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/eval/tedsmetric.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/eval/tp_eval_callback.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/base.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/d2detect.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/deskew.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/doctrocr.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/fastlang.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/hfdetr.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/hflayoutlm.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/hflm.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/model.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/pdftext.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/pt/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/pt/nms.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/pt/ptutils.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/tessocr.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/texocr.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/tp/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tfutils.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpcompat.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/common.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/config/config.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/predict.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/preproc.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/utils/np_box_ops.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/extern/tpdetect.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/mapper/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/mapper/cats.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/mapper/cocostruct.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/mapper/d2struct.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/mapper/hfstruct.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/mapper/laylmstruct.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/mapper/maputils.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/mapper/match.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/mapper/misc.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/mapper/pascalstruct.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/mapper/prodigystruct.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/mapper/pubstruct.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/mapper/tpstruct.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/mapper/xfundstruct.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/pipe/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/pipe/anngen.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/pipe/doctectionpipe.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/pipe/registry.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/py.typed +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/train/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/train/d2_frcnn_train.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/train/hf_detr_train.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/train/hf_layoutlm_train.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/train/tp_frcnn_train.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/utils/__init__.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/utils/concurrency.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/utils/context.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/utils/develop.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/utils/env_info.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/utils/error.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/utils/file_utils.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/utils/fs.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/utils/identifier.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/utils/logger.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/utils/metacfg.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/utils/mocks.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/utils/pdf_utils.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/utils/settings.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/utils/tqdm.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/utils/transform.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/utils/types.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/utils/utils.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection/utils/viz.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection.egg-info/SOURCES.txt +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection.egg-info/dependency_links.txt +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection.egg-info/requires.txt +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/deepdoctection.egg-info/top_level.txt +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/setup.cfg +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/setup.py +0 -0
- {deepdoctection-0.43.6 → deepdoctection-0.44.0}/tests/test_utils.py +0 -0
|
@@ -25,7 +25,7 @@ from .utils.logger import LoggingRecord, logger
|
|
|
25
25
|
|
|
26
26
|
# pylint: enable=wrong-import-position
|
|
27
27
|
|
|
28
|
-
__version__ = "0.
|
|
28
|
+
__version__ = "0.44.0"
|
|
29
29
|
|
|
30
30
|
_IMPORT_STRUCTURE = {
|
|
31
31
|
"analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory"],
|
|
@@ -92,6 +92,7 @@ _IMPORT_STRUCTURE = {
|
|
|
92
92
|
"convert_pdf_bytes_to_np_array_v2",
|
|
93
93
|
"as_dict",
|
|
94
94
|
"ImageAnnotationBaseView",
|
|
95
|
+
"MetaAnnotation",
|
|
95
96
|
"Image",
|
|
96
97
|
"Word",
|
|
97
98
|
"Layout",
|
|
@@ -105,6 +106,7 @@ _IMPORT_STRUCTURE = {
|
|
|
105
106
|
"DatasetAdapter",
|
|
106
107
|
"DatasetBase",
|
|
107
108
|
"MergeDataset",
|
|
109
|
+
"DatasetCard",
|
|
108
110
|
"CustomDataset",
|
|
109
111
|
"DataFlowBaseBuilder",
|
|
110
112
|
"DatasetInfo",
|
|
@@ -25,7 +25,7 @@ from collections import defaultdict
|
|
|
25
25
|
from dataclasses import dataclass, field
|
|
26
26
|
from os import environ, fspath
|
|
27
27
|
from pathlib import Path
|
|
28
|
-
from typing import Any, Optional, Sequence, Union, no_type_check
|
|
28
|
+
from typing import Any, Optional, Sequence, TypedDict, Union, no_type_check
|
|
29
29
|
|
|
30
30
|
import numpy as np
|
|
31
31
|
from numpy import uint8
|
|
@@ -40,6 +40,54 @@ from .box import crop_box_from_image, global_to_local_coords, intersection_box
|
|
|
40
40
|
from .convert import as_dict, convert_b64_to_np_array, convert_np_array_to_b64, convert_pdf_bytes_to_np_array_v2
|
|
41
41
|
|
|
42
42
|
|
|
43
|
+
class MetaAnnotationDict(TypedDict):
|
|
44
|
+
"""MetaAnnotationDict"""
|
|
45
|
+
image_annotations: list[str]
|
|
46
|
+
sub_categories: dict[str, dict[str, list[str]]]
|
|
47
|
+
relationships: dict[str, list[str]]
|
|
48
|
+
summaries: list[str]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass(frozen=True)
|
|
52
|
+
class MetaAnnotation:
|
|
53
|
+
"""
|
|
54
|
+
An immutable dataclass that stores information about what `Image` are being
|
|
55
|
+
modified through a pipeline component.
|
|
56
|
+
|
|
57
|
+
Attributes:
|
|
58
|
+
image_annotations: Tuple of `ObjectTypes` representing image annotations.
|
|
59
|
+
sub_categories: Dictionary mapping `ObjectTypes` to dicts of `ObjectTypes` to sets of `ObjectTypes`
|
|
60
|
+
for sub-categories.
|
|
61
|
+
relationships: Dictionary mapping `ObjectTypes` to sets of `ObjectTypes` for relationships.
|
|
62
|
+
summaries: Tuple of `ObjectTypes` representing summaries.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
image_annotations: tuple[ObjectTypes, ...] = field(default=())
|
|
66
|
+
sub_categories: dict[ObjectTypes, dict[ObjectTypes, set[ObjectTypes]]] = field(default_factory=dict)
|
|
67
|
+
relationships: dict[ObjectTypes, set[ObjectTypes]] = field(default_factory=dict)
|
|
68
|
+
summaries: tuple[ObjectTypes, ...] = field(default=())
|
|
69
|
+
|
|
70
|
+
def as_dict(self) -> MetaAnnotationDict:
|
|
71
|
+
"""
|
|
72
|
+
Returns the MetaAnnotation as a dictionary, with all `ObjectTypes` converted to strings.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
A dictionary representation of the MetaAnnotation where all `ObjectTypes` are converted to strings.
|
|
76
|
+
"""
|
|
77
|
+
return {
|
|
78
|
+
"image_annotations": [obj.value for obj in self.image_annotations],
|
|
79
|
+
"sub_categories": {
|
|
80
|
+
outer_key.value: {
|
|
81
|
+
inner_key.value: [val.value for val in inner_values]
|
|
82
|
+
for inner_key, inner_values in outer_value.items()
|
|
83
|
+
}
|
|
84
|
+
for outer_key, outer_value in self.sub_categories.items()
|
|
85
|
+
},
|
|
86
|
+
"relationships": {key.value: [val.value for val in values] for key, values in self.relationships.items()},
|
|
87
|
+
"summaries": [obj.value for obj in self.summaries],
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
|
|
43
91
|
@dataclass
|
|
44
92
|
class Image:
|
|
45
93
|
"""
|
|
@@ -590,14 +590,16 @@ class Table(Layout):
|
|
|
590
590
|
|
|
591
591
|
@property
|
|
592
592
|
def csv_(self) -> list[list[list[Text_]]]:
|
|
593
|
+
"""
|
|
594
|
+
Returns:
|
|
595
|
+
A csv-style representation of a table as list of lists of cell.text_.
|
|
596
|
+
"""
|
|
593
597
|
cells = self.cells
|
|
594
598
|
table_list = [[[] for _ in range(self.number_of_columns)] for _ in range(self.number_of_rows)] # type: ignore
|
|
595
599
|
for cell in cells:
|
|
596
600
|
table_list[cell.row_number - 1][cell.column_number - 1].append(cell.text_) # type: ignore
|
|
597
601
|
return table_list
|
|
598
602
|
|
|
599
|
-
|
|
600
|
-
|
|
601
603
|
def __str__(self) -> str:
|
|
602
604
|
out = " ".join([" ".join(row + ["\n"]) for row in self.csv])
|
|
603
605
|
return out
|
|
@@ -25,14 +25,15 @@ import os
|
|
|
25
25
|
import pprint
|
|
26
26
|
from abc import ABC, abstractmethod
|
|
27
27
|
from collections import defaultdict
|
|
28
|
+
from dataclasses import dataclass, field
|
|
28
29
|
from inspect import signature
|
|
29
30
|
from pathlib import Path
|
|
30
|
-
from typing import Any, Mapping, Optional, Sequence, Type, Union
|
|
31
|
+
from typing import Any, Mapping, Optional, Sequence, Type, TypedDict, Union
|
|
31
32
|
|
|
32
33
|
import numpy as np
|
|
33
34
|
|
|
34
35
|
from ..dataflow import CacheData, ConcatData, CustomDataFromList, DataFlow
|
|
35
|
-
from ..datapoint.image import Image
|
|
36
|
+
from ..datapoint.image import Image, MetaAnnotation
|
|
36
37
|
from ..utils.logger import LoggingRecord, logger
|
|
37
38
|
from ..utils.settings import DatasetType, ObjectTypes, TypeOrStr, get_type
|
|
38
39
|
from ..utils.types import PathLikeOrStr
|
|
@@ -405,6 +406,193 @@ class MergeDataset(DatasetBase):
|
|
|
405
406
|
self._dataflow_builder.categories = self._categories()
|
|
406
407
|
|
|
407
408
|
|
|
409
|
+
class DatasetCardDict(TypedDict):
|
|
410
|
+
"""DatasetCardDict"""
|
|
411
|
+
name: str
|
|
412
|
+
dataset_type: Union[str, Any]
|
|
413
|
+
location: str
|
|
414
|
+
init_categories: Sequence[Any]
|
|
415
|
+
init_sub_categories: dict[Any, dict[Any, list[Any]]]
|
|
416
|
+
annotation_files: Optional[dict[Any, Union[Any, Sequence[Any]]]]
|
|
417
|
+
description: str
|
|
418
|
+
service_id_to_meta_annotation: dict[str, Any]
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
# Usage:
|
|
422
|
+
# def as_dict(self, ...) -> DatasetCardDict:
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
@dataclass
|
|
426
|
+
class DatasetCard:
|
|
427
|
+
"""
|
|
428
|
+
An immutable dataclass representing the metadata of a dataset, including categories, sub-categories,
|
|
429
|
+
storage location, annotation files, and description. It facilitates management and consistency checks
|
|
430
|
+
for annotations generated by pipeline components.
|
|
431
|
+
|
|
432
|
+
Attributes:
|
|
433
|
+
name: Name of the dataset.
|
|
434
|
+
dataset_type: Type of the dataset as `ObjectTypes`.
|
|
435
|
+
location: Storage location of the dataset as `Path`.
|
|
436
|
+
init_categories: List of all initial categories (`ObjectTypes`) present in the dataset.
|
|
437
|
+
init_sub_categories: Mapping from main categories to sub-categories and their possible values.
|
|
438
|
+
annotation_files: Optional mapping from split names to annotation files.
|
|
439
|
+
description: Description of the dataset.
|
|
440
|
+
service_id_to_meta_annotation: Mapping from service IDs to `MetaAnnotation` objects, storing
|
|
441
|
+
annotation structure for different pipeline components.
|
|
442
|
+
"""
|
|
443
|
+
|
|
444
|
+
name: str
|
|
445
|
+
dataset_type: ObjectTypes
|
|
446
|
+
location: Path
|
|
447
|
+
init_categories: list[ObjectTypes] = field(default_factory=list)
|
|
448
|
+
init_sub_categories: dict[ObjectTypes, dict[ObjectTypes, list[ObjectTypes]]] = field(default_factory=dict)
|
|
449
|
+
annotation_files: Optional[Mapping[str, Union[str, Sequence[str]]]] = None
|
|
450
|
+
description: str = field(default="")
|
|
451
|
+
service_id_to_meta_annotation: dict[str, MetaAnnotation] = field(default_factory=dict)
|
|
452
|
+
|
|
453
|
+
def save_dataset_card(self, file_path: Union[str, Path]) -> None:
|
|
454
|
+
"""Save the DatasetCard instance as a JSON file."""
|
|
455
|
+
with open(file_path, "w", encoding="utf-8") as f:
|
|
456
|
+
json.dump(self.as_dict(), f, indent=4)
|
|
457
|
+
|
|
458
|
+
@staticmethod
|
|
459
|
+
def load_dataset_card(file_path: PathLikeOrStr) -> DatasetCard:
|
|
460
|
+
"""Load a DatasetCard instance from a JSON file."""
|
|
461
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
462
|
+
data = json.load(f)
|
|
463
|
+
service_id_to_meta_annotation = {}
|
|
464
|
+
if "service_id_to_meta_annotation" in data:
|
|
465
|
+
for service_id, meta_ann_dict in data.pop("service_id_to_meta_annotation").items():
|
|
466
|
+
meta_ann_dict["image_annotations"] = tuple(
|
|
467
|
+
get_type(cat) for cat in meta_ann_dict["image_annotations"]
|
|
468
|
+
)
|
|
469
|
+
meta_ann_dict["sub_categories"] = {
|
|
470
|
+
get_type(cat): {
|
|
471
|
+
get_type(sub_cat): set({get_type(value) for value in values})
|
|
472
|
+
for sub_cat, values in sub_cats.items()
|
|
473
|
+
}
|
|
474
|
+
for cat, sub_cats in meta_ann_dict["sub_categories"].items()
|
|
475
|
+
}
|
|
476
|
+
meta_ann_dict["relationships"] = {
|
|
477
|
+
get_type(key): set({get_type(value) for value in values})
|
|
478
|
+
for key, values in meta_ann_dict["relationships"].items()
|
|
479
|
+
}
|
|
480
|
+
meta_ann_dict["summaries"] = tuple(get_type(val) for val in meta_ann_dict["summaries"])
|
|
481
|
+
service_id_to_meta_annotation[service_id] = MetaAnnotation(**meta_ann_dict)
|
|
482
|
+
data["service_id_to_meta_annotation"] = service_id_to_meta_annotation
|
|
483
|
+
return DatasetCard(**data)
|
|
484
|
+
|
|
485
|
+
def as_dict(self, keep_object_types: bool = False) -> DatasetCardDict:
|
|
486
|
+
"""Convert the DatasetCard to a dictionary."""
|
|
487
|
+
if keep_object_types:
|
|
488
|
+
return {
|
|
489
|
+
"name": self.name,
|
|
490
|
+
"dataset_type": self.dataset_type,
|
|
491
|
+
"location": self.location.as_posix(),
|
|
492
|
+
"init_categories": self.init_categories,
|
|
493
|
+
"init_sub_categories": self.init_sub_categories,
|
|
494
|
+
"annotation_files": self.annotation_files, # type: ignore
|
|
495
|
+
"description": self.description,
|
|
496
|
+
"service_id_to_meta_annotation": {
|
|
497
|
+
key: val.as_dict() for key, val in self.service_id_to_meta_annotation.items()
|
|
498
|
+
},
|
|
499
|
+
}
|
|
500
|
+
return {
|
|
501
|
+
"name": self.name,
|
|
502
|
+
"dataset_type": self.dataset_type.value,
|
|
503
|
+
"location": self.location.as_posix(),
|
|
504
|
+
"init_categories": [cat.value for cat in self.init_categories],
|
|
505
|
+
"init_sub_categories": {
|
|
506
|
+
cat.value: {
|
|
507
|
+
sub_cat.value: list({value.value for value in values}) for sub_cat, values in sub_cats.items()
|
|
508
|
+
}
|
|
509
|
+
for cat, sub_cats in self.init_sub_categories.items()
|
|
510
|
+
},
|
|
511
|
+
"annotation_files": self.annotation_files, # type: ignore
|
|
512
|
+
"description": self.description,
|
|
513
|
+
"service_id_to_meta_annotation": {
|
|
514
|
+
key: val.as_dict() for key, val in self.service_id_to_meta_annotation.items()
|
|
515
|
+
},
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
def update_from_pipeline(
|
|
519
|
+
self, meta_annotations: MetaAnnotation, service_id_to_meta_annotation: Mapping[str, MetaAnnotation]
|
|
520
|
+
) -> None:
|
|
521
|
+
"""
|
|
522
|
+
Update the initial categories, sub-categories, and service ID to `MetaAnnotation` mapping
|
|
523
|
+
based on the results from a pipeline.
|
|
524
|
+
|
|
525
|
+
```python
|
|
526
|
+
analyzer = dd.get_dd_analyzer(config_overwrite=["USE_OCR=True","USE_TABLE_SEGMENTATION=True"])
|
|
527
|
+
meta_annotations = analyzer.get_meta_annotation()
|
|
528
|
+
service_id_to_meta_annotation = analyzer.get_service_id_to_meta_annotation()
|
|
529
|
+
card.update_from_pipeline(meta_annotations, service_id_to_meta_annotation)
|
|
530
|
+
```
|
|
531
|
+
|
|
532
|
+
Args:
|
|
533
|
+
meta_annotations: A `MetaAnnotation` object containing new or updated categories and sub-categories.
|
|
534
|
+
service_id_to_meta_annotation: A mapping from service IDs to `MetaAnnotation` objects generated by the
|
|
535
|
+
pipeline.
|
|
536
|
+
|
|
537
|
+
Adds any missing categories, sub-categories, and values to the respective attributes of the instance.
|
|
538
|
+
"""
|
|
539
|
+
for category in meta_annotations.image_annotations:
|
|
540
|
+
if category not in self.init_categories:
|
|
541
|
+
self.init_categories.append(category)
|
|
542
|
+
for cat, sub_cats in meta_annotations.sub_categories.items():
|
|
543
|
+
if cat not in self.init_sub_categories:
|
|
544
|
+
self.init_sub_categories[cat] = {}
|
|
545
|
+
for sub_cat, values in sub_cats.items():
|
|
546
|
+
if sub_cat not in self.init_sub_categories[cat]:
|
|
547
|
+
self.init_sub_categories[cat][sub_cat] = []
|
|
548
|
+
for value in values:
|
|
549
|
+
if value not in self.init_sub_categories[cat][sub_cat]:
|
|
550
|
+
self.init_sub_categories[cat][sub_cat].append(value)
|
|
551
|
+
|
|
552
|
+
for service_id, meta_annotation in service_id_to_meta_annotation.items():
|
|
553
|
+
if service_id not in self.service_id_to_meta_annotation:
|
|
554
|
+
self.service_id_to_meta_annotation[service_id] = meta_annotation
|
|
555
|
+
|
|
556
|
+
def __post_init__(self) -> None:
|
|
557
|
+
"""
|
|
558
|
+
Perform internal consistency checks ensuring `init_categories` and
|
|
559
|
+
`init_sub_categories` align with `service_id_to_meta_annotation`.
|
|
560
|
+
"""
|
|
561
|
+
self.dataset_type = get_type(self.dataset_type)
|
|
562
|
+
self.location = Path(self.location)
|
|
563
|
+
self.init_categories = [get_type(cat) for cat in self.init_categories]
|
|
564
|
+
self.init_sub_categories = {
|
|
565
|
+
get_type(outer_key): {
|
|
566
|
+
get_type(inner_key): [get_type(value) for value in inner_values]
|
|
567
|
+
for inner_key, inner_values in outer_value.items()
|
|
568
|
+
}
|
|
569
|
+
for outer_key, outer_value in self.init_sub_categories.items()
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
if self.service_id_to_meta_annotation is None:
|
|
573
|
+
return
|
|
574
|
+
|
|
575
|
+
# Check compatibility of image_annotations with init_categories
|
|
576
|
+
for service_id, meta_annotation in self.service_id_to_meta_annotation.items():
|
|
577
|
+
for annotation in meta_annotation.image_annotations:
|
|
578
|
+
if annotation not in self.init_categories:
|
|
579
|
+
raise ValueError(
|
|
580
|
+
f"Image annotation '{annotation}' in service ID '{service_id}' is not "
|
|
581
|
+
f"present in `init_categories`."
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
# Check compatibility of sub_categories
|
|
585
|
+
for cat, sub_cats in meta_annotation.sub_categories.items():
|
|
586
|
+
if not (
|
|
587
|
+
cat in self.init_sub_categories
|
|
588
|
+
and all(sub_cat in self.init_sub_categories[cat] for sub_cat in sub_cats)
|
|
589
|
+
):
|
|
590
|
+
raise ValueError(
|
|
591
|
+
f"Sub-categories for category '{cat}' in service ID '{service_id}' "
|
|
592
|
+
f"do not match with `init_sub_categories`."
|
|
593
|
+
)
|
|
594
|
+
|
|
595
|
+
|
|
408
596
|
class CustomDataset(DatasetBase):
|
|
409
597
|
"""
|
|
410
598
|
A simple dataset interface that implements the boilerplate code and reduces complexity by merely leaving
|
|
@@ -512,53 +700,9 @@ class CustomDataset(DatasetBase):
|
|
|
512
700
|
Returns:
|
|
513
701
|
A CustomDataset instance created from the dataset card.
|
|
514
702
|
"""
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
meta_data["init_categories"] = [get_type(cat) for cat in meta_data["init_categories"]]
|
|
521
|
-
meta_data["init_sub_categories"] = (
|
|
522
|
-
{
|
|
523
|
-
get_type(cat): {
|
|
524
|
-
get_type(sub_cat_key): [get_type(sub_cat_value) for sub_cat_value in sub_cat_values]
|
|
525
|
-
for sub_cat_key, sub_cat_values in sub_cats.items()
|
|
526
|
-
}
|
|
527
|
-
for cat, sub_cats in meta_data["init_sub_categories"].items()
|
|
528
|
-
}
|
|
529
|
-
if meta_data["init_sub_categories"] is not None
|
|
530
|
-
else None
|
|
703
|
+
dataset_card = DatasetCard.load_dataset_card(file_path)
|
|
704
|
+
dataset_card_as_dict = dataset_card.as_dict(True)
|
|
705
|
+
dataset_card_as_dict.pop("service_id_to_meta_annotation") # type: ignore # pylint: disable=E1123
|
|
706
|
+
return CustomDataset( # pylint: disable=E1123
|
|
707
|
+
**dataset_card_as_dict, dataflow_builder=dataflow_builder # type: ignore
|
|
531
708
|
)
|
|
532
|
-
return CustomDataset(**meta_data, dataflow_builder=dataflow_builder)
|
|
533
|
-
|
|
534
|
-
def as_dict(self) -> Mapping[str, Any]:
|
|
535
|
-
"""
|
|
536
|
-
Return:
|
|
537
|
-
The meta-data of the dataset as a dictionary.
|
|
538
|
-
"""
|
|
539
|
-
return {
|
|
540
|
-
"name": self.name,
|
|
541
|
-
"dataset_type": self.type,
|
|
542
|
-
"location": str(self.location),
|
|
543
|
-
"annotation_files": self.annotation_files,
|
|
544
|
-
"init_categories": [cat.value for cat in self.init_categories],
|
|
545
|
-
"init_sub_categories": {
|
|
546
|
-
cat.value: {
|
|
547
|
-
sub_cat_key.value: [sub_cat_value.value for sub_cat_value in sub_cat_values]
|
|
548
|
-
for sub_cat_key, sub_cat_values in sub_cats.items()
|
|
549
|
-
}
|
|
550
|
-
for cat, sub_cats in self.init_sub_categories.items()
|
|
551
|
-
}
|
|
552
|
-
if self.init_sub_categories is not None
|
|
553
|
-
else None,
|
|
554
|
-
}
|
|
555
|
-
|
|
556
|
-
def save_dataset_card(self, file_path: str) -> None:
|
|
557
|
-
"""
|
|
558
|
-
Save the dataset card to a `JSON` file.
|
|
559
|
-
|
|
560
|
-
Args:
|
|
561
|
-
file_path: file_path
|
|
562
|
-
"""
|
|
563
|
-
with open(file_path, "w", encoding="UTF-8") as file:
|
|
564
|
-
json.dump(self.as_dict(), file, indent=4)
|
|
@@ -23,12 +23,11 @@ from __future__ import annotations
|
|
|
23
23
|
|
|
24
24
|
from abc import ABC, abstractmethod
|
|
25
25
|
from collections import defaultdict
|
|
26
|
-
from dataclasses import dataclass, field
|
|
27
26
|
from typing import Any, Callable, Mapping, Optional, Union
|
|
28
27
|
from uuid import uuid1
|
|
29
28
|
|
|
30
29
|
from ..dataflow import DataFlow, MapData
|
|
31
|
-
from ..datapoint.image import Image
|
|
30
|
+
from ..datapoint.image import Image, MetaAnnotation
|
|
32
31
|
from ..mapper.misc import curry
|
|
33
32
|
from ..utils.context import timed_operation
|
|
34
33
|
from ..utils.identifier import get_uuid_from_str
|
|
@@ -37,25 +36,6 @@ from ..utils.types import DP
|
|
|
37
36
|
from .anngen import DatapointManager
|
|
38
37
|
|
|
39
38
|
|
|
40
|
-
@dataclass(frozen=True)
|
|
41
|
-
class MetaAnnotation:
|
|
42
|
-
"""
|
|
43
|
-
A immutable dataclass that stores information about what `Image` are being
|
|
44
|
-
modified through a pipeline component.
|
|
45
|
-
|
|
46
|
-
Attributes:
|
|
47
|
-
image_annotations: Tuple of `ObjectTypes` representing image annotations.
|
|
48
|
-
sub_categories: Dictionary mapping `ObjectTypes` to sets of `ObjectTypes` for sub-categories.
|
|
49
|
-
relationships: Dictionary mapping `ObjectTypes` to sets of `ObjectTypes` for relationships.
|
|
50
|
-
summaries: Tuple of `ObjectTypes` representing summaries.
|
|
51
|
-
"""
|
|
52
|
-
|
|
53
|
-
image_annotations: tuple[ObjectTypes, ...] = field(default=())
|
|
54
|
-
sub_categories: dict[ObjectTypes, set[ObjectTypes]] = field(default_factory=dict)
|
|
55
|
-
relationships: dict[ObjectTypes, set[ObjectTypes]] = field(default_factory=dict)
|
|
56
|
-
summaries: tuple[ObjectTypes, ...] = field(default=())
|
|
57
|
-
|
|
58
|
-
|
|
59
39
|
class PipelineComponent(ABC):
|
|
60
40
|
"""
|
|
61
41
|
Base class for pipeline components.
|
|
@@ -427,15 +407,24 @@ class Pipeline(ABC):
|
|
|
427
407
|
as well as summaries (list with sub categories).
|
|
428
408
|
"""
|
|
429
409
|
image_annotations: list[ObjectTypes] = []
|
|
430
|
-
sub_categories =
|
|
431
|
-
relationships = defaultdict(set)
|
|
410
|
+
sub_categories: dict[ObjectTypes, dict[ObjectTypes, set[ObjectTypes]]] = {}
|
|
411
|
+
relationships = defaultdict(set[ObjectTypes]) # type: ignore
|
|
432
412
|
summaries: list[ObjectTypes] = []
|
|
433
413
|
for component in self.pipe_component_list:
|
|
434
414
|
meta_anns = component.get_meta_annotation()
|
|
435
415
|
image_annotations.extend(meta_anns.image_annotations)
|
|
436
416
|
for key, value in meta_anns.sub_categories.items():
|
|
437
|
-
sub_categories[key]
|
|
438
|
-
|
|
417
|
+
sub_dict = meta_anns.sub_categories[key]
|
|
418
|
+
for sub_cat, sub_cat_value in value.items():
|
|
419
|
+
if sub_cat in sub_dict:
|
|
420
|
+
sub_dict[sub_cat].update(sub_cat_value)
|
|
421
|
+
else:
|
|
422
|
+
sub_dict[sub_cat] = {sub_cat_value} # type: ignore
|
|
423
|
+
if key in sub_categories:
|
|
424
|
+
sub_categories[key].update(sub_dict)
|
|
425
|
+
else:
|
|
426
|
+
sub_categories[key] = sub_dict
|
|
427
|
+
for key, value in meta_anns.relationships.items(): # type: ignore
|
|
439
428
|
relationships[key].update(value)
|
|
440
429
|
summaries.extend(meta_anns.summaries)
|
|
441
430
|
return MetaAnnotation(
|
|
@@ -445,6 +434,21 @@ class Pipeline(ABC):
|
|
|
445
434
|
summaries=tuple(summaries),
|
|
446
435
|
)
|
|
447
436
|
|
|
437
|
+
def get_service_id_to_meta_annotation(self) -> Mapping[str, MetaAnnotation]:
|
|
438
|
+
"""
|
|
439
|
+
Collects meta annotations from all pipeline components and return a dict of service id to its meta annotation.
|
|
440
|
+
|
|
441
|
+
Returns:
|
|
442
|
+
`service_id` to `MetaAnnotation` with information about image annotations (list), sub categories (dict with
|
|
443
|
+
category names and generated sub categories), relationships (dict with category names and generated
|
|
444
|
+
relationships) as well as summaries (list with sub categories).
|
|
445
|
+
"""
|
|
446
|
+
service_id_to_meta_annotation = {}
|
|
447
|
+
for component in self.pipe_component_list:
|
|
448
|
+
meta_anns = component.get_meta_annotation()
|
|
449
|
+
service_id_to_meta_annotation[component.service_id] = meta_anns
|
|
450
|
+
return service_id_to_meta_annotation
|
|
451
|
+
|
|
448
452
|
def get_pipeline_info(
|
|
449
453
|
self, service_id: Optional[str] = None, name: Optional[str] = None
|
|
450
454
|
) -> Union[str, Mapping[str, str]]:
|
|
@@ -28,13 +28,13 @@ from typing import Literal, Mapping, Optional, Sequence, Union
|
|
|
28
28
|
import numpy as np
|
|
29
29
|
|
|
30
30
|
from ..dataflow import DataFlow, MapData
|
|
31
|
-
from ..datapoint.image import Image
|
|
31
|
+
from ..datapoint.image import Image, MetaAnnotation
|
|
32
32
|
from ..datapoint.view import IMAGE_DEFAULTS, Page
|
|
33
33
|
from ..extern.base import DetectionResult
|
|
34
34
|
from ..mapper.match import match_anns_by_distance, match_anns_by_intersection
|
|
35
35
|
from ..mapper.misc import to_image
|
|
36
36
|
from ..utils.settings import LayoutType, ObjectTypes, Relationships, TypeOrStr, get_type
|
|
37
|
-
from .base import
|
|
37
|
+
from .base import PipelineComponent
|
|
38
38
|
from .registry import pipeline_component_registry
|
|
39
39
|
|
|
40
40
|
if os.environ.get("DD_USE_TORCH"):
|
|
@@ -29,11 +29,11 @@ from typing import Callable, Optional, Sequence, Union
|
|
|
29
29
|
import tqdm
|
|
30
30
|
|
|
31
31
|
from ..dataflow import DataFlow, MapData
|
|
32
|
-
from ..datapoint.image import Image
|
|
32
|
+
from ..datapoint.image import Image, MetaAnnotation
|
|
33
33
|
from ..utils.context import timed_operation
|
|
34
34
|
from ..utils.tqdm import get_tqdm
|
|
35
35
|
from ..utils.types import QueueType, TqdmType
|
|
36
|
-
from .base import
|
|
36
|
+
from .base import PipelineComponent
|
|
37
37
|
from .common import ImageParsingService, PageParsingService
|
|
38
38
|
from .registry import pipeline_component_registry
|
|
39
39
|
|
|
@@ -20,12 +20,12 @@ Module for language detection pipeline component
|
|
|
20
20
|
"""
|
|
21
21
|
from typing import Optional, Sequence
|
|
22
22
|
|
|
23
|
-
from ..datapoint.image import Image
|
|
23
|
+
from ..datapoint.image import Image, MetaAnnotation
|
|
24
24
|
from ..datapoint.view import ImageDefaults, Page
|
|
25
25
|
from ..extern.base import LanguageDetector, ObjectDetector
|
|
26
26
|
from ..utils.error import ImageError
|
|
27
27
|
from ..utils.settings import PageType, TypeOrStr, get_type
|
|
28
|
-
from .base import
|
|
28
|
+
from .base import PipelineComponent
|
|
29
29
|
from .registry import pipeline_component_registry
|
|
30
30
|
|
|
31
31
|
|
|
@@ -24,13 +24,13 @@ from typing import Optional, Sequence, Union
|
|
|
24
24
|
|
|
25
25
|
import numpy as np
|
|
26
26
|
|
|
27
|
-
from ..datapoint.image import Image
|
|
27
|
+
from ..datapoint.image import Image, MetaAnnotation
|
|
28
28
|
from ..extern.base import ObjectDetector, PdfMiner
|
|
29
29
|
from ..mapper.misc import curry
|
|
30
30
|
from ..utils.error import ImageError
|
|
31
31
|
from ..utils.settings import ObjectTypes
|
|
32
32
|
from ..utils.transform import PadTransform
|
|
33
|
-
from .base import
|
|
33
|
+
from .base import PipelineComponent
|
|
34
34
|
from .registry import pipeline_component_registry
|
|
35
35
|
|
|
36
36
|
|
|
@@ -23,11 +23,11 @@ from __future__ import annotations
|
|
|
23
23
|
from copy import copy
|
|
24
24
|
from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Sequence, Union
|
|
25
25
|
|
|
26
|
-
from ..datapoint.image import Image
|
|
26
|
+
from ..datapoint.image import Image, MetaAnnotation
|
|
27
27
|
from ..extern.base import SequenceClassResult
|
|
28
28
|
from ..mapper.laylmstruct import image_to_layoutlm_features, image_to_lm_features
|
|
29
29
|
from ..utils.settings import BioTag, LayoutType, ObjectTypes, PageType, TokenClasses, WordType
|
|
30
|
-
from .base import
|
|
30
|
+
from .base import PipelineComponent
|
|
31
31
|
from .registry import pipeline_component_registry
|
|
32
32
|
|
|
33
33
|
if TYPE_CHECKING:
|
|
@@ -246,7 +246,17 @@ class LMTokenClassifierService(PipelineComponent):
|
|
|
246
246
|
def get_meta_annotation(self) -> MetaAnnotation:
|
|
247
247
|
return MetaAnnotation(
|
|
248
248
|
image_annotations=(),
|
|
249
|
-
sub_categories={
|
|
249
|
+
sub_categories={
|
|
250
|
+
LayoutType.WORD: {
|
|
251
|
+
WordType.TOKEN_CLASS: set(self.language_model.categories.categories_semantics) # type: ignore
|
|
252
|
+
if self.language_model.categories.categories_semantics
|
|
253
|
+
else [],
|
|
254
|
+
WordType.TAG: set(self.language_model.categories.categories_bio) # type: ignore
|
|
255
|
+
if self.language_model.categories.categories_bio
|
|
256
|
+
else [],
|
|
257
|
+
WordType.TOKEN_TAG: set(self.language_model.categories.get_categories(as_dict=False)),
|
|
258
|
+
}
|
|
259
|
+
},
|
|
250
260
|
relationships={},
|
|
251
261
|
summaries=(),
|
|
252
262
|
)
|
|
@@ -31,11 +31,11 @@ import numpy as np
|
|
|
31
31
|
|
|
32
32
|
from ..datapoint.annotation import ImageAnnotation
|
|
33
33
|
from ..datapoint.box import BoundingBox, merge_boxes
|
|
34
|
-
from ..datapoint.image import Image
|
|
34
|
+
from ..datapoint.image import Image, MetaAnnotation
|
|
35
35
|
from ..datapoint.view import IMAGE_DEFAULTS
|
|
36
36
|
from ..extern.base import DetectionResult
|
|
37
37
|
from ..extern.tp.tpfrcnn.utils.np_box_ops import ioa as np_ioa
|
|
38
|
-
from ..pipe.base import
|
|
38
|
+
from ..pipe.base import PipelineComponent
|
|
39
39
|
from ..pipe.registry import pipeline_component_registry
|
|
40
40
|
from ..utils.logger import LoggingRecord, logger
|
|
41
41
|
from ..utils.settings import LayoutType, ObjectTypes, Relationships, TypeOrStr, get_type
|
|
@@ -611,8 +611,8 @@ class TextLineService(TextLineServiceMixin):
|
|
|
611
611
|
"""
|
|
612
612
|
return MetaAnnotation(
|
|
613
613
|
image_annotations=(LayoutType.LINE,),
|
|
614
|
-
sub_categories={
|
|
615
|
-
relationships={},
|
|
614
|
+
sub_categories={},
|
|
615
|
+
relationships={LayoutType.LINE: {Relationships.CHILD}},
|
|
616
616
|
summaries=(),
|
|
617
617
|
)
|
|
618
618
|
|
|
@@ -818,7 +818,11 @@ class TextOrderService(TextLineServiceMixin):
|
|
|
818
818
|
anns_with_reading_order = list(copy(self.floating_text_block_categories)) + add_category
|
|
819
819
|
return MetaAnnotation(
|
|
820
820
|
image_annotations=tuple(image_annotations),
|
|
821
|
-
sub_categories={
|
|
821
|
+
sub_categories={ # type: ignore
|
|
822
|
+
category: {Relationships.READING_ORDER: {Relationships.READING_ORDER}}
|
|
823
|
+
for category in anns_with_reading_order
|
|
824
|
+
}
|
|
825
|
+
| {self.text_container: {Relationships.READING_ORDER: {Relationships.READING_ORDER}}},
|
|
822
826
|
relationships={},
|
|
823
827
|
summaries=(),
|
|
824
828
|
)
|
|
@@ -31,12 +31,12 @@ import networkx as nx # type: ignore
|
|
|
31
31
|
|
|
32
32
|
from ..datapoint.annotation import ImageAnnotation
|
|
33
33
|
from ..datapoint.box import merge_boxes
|
|
34
|
-
from ..datapoint.image import Image
|
|
34
|
+
from ..datapoint.image import Image, MetaAnnotation
|
|
35
35
|
from ..extern.base import DetectionResult
|
|
36
36
|
from ..mapper.maputils import MappingContextManager
|
|
37
37
|
from ..utils.error import ImageError
|
|
38
38
|
from ..utils.settings import CellType, LayoutType, ObjectTypes, Relationships, TableType, get_type
|
|
39
|
-
from .base import
|
|
39
|
+
from .base import PipelineComponent
|
|
40
40
|
from .registry import pipeline_component_registry
|
|
41
41
|
|
|
42
42
|
__all__ = ["TableSegmentationRefinementService", "generate_html_string"]
|
|
@@ -537,12 +537,12 @@ class TableSegmentationRefinementService(PipelineComponent):
|
|
|
537
537
|
image_annotations=(),
|
|
538
538
|
sub_categories={
|
|
539
539
|
LayoutType.CELL: {
|
|
540
|
-
CellType.ROW_NUMBER,
|
|
541
|
-
CellType.COLUMN_NUMBER,
|
|
542
|
-
CellType.ROW_SPAN,
|
|
543
|
-
CellType.COLUMN_SPAN,
|
|
540
|
+
CellType.ROW_NUMBER: {CellType.ROW_NUMBER},
|
|
541
|
+
CellType.COLUMN_NUMBER: {CellType.COLUMN_NUMBER},
|
|
542
|
+
CellType.ROW_SPAN: {CellType.ROW_SPAN},
|
|
543
|
+
CellType.COLUMN_SPAN: {CellType.COLUMN_SPAN},
|
|
544
544
|
},
|
|
545
|
-
LayoutType.TABLE: {TableType.HTML},
|
|
545
|
+
LayoutType.TABLE: {TableType.HTML: {TableType.HTML}},
|
|
546
546
|
},
|
|
547
547
|
relationships={},
|
|
548
548
|
summaries=(),
|