deepdoctection 0.43.5__tar.gz → 0.44.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/PKG-INFO +3 -3
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/README.md +2 -2
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/__init__.py +3 -1
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/analyzer/config.py +1 -1
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/configs/profiles.jsonl +1 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datapoint/__init__.py +1 -1
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datapoint/image.py +49 -1
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datapoint/view.py +27 -13
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/base.py +195 -51
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/model.py +1 -1
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/base.py +29 -25
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/common.py +2 -2
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/concurrency.py +2 -2
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/language.py +2 -2
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/layout.py +2 -2
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/lm.py +13 -3
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/order.py +9 -5
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/refine.py +7 -7
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/segment.py +30 -30
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/sub_layout.py +2 -2
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/text.py +10 -5
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/transform.py +2 -4
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection.egg-info/PKG-INFO +3 -3
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/LICENSE +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/analyzer/__init__.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/analyzer/dd.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/analyzer/factory.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/configs/__init__.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/configs/conf_dd_one.yaml +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/configs/conf_tesseract.yaml +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/dataflow/__init__.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/dataflow/base.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/dataflow/common.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/dataflow/custom.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/dataflow/custom_serialize.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/dataflow/parallel_map.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/dataflow/serialize.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/dataflow/stats.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datapoint/annotation.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datapoint/box.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datapoint/convert.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/__init__.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/adapter.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/dataflow_builder.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/info.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/__init__.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/doclaynet.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/fintabnet.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/funsd.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/iiitar13k.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/layouttest.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/publaynet.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/pubtables1m.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/pubtabnet.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/rvlcdip.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/xfund.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/xsl/__init__.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/xsl/pascal_voc.xsl +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/registry.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/save.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/eval/__init__.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/eval/accmetric.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/eval/base.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/eval/cocometric.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/eval/eval.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/eval/registry.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/eval/tedsmetric.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/eval/tp_eval_callback.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/__init__.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/base.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/d2detect.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/deskew.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/doctrocr.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/fastlang.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/hfdetr.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/hflayoutlm.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/hflm.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/pdftext.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/pt/__init__.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/pt/nms.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/pt/ptutils.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tessocr.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/texocr.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/__init__.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tfutils.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpcompat.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/__init__.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/common.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/config/config.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/predict.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/preproc.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/utils/np_box_ops.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tpdetect.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/mapper/__init__.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/mapper/cats.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/mapper/cocostruct.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/mapper/d2struct.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/mapper/hfstruct.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/mapper/laylmstruct.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/mapper/maputils.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/mapper/match.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/mapper/misc.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/mapper/pascalstruct.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/mapper/prodigystruct.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/mapper/pubstruct.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/mapper/tpstruct.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/mapper/xfundstruct.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/__init__.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/anngen.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/doctectionpipe.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/registry.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/py.typed +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/train/__init__.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/train/d2_frcnn_train.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/train/hf_detr_train.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/train/hf_layoutlm_train.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/train/tp_frcnn_train.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/__init__.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/concurrency.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/context.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/develop.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/env_info.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/error.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/file_utils.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/fs.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/identifier.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/logger.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/metacfg.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/mocks.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/pdf_utils.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/settings.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/tqdm.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/transform.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/types.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/utils.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/viz.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection.egg-info/SOURCES.txt +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection.egg-info/dependency_links.txt +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection.egg-info/requires.txt +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection.egg-info/top_level.txt +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/setup.cfg +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/setup.py +0 -0
- {deepdoctection-0.43.5 → deepdoctection-0.44.0}/tests/test_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: deepdoctection
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.44.0
|
|
4
4
|
Summary: Repository for Document AI
|
|
5
5
|
Home-page: https://github.com/deepdoctection/deepdoctection
|
|
6
6
|
Author: Dr. Janis Meyer
|
|
@@ -321,7 +321,7 @@ For a simple setup which is enough to parse documents with the default setting,
|
|
|
321
321
|
|
|
322
322
|
```
|
|
323
323
|
pip install transformers
|
|
324
|
-
pip install python-doctr
|
|
324
|
+
pip install python-doctr==0.9.0
|
|
325
325
|
pip install deepdoctection
|
|
326
326
|
```
|
|
327
327
|
|
|
@@ -329,7 +329,7 @@ pip install deepdoctection
|
|
|
329
329
|
|
|
330
330
|
```
|
|
331
331
|
pip install tensorpack
|
|
332
|
-
pip install python-doctr
|
|
332
|
+
pip install python-doctr==0.9.0
|
|
333
333
|
pip install deepdoctection
|
|
334
334
|
```
|
|
335
335
|
|
|
@@ -178,7 +178,7 @@ For a simple setup which is enough to parse documents with the default setting,
|
|
|
178
178
|
|
|
179
179
|
```
|
|
180
180
|
pip install transformers
|
|
181
|
-
pip install python-doctr
|
|
181
|
+
pip install python-doctr==0.9.0
|
|
182
182
|
pip install deepdoctection
|
|
183
183
|
```
|
|
184
184
|
|
|
@@ -186,7 +186,7 @@ pip install deepdoctection
|
|
|
186
186
|
|
|
187
187
|
```
|
|
188
188
|
pip install tensorpack
|
|
189
|
-
pip install python-doctr
|
|
189
|
+
pip install python-doctr==0.9.0
|
|
190
190
|
pip install deepdoctection
|
|
191
191
|
```
|
|
192
192
|
|
|
@@ -25,7 +25,7 @@ from .utils.logger import LoggingRecord, logger
|
|
|
25
25
|
|
|
26
26
|
# pylint: enable=wrong-import-position
|
|
27
27
|
|
|
28
|
-
__version__ = "0.
|
|
28
|
+
__version__ = "0.44.0"
|
|
29
29
|
|
|
30
30
|
_IMPORT_STRUCTURE = {
|
|
31
31
|
"analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory"],
|
|
@@ -92,6 +92,7 @@ _IMPORT_STRUCTURE = {
|
|
|
92
92
|
"convert_pdf_bytes_to_np_array_v2",
|
|
93
93
|
"as_dict",
|
|
94
94
|
"ImageAnnotationBaseView",
|
|
95
|
+
"MetaAnnotation",
|
|
95
96
|
"Image",
|
|
96
97
|
"Word",
|
|
97
98
|
"Layout",
|
|
@@ -105,6 +106,7 @@ _IMPORT_STRUCTURE = {
|
|
|
105
106
|
"DatasetAdapter",
|
|
106
107
|
"DatasetBase",
|
|
107
108
|
"MergeDataset",
|
|
109
|
+
"DatasetCard",
|
|
108
110
|
"CustomDataset",
|
|
109
111
|
"DataFlowBaseBuilder",
|
|
110
112
|
"DatasetInfo",
|
|
@@ -629,7 +629,7 @@ cfg.PT.ENFORCE_WEIGHTS.ITEM = True
|
|
|
629
629
|
|
|
630
630
|
# Specifies the PyTorch model weights for item detection.
|
|
631
631
|
# Use either .pt or .safetensors files.
|
|
632
|
-
cfg.PT.ITEM.WEIGHTS = "deepdoctection/tatr_tab_struct_v2/
|
|
632
|
+
cfg.PT.ITEM.WEIGHTS = "deepdoctection/tatr_tab_struct_v2/model.safetensors"
|
|
633
633
|
|
|
634
634
|
# Specifies the TorchScript model for item detection.
|
|
635
635
|
# Use .ts files for deployment without model implementation dependencies.
|
|
@@ -30,3 +30,4 @@
|
|
|
30
30
|
{"name": "Felix92/doctr-torch-parseq-multilingual-v1/pytorch_model.bin", "description": "", "size": [63286381], "tp_model": false, "config": "Felix92/doctr-torch-parseq-multilingual-v1/config.json", "preprocessor_config": null, "hf_repo_id": "Felix92/doctr-torch-parseq-multilingual-v1", "hf_model_name": "pytorch_model.bin", "hf_config_file": ["config.json"], "urls": null, "categories": {}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "DoctrTextRecognizer", "architecture": "parseq", "padding": null}
|
|
31
31
|
{"name": "doctr/crnn_vgg16_bn/pt/master-fde31e4a.pt", "description": "MASTER", "size": [63286381], "tp_model": false, "config": null, "preprocessor_config": null, "hf_repo_id": null, "hf_model_name": null, "hf_config_file": null, "urls": ["https://doctr-static.mindee.com/models?id=v0.7.0/master-fde31e4a.pt&src=0"], "categories": {}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "DoctrTextRecognizer", "architecture": "master", "padding": null}
|
|
32
32
|
{"name": "Aryn/deformable-detr-DocLayNet/model.safetensors", "description": "Deformable DEtection TRansformer (DETR), trained on DocLayNet (including 80k annotated pages in 11 classes).", "size": [115511753], "tp_model": false, "config": "Aryn/deformable-detr-DocLayNet/config.json", "preprocessor_config": "Aryn/deformable-detr-DocLayNet/preprocessor_config.json", "hf_repo_id": "Aryn/deformable-detr-DocLayNet", "hf_model_name": "model.safetensors", "hf_config_file": ["config.json", "preprocessor_config.json"], "urls": null, "categories": {"1": "default_type", "2": "caption", "11": "text", "12": "title", "3": "footnote", "4": "formula", "5": "list_item", "6": "page_footer", "7": "page_header", "8": "figure", "9": "section_header", "10": "table"}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "HFDetrDerivedDetector", "architecture": null, "padding": null}
|
|
33
|
+
{"name": "deepdoctection/tatr_tab_struct_v2/model.safetensors", "description": "Table Transformer (DETR) model trained on PubTables1M. It was introduced in the paper Aligning benchmark datasets for table structure recognition by Smock et al. This model is devoted to table structure recognition and assumes to receive a slightly croppedtable as input. It will predict rows, column and spanning cells. Use a padding of around 5 pixels. This artefact has been converted from deepdoctection/tatr_tab_struct_v2/pytorch_model.bin and should be used to reduce security issues", "size": [115511753], "tp_model": false, "config": "deepdoctection/tatr_tab_struct_v2/config.json", "preprocessor_config": "deepdoctection/tatr_tab_struct_v2/preprocessor_config.json", "hf_repo_id": "deepdoctection/tatr_tab_struct_v2", "hf_model_name": "model.safetensors", "hf_config_file": ["config.json", "preprocessor_config.json"], "urls": null, "categories": {"1": "table", "2": "column", "3": "row", "4": "column_header", "5": "projected_row_header", "6": "spanning"}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "HFDetrDerivedDetector", "architecture": null, "padding": null}
|
|
@@ -25,7 +25,7 @@ from collections import defaultdict
|
|
|
25
25
|
from dataclasses import dataclass, field
|
|
26
26
|
from os import environ, fspath
|
|
27
27
|
from pathlib import Path
|
|
28
|
-
from typing import Any, Optional, Sequence, Union, no_type_check
|
|
28
|
+
from typing import Any, Optional, Sequence, TypedDict, Union, no_type_check
|
|
29
29
|
|
|
30
30
|
import numpy as np
|
|
31
31
|
from numpy import uint8
|
|
@@ -40,6 +40,54 @@ from .box import crop_box_from_image, global_to_local_coords, intersection_box
|
|
|
40
40
|
from .convert import as_dict, convert_b64_to_np_array, convert_np_array_to_b64, convert_pdf_bytes_to_np_array_v2
|
|
41
41
|
|
|
42
42
|
|
|
43
|
+
class MetaAnnotationDict(TypedDict):
|
|
44
|
+
"""MetaAnnotationDict"""
|
|
45
|
+
image_annotations: list[str]
|
|
46
|
+
sub_categories: dict[str, dict[str, list[str]]]
|
|
47
|
+
relationships: dict[str, list[str]]
|
|
48
|
+
summaries: list[str]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass(frozen=True)
|
|
52
|
+
class MetaAnnotation:
|
|
53
|
+
"""
|
|
54
|
+
An immutable dataclass that stores information about what `Image` are being
|
|
55
|
+
modified through a pipeline component.
|
|
56
|
+
|
|
57
|
+
Attributes:
|
|
58
|
+
image_annotations: Tuple of `ObjectTypes` representing image annotations.
|
|
59
|
+
sub_categories: Dictionary mapping `ObjectTypes` to dicts of `ObjectTypes` to sets of `ObjectTypes`
|
|
60
|
+
for sub-categories.
|
|
61
|
+
relationships: Dictionary mapping `ObjectTypes` to sets of `ObjectTypes` for relationships.
|
|
62
|
+
summaries: Tuple of `ObjectTypes` representing summaries.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
image_annotations: tuple[ObjectTypes, ...] = field(default=())
|
|
66
|
+
sub_categories: dict[ObjectTypes, dict[ObjectTypes, set[ObjectTypes]]] = field(default_factory=dict)
|
|
67
|
+
relationships: dict[ObjectTypes, set[ObjectTypes]] = field(default_factory=dict)
|
|
68
|
+
summaries: tuple[ObjectTypes, ...] = field(default=())
|
|
69
|
+
|
|
70
|
+
def as_dict(self) -> MetaAnnotationDict:
|
|
71
|
+
"""
|
|
72
|
+
Returns the MetaAnnotation as a dictionary, with all `ObjectTypes` converted to strings.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
A dictionary representation of the MetaAnnotation where all `ObjectTypes` are converted to strings.
|
|
76
|
+
"""
|
|
77
|
+
return {
|
|
78
|
+
"image_annotations": [obj.value for obj in self.image_annotations],
|
|
79
|
+
"sub_categories": {
|
|
80
|
+
outer_key.value: {
|
|
81
|
+
inner_key.value: [val.value for val in inner_values]
|
|
82
|
+
for inner_key, inner_values in outer_value.items()
|
|
83
|
+
}
|
|
84
|
+
for outer_key, outer_value in self.sub_categories.items()
|
|
85
|
+
},
|
|
86
|
+
"relationships": {key.value: [val.value for val in values] for key, values in self.relationships.items()},
|
|
87
|
+
"summaries": [obj.value for obj in self.summaries],
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
|
|
43
91
|
@dataclass
|
|
44
92
|
class Image:
|
|
45
93
|
"""
|
|
@@ -195,7 +195,9 @@ class Word(ImageAnnotationBaseView):
|
|
|
195
195
|
attr_names = (
|
|
196
196
|
set(WordType)
|
|
197
197
|
.union(super().get_attribute_names())
|
|
198
|
-
.union(
|
|
198
|
+
.union(
|
|
199
|
+
{Relationships.READING_ORDER, Relationships.LAYOUT_LINK, Relationships.LINK, Relationships.SUCCESSOR}
|
|
200
|
+
)
|
|
199
201
|
)
|
|
200
202
|
return {attr_name.value if isinstance(attr_name, ObjectTypes) else attr_name for attr_name in attr_names}
|
|
201
203
|
|
|
@@ -384,16 +386,10 @@ class Table(Layout):
|
|
|
384
386
|
Returns:
|
|
385
387
|
A list of a table cells.
|
|
386
388
|
"""
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
LayoutType.CELL,
|
|
392
|
-
CellType.HEADER,
|
|
393
|
-
CellType.BODY,
|
|
394
|
-
CellType.SPANNING,
|
|
395
|
-
],
|
|
396
|
-
)
|
|
389
|
+
cell_anns: list[Cell] = []
|
|
390
|
+
for row_number in range(1, self.number_of_rows + 1): # type: ignore
|
|
391
|
+
cell_anns.extend(self.row(row_number)) # type: ignore
|
|
392
|
+
|
|
397
393
|
return cell_anns
|
|
398
394
|
|
|
399
395
|
@property
|
|
@@ -592,6 +588,18 @@ class Table(Layout):
|
|
|
592
588
|
)
|
|
593
589
|
return table_list
|
|
594
590
|
|
|
591
|
+
@property
|
|
592
|
+
def csv_(self) -> list[list[list[Text_]]]:
|
|
593
|
+
"""
|
|
594
|
+
Returns:
|
|
595
|
+
A csv-style representation of a table as list of lists of cell.text_.
|
|
596
|
+
"""
|
|
597
|
+
cells = self.cells
|
|
598
|
+
table_list = [[[] for _ in range(self.number_of_columns)] for _ in range(self.number_of_rows)] # type: ignore
|
|
599
|
+
for cell in cells:
|
|
600
|
+
table_list[cell.row_number - 1][cell.column_number - 1].append(cell.text_) # type: ignore
|
|
601
|
+
return table_list
|
|
602
|
+
|
|
595
603
|
def __str__(self) -> str:
|
|
596
604
|
out = " ".join([" ".join(row + ["\n"]) for row in self.csv])
|
|
597
605
|
return out
|
|
@@ -599,7 +607,13 @@ class Table(Layout):
|
|
|
599
607
|
@property
|
|
600
608
|
def text(self) -> str:
|
|
601
609
|
try:
|
|
602
|
-
|
|
610
|
+
cells = self.cells
|
|
611
|
+
if not cells:
|
|
612
|
+
return super().text
|
|
613
|
+
text_list: list[str] = []
|
|
614
|
+
for cell in cells:
|
|
615
|
+
text_list.append(cell.text)
|
|
616
|
+
return " ".join(text_list)
|
|
603
617
|
except (TypeError, AnnotationError):
|
|
604
618
|
return super().text
|
|
605
619
|
|
|
@@ -616,7 +630,7 @@ class Table(Layout):
|
|
|
616
630
|
token_class_ids: list[str] = []
|
|
617
631
|
token_tag_ids: list[str] = []
|
|
618
632
|
for cell in cells:
|
|
619
|
-
text.
|
|
633
|
+
text.append(cell.text_["text"])
|
|
620
634
|
words.extend(cell.text_["words"])
|
|
621
635
|
ann_ids.extend(cell.text_["ann_ids"])
|
|
622
636
|
token_classes.extend(cell.text_["token_classes"])
|
|
@@ -25,14 +25,15 @@ import os
|
|
|
25
25
|
import pprint
|
|
26
26
|
from abc import ABC, abstractmethod
|
|
27
27
|
from collections import defaultdict
|
|
28
|
+
from dataclasses import dataclass, field
|
|
28
29
|
from inspect import signature
|
|
29
30
|
from pathlib import Path
|
|
30
|
-
from typing import Any, Mapping, Optional, Sequence, Type, Union
|
|
31
|
+
from typing import Any, Mapping, Optional, Sequence, Type, TypedDict, Union
|
|
31
32
|
|
|
32
33
|
import numpy as np
|
|
33
34
|
|
|
34
35
|
from ..dataflow import CacheData, ConcatData, CustomDataFromList, DataFlow
|
|
35
|
-
from ..datapoint.image import Image
|
|
36
|
+
from ..datapoint.image import Image, MetaAnnotation
|
|
36
37
|
from ..utils.logger import LoggingRecord, logger
|
|
37
38
|
from ..utils.settings import DatasetType, ObjectTypes, TypeOrStr, get_type
|
|
38
39
|
from ..utils.types import PathLikeOrStr
|
|
@@ -405,6 +406,193 @@ class MergeDataset(DatasetBase):
|
|
|
405
406
|
self._dataflow_builder.categories = self._categories()
|
|
406
407
|
|
|
407
408
|
|
|
409
|
+
class DatasetCardDict(TypedDict):
|
|
410
|
+
"""DatasetCardDict"""
|
|
411
|
+
name: str
|
|
412
|
+
dataset_type: Union[str, Any]
|
|
413
|
+
location: str
|
|
414
|
+
init_categories: Sequence[Any]
|
|
415
|
+
init_sub_categories: dict[Any, dict[Any, list[Any]]]
|
|
416
|
+
annotation_files: Optional[dict[Any, Union[Any, Sequence[Any]]]]
|
|
417
|
+
description: str
|
|
418
|
+
service_id_to_meta_annotation: dict[str, Any]
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
# Usage:
|
|
422
|
+
# def as_dict(self, ...) -> DatasetCardDict:
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
@dataclass
|
|
426
|
+
class DatasetCard:
|
|
427
|
+
"""
|
|
428
|
+
An immutable dataclass representing the metadata of a dataset, including categories, sub-categories,
|
|
429
|
+
storage location, annotation files, and description. It facilitates management and consistency checks
|
|
430
|
+
for annotations generated by pipeline components.
|
|
431
|
+
|
|
432
|
+
Attributes:
|
|
433
|
+
name: Name of the dataset.
|
|
434
|
+
dataset_type: Type of the dataset as `ObjectTypes`.
|
|
435
|
+
location: Storage location of the dataset as `Path`.
|
|
436
|
+
init_categories: List of all initial categories (`ObjectTypes`) present in the dataset.
|
|
437
|
+
init_sub_categories: Mapping from main categories to sub-categories and their possible values.
|
|
438
|
+
annotation_files: Optional mapping from split names to annotation files.
|
|
439
|
+
description: Description of the dataset.
|
|
440
|
+
service_id_to_meta_annotation: Mapping from service IDs to `MetaAnnotation` objects, storing
|
|
441
|
+
annotation structure for different pipeline components.
|
|
442
|
+
"""
|
|
443
|
+
|
|
444
|
+
name: str
|
|
445
|
+
dataset_type: ObjectTypes
|
|
446
|
+
location: Path
|
|
447
|
+
init_categories: list[ObjectTypes] = field(default_factory=list)
|
|
448
|
+
init_sub_categories: dict[ObjectTypes, dict[ObjectTypes, list[ObjectTypes]]] = field(default_factory=dict)
|
|
449
|
+
annotation_files: Optional[Mapping[str, Union[str, Sequence[str]]]] = None
|
|
450
|
+
description: str = field(default="")
|
|
451
|
+
service_id_to_meta_annotation: dict[str, MetaAnnotation] = field(default_factory=dict)
|
|
452
|
+
|
|
453
|
+
def save_dataset_card(self, file_path: Union[str, Path]) -> None:
|
|
454
|
+
"""Save the DatasetCard instance as a JSON file."""
|
|
455
|
+
with open(file_path, "w", encoding="utf-8") as f:
|
|
456
|
+
json.dump(self.as_dict(), f, indent=4)
|
|
457
|
+
|
|
458
|
+
@staticmethod
|
|
459
|
+
def load_dataset_card(file_path: PathLikeOrStr) -> DatasetCard:
|
|
460
|
+
"""Load a DatasetCard instance from a JSON file."""
|
|
461
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
462
|
+
data = json.load(f)
|
|
463
|
+
service_id_to_meta_annotation = {}
|
|
464
|
+
if "service_id_to_meta_annotation" in data:
|
|
465
|
+
for service_id, meta_ann_dict in data.pop("service_id_to_meta_annotation").items():
|
|
466
|
+
meta_ann_dict["image_annotations"] = tuple(
|
|
467
|
+
get_type(cat) for cat in meta_ann_dict["image_annotations"]
|
|
468
|
+
)
|
|
469
|
+
meta_ann_dict["sub_categories"] = {
|
|
470
|
+
get_type(cat): {
|
|
471
|
+
get_type(sub_cat): set({get_type(value) for value in values})
|
|
472
|
+
for sub_cat, values in sub_cats.items()
|
|
473
|
+
}
|
|
474
|
+
for cat, sub_cats in meta_ann_dict["sub_categories"].items()
|
|
475
|
+
}
|
|
476
|
+
meta_ann_dict["relationships"] = {
|
|
477
|
+
get_type(key): set({get_type(value) for value in values})
|
|
478
|
+
for key, values in meta_ann_dict["relationships"].items()
|
|
479
|
+
}
|
|
480
|
+
meta_ann_dict["summaries"] = tuple(get_type(val) for val in meta_ann_dict["summaries"])
|
|
481
|
+
service_id_to_meta_annotation[service_id] = MetaAnnotation(**meta_ann_dict)
|
|
482
|
+
data["service_id_to_meta_annotation"] = service_id_to_meta_annotation
|
|
483
|
+
return DatasetCard(**data)
|
|
484
|
+
|
|
485
|
+
def as_dict(self, keep_object_types: bool = False) -> DatasetCardDict:
|
|
486
|
+
"""Convert the DatasetCard to a dictionary."""
|
|
487
|
+
if keep_object_types:
|
|
488
|
+
return {
|
|
489
|
+
"name": self.name,
|
|
490
|
+
"dataset_type": self.dataset_type,
|
|
491
|
+
"location": self.location.as_posix(),
|
|
492
|
+
"init_categories": self.init_categories,
|
|
493
|
+
"init_sub_categories": self.init_sub_categories,
|
|
494
|
+
"annotation_files": self.annotation_files, # type: ignore
|
|
495
|
+
"description": self.description,
|
|
496
|
+
"service_id_to_meta_annotation": {
|
|
497
|
+
key: val.as_dict() for key, val in self.service_id_to_meta_annotation.items()
|
|
498
|
+
},
|
|
499
|
+
}
|
|
500
|
+
return {
|
|
501
|
+
"name": self.name,
|
|
502
|
+
"dataset_type": self.dataset_type.value,
|
|
503
|
+
"location": self.location.as_posix(),
|
|
504
|
+
"init_categories": [cat.value for cat in self.init_categories],
|
|
505
|
+
"init_sub_categories": {
|
|
506
|
+
cat.value: {
|
|
507
|
+
sub_cat.value: list({value.value for value in values}) for sub_cat, values in sub_cats.items()
|
|
508
|
+
}
|
|
509
|
+
for cat, sub_cats in self.init_sub_categories.items()
|
|
510
|
+
},
|
|
511
|
+
"annotation_files": self.annotation_files, # type: ignore
|
|
512
|
+
"description": self.description,
|
|
513
|
+
"service_id_to_meta_annotation": {
|
|
514
|
+
key: val.as_dict() for key, val in self.service_id_to_meta_annotation.items()
|
|
515
|
+
},
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
def update_from_pipeline(
|
|
519
|
+
self, meta_annotations: MetaAnnotation, service_id_to_meta_annotation: Mapping[str, MetaAnnotation]
|
|
520
|
+
) -> None:
|
|
521
|
+
"""
|
|
522
|
+
Update the initial categories, sub-categories, and service ID to `MetaAnnotation` mapping
|
|
523
|
+
based on the results from a pipeline.
|
|
524
|
+
|
|
525
|
+
```python
|
|
526
|
+
analyzer = dd.get_dd_analyzer(config_overwrite=["USE_OCR=True","USE_TABLE_SEGMENTATION=True"])
|
|
527
|
+
meta_annotations = analyzer.get_meta_annotation()
|
|
528
|
+
service_id_to_meta_annotation = analyzer.get_service_id_to_meta_annotation()
|
|
529
|
+
card.update_from_pipeline(meta_annotations, service_id_to_meta_annotation)
|
|
530
|
+
```
|
|
531
|
+
|
|
532
|
+
Args:
|
|
533
|
+
meta_annotations: A `MetaAnnotation` object containing new or updated categories and sub-categories.
|
|
534
|
+
service_id_to_meta_annotation: A mapping from service IDs to `MetaAnnotation` objects generated by the
|
|
535
|
+
pipeline.
|
|
536
|
+
|
|
537
|
+
Adds any missing categories, sub-categories, and values to the respective attributes of the instance.
|
|
538
|
+
"""
|
|
539
|
+
for category in meta_annotations.image_annotations:
|
|
540
|
+
if category not in self.init_categories:
|
|
541
|
+
self.init_categories.append(category)
|
|
542
|
+
for cat, sub_cats in meta_annotations.sub_categories.items():
|
|
543
|
+
if cat not in self.init_sub_categories:
|
|
544
|
+
self.init_sub_categories[cat] = {}
|
|
545
|
+
for sub_cat, values in sub_cats.items():
|
|
546
|
+
if sub_cat not in self.init_sub_categories[cat]:
|
|
547
|
+
self.init_sub_categories[cat][sub_cat] = []
|
|
548
|
+
for value in values:
|
|
549
|
+
if value not in self.init_sub_categories[cat][sub_cat]:
|
|
550
|
+
self.init_sub_categories[cat][sub_cat].append(value)
|
|
551
|
+
|
|
552
|
+
for service_id, meta_annotation in service_id_to_meta_annotation.items():
|
|
553
|
+
if service_id not in self.service_id_to_meta_annotation:
|
|
554
|
+
self.service_id_to_meta_annotation[service_id] = meta_annotation
|
|
555
|
+
|
|
556
|
+
def __post_init__(self) -> None:
|
|
557
|
+
"""
|
|
558
|
+
Perform internal consistency checks ensuring `init_categories` and
|
|
559
|
+
`init_sub_categories` align with `service_id_to_meta_annotation`.
|
|
560
|
+
"""
|
|
561
|
+
self.dataset_type = get_type(self.dataset_type)
|
|
562
|
+
self.location = Path(self.location)
|
|
563
|
+
self.init_categories = [get_type(cat) for cat in self.init_categories]
|
|
564
|
+
self.init_sub_categories = {
|
|
565
|
+
get_type(outer_key): {
|
|
566
|
+
get_type(inner_key): [get_type(value) for value in inner_values]
|
|
567
|
+
for inner_key, inner_values in outer_value.items()
|
|
568
|
+
}
|
|
569
|
+
for outer_key, outer_value in self.init_sub_categories.items()
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
if self.service_id_to_meta_annotation is None:
|
|
573
|
+
return
|
|
574
|
+
|
|
575
|
+
# Check compatibility of image_annotations with init_categories
|
|
576
|
+
for service_id, meta_annotation in self.service_id_to_meta_annotation.items():
|
|
577
|
+
for annotation in meta_annotation.image_annotations:
|
|
578
|
+
if annotation not in self.init_categories:
|
|
579
|
+
raise ValueError(
|
|
580
|
+
f"Image annotation '{annotation}' in service ID '{service_id}' is not "
|
|
581
|
+
f"present in `init_categories`."
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
# Check compatibility of sub_categories
|
|
585
|
+
for cat, sub_cats in meta_annotation.sub_categories.items():
|
|
586
|
+
if not (
|
|
587
|
+
cat in self.init_sub_categories
|
|
588
|
+
and all(sub_cat in self.init_sub_categories[cat] for sub_cat in sub_cats)
|
|
589
|
+
):
|
|
590
|
+
raise ValueError(
|
|
591
|
+
f"Sub-categories for category '{cat}' in service ID '{service_id}' "
|
|
592
|
+
f"do not match with `init_sub_categories`."
|
|
593
|
+
)
|
|
594
|
+
|
|
595
|
+
|
|
408
596
|
class CustomDataset(DatasetBase):
|
|
409
597
|
"""
|
|
410
598
|
A simple dataset interface that implements the boilerplate code and reduces complexity by merely leaving
|
|
@@ -512,53 +700,9 @@ class CustomDataset(DatasetBase):
|
|
|
512
700
|
Returns:
|
|
513
701
|
A CustomDataset instance created from the dataset card.
|
|
514
702
|
"""
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
meta_data["init_categories"] = [get_type(cat) for cat in meta_data["init_categories"]]
|
|
521
|
-
meta_data["init_sub_categories"] = (
|
|
522
|
-
{
|
|
523
|
-
get_type(cat): {
|
|
524
|
-
get_type(sub_cat_key): [get_type(sub_cat_value) for sub_cat_value in sub_cat_values]
|
|
525
|
-
for sub_cat_key, sub_cat_values in sub_cats.items()
|
|
526
|
-
}
|
|
527
|
-
for cat, sub_cats in meta_data["init_sub_categories"].items()
|
|
528
|
-
}
|
|
529
|
-
if meta_data["init_sub_categories"] is not None
|
|
530
|
-
else None
|
|
703
|
+
dataset_card = DatasetCard.load_dataset_card(file_path)
|
|
704
|
+
dataset_card_as_dict = dataset_card.as_dict(True)
|
|
705
|
+
dataset_card_as_dict.pop("service_id_to_meta_annotation") # type: ignore # pylint: disable=E1123
|
|
706
|
+
return CustomDataset( # pylint: disable=E1123
|
|
707
|
+
**dataset_card_as_dict, dataflow_builder=dataflow_builder # type: ignore
|
|
531
708
|
)
|
|
532
|
-
return CustomDataset(**meta_data, dataflow_builder=dataflow_builder)
|
|
533
|
-
|
|
534
|
-
def as_dict(self) -> Mapping[str, Any]:
|
|
535
|
-
"""
|
|
536
|
-
Return:
|
|
537
|
-
The meta-data of the dataset as a dictionary.
|
|
538
|
-
"""
|
|
539
|
-
return {
|
|
540
|
-
"name": self.name,
|
|
541
|
-
"dataset_type": self.type,
|
|
542
|
-
"location": str(self.location),
|
|
543
|
-
"annotation_files": self.annotation_files,
|
|
544
|
-
"init_categories": [cat.value for cat in self.init_categories],
|
|
545
|
-
"init_sub_categories": {
|
|
546
|
-
cat.value: {
|
|
547
|
-
sub_cat_key.value: [sub_cat_value.value for sub_cat_value in sub_cat_values]
|
|
548
|
-
for sub_cat_key, sub_cat_values in sub_cats.items()
|
|
549
|
-
}
|
|
550
|
-
for cat, sub_cats in self.init_sub_categories.items()
|
|
551
|
-
}
|
|
552
|
-
if self.init_sub_categories is not None
|
|
553
|
-
else None,
|
|
554
|
-
}
|
|
555
|
-
|
|
556
|
-
def save_dataset_card(self, file_path: str) -> None:
|
|
557
|
-
"""
|
|
558
|
-
Save the dataset card to a `JSON` file.
|
|
559
|
-
|
|
560
|
-
Args:
|
|
561
|
-
file_path: file_path
|
|
562
|
-
"""
|
|
563
|
-
with open(file_path, "w", encoding="UTF-8") as file:
|
|
564
|
-
json.dump(self.as_dict(), file, indent=4)
|
|
@@ -306,7 +306,7 @@ class ModelCatalog:
|
|
|
306
306
|
|
|
307
307
|
# Loading default profiles
|
|
308
308
|
dd_profile_path = maybe_copy_config_to_cache(
|
|
309
|
-
get_package_path(), get_cache_dir_path(), "deepdoctection/configs/profiles.jsonl",
|
|
309
|
+
get_package_path(), get_cache_dir_path(), "deepdoctection/configs/profiles.jsonl", True
|
|
310
310
|
)
|
|
311
311
|
ModelCatalog.load_profiles_from_file(dd_profile_path)
|
|
312
312
|
# Additional profiles can be added
|
|
@@ -23,12 +23,11 @@ from __future__ import annotations
|
|
|
23
23
|
|
|
24
24
|
from abc import ABC, abstractmethod
|
|
25
25
|
from collections import defaultdict
|
|
26
|
-
from dataclasses import dataclass, field
|
|
27
26
|
from typing import Any, Callable, Mapping, Optional, Union
|
|
28
27
|
from uuid import uuid1
|
|
29
28
|
|
|
30
29
|
from ..dataflow import DataFlow, MapData
|
|
31
|
-
from ..datapoint.image import Image
|
|
30
|
+
from ..datapoint.image import Image, MetaAnnotation
|
|
32
31
|
from ..mapper.misc import curry
|
|
33
32
|
from ..utils.context import timed_operation
|
|
34
33
|
from ..utils.identifier import get_uuid_from_str
|
|
@@ -37,25 +36,6 @@ from ..utils.types import DP
|
|
|
37
36
|
from .anngen import DatapointManager
|
|
38
37
|
|
|
39
38
|
|
|
40
|
-
@dataclass(frozen=True)
|
|
41
|
-
class MetaAnnotation:
|
|
42
|
-
"""
|
|
43
|
-
A immutable dataclass that stores information about what `Image` are being
|
|
44
|
-
modified through a pipeline component.
|
|
45
|
-
|
|
46
|
-
Attributes:
|
|
47
|
-
image_annotations: Tuple of `ObjectTypes` representing image annotations.
|
|
48
|
-
sub_categories: Dictionary mapping `ObjectTypes` to sets of `ObjectTypes` for sub-categories.
|
|
49
|
-
relationships: Dictionary mapping `ObjectTypes` to sets of `ObjectTypes` for relationships.
|
|
50
|
-
summaries: Tuple of `ObjectTypes` representing summaries.
|
|
51
|
-
"""
|
|
52
|
-
|
|
53
|
-
image_annotations: tuple[ObjectTypes, ...] = field(default=())
|
|
54
|
-
sub_categories: dict[ObjectTypes, set[ObjectTypes]] = field(default_factory=dict)
|
|
55
|
-
relationships: dict[ObjectTypes, set[ObjectTypes]] = field(default_factory=dict)
|
|
56
|
-
summaries: tuple[ObjectTypes, ...] = field(default=())
|
|
57
|
-
|
|
58
|
-
|
|
59
39
|
class PipelineComponent(ABC):
|
|
60
40
|
"""
|
|
61
41
|
Base class for pipeline components.
|
|
@@ -427,15 +407,24 @@ class Pipeline(ABC):
|
|
|
427
407
|
as well as summaries (list with sub categories).
|
|
428
408
|
"""
|
|
429
409
|
image_annotations: list[ObjectTypes] = []
|
|
430
|
-
sub_categories =
|
|
431
|
-
relationships = defaultdict(set)
|
|
410
|
+
sub_categories: dict[ObjectTypes, dict[ObjectTypes, set[ObjectTypes]]] = {}
|
|
411
|
+
relationships = defaultdict(set[ObjectTypes]) # type: ignore
|
|
432
412
|
summaries: list[ObjectTypes] = []
|
|
433
413
|
for component in self.pipe_component_list:
|
|
434
414
|
meta_anns = component.get_meta_annotation()
|
|
435
415
|
image_annotations.extend(meta_anns.image_annotations)
|
|
436
416
|
for key, value in meta_anns.sub_categories.items():
|
|
437
|
-
sub_categories[key]
|
|
438
|
-
|
|
417
|
+
sub_dict = meta_anns.sub_categories[key]
|
|
418
|
+
for sub_cat, sub_cat_value in value.items():
|
|
419
|
+
if sub_cat in sub_dict:
|
|
420
|
+
sub_dict[sub_cat].update(sub_cat_value)
|
|
421
|
+
else:
|
|
422
|
+
sub_dict[sub_cat] = {sub_cat_value} # type: ignore
|
|
423
|
+
if key in sub_categories:
|
|
424
|
+
sub_categories[key].update(sub_dict)
|
|
425
|
+
else:
|
|
426
|
+
sub_categories[key] = sub_dict
|
|
427
|
+
for key, value in meta_anns.relationships.items(): # type: ignore
|
|
439
428
|
relationships[key].update(value)
|
|
440
429
|
summaries.extend(meta_anns.summaries)
|
|
441
430
|
return MetaAnnotation(
|
|
@@ -445,6 +434,21 @@ class Pipeline(ABC):
|
|
|
445
434
|
summaries=tuple(summaries),
|
|
446
435
|
)
|
|
447
436
|
|
|
437
|
+
def get_service_id_to_meta_annotation(self) -> Mapping[str, MetaAnnotation]:
|
|
438
|
+
"""
|
|
439
|
+
Collects meta annotations from all pipeline components and return a dict of service id to its meta annotation.
|
|
440
|
+
|
|
441
|
+
Returns:
|
|
442
|
+
`service_id` to `MetaAnnotation` with information about image annotations (list), sub categories (dict with
|
|
443
|
+
category names and generated sub categories), relationships (dict with category names and generated
|
|
444
|
+
relationships) as well as summaries (list with sub categories).
|
|
445
|
+
"""
|
|
446
|
+
service_id_to_meta_annotation = {}
|
|
447
|
+
for component in self.pipe_component_list:
|
|
448
|
+
meta_anns = component.get_meta_annotation()
|
|
449
|
+
service_id_to_meta_annotation[component.service_id] = meta_anns
|
|
450
|
+
return service_id_to_meta_annotation
|
|
451
|
+
|
|
448
452
|
def get_pipeline_info(
|
|
449
453
|
self, service_id: Optional[str] = None, name: Optional[str] = None
|
|
450
454
|
) -> Union[str, Mapping[str, str]]:
|
|
@@ -28,13 +28,13 @@ from typing import Literal, Mapping, Optional, Sequence, Union
|
|
|
28
28
|
import numpy as np
|
|
29
29
|
|
|
30
30
|
from ..dataflow import DataFlow, MapData
|
|
31
|
-
from ..datapoint.image import Image
|
|
31
|
+
from ..datapoint.image import Image, MetaAnnotation
|
|
32
32
|
from ..datapoint.view import IMAGE_DEFAULTS, Page
|
|
33
33
|
from ..extern.base import DetectionResult
|
|
34
34
|
from ..mapper.match import match_anns_by_distance, match_anns_by_intersection
|
|
35
35
|
from ..mapper.misc import to_image
|
|
36
36
|
from ..utils.settings import LayoutType, ObjectTypes, Relationships, TypeOrStr, get_type
|
|
37
|
-
from .base import
|
|
37
|
+
from .base import PipelineComponent
|
|
38
38
|
from .registry import pipeline_component_registry
|
|
39
39
|
|
|
40
40
|
if os.environ.get("DD_USE_TORCH"):
|
|
@@ -29,11 +29,11 @@ from typing import Callable, Optional, Sequence, Union
|
|
|
29
29
|
import tqdm
|
|
30
30
|
|
|
31
31
|
from ..dataflow import DataFlow, MapData
|
|
32
|
-
from ..datapoint.image import Image
|
|
32
|
+
from ..datapoint.image import Image, MetaAnnotation
|
|
33
33
|
from ..utils.context import timed_operation
|
|
34
34
|
from ..utils.tqdm import get_tqdm
|
|
35
35
|
from ..utils.types import QueueType, TqdmType
|
|
36
|
-
from .base import
|
|
36
|
+
from .base import PipelineComponent
|
|
37
37
|
from .common import ImageParsingService, PageParsingService
|
|
38
38
|
from .registry import pipeline_component_registry
|
|
39
39
|
|
|
@@ -20,12 +20,12 @@ Module for language detection pipeline component
|
|
|
20
20
|
"""
|
|
21
21
|
from typing import Optional, Sequence
|
|
22
22
|
|
|
23
|
-
from ..datapoint.image import Image
|
|
23
|
+
from ..datapoint.image import Image, MetaAnnotation
|
|
24
24
|
from ..datapoint.view import ImageDefaults, Page
|
|
25
25
|
from ..extern.base import LanguageDetector, ObjectDetector
|
|
26
26
|
from ..utils.error import ImageError
|
|
27
27
|
from ..utils.settings import PageType, TypeOrStr, get_type
|
|
28
|
-
from .base import
|
|
28
|
+
from .base import PipelineComponent
|
|
29
29
|
from .registry import pipeline_component_registry
|
|
30
30
|
|
|
31
31
|
|