deepdoctection 0.37.3__tar.gz → 0.38__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- {deepdoctection-0.37.3 → deepdoctection-0.38}/PKG-INFO +12 -2
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/__init__.py +1 -1
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/analyzer/_config.py +2 -1
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/analyzer/factory.py +9 -4
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/configs/conf_dd_one.yaml +126 -85
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/datapoint/box.py +2 -4
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/datapoint/image.py +11 -4
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/datapoint/view.py +124 -36
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/hfdetr.py +4 -3
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/pipe/doctectionpipe.py +1 -1
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/pipe/refine.py +6 -13
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/pipe/segment.py +229 -46
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/pipe/sub_layout.py +40 -22
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection.egg-info/PKG-INFO +12 -2
- {deepdoctection-0.37.3 → deepdoctection-0.38}/setup.cfg +1 -1
- {deepdoctection-0.37.3 → deepdoctection-0.38}/setup.py +1 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/LICENSE +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/README.md +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/analyzer/__init__.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/analyzer/dd.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/configs/__init__.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/configs/conf_tesseract.yaml +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/dataflow/__init__.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/dataflow/base.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/dataflow/common.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/dataflow/custom.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/dataflow/custom_serialize.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/dataflow/parallel_map.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/dataflow/serialize.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/dataflow/stats.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/datapoint/__init__.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/datapoint/annotation.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/datapoint/convert.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/datasets/__init__.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/datasets/adapter.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/datasets/base.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/datasets/dataflow_builder.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/datasets/info.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/datasets/instances/__init__.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/datasets/instances/doclaynet.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/datasets/instances/fintabnet.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/datasets/instances/funsd.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/datasets/instances/iiitar13k.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/datasets/instances/layouttest.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/datasets/instances/publaynet.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/datasets/instances/pubtables1m.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/datasets/instances/pubtabnet.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/datasets/instances/rvlcdip.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/datasets/instances/xfund.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/datasets/instances/xsl/__init__.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/datasets/instances/xsl/pascal_voc.xsl +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/datasets/registry.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/datasets/save.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/eval/__init__.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/eval/accmetric.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/eval/base.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/eval/cocometric.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/eval/eval.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/eval/registry.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/eval/tedsmetric.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/eval/tp_eval_callback.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/__init__.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/base.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/d2detect.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/deskew.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/doctrocr.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/fastlang.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/hflayoutlm.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/hflm.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/model.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/pdftext.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/pt/__init__.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/pt/nms.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/pt/ptutils.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/tessocr.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/texocr.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/tp/__init__.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/tp/tfutils.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/tp/tpcompat.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/tp/tpfrcnn/__init__.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/tp/tpfrcnn/common.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/tp/tpfrcnn/config/config.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/tp/tpfrcnn/predict.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/tp/tpfrcnn/preproc.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/tp/tpfrcnn/utils/np_box_ops.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/extern/tpdetect.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/mapper/__init__.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/mapper/cats.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/mapper/cocostruct.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/mapper/d2struct.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/mapper/hfstruct.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/mapper/laylmstruct.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/mapper/maputils.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/mapper/match.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/mapper/misc.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/mapper/pascalstruct.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/mapper/prodigystruct.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/mapper/pubstruct.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/mapper/tpstruct.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/mapper/xfundstruct.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/pipe/__init__.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/pipe/anngen.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/pipe/base.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/pipe/common.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/pipe/concurrency.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/pipe/language.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/pipe/layout.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/pipe/lm.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/pipe/order.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/pipe/registry.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/pipe/text.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/pipe/transform.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/py.typed +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/train/__init__.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/train/d2_frcnn_train.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/train/hf_detr_train.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/train/hf_layoutlm_train.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/train/tp_frcnn_train.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/utils/__init__.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/utils/concurrency.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/utils/context.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/utils/develop.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/utils/env_info.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/utils/error.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/utils/file_utils.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/utils/fs.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/utils/identifier.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/utils/logger.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/utils/metacfg.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/utils/mocks.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/utils/pdf_utils.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/utils/settings.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/utils/tqdm.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/utils/transform.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/utils/types.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/utils/utils.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection/utils/viz.py +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection.egg-info/SOURCES.txt +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection.egg-info/dependency_links.txt +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection.egg-info/requires.txt +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/deepdoctection.egg-info/top_level.txt +0 -0
- {deepdoctection-0.37.3 → deepdoctection-0.38}/tests/test_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: deepdoctection
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.38
|
|
4
4
|
Summary: Repository for Document AI
|
|
5
5
|
Home-page: https://github.com/deepdoctection/deepdoctection
|
|
6
6
|
Author: Dr. Janis Meyer
|
|
@@ -127,6 +127,16 @@ Requires-Dist: types-urllib3>=1.26.25.14; extra == "dev"
|
|
|
127
127
|
Provides-Extra: test
|
|
128
128
|
Requires-Dist: pytest==8.0.2; extra == "test"
|
|
129
129
|
Requires-Dist: pytest-cov; extra == "test"
|
|
130
|
+
Dynamic: author
|
|
131
|
+
Dynamic: classifier
|
|
132
|
+
Dynamic: description
|
|
133
|
+
Dynamic: description-content-type
|
|
134
|
+
Dynamic: home-page
|
|
135
|
+
Dynamic: license
|
|
136
|
+
Dynamic: provides-extra
|
|
137
|
+
Dynamic: requires-dist
|
|
138
|
+
Dynamic: requires-python
|
|
139
|
+
Dynamic: summary
|
|
130
140
|
|
|
131
141
|
|
|
132
142
|
<p align="center">
|
|
@@ -91,7 +91,8 @@ cfg.SEGMENTATION.PUBTABLES_SUB_ITEM_NAMES = [CellType.ROW_NUMBER, CellType.COLUM
|
|
|
91
91
|
cfg.SEGMENTATION.CELL_NAMES = [CellType.HEADER, CellType.BODY, LayoutType.CELL]
|
|
92
92
|
cfg.SEGMENTATION.ITEM_NAMES = [LayoutType.ROW, LayoutType.COLUMN]
|
|
93
93
|
cfg.SEGMENTATION.SUB_ITEM_NAMES = [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER]
|
|
94
|
-
|
|
94
|
+
cfg.SEGMENTATION.PUBTABLES_ITEM_HEADER_CELL_NAMES = [CellType.COLUMN_HEADER, CellType.ROW_HEADER]
|
|
95
|
+
cfg.SEGMENTATION.PUBTABLES_ITEM_HEADER_THRESHOLDS = [0.6, 0.0001]
|
|
95
96
|
cfg.SEGMENTATION.STRETCH_RULE = "equal"
|
|
96
97
|
|
|
97
98
|
cfg.USE_TABLE_REFINEMENT = True
|
|
@@ -51,7 +51,7 @@ from ..pipe.transform import SimpleTransformService
|
|
|
51
51
|
from ..utils.file_utils import detectron2_available
|
|
52
52
|
from ..utils.fs import get_configs_dir_path
|
|
53
53
|
from ..utils.metacfg import AttrDict
|
|
54
|
-
from ..utils.settings import LayoutType, Relationships
|
|
54
|
+
from ..utils.settings import CellType, LayoutType, Relationships
|
|
55
55
|
from ..utils.transform import PadTransform
|
|
56
56
|
|
|
57
57
|
with try_import() as image_guard:
|
|
@@ -264,14 +264,17 @@ class ServiceFactory:
|
|
|
264
264
|
:param mode: either `LAYOUT`,`CELL` or `ITEM`
|
|
265
265
|
:return: `SubImageLayoutService` instance
|
|
266
266
|
"""
|
|
267
|
-
|
|
267
|
+
exclude_category_names = []
|
|
268
268
|
padder = None
|
|
269
269
|
if mode == "ITEM":
|
|
270
270
|
if detector.__class__.__name__ in ("HFDetrDerivedDetector",):
|
|
271
|
-
|
|
271
|
+
exclude_category_names.extend(
|
|
272
|
+
[LayoutType.TABLE, CellType.COLUMN_HEADER, CellType.PROJECTED_ROW_HEADER, CellType.SPANNING]
|
|
273
|
+
)
|
|
272
274
|
padder = ServiceFactory.build_padder(config, mode)
|
|
273
275
|
detect_result_generator = DetectResultGenerator(
|
|
274
|
-
|
|
276
|
+
categories_name_as_key=detector.categories.get_categories(as_dict=True, name_as_key=True),
|
|
277
|
+
exclude_category_names=exclude_category_names,
|
|
275
278
|
)
|
|
276
279
|
return SubImageLayoutService(
|
|
277
280
|
sub_image_detector=detector,
|
|
@@ -399,6 +402,8 @@ class ServiceFactory:
|
|
|
399
402
|
spanning_cell_names=config.SEGMENTATION.PUBTABLES_SPANNING_CELL_NAMES,
|
|
400
403
|
item_names=config.SEGMENTATION.PUBTABLES_ITEM_NAMES,
|
|
401
404
|
sub_item_names=config.SEGMENTATION.PUBTABLES_SUB_ITEM_NAMES,
|
|
405
|
+
item_header_cell_names=config.SEGMENTATION.PUBTABLES_ITEM_HEADER_CELL_NAMES,
|
|
406
|
+
item_header_thresholds=config.SEGMENTATION.PUBTABLES_ITEM_HEADER_THRESHOLDS,
|
|
402
407
|
stretch_rule=config.SEGMENTATION.STRETCH_RULE,
|
|
403
408
|
)
|
|
404
409
|
|
|
@@ -1,104 +1,145 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
1
|
+
DEVICE: null
|
|
2
|
+
LANGUAGE: null
|
|
3
|
+
LAYOUT_LINK:
|
|
4
|
+
CHILD_CATEGORIES: []
|
|
5
|
+
PARENTAL_CATEGORIES: []
|
|
6
|
+
LAYOUT_NMS_PAIRS:
|
|
7
|
+
COMBINATIONS: null
|
|
8
|
+
PRIORITY: null
|
|
9
|
+
THRESHOLDS: null
|
|
10
|
+
LIB: null
|
|
11
|
+
OCR:
|
|
12
|
+
CONFIG:
|
|
13
|
+
TESSERACT: dd/conf_tesseract.yaml
|
|
14
|
+
USE_DOCTR: false
|
|
15
|
+
USE_TESSERACT: true
|
|
16
|
+
USE_TEXTRACT: false
|
|
17
|
+
WEIGHTS:
|
|
18
|
+
DOCTR_RECOGNITION:
|
|
19
|
+
PT: doctr/crnn_vgg16_bn/pt/crnn_vgg16_bn-9762b0b0.pt
|
|
20
|
+
TF: doctr/crnn_vgg16_bn/tf/crnn_vgg16_bn-76b7f2c6.zip
|
|
21
|
+
DOCTR_WORD:
|
|
22
|
+
PT: doctr/db_resnet50/pt/db_resnet50-ac60cadc.pt
|
|
23
|
+
TF: doctr/db_resnet50/tf/db_resnet50-adcafc63.zip
|
|
24
|
+
PDF_MINER:
|
|
25
|
+
X_TOLERANCE: 3
|
|
26
|
+
Y_TOLERANCE: 3
|
|
27
|
+
PT:
|
|
8
28
|
CELL:
|
|
9
|
-
|
|
10
|
-
|
|
29
|
+
FILTER: null
|
|
30
|
+
WEIGHTS: cell/d2_model_1849999_cell_inf_only.pt
|
|
31
|
+
WEIGHTS_TS: cell/d2_model_1849999_cell_inf_only.ts
|
|
11
32
|
ITEM:
|
|
12
|
-
|
|
13
|
-
FILTER:
|
|
14
|
-
PT:
|
|
15
|
-
LAYOUT:
|
|
16
|
-
WEIGHTS: layout/d2_model_0829999_layout_inf_only.pt
|
|
17
|
-
WEIGHTS_TS: layout/d2_model_0829999_layout_inf_only.ts
|
|
18
|
-
FILTER:
|
|
33
|
+
FILTER: null
|
|
19
34
|
PAD:
|
|
20
|
-
TOP: 60
|
|
21
|
-
RIGHT: 60
|
|
22
35
|
BOTTOM: 60
|
|
23
36
|
LEFT: 60
|
|
24
|
-
|
|
37
|
+
RIGHT: 60
|
|
38
|
+
TOP: 60
|
|
25
39
|
WEIGHTS: item/d2_model_1639999_item_inf_only.pt
|
|
26
40
|
WEIGHTS_TS: item/d2_model_1639999_item_inf_only.ts
|
|
27
|
-
|
|
41
|
+
LAYOUT:
|
|
42
|
+
FILTER: null
|
|
28
43
|
PAD:
|
|
29
|
-
TOP: 60
|
|
30
|
-
RIGHT: 60
|
|
31
44
|
BOTTOM: 60
|
|
32
45
|
LEFT: 60
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
LAYOUT_NMS_PAIRS:
|
|
38
|
-
COMBINATIONS:
|
|
39
|
-
THRESHOLDS:
|
|
40
|
-
PRIORITY:
|
|
46
|
+
RIGHT: 60
|
|
47
|
+
TOP: 60
|
|
48
|
+
WEIGHTS: layout/d2_model_0829999_layout_inf_only.pt
|
|
49
|
+
WEIGHTS_TS: layout/d2_model_0829999_layout_inf_only.ts
|
|
41
50
|
SEGMENTATION:
|
|
42
51
|
ASSIGNMENT_RULE: ioa
|
|
43
|
-
THRESHOLD_ROWS: 0.4
|
|
44
|
-
THRESHOLD_COLS: 0.4
|
|
45
|
-
FULL_TABLE_TILING: True
|
|
46
|
-
REMOVE_IOU_THRESHOLD_ROWS: 0.001
|
|
47
|
-
REMOVE_IOU_THRESHOLD_COLS: 0.001
|
|
48
52
|
CELL_CATEGORY_ID: 12
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
53
|
+
CELL_NAMES:
|
|
54
|
+
- header
|
|
55
|
+
- body
|
|
56
|
+
- cell
|
|
57
|
+
FULL_TABLE_TILING: true
|
|
58
|
+
ITEM_NAMES:
|
|
59
|
+
- row
|
|
60
|
+
- column
|
|
61
|
+
PUBTABLES_CELL_NAMES:
|
|
62
|
+
- spanning
|
|
63
|
+
- row_header
|
|
64
|
+
- column_header
|
|
65
|
+
- projected_row_header
|
|
66
|
+
- cell
|
|
67
|
+
PUBTABLES_ITEM_NAMES:
|
|
68
|
+
- row
|
|
69
|
+
- column
|
|
70
|
+
PUBTABLES_SPANNING_CELL_NAMES:
|
|
71
|
+
- spanning
|
|
72
|
+
- row_header
|
|
73
|
+
- column_header
|
|
74
|
+
- projected_row_header
|
|
75
|
+
PUBTABLES_SUB_ITEM_NAMES:
|
|
76
|
+
- row_number
|
|
77
|
+
- column_number
|
|
78
|
+
PUBTABLES_ITEM_HEADER_CELL_NAMES:
|
|
75
79
|
- column_header
|
|
76
|
-
- projected_row_header
|
|
77
|
-
- spanning
|
|
78
80
|
- row_header
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
81
|
+
PUBTABLES_ITEM_HEADER_THRESHOLDS:
|
|
82
|
+
- 0.6
|
|
83
|
+
- 0.0001
|
|
84
|
+
REMOVE_IOU_THRESHOLD_COLS: 0.001
|
|
85
|
+
REMOVE_IOU_THRESHOLD_ROWS: 0.001
|
|
86
|
+
STRETCH_RULE: equal
|
|
87
|
+
SUB_ITEM_NAMES:
|
|
88
|
+
- row_number
|
|
89
|
+
- column_number
|
|
90
|
+
TABLE_NAME: table
|
|
91
|
+
THRESHOLD_COLS: 0.4
|
|
92
|
+
THRESHOLD_ROWS: 0.4
|
|
93
|
+
TEXT_CONTAINER: word
|
|
82
94
|
TEXT_ORDERING:
|
|
83
|
-
TEXT_BLOCK_CATEGORIES:
|
|
84
|
-
- title
|
|
85
|
-
- text
|
|
86
|
-
- list
|
|
87
|
-
- cell
|
|
88
|
-
- column_header
|
|
89
|
-
- projected_row_header
|
|
90
|
-
- spanning
|
|
91
|
-
- row_header
|
|
92
|
-
FLOATING_TEXT_BLOCK_CATEGORIES:
|
|
93
|
-
- title
|
|
94
|
-
- text
|
|
95
|
-
- list
|
|
96
|
-
INCLUDE_RESIDUAL_TEXT_CONTAINER: False
|
|
97
|
-
STARTING_POINT_TOLERANCE: 0.005
|
|
98
95
|
BROKEN_LINE_TOLERANCE: 0.003
|
|
96
|
+
FLOATING_TEXT_BLOCK_CATEGORIES:
|
|
97
|
+
- text
|
|
98
|
+
- title
|
|
99
|
+
- figure
|
|
100
|
+
- list
|
|
99
101
|
HEIGHT_TOLERANCE: 2.0
|
|
102
|
+
INCLUDE_RESIDUAL_TEXT_CONTAINER: false
|
|
100
103
|
PARAGRAPH_BREAK: 0.035
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
104
|
+
STARTING_POINT_TOLERANCE: 0.005
|
|
105
|
+
TEXT_BLOCK_CATEGORIES:
|
|
106
|
+
- text
|
|
107
|
+
- title
|
|
108
|
+
- list
|
|
109
|
+
- cell
|
|
110
|
+
- figure
|
|
111
|
+
- column_header
|
|
112
|
+
- projected_row_header
|
|
113
|
+
- spanning
|
|
114
|
+
- row_header
|
|
115
|
+
TF:
|
|
116
|
+
CELL:
|
|
117
|
+
FILTER: null
|
|
118
|
+
WEIGHTS: cell/model-1800000_inf_only.data-00000-of-00001
|
|
119
|
+
ITEM:
|
|
120
|
+
FILTER: null
|
|
121
|
+
WEIGHTS: item/model-1620000_inf_only.data-00000-of-00001
|
|
122
|
+
LAYOUT:
|
|
123
|
+
FILTER: null
|
|
124
|
+
WEIGHTS: layout/model-800000_inf_only.data-00000-of-00001
|
|
125
|
+
USE_LAYOUT: true
|
|
126
|
+
USE_LAYOUT_LINK: false
|
|
127
|
+
USE_LAYOUT_NMS: false
|
|
128
|
+
USE_OCR: true
|
|
129
|
+
USE_PDF_MINER: false
|
|
130
|
+
USE_ROTATOR: false
|
|
131
|
+
USE_TABLE_REFINEMENT: true
|
|
132
|
+
USE_TABLE_SEGMENTATION: true
|
|
133
|
+
WORD_MATCHING:
|
|
134
|
+
MAX_PARENT_ONLY: true
|
|
135
|
+
PARENTAL_CATEGORIES:
|
|
136
|
+
- text
|
|
137
|
+
- title
|
|
138
|
+
- list
|
|
139
|
+
- cell
|
|
140
|
+
- column_header
|
|
141
|
+
- projected_row_header
|
|
142
|
+
- spanning
|
|
143
|
+
- row_header
|
|
144
|
+
RULE: ioa
|
|
145
|
+
THRESHOLD: 0.6
|
|
@@ -491,10 +491,8 @@ def global_to_local_coords(global_box: BoundingBox, embedding_box: BoundingBox)
|
|
|
491
491
|
|
|
492
492
|
def merge_boxes(*boxes: BoundingBox) -> BoundingBox:
|
|
493
493
|
"""
|
|
494
|
-
Generating the smallest box containing an arbitrary tuple/list of boxes.
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
:param boxes: An arbitrary tuple/list of bounding boxes `BoundingBox` all having absolute_coords="True".
|
|
494
|
+
Generating the smallest box containing an arbitrary tuple/list of boxes.
|
|
495
|
+
:param boxes: An arbitrary tuple/list of bounding boxes `BoundingBox`.
|
|
498
496
|
"""
|
|
499
497
|
absolute_coords = boxes[0].absolute_coords
|
|
500
498
|
assert all(box.absolute_coords == absolute_coords for box in boxes), "all boxes must have same absolute_coords"
|
|
@@ -428,7 +428,7 @@ class Image:
|
|
|
428
428
|
A list of attributes to suspend from as_dict creation.
|
|
429
429
|
"""
|
|
430
430
|
|
|
431
|
-
return ["
|
|
431
|
+
return ["_annotation_ids", "_category_name"]
|
|
432
432
|
|
|
433
433
|
def define_annotation_id(self, annotation: Annotation) -> str:
|
|
434
434
|
"""
|
|
@@ -572,24 +572,31 @@ class Image:
|
|
|
572
572
|
ann = self.get_annotation(annotation_ids=annotation_id)[0]
|
|
573
573
|
if ann.image is None:
|
|
574
574
|
raise ImageError("When adding sub images to ImageAnnotation then ImageAnnotation.image must not be None")
|
|
575
|
-
|
|
576
|
-
box = ann.bounding_box.to_list("xyxy")
|
|
575
|
+
box = ann.get_bounding_box(self.image_id).to_list("xyxy")
|
|
577
576
|
proposals = self.get_annotation(category_names)
|
|
578
577
|
points = np.array([prop.get_bounding_box(self.image_id).center for prop in proposals])
|
|
578
|
+
if not points.size:
|
|
579
|
+
return
|
|
579
580
|
ann_ids = np.array([prop.annotation_id for prop in proposals])
|
|
580
581
|
indices = np.where(
|
|
581
582
|
(box[0] < points[:, 0]) & (box[1] < points[:, 1]) & (box[2] > points[:, 0]) & (box[3] > points[:, 1])
|
|
582
583
|
)[0]
|
|
583
584
|
selected_ids = ann_ids[indices]
|
|
584
585
|
sub_images = self.get_annotation(annotation_ids=selected_ids.tolist())
|
|
586
|
+
ann_box = ann.get_bounding_box(self.image_id)
|
|
587
|
+
if not ann_box.absolute_coords:
|
|
588
|
+
ann_box = ann_box.transform(self.width, self.height, absolute_coords=True)
|
|
585
589
|
for sub_image in sub_images:
|
|
586
590
|
if sub_image.image is None:
|
|
587
591
|
raise ImageError(
|
|
588
592
|
"When setting an embedding to ImageAnnotation then ImageAnnotation.image must not be None"
|
|
589
593
|
)
|
|
594
|
+
sub_image_box = sub_image.get_bounding_box(self.image_id)
|
|
595
|
+
if not sub_image_box.absolute_coords:
|
|
596
|
+
sub_image_box = sub_image_box.transform(self.width, self.height, absolute_coords=True)
|
|
590
597
|
sub_image.image.set_embedding(
|
|
591
598
|
annotation_id,
|
|
592
|
-
global_to_local_coords(
|
|
599
|
+
global_to_local_coords(sub_image_box, ann_box),
|
|
593
600
|
)
|
|
594
601
|
ann.image.dump(sub_image)
|
|
595
602
|
|
|
@@ -28,7 +28,7 @@ import numpy as np
|
|
|
28
28
|
from typing_extensions import LiteralString
|
|
29
29
|
|
|
30
30
|
from ..utils.error import AnnotationError, ImageError
|
|
31
|
-
from ..utils.logger import LoggingRecord, logger
|
|
31
|
+
from ..utils.logger import LoggingRecord, log_once, logger
|
|
32
32
|
from ..utils.settings import (
|
|
33
33
|
CellType,
|
|
34
34
|
LayoutType,
|
|
@@ -282,25 +282,103 @@ class Table(Layout):
|
|
|
282
282
|
"""
|
|
283
283
|
|
|
284
284
|
@property
|
|
285
|
-
def cells(self) -> list[
|
|
285
|
+
def cells(self) -> list[Cell]:
|
|
286
286
|
"""
|
|
287
287
|
A list of a table cells.
|
|
288
288
|
"""
|
|
289
289
|
all_relation_ids = self.get_relationship(Relationships.CHILD)
|
|
290
|
-
cell_anns = self.base_page.get_annotation(
|
|
290
|
+
cell_anns: list[Cell] = self.base_page.get_annotation( # type: ignore
|
|
291
291
|
annotation_ids=all_relation_ids,
|
|
292
292
|
category_names=[
|
|
293
293
|
LayoutType.CELL,
|
|
294
294
|
CellType.HEADER,
|
|
295
295
|
CellType.BODY,
|
|
296
|
-
CellType.PROJECTED_ROW_HEADER,
|
|
297
296
|
CellType.SPANNING,
|
|
298
|
-
CellType.ROW_HEADER,
|
|
299
|
-
CellType.COLUMN_HEADER,
|
|
300
297
|
],
|
|
301
298
|
)
|
|
302
299
|
return cell_anns
|
|
303
300
|
|
|
301
|
+
@property
|
|
302
|
+
def column_header_cells(self) -> list[Cell]:
|
|
303
|
+
"""
|
|
304
|
+
Retrieve a list of cells that are column headers in the table.
|
|
305
|
+
|
|
306
|
+
This property filters and sorts the cells in the table to return only those that are column headers.
|
|
307
|
+
The cells are sorted by their column number.
|
|
308
|
+
|
|
309
|
+
:return: A list of `Cell` objects that are column headers.
|
|
310
|
+
"""
|
|
311
|
+
all_relation_ids = self.get_relationship(Relationships.CHILD)
|
|
312
|
+
all_cells: list[Cell] = self.base_page.get_annotation( # type: ignore
|
|
313
|
+
category_names=[LayoutType.CELL, CellType.SPANNING], annotation_ids=all_relation_ids
|
|
314
|
+
)
|
|
315
|
+
headers = list(filter(lambda cell: CellType.COLUMN_HEADER in cell.sub_categories, all_cells))
|
|
316
|
+
headers.sort(key=lambda x: x.column_number) # type: ignore
|
|
317
|
+
return headers
|
|
318
|
+
|
|
319
|
+
@property
|
|
320
|
+
def row_header_cells(self) -> list[Cell]:
|
|
321
|
+
"""
|
|
322
|
+
Retrieve a list of cells that are row headers in the table.
|
|
323
|
+
|
|
324
|
+
This property filters and sorts the cells in the table to return only those that are row headers.
|
|
325
|
+
The cells are sorted by their column number.
|
|
326
|
+
|
|
327
|
+
:return: A list of `Cell` objects that are row headers.
|
|
328
|
+
"""
|
|
329
|
+
all_relation_ids = self.get_relationship(Relationships.CHILD)
|
|
330
|
+
all_cells: list[Cell] = self.base_page.get_annotation( # type: ignore
|
|
331
|
+
category_names=[LayoutType.CELL, CellType.SPANNING], annotation_ids=all_relation_ids
|
|
332
|
+
)
|
|
333
|
+
row_header_cells = list(filter(lambda cell: CellType.ROW_HEADER in cell.sub_categories, all_cells))
|
|
334
|
+
row_header_cells.sort(key=lambda x: x.column_number) # type: ignore
|
|
335
|
+
return row_header_cells
|
|
336
|
+
|
|
337
|
+
def kv_header_rows(self, row_number: int) -> Mapping[str, str]:
|
|
338
|
+
"""
|
|
339
|
+
For a given row number, returns a dictionary mapping column headers to cell values in that row.
|
|
340
|
+
|
|
341
|
+
This method retrieves all cells in the specified row and matches them with their corresponding column headers.
|
|
342
|
+
It then creates a key-value pair where the key is a tuple containing the column number and header text,
|
|
343
|
+
and the value is the cell text.
|
|
344
|
+
|
|
345
|
+
:param row_number: The row number for which to retrieve the key-value pairs.
|
|
346
|
+
:return: A dictionary where keys are tuples of (column number, header text) and values are cell texts.
|
|
347
|
+
|
|
348
|
+
Example:
|
|
349
|
+
If the table has the following structure:
|
|
350
|
+
| Header1 | Header2 |
|
|
351
|
+
|---------|---------|
|
|
352
|
+
| Value1 | Value2 |
|
|
353
|
+
| Value3 | Value4 |
|
|
354
|
+
|
|
355
|
+
Calling kv_header_rows(1) would return:
|
|
356
|
+
{
|
|
357
|
+
(1, 'Header1'): 'Value1',
|
|
358
|
+
(2, 'Header2'): 'Value2'
|
|
359
|
+
}
|
|
360
|
+
"""
|
|
361
|
+
all_relation_ids = self.get_relationship(Relationships.CHILD)
|
|
362
|
+
all_cells = self.base_page.get_annotation(
|
|
363
|
+
category_names=[LayoutType.CELL, CellType.SPANNING], annotation_ids=all_relation_ids
|
|
364
|
+
)
|
|
365
|
+
row_cells = list(
|
|
366
|
+
filter(
|
|
367
|
+
lambda c: row_number in (c.row_number, c.row_number + c.row_span), all_cells # type: ignore
|
|
368
|
+
)
|
|
369
|
+
)
|
|
370
|
+
row_cells.sort(key=lambda c: c.column_number) # type: ignore
|
|
371
|
+
column_header_cells = self.column_header_cells
|
|
372
|
+
|
|
373
|
+
kv_dict: Mapping[str, str] = {}
|
|
374
|
+
for cell in row_cells:
|
|
375
|
+
for header in column_header_cells:
|
|
376
|
+
if (cell.column_number == header.column_number and # type: ignore
|
|
377
|
+
cell.annotation_id != header.annotation_id): # type: ignore
|
|
378
|
+
kv_dict[(header.column_number, header.text)] = cell.text # type: ignore
|
|
379
|
+
break
|
|
380
|
+
return kv_dict
|
|
381
|
+
|
|
304
382
|
@property
|
|
305
383
|
def rows(self) -> list[ImageAnnotationBaseView]:
|
|
306
384
|
"""
|
|
@@ -335,7 +413,7 @@ class Table(Layout):
|
|
|
335
413
|
try:
|
|
336
414
|
html_index = html_list.index(cell.annotation_id)
|
|
337
415
|
html_list.pop(html_index)
|
|
338
|
-
html_list.insert(html_index, cell.text)
|
|
416
|
+
html_list.insert(html_index, cell.text)
|
|
339
417
|
except ValueError:
|
|
340
418
|
logger.warning(LoggingRecord("html construction not possible", {"annotation_id": cell.annotation_id}))
|
|
341
419
|
|
|
@@ -357,6 +435,12 @@ class Table(Layout):
|
|
|
357
435
|
cells = self.cells
|
|
358
436
|
table_list = [["" for _ in range(self.number_of_columns)] for _ in range(self.number_of_rows)] # type: ignore
|
|
359
437
|
for cell in cells:
|
|
438
|
+
if cell.category_name == CellType.SPANNING:
|
|
439
|
+
log_once(
|
|
440
|
+
"Table has spanning cells. This implies, that the .csv output will not be correct."
|
|
441
|
+
"To prevent spanning cell table creation set PT.ITEM.FILTER=['table','spanning'] ",
|
|
442
|
+
"error",
|
|
443
|
+
)
|
|
360
444
|
table_list[cell.row_number - 1][cell.column_number - 1] = ( # type: ignore
|
|
361
445
|
table_list[cell.row_number - 1][cell.column_number - 1] + cell.text + " " # type: ignore
|
|
362
446
|
)
|
|
@@ -386,13 +470,13 @@ class Table(Layout):
|
|
|
386
470
|
token_class_ids: list[str] = []
|
|
387
471
|
token_tag_ids: list[str] = []
|
|
388
472
|
for cell in cells:
|
|
389
|
-
text.extend(cell.text_["text"])
|
|
390
|
-
words.extend(cell.text_["words"])
|
|
391
|
-
ann_ids.extend(cell.text_["ann_ids"])
|
|
392
|
-
token_classes.extend(cell.text_["token_classes"])
|
|
393
|
-
token_tags.extend(cell.text_["token_tags"])
|
|
394
|
-
token_class_ids.extend(cell.text_["token_class_ids"])
|
|
395
|
-
token_tag_ids.extend(cell.text_["token_tag_ids"])
|
|
473
|
+
text.extend(cell.text_["text"])
|
|
474
|
+
words.extend(cell.text_["words"])
|
|
475
|
+
ann_ids.extend(cell.text_["ann_ids"])
|
|
476
|
+
token_classes.extend(cell.text_["token_classes"])
|
|
477
|
+
token_tags.extend(cell.text_["token_tags"])
|
|
478
|
+
token_class_ids.extend(cell.text_["token_class_ids"])
|
|
479
|
+
token_tag_ids.extend(cell.text_["token_tag_ids"])
|
|
396
480
|
return {
|
|
397
481
|
"text": " ".join(text),
|
|
398
482
|
"words": words,
|
|
@@ -414,7 +498,7 @@ class Table(Layout):
|
|
|
414
498
|
if not cells:
|
|
415
499
|
return super().words
|
|
416
500
|
for cell in cells:
|
|
417
|
-
all_words.extend(cell.words)
|
|
501
|
+
all_words.extend(cell.words)
|
|
418
502
|
return all_words
|
|
419
503
|
|
|
420
504
|
def get_ordered_words(self) -> list[ImageAnnotationBaseView]:
|
|
@@ -424,7 +508,7 @@ class Table(Layout):
|
|
|
424
508
|
all_words = []
|
|
425
509
|
cells.sort(key=lambda x: (x.ROW_NUMBER, x.COLUMN_NUMBER))
|
|
426
510
|
for cell in cells:
|
|
427
|
-
all_words.extend(cell.get_ordered_words())
|
|
511
|
+
all_words.extend(cell.get_ordered_words())
|
|
428
512
|
return all_words
|
|
429
513
|
except (TypeError, AnnotationError):
|
|
430
514
|
return super().get_ordered_words()
|
|
@@ -436,10 +520,10 @@ IMAGE_ANNOTATION_TO_LAYOUTS: dict[ObjectTypes, Type[Union[Layout, Table, Word]]]
|
|
|
436
520
|
LayoutType.TABLE_ROTATED: Table,
|
|
437
521
|
LayoutType.WORD: Word,
|
|
438
522
|
LayoutType.CELL: Cell,
|
|
439
|
-
CellType.PROJECTED_ROW_HEADER: Cell,
|
|
440
523
|
CellType.SPANNING: Cell,
|
|
441
524
|
CellType.ROW_HEADER: Cell,
|
|
442
525
|
CellType.COLUMN_HEADER: Cell,
|
|
526
|
+
CellType.PROJECTED_ROW_HEADER: Cell,
|
|
443
527
|
}
|
|
444
528
|
|
|
445
529
|
|
|
@@ -465,10 +549,7 @@ IMAGE_DEFAULTS: ImageDefaults = {
|
|
|
465
549
|
LayoutType.LIST,
|
|
466
550
|
LayoutType.CELL,
|
|
467
551
|
LayoutType.FIGURE,
|
|
468
|
-
CellType.COLUMN_HEADER,
|
|
469
|
-
CellType.PROJECTED_ROW_HEADER,
|
|
470
552
|
CellType.SPANNING,
|
|
471
|
-
CellType.ROW_HEADER,
|
|
472
553
|
),
|
|
473
554
|
}
|
|
474
555
|
|
|
@@ -851,6 +932,16 @@ class Page(Image):
|
|
|
851
932
|
"""
|
|
852
933
|
return self._make_text(False)
|
|
853
934
|
|
|
935
|
+
def _ann_viz_bbox(self, ann: ImageAnnotationBaseView) -> list[float]:
|
|
936
|
+
"""
|
|
937
|
+
Get the bounding box as list and in absolute coordinates of the base page.
|
|
938
|
+
"""
|
|
939
|
+
bounding_box = ann.get_bounding_box(self.image_id)
|
|
940
|
+
|
|
941
|
+
if not bounding_box.absolute_coords:
|
|
942
|
+
bounding_box = bounding_box.transform(self.width, self.height, absolute_coords=True)
|
|
943
|
+
return bounding_box.to_list(mode="xyxy")
|
|
944
|
+
|
|
854
945
|
@no_type_check
|
|
855
946
|
def viz(
|
|
856
947
|
self,
|
|
@@ -886,6 +977,7 @@ class Page(Image):
|
|
|
886
977
|
:param show_tables: Will display all tables boxes as well as cells, rows and columns
|
|
887
978
|
:param show_layouts: Will display all other layout components.
|
|
888
979
|
:param show_figures: Will display all figures
|
|
980
|
+
:param show_residual_layouts: Will display all residual layouts
|
|
889
981
|
:param show_cells: Will display cells within tables. (Only available if `show_tables=True`)
|
|
890
982
|
:param show_table_structure: Will display rows and columns
|
|
891
983
|
:param show_words: Will display bounding boxes around words labeled with token class and bio tag (experimental)
|
|
@@ -910,50 +1002,46 @@ class Page(Image):
|
|
|
910
1002
|
if debug_kwargs:
|
|
911
1003
|
anns = self.get_annotation(category_names=list(debug_kwargs.keys()))
|
|
912
1004
|
for ann in anns:
|
|
913
|
-
box_stack.append(ann
|
|
1005
|
+
box_stack.append(self._ann_viz_bbox(ann))
|
|
914
1006
|
category_names_list.append(str(getattr(ann, debug_kwargs[ann.category_name])))
|
|
915
1007
|
|
|
916
1008
|
if show_layouts and not debug_kwargs:
|
|
917
1009
|
for item in self.layouts:
|
|
918
|
-
box_stack.append(item
|
|
1010
|
+
box_stack.append(self._ann_viz_bbox(item))
|
|
919
1011
|
category_names_list.append(item.category_name.value)
|
|
920
1012
|
|
|
921
1013
|
if show_figures and not debug_kwargs:
|
|
922
1014
|
for item in self.figures:
|
|
923
|
-
box_stack.append(item
|
|
1015
|
+
box_stack.append(self._ann_viz_bbox(item))
|
|
924
1016
|
category_names_list.append(item.category_name.value)
|
|
925
1017
|
|
|
926
1018
|
if show_tables and not debug_kwargs:
|
|
927
1019
|
for table in self.tables:
|
|
928
|
-
box_stack.append(table
|
|
1020
|
+
box_stack.append(self._ann_viz_bbox(table))
|
|
929
1021
|
category_names_list.append(LayoutType.TABLE.value)
|
|
930
1022
|
if show_cells:
|
|
931
1023
|
for cell in table.cells:
|
|
932
1024
|
if cell.category_name in {
|
|
933
1025
|
LayoutType.CELL,
|
|
934
|
-
CellType.PROJECTED_ROW_HEADER,
|
|
935
1026
|
CellType.SPANNING,
|
|
936
|
-
CellType.ROW_HEADER,
|
|
937
|
-
CellType.COLUMN_HEADER,
|
|
938
1027
|
}:
|
|
939
1028
|
cells_found = True
|
|
940
|
-
box_stack.append(cell
|
|
1029
|
+
box_stack.append(self._ann_viz_bbox(cell))
|
|
941
1030
|
category_names_list.append(None)
|
|
942
1031
|
if show_table_structure:
|
|
943
1032
|
rows = table.rows
|
|
944
1033
|
cols = table.columns
|
|
945
1034
|
for row in rows:
|
|
946
|
-
box_stack.append(row
|
|
1035
|
+
box_stack.append(self._ann_viz_bbox(row))
|
|
947
1036
|
category_names_list.append(None)
|
|
948
1037
|
for col in cols:
|
|
949
|
-
box_stack.append(col
|
|
1038
|
+
box_stack.append(self._ann_viz_bbox(col))
|
|
950
1039
|
category_names_list.append(None)
|
|
951
1040
|
|
|
952
1041
|
if show_cells and not cells_found and not debug_kwargs:
|
|
953
|
-
for ann in self.
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
category_names_list.append(None)
|
|
1042
|
+
for ann in self.get_annotation(category_names=[LayoutType.CELL, CellType.SPANNING]):
|
|
1043
|
+
box_stack.append(self._ann_viz_bbox(ann))
|
|
1044
|
+
category_names_list.append(None)
|
|
957
1045
|
|
|
958
1046
|
if show_words and not debug_kwargs:
|
|
959
1047
|
all_words = []
|
|
@@ -965,7 +1053,7 @@ class Page(Image):
|
|
|
965
1053
|
all_words = self.get_annotation(category_names=LayoutType.WORD)
|
|
966
1054
|
if not ignore_default_token_class:
|
|
967
1055
|
for word in all_words:
|
|
968
|
-
box_stack.append(word
|
|
1056
|
+
box_stack.append(self._ann_viz_bbox(word))
|
|
969
1057
|
if show_token_class:
|
|
970
1058
|
category_names_list.append(word.token_class.value if word.token_class is not None else None)
|
|
971
1059
|
else:
|
|
@@ -973,7 +1061,7 @@ class Page(Image):
|
|
|
973
1061
|
else:
|
|
974
1062
|
for word in all_words:
|
|
975
1063
|
if word.token_class is not None and word.token_class != TokenClasses.OTHER:
|
|
976
|
-
box_stack.append(word
|
|
1064
|
+
box_stack.append(self._ann_viz_bbox(word))
|
|
977
1065
|
if show_token_class:
|
|
978
1066
|
category_names_list.append(word.token_class.value if word.token_class is not None else None)
|
|
979
1067
|
else:
|