deepdoctection 0.33__tar.gz → 0.34__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (152) hide show
  1. {deepdoctection-0.33 → deepdoctection-0.34}/PKG-INFO +4 -1
  2. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/__init__.py +6 -3
  3. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/analyzer/dd.py +39 -31
  4. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datapoint/annotation.py +40 -2
  5. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datapoint/image.py +117 -41
  6. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datapoint/view.py +1 -1
  7. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/base.py +1 -1
  8. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/instances/fintabnet.py +1 -1
  9. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/instances/xfund.py +29 -7
  10. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/eval/eval.py +7 -1
  11. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/model.py +2 -1
  12. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/mapper/cats.py +11 -13
  13. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/mapper/cocostruct.py +6 -2
  14. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/mapper/d2struct.py +2 -1
  15. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/mapper/laylmstruct.py +1 -1
  16. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/mapper/match.py +31 -0
  17. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/mapper/misc.py +1 -1
  18. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/mapper/prodigystruct.py +1 -1
  19. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/pipe/anngen.py +27 -0
  20. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/pipe/base.py +23 -0
  21. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/pipe/common.py +123 -38
  22. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/pipe/segment.py +1 -1
  23. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/pipe/sub_layout.py +1 -1
  24. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/env_info.py +1 -1
  25. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/fs.py +27 -4
  26. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/pdf_utils.py +28 -3
  27. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/settings.py +3 -0
  28. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection.egg-info/PKG-INFO +4 -1
  29. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection.egg-info/requires.txt +3 -0
  30. {deepdoctection-0.33 → deepdoctection-0.34}/setup.cfg +3 -0
  31. {deepdoctection-0.33 → deepdoctection-0.34}/setup.py +3 -1
  32. {deepdoctection-0.33 → deepdoctection-0.34}/LICENSE +0 -0
  33. {deepdoctection-0.33 → deepdoctection-0.34}/README.md +0 -0
  34. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/analyzer/__init__.py +0 -0
  35. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/configs/__init__.py +0 -0
  36. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/configs/conf_dd_one.yaml +0 -0
  37. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/configs/conf_tesseract.yaml +0 -0
  38. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/dataflow/__init__.py +0 -0
  39. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/dataflow/base.py +0 -0
  40. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/dataflow/common.py +0 -0
  41. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/dataflow/custom.py +0 -0
  42. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/dataflow/custom_serialize.py +0 -0
  43. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/dataflow/parallel_map.py +0 -0
  44. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/dataflow/serialize.py +0 -0
  45. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/dataflow/stats.py +0 -0
  46. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datapoint/__init__.py +0 -0
  47. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datapoint/box.py +0 -0
  48. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datapoint/convert.py +0 -0
  49. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/__init__.py +0 -0
  50. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/adapter.py +0 -0
  51. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/dataflow_builder.py +0 -0
  52. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/info.py +0 -0
  53. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/instances/__init__.py +0 -0
  54. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/instances/doclaynet.py +0 -0
  55. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/instances/funsd.py +0 -0
  56. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/instances/iiitar13k.py +0 -0
  57. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/instances/layouttest.py +0 -0
  58. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/instances/publaynet.py +0 -0
  59. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/instances/pubtables1m.py +0 -0
  60. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/instances/pubtabnet.py +0 -0
  61. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/instances/rvlcdip.py +0 -0
  62. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/instances/xsl/__init__.py +0 -0
  63. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/instances/xsl/pascal_voc.xsl +0 -0
  64. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/registry.py +0 -0
  65. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/save.py +0 -0
  66. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/eval/__init__.py +0 -0
  67. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/eval/accmetric.py +0 -0
  68. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/eval/base.py +0 -0
  69. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/eval/cocometric.py +0 -0
  70. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/eval/registry.py +0 -0
  71. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/eval/tedsmetric.py +0 -0
  72. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/eval/tp_eval_callback.py +0 -0
  73. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/__init__.py +0 -0
  74. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/base.py +0 -0
  75. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/d2detect.py +0 -0
  76. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/deskew.py +0 -0
  77. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/doctrocr.py +0 -0
  78. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/fastlang.py +0 -0
  79. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/hfdetr.py +0 -0
  80. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/hflayoutlm.py +0 -0
  81. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/hflm.py +0 -0
  82. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/pdftext.py +0 -0
  83. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/pt/__init__.py +0 -0
  84. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/pt/nms.py +0 -0
  85. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/pt/ptutils.py +0 -0
  86. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tessocr.py +0 -0
  87. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/texocr.py +0 -0
  88. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/__init__.py +0 -0
  89. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tfutils.py +0 -0
  90. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpcompat.py +0 -0
  91. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/__init__.py +0 -0
  92. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/common.py +0 -0
  93. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +0 -0
  94. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/config/config.py +0 -0
  95. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +0 -0
  96. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +0 -0
  97. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +0 -0
  98. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +0 -0
  99. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +0 -0
  100. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +0 -0
  101. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +0 -0
  102. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +0 -0
  103. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +0 -0
  104. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/predict.py +0 -0
  105. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/preproc.py +0 -0
  106. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +0 -0
  107. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +0 -0
  108. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tp/tpfrcnn/utils/np_box_ops.py +0 -0
  109. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/tpdetect.py +0 -0
  110. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/mapper/__init__.py +0 -0
  111. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/mapper/hfstruct.py +0 -0
  112. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/mapper/maputils.py +0 -0
  113. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/mapper/pascalstruct.py +0 -0
  114. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/mapper/pubstruct.py +0 -0
  115. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/mapper/tpstruct.py +0 -0
  116. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/mapper/xfundstruct.py +0 -0
  117. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/pipe/__init__.py +0 -0
  118. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/pipe/concurrency.py +0 -0
  119. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/pipe/doctectionpipe.py +0 -0
  120. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/pipe/language.py +0 -0
  121. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/pipe/layout.py +0 -0
  122. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/pipe/lm.py +0 -0
  123. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/pipe/order.py +0 -0
  124. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/pipe/refine.py +0 -0
  125. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/pipe/registry.py +0 -0
  126. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/pipe/text.py +0 -0
  127. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/pipe/transform.py +0 -0
  128. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/py.typed +0 -0
  129. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/train/__init__.py +0 -0
  130. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/train/d2_frcnn_train.py +0 -0
  131. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/train/hf_detr_train.py +0 -0
  132. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/train/hf_layoutlm_train.py +0 -0
  133. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/train/tp_frcnn_train.py +0 -0
  134. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/__init__.py +0 -0
  135. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/concurrency.py +0 -0
  136. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/context.py +0 -0
  137. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/develop.py +0 -0
  138. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/error.py +0 -0
  139. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/file_utils.py +0 -0
  140. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/identifier.py +0 -0
  141. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/logger.py +0 -0
  142. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/metacfg.py +0 -0
  143. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/mocks.py +0 -0
  144. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/tqdm.py +0 -0
  145. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/transform.py +0 -0
  146. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/types.py +0 -0
  147. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/utils.py +0 -0
  148. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/utils/viz.py +0 -0
  149. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection.egg-info/SOURCES.txt +0 -0
  150. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection.egg-info/dependency_links.txt +0 -0
  151. {deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection.egg-info/top_level.txt +0 -0
  152. {deepdoctection-0.33 → deepdoctection-0.34}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deepdoctection
3
- Version: 0.33
3
+ Version: 0.34
4
4
  Summary: Repository for Document AI
5
5
  Home-page: https://github.com/deepdoctection/deepdoctection
6
6
  Author: Dr. Janis Meyer
@@ -29,6 +29,7 @@ Requires-Dist: Pillow>=10.0.0
29
29
  Requires-Dist: pypdf>=3.16.0
30
30
  Requires-Dist: pyyaml>=6.0.1
31
31
  Requires-Dist: pyzmq>=16
32
+ Requires-Dist: scipy>=1.13.1
32
33
  Requires-Dist: termcolor>=1.1
33
34
  Requires-Dist: tabulate>=0.7.7
34
35
  Requires-Dist: tqdm==4.64.0
@@ -46,6 +47,7 @@ Requires-Dist: Pillow>=10.0.0; extra == "tf"
46
47
  Requires-Dist: pypdf>=3.16.0; extra == "tf"
47
48
  Requires-Dist: pyyaml>=6.0.1; extra == "tf"
48
49
  Requires-Dist: pyzmq>=16; extra == "tf"
50
+ Requires-Dist: scipy>=1.13.1; extra == "tf"
49
51
  Requires-Dist: termcolor>=1.1; extra == "tf"
50
52
  Requires-Dist: tabulate>=0.7.7; extra == "tf"
51
53
  Requires-Dist: tqdm==4.64.0; extra == "tf"
@@ -76,6 +78,7 @@ Requires-Dist: Pillow>=10.0.0; extra == "pt"
76
78
  Requires-Dist: pypdf>=3.16.0; extra == "pt"
77
79
  Requires-Dist: pyyaml>=6.0.1; extra == "pt"
78
80
  Requires-Dist: pyzmq>=16; extra == "pt"
81
+ Requires-Dist: scipy>=1.13.1; extra == "pt"
79
82
  Requires-Dist: termcolor>=1.1; extra == "pt"
80
83
  Requires-Dist: tabulate>=0.7.7; extra == "pt"
81
84
  Requires-Dist: tqdm==4.64.0; extra == "pt"
@@ -15,7 +15,6 @@ if importlib.util.find_spec("dotenv") is not None:
15
15
 
16
16
 
17
17
  # pylint: disable=wrong-import-position
18
- import os
19
18
  import sys
20
19
  from typing import TYPE_CHECKING
21
20
 
@@ -25,11 +24,10 @@ from .utils.logger import LoggingRecord, logger
25
24
 
26
25
  # pylint: enable=wrong-import-position
27
26
 
28
- __version__ = 0.33
27
+ __version__ = 0.34
29
28
 
30
29
  _IMPORT_STRUCTURE = {
31
30
  "analyzer": [
32
- "maybe_copy_config_to_cache",
33
31
  "config_sanity_checks",
34
32
  "build_detector",
35
33
  "build_padder",
@@ -76,6 +74,7 @@ _IMPORT_STRUCTURE = {
76
74
  ],
77
75
  "datapoint": [
78
76
  "ann_from_dict",
77
+ "AnnotationMap",
79
78
  "Annotation",
80
79
  "CategoryAnnotation",
81
80
  "ImageAnnotation",
@@ -237,6 +236,7 @@ _IMPORT_STRUCTURE = {
237
236
  "LabelSummarizer",
238
237
  "curry",
239
238
  "match_anns_by_intersection",
239
+ "match_anns_by_distance",
240
240
  "to_image",
241
241
  "maybe_load_image",
242
242
  "maybe_remove_image",
@@ -265,6 +265,8 @@ _IMPORT_STRUCTURE = {
265
265
  "DetectResultGenerator",
266
266
  "SubImageLayoutService",
267
267
  "ImageCroppingService",
268
+ "IntersectionMatcher",
269
+ "NeighbourMatcher",
268
270
  "MatchingService",
269
271
  "PageParsingService",
270
272
  "AnnotationNmsService",
@@ -364,6 +366,7 @@ _IMPORT_STRUCTURE = {
364
366
  "get_configs_dir_path",
365
367
  "get_weights_dir_path",
366
368
  "get_dataset_dir_path",
369
+ "maybe_copy_config_to_cache",
367
370
  "is_uuid_like",
368
371
  "get_uuid_from_str",
369
372
  "get_uuid",
@@ -27,7 +27,6 @@ from __future__ import annotations
27
27
 
28
28
  import os
29
29
  from os import environ
30
- from shutil import copyfile
31
30
  from typing import Optional, Union
32
31
 
33
32
  from lazy_imports import try_import
@@ -44,7 +43,7 @@ from ..extern.texocr import TextractOcrDetector
44
43
  from ..extern.tp.tfutils import disable_tp_layer_logging, get_tf_device
45
44
  from ..extern.tpdetect import TPFrcnnDetector
46
45
  from ..pipe.base import PipelineComponent
47
- from ..pipe.common import AnnotationNmsService, MatchingService, PageParsingService
46
+ from ..pipe.common import AnnotationNmsService, IntersectionMatcher, MatchingService, PageParsingService
48
47
  from ..pipe.doctectionpipe import DoctectionPipe
49
48
  from ..pipe.layout import ImageLayoutService
50
49
  from ..pipe.order import TextOrderService
@@ -55,10 +54,10 @@ from ..pipe.text import TextExtractionService
55
54
  from ..utils.env_info import ENV_VARS_TRUE
56
55
  from ..utils.error import DependencyError
57
56
  from ..utils.file_utils import detectron2_available, tensorpack_available
58
- from ..utils.fs import get_configs_dir_path, get_package_path, mkdir_p
57
+ from ..utils.fs import get_configs_dir_path, get_package_path, maybe_copy_config_to_cache
59
58
  from ..utils.logger import LoggingRecord, logger
60
59
  from ..utils.metacfg import AttrDict, set_config_by_yaml
61
- from ..utils.settings import CellType, LayoutType
60
+ from ..utils.settings import CellType, LayoutType, Relationships
62
61
  from ..utils.transform import PadTransform
63
62
  from ..utils.types import PathLikeOrStr
64
63
 
@@ -67,7 +66,6 @@ with try_import() as image_guard:
67
66
 
68
67
 
69
68
  __all__ = [
70
- "maybe_copy_config_to_cache",
71
69
  "config_sanity_checks",
72
70
  "build_detector",
73
71
  "build_padder",
@@ -77,31 +75,37 @@ __all__ = [
77
75
  "build_doctr_word",
78
76
  "get_dd_analyzer",
79
77
  "build_analyzer",
78
+ "set_config_by_yaml",
80
79
  ]
81
80
 
82
81
  _DD_ONE = "deepdoctection/configs/conf_dd_one.yaml"
83
82
  _TESSERACT = "deepdoctection/configs/conf_tesseract.yaml"
84
-
85
-
86
- def maybe_copy_config_to_cache(
87
- package_path: PathLikeOrStr, configs_dir_path: PathLikeOrStr, file_name: str, force_copy: bool = True
88
- ) -> str:
89
- """
90
- Initial copying of various files
91
- :param package_path: base path to directory of source file `file_name`
92
- :param configs_dir_path: base path to target directory
93
- :param file_name: file to copy
94
- :param force_copy: If file is already in target directory, will re-copy the file
95
-
96
- :return: path to the copied file_name
97
- """
98
-
99
- absolute_path_source = os.path.join(package_path, file_name)
100
- absolute_path = os.path.join(configs_dir_path, os.path.join("dd", os.path.split(file_name)[1]))
101
- mkdir_p(os.path.split(absolute_path)[0])
102
- if not os.path.isfile(absolute_path) or force_copy:
103
- copyfile(absolute_path_source, absolute_path)
104
- return absolute_path
83
+ _MODEL_CHOICES = {
84
+ "layout": [
85
+ "layout/d2_model_0829999_layout_inf_only.pt",
86
+ "xrf_layout/model_final_inf_only.pt",
87
+ "microsoft/table-transformer-detection/pytorch_model.bin",
88
+ ],
89
+ "segmentation": [
90
+ "item/model-1620000_inf_only.data-00000-of-00001",
91
+ "xrf_item/model_final_inf_only.pt",
92
+ "microsoft/table-transformer-structure-recognition/pytorch_model.bin",
93
+ "deepdoctection/tatr_tab_struct_v2/pytorch_model.bin",
94
+ ],
95
+ "ocr": ["Tesseract", "DocTr", "Textract"],
96
+ "doctr_word": ["doctr/db_resnet50/pt/db_resnet50-ac60cadc.pt"],
97
+ "doctr_recognition": [
98
+ "doctr/crnn_vgg16_bn/pt/crnn_vgg16_bn-9762b0b0.pt",
99
+ "doctr/crnn_vgg16_bn/pt/pytorch_model.bin",
100
+ ],
101
+ "llm": ["gpt-3.5-turbo", "gpt-4"],
102
+ "segmentation_choices": {
103
+ "item/model-1620000_inf_only.data-00000-of-00001": "cell/model-1800000_inf_only.data-00000-of-00001",
104
+ "xrf_item/model_final_inf_only.pt": "xrf_cell/model_final_inf_only.pt",
105
+ "microsoft/table-transformer-structure-recognition/pytorch_model.bin": None,
106
+ "deepdoctection/tatr_tab_struct_v2/pytorch_model.bin": None,
107
+ },
108
+ }
105
109
 
106
110
 
107
111
  def config_sanity_checks(cfg: AttrDict) -> None:
@@ -375,13 +379,17 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
375
379
  pipe_component_list.append(text)
376
380
 
377
381
  if cfg.USE_PDF_MINER or cfg.USE_OCR:
378
- match = MatchingService(
379
- parent_categories=cfg.WORD_MATCHING.PARENTAL_CATEGORIES,
380
- child_categories=LayoutType.WORD,
382
+ matcher = IntersectionMatcher(
381
383
  matching_rule=cfg.WORD_MATCHING.RULE,
382
384
  threshold=cfg.WORD_MATCHING.THRESHOLD,
383
385
  max_parent_only=cfg.WORD_MATCHING.MAX_PARENT_ONLY,
384
386
  )
387
+ match = MatchingService(
388
+ parent_categories=cfg.WORD_MATCHING.PARENTAL_CATEGORIES,
389
+ child_categories=LayoutType.WORD,
390
+ matcher=matcher,
391
+ relationship_key=Relationships.CHILD,
392
+ )
385
393
  pipe_component_list.append(match)
386
394
 
387
395
  order = TextOrderService(
@@ -444,9 +452,9 @@ def get_dd_analyzer(
444
452
  else:
445
453
  raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
446
454
  dd_one_config_path = maybe_copy_config_to_cache(
447
- get_package_path(), get_configs_dir_path(), _DD_ONE, reset_config_file
455
+ get_package_path(), get_configs_dir_path() / "dd", _DD_ONE, reset_config_file
448
456
  )
449
- maybe_copy_config_to_cache(get_package_path(), get_configs_dir_path(), _TESSERACT)
457
+ maybe_copy_config_to_cache(get_package_path(), get_configs_dir_path() / "dd", _TESSERACT)
450
458
 
451
459
  # Set up of the configuration and logging
452
460
  cfg = set_config_by_yaml(dd_one_config_path if not path_config_file else path_config_file)
@@ -21,6 +21,7 @@ Dataclass for annotations and their derived classes.
21
21
  from __future__ import annotations
22
22
 
23
23
  from abc import ABC, abstractmethod
24
+ from collections import defaultdict
24
25
  from dataclasses import dataclass, field
25
26
  from typing import Optional, Union, no_type_check
26
27
 
@@ -66,6 +67,16 @@ def ann_from_dict(cls, **kwargs: AnnotationDict):
66
67
  return ann
67
68
 
68
69
 
70
+ @dataclass(frozen=True)
71
+ class AnnotationMap:
72
+ """AnnotationMap to store all sub categories, relationship keys and summary keys of an annotation"""
73
+
74
+ image_annotation_id: str
75
+ sub_category_key: Optional[ObjectTypes] = None
76
+ relationship_key: Optional[ObjectTypes] = None
77
+ summary_key: Optional[ObjectTypes] = None
78
+
79
+
69
80
  @dataclass
70
81
  class Annotation(ABC):
71
82
  """
@@ -397,7 +408,8 @@ class CategoryAnnotation(Annotation):
397
408
  except ValueError:
398
409
  logger.warning(LoggingRecord(f"Relationship {key} cannot be removed because it does not exist"))
399
410
  else:
400
- self.relationships[key].clear()
411
+ if key in self.relationships:
412
+ self.relationships[key].clear()
401
413
 
402
414
  def get_defining_attributes(self) -> list[str]:
403
415
  return ["category_name", "category_id"]
@@ -409,7 +421,7 @@ class CategoryAnnotation(Annotation):
409
421
 
410
422
  :return: list of attributes.
411
423
  """
412
- return []
424
+ return ["_category_name"]
413
425
 
414
426
  @classmethod
415
427
  def from_dict(cls, **kwargs: AnnotationDict) -> CategoryAnnotation:
@@ -470,6 +482,32 @@ class ImageAnnotation(CategoryAnnotation):
470
482
  return self.image.summary.get_sub_category(key)
471
483
  raise AnnotationError(f"Summary does not exist for {self.annotation_id} and key: {key}")
472
484
 
485
+ def get_annotation_map(self) -> defaultdict[str, list[AnnotationMap]]:
486
+ """
487
+ Returns a defaultdict with annotation ids as keys and a list of AnnotationMap instances as values for all sub
488
+ categories, relationships and image summaries.
489
+ :return: defaultdict with annotation ids as keys and a list of AnnotationMap instances as values.
490
+ """
491
+ annotation_id_dict = defaultdict(list)
492
+ annotation_id_dict[self.annotation_id].append(AnnotationMap(image_annotation_id=self.annotation_id))
493
+ for sub_cat_key in self.sub_categories:
494
+ sub_cat = self.get_sub_category(sub_cat_key)
495
+ annotation_id_dict[sub_cat.annotation_id].append(
496
+ AnnotationMap(image_annotation_id=self.annotation_id, sub_category_key=sub_cat_key)
497
+ )
498
+ if self.image is not None:
499
+ for summary_cat_key in self.image.summary.sub_categories:
500
+ summary_cat = self.get_summary(summary_cat_key)
501
+ annotation_id_dict[summary_cat.annotation_id].append(
502
+ AnnotationMap(image_annotation_id=self.annotation_id, summary_key=summary_cat_key)
503
+ )
504
+ for rel_key in self.relationships:
505
+ for rel_ann_ids in self.get_relationship(rel_key):
506
+ annotation_id_dict[rel_ann_ids].append(
507
+ AnnotationMap(image_annotation_id=self.annotation_id, relationship_key=rel_key)
508
+ )
509
+ return annotation_id_dict
510
+
473
511
 
474
512
  @dataclass
475
513
  class ContainerAnnotation(CategoryAnnotation):
@@ -21,10 +21,11 @@ Dataclass Image
21
21
  from __future__ import annotations
22
22
 
23
23
  import json
24
+ from collections import defaultdict
24
25
  from dataclasses import dataclass, field
25
26
  from os import environ
26
27
  from pathlib import Path
27
- from typing import Any, Iterable, Optional, Sequence, Union, no_type_check
28
+ from typing import Any, Optional, Sequence, Union, no_type_check
28
29
 
29
30
  import numpy as np
30
31
  from numpy import uint8
@@ -33,7 +34,7 @@ from ..utils.error import AnnotationError, BoundingBoxError, ImageError, UUIDErr
33
34
  from ..utils.identifier import get_uuid, is_uuid_like
34
35
  from ..utils.settings import ObjectTypes, SummaryType, get_type
35
36
  from ..utils.types import ImageDict, PathLikeOrStr, PixelValues
36
- from .annotation import Annotation, BoundingBox, CategoryAnnotation, ImageAnnotation
37
+ from .annotation import Annotation, AnnotationMap, BoundingBox, CategoryAnnotation, ImageAnnotation
37
38
  from .box import crop_box_from_image, global_to_local_coords, intersection_box
38
39
  from .convert import as_dict, convert_b64_to_np_array, convert_np_array_to_b64, convert_pdf_bytes_to_np_array_v2
39
40
 
@@ -303,6 +304,15 @@ class Image:
303
304
 
304
305
  return self.embeddings[image_id]
305
306
 
307
+ def remove_embedding(self, image_id: str) -> None:
308
+ """
309
+ Remove an embedding from the image.
310
+
311
+ :param image_id: uuid string of the embedding image
312
+ """
313
+ if image_id in self.embeddings:
314
+ self.embeddings.pop(image_id)
315
+
306
316
  def _self_embedding(self) -> None:
307
317
  if self._bbox is not None:
308
318
  self.set_embedding(self.image_id, self._bbox)
@@ -387,39 +397,6 @@ class Image:
387
397
 
388
398
  return list(anns)
389
399
 
390
- def get_annotation_iter(
391
- self,
392
- category_names: Optional[Union[str, ObjectTypes, Sequence[Union[str, ObjectTypes]]]] = None,
393
- annotation_ids: Optional[Union[str, Sequence[str]]] = None,
394
- service_id: Optional[Union[str, Sequence[str]]] = None,
395
- model_id: Optional[Union[str, Sequence[str]]] = None,
396
- session_ids: Optional[Union[str, Sequence[str]]] = None,
397
- ignore_inactive: bool = True,
398
- ) -> Iterable[ImageAnnotation]:
399
- """
400
- Get annotation as an iterator. Same as `get_annotation` but returns an iterator instead of a list.
401
-
402
- :param category_names: A single name or list of names
403
- :param annotation_ids: A single id or list of ids
404
- :param service_id: A single service name or list of service names
405
- :param model_id: A single model name or list of model names
406
- :param session_ids: A single session id or list of session ids
407
- :param ignore_inactive: If set to `True` only active annotations are returned.
408
-
409
- :return: A (possibly empty) list of annotations
410
- """
411
-
412
- return iter(
413
- self.get_annotation(
414
- category_names=category_names,
415
- annotation_ids=annotation_ids,
416
- service_id=service_id,
417
- model_id=model_id,
418
- session_ids=session_ids,
419
- ignore_inactive=ignore_inactive,
420
- )
421
- )
422
-
423
400
  def as_dict(self) -> dict[str, Any]:
424
401
  """
425
402
  Returns the full image dataclass as dict. Uses the custom `convert.as_dict` to disregard attributes
@@ -441,7 +418,7 @@ class Image:
441
418
  A list of attributes to suspend from as_dict creation.
442
419
  """
443
420
 
444
- return ["_image"]
421
+ return ["_image", "_annotation_ids"]
445
422
 
446
423
  def define_annotation_id(self, annotation: Annotation) -> str:
447
424
  """
@@ -456,7 +433,11 @@ class Image:
456
433
  attributes_values = [str(getattr(annotation, attribute)) for attribute in attributes]
457
434
  return get_uuid(*attributes_values, str(self.image_id))
458
435
 
459
- def remove(self, annotation: ImageAnnotation) -> None:
436
+ def remove(
437
+ self,
438
+ annotation_ids: Optional[Union[str, list[str]]] = None,
439
+ service_ids: Optional[Union[str, list[str]]] = None,
440
+ ) -> None:
460
441
  """
461
442
  Instead of removing consider deactivating annotations.
462
443
 
@@ -464,9 +445,66 @@ class Image:
464
445
 
465
446
  :param annotation: The annotation to remove
466
447
  """
448
+ ann_id_to_annotation_maps = self.get_annotation_id_to_annotation_maps()
449
+
450
+ if annotation_ids is not None:
451
+ annotation_ids = [annotation_ids] if isinstance(annotation_ids, str) else annotation_ids
452
+
453
+ for ann_id in annotation_ids:
454
+ if ann_id not in ann_id_to_annotation_maps:
455
+ raise ImageError(f"Annotation with id {ann_id} not found")
456
+ annotation_maps = ann_id_to_annotation_maps[ann_id]
457
+
458
+ for annotation_map in annotation_maps:
459
+ self._remove_by_annotation_id(ann_id, annotation_map)
460
+
461
+ if service_ids is not None:
462
+ service_ids = [service_ids] if isinstance(service_ids, str) else service_ids
463
+ service_id_to_annotation_id = self.get_service_id_to_annotation_id()
464
+
465
+ for service_id in service_ids:
466
+ if service_id not in service_id_to_annotation_id:
467
+ raise ImageError(f"Service id {service_id} not found")
468
+ annotation_ids = service_id_to_annotation_id[service_id]
469
+
470
+ for ann_id in annotation_ids:
471
+ if ann_id not in ann_id_to_annotation_maps:
472
+ raise ImageError(f"Annotation with id {ann_id} not found")
473
+ annotation_maps = ann_id_to_annotation_maps[ann_id]
474
+
475
+ for annotation_map in annotation_maps:
476
+ self._remove_by_annotation_id(ann_id, annotation_map)
477
+
478
+ def _remove_by_annotation_id(self, annotation_id: str, location_dict: AnnotationMap) -> None:
479
+ image_annotation_id = location_dict.image_annotation_id
480
+ annotations = self.get_annotation(annotation_ids=image_annotation_id)
481
+ if not annotations:
482
+ return
483
+ # There can only be one annotation with a given id
484
+ annotation = annotations[0]
485
+
486
+ if (
487
+ location_dict.sub_category_key is None
488
+ and location_dict.relationship_key is None
489
+ and location_dict.summary_key is None
490
+ ):
491
+ self.annotations.remove(annotation)
492
+ self._annotation_ids.remove(annotation.annotation_id)
493
+
494
+ sub_category_key = location_dict.sub_category_key
495
+
496
+ if sub_category_key is not None:
497
+ annotation.remove_sub_category(sub_category_key)
498
+
499
+ relationship_key = location_dict.relationship_key
467
500
 
468
- self.annotations.remove(annotation)
469
- self._annotation_ids.remove(annotation.annotation_id)
501
+ if relationship_key is not None:
502
+ annotation.remove_relationship(relationship_key, annotation_id)
503
+
504
+ summary_key = location_dict.summary_key
505
+ if summary_key is not None:
506
+ if annotation.image is not None:
507
+ annotation.image.summary.remove_sub_category(summary_key)
470
508
 
471
509
  def image_ann_to_image(self, annotation_id: str, crop_image: bool = False) -> None:
472
510
  """
@@ -580,6 +618,7 @@ class Image:
580
618
  if summary_dict := kwargs.get("_summary", kwargs.get("summary")):
581
619
  image.summary = CategoryAnnotation.from_dict(**summary_dict)
582
620
  image.summary.category_name = SummaryType.SUMMARY
621
+
583
622
  return image
584
623
 
585
624
  @classmethod
@@ -645,7 +684,7 @@ class Image:
645
684
  highest_hierarchy_only: bool = False,
646
685
  path: Optional[PathLikeOrStr] = None,
647
686
  dry: bool = False,
648
- ) -> Optional[ImageDict]:
687
+ ) -> Optional[Union[ImageDict, str]]:
649
688
  """
650
689
  Export image as dictionary. As numpy array cannot be serialized `image` values will be converted into
651
690
  base64 encodings.
@@ -677,8 +716,45 @@ class Image:
677
716
  return export_dict
678
717
  with open(path_json, "w", encoding="UTF-8") as file:
679
718
  json.dump(export_dict, file, indent=2)
680
- return None
719
+ return path_json
681
720
 
682
721
  def get_categories_from_current_state(self) -> set[str]:
683
722
  """Returns all active dumped categories"""
684
723
  return {ann.category_name for ann in self.get_annotation()}
724
+
725
+ def get_service_id_to_annotation_id(self) -> defaultdict[str, list[str]]:
726
+ """
727
+ Returns a dictionary with service ids as keys and lists of annotation ids that have been generated by the
728
+ service
729
+ :return: default with service ids as keys and lists of annotation ids as values
730
+ """
731
+ service_id_dict = defaultdict(list)
732
+ for ann in self.get_annotation():
733
+ if ann.service_id:
734
+ service_id_dict[ann.service_id].append(ann.annotation_id)
735
+ for sub_cat_key in ann.sub_categories:
736
+ sub_cat = ann.get_sub_category(sub_cat_key)
737
+ if sub_cat.service_id:
738
+ service_id_dict[sub_cat.service_id].append(sub_cat.annotation_id)
739
+ if ann.image is not None:
740
+ for summary_cat_key in ann.image.summary:
741
+ summary_cat = ann.get_summary(summary_cat_key)
742
+ if summary_cat.service_id:
743
+ service_id_dict[summary_cat.service_id].append(summary_cat.annotation_id)
744
+
745
+ return service_id_dict
746
+
747
+ def get_annotation_id_to_annotation_maps(self) -> defaultdict[str, list[AnnotationMap]]:
748
+ """
749
+ Returns a dictionary with annotation ids as keys and lists of AnnotationMap as values. The range of ids
750
+ is the union of all ImageAnnotation, CategoryAnnotation and ContainerAnnotation of the image.
751
+
752
+ :return: default dict with annotation ids as keys and lists of AnnotationMap as values
753
+ """
754
+ all_ann_id_dict = defaultdict(list)
755
+ for ann in self.get_annotation():
756
+ ann_id_dict = ann.get_annotation_map()
757
+ for key, val in ann_id_dict.items():
758
+ all_ann_id_dict[key].extend(val)
759
+
760
+ return all_ann_id_dict
@@ -971,7 +971,7 @@ class Page(Image):
971
971
  highest_hierarchy_only: bool = False,
972
972
  path: Optional[PathLikeOrStr] = None,
973
973
  dry: bool = False,
974
- ) -> Optional[ImageDict]:
974
+ ) -> Optional[Union[ImageDict, str]]:
975
975
  """
976
976
  Export image as dictionary. As numpy array cannot be serialized `image` values will be converted into
977
977
  base64 encodings.
@@ -451,7 +451,7 @@ class CustomDataset(DatasetBase):
451
451
  return self.dataflow_builder
452
452
 
453
453
  @staticmethod
454
- def from_dataset_card(file_path: str, dataflow_builder: Type[DataFlowBaseBuilder]) -> CustomDataset:
454
+ def from_dataset_card(file_path: PathLikeOrStr, dataflow_builder: Type[DataFlowBaseBuilder]) -> CustomDataset:
455
455
  """
456
456
  This static method creates a CustomDataset instance from a dataset card.
457
457
 
@@ -264,7 +264,7 @@ class FintabnetBuilder(DataFlowBaseBuilder):
264
264
  add_summary=True,
265
265
  ),
266
266
  )
267
- df = MapData(df, lambda dp: [ann.image for ann in dp.get_annotation_iter(category_names=LayoutType.TABLE)])
267
+ df = MapData(df, lambda dp: [ann.image for ann in dp.get_annotation(category_names=LayoutType.TABLE)])
268
268
  df = FlattenData(df)
269
269
  df = MapData(df, lambda dp: dp[0])
270
270
 
@@ -180,13 +180,35 @@ class XfundBuilder(DataFlowBaseBuilder):
180
180
  "answer": TokenClasses.ANSWER,
181
181
  "header": TokenClasses.HEADER,
182
182
  }
183
- ner_token_to_id_mapping = self.categories.get_sub_categories(
184
- categories=LayoutType.WORD,
185
- sub_categories={LayoutType.WORD: [WordType.TOKEN_TAG, WordType.TAG, WordType.TOKEN_CLASS]},
186
- keys=False,
187
- values_as_dict=True,
188
- name_as_key=True,
189
- )
183
+ if LayoutType.WORD in self.categories.get_categories(filtered=True, name_as_key=True):
184
+ ner_token_to_id_mapping = self.categories.get_sub_categories(
185
+ categories=LayoutType.WORD,
186
+ sub_categories={LayoutType.WORD: [WordType.TOKEN_TAG, WordType.TAG, WordType.TOKEN_CLASS]},
187
+ keys=False,
188
+ values_as_dict=True,
189
+ name_as_key=True,
190
+ )
191
+ else:
192
+ ner_token_to_id_mapping = {
193
+ LayoutType.WORD: {
194
+ WordType.TAG: {BioTag.BEGIN: 3, BioTag.INSIDE: 1, BioTag.OUTSIDE: 2},
195
+ WordType.TOKEN_CLASS: {
196
+ TokenClasses.ANSWER: 3,
197
+ TokenClasses.HEADER: 4,
198
+ TokenClasses.OTHER: 1,
199
+ TokenClasses.QUESTION: 2,
200
+ },
201
+ WordType.TOKEN_TAG: {
202
+ TokenClassWithTag.B_ANSWER: 1,
203
+ TokenClassWithTag.B_HEADER: 2,
204
+ TokenClassWithTag.B_QUESTION: 3,
205
+ TokenClassWithTag.I_ANSWER: 4,
206
+ TokenClassWithTag.I_HEADER: 5,
207
+ TokenClassWithTag.I_QUESTION: 6,
208
+ BioTag.OUTSIDE: 7,
209
+ },
210
+ }
211
+ }
190
212
  df = MapData(
191
213
  df,
192
214
  xfund_to_image(
@@ -293,6 +293,8 @@ class Evaluator:
293
293
  show_words = kwargs.pop("show_words", False)
294
294
  show_token_class = kwargs.pop("show_token_class", True)
295
295
  ignore_default_token_class = kwargs.pop("ignore_default_token_class", False)
296
+ floating_text_block_categories = kwargs.pop("floating_text_block_categories", None)
297
+ include_residual_text_containers = kwargs.pop("include_residual_Text_containers", True)
296
298
 
297
299
  df_gt = self.dataset.dataflow.build(**kwargs)
298
300
  df_pr = self.dataset.dataflow.build(**kwargs)
@@ -301,7 +303,11 @@ class Evaluator:
301
303
  df_pr = MapData(df_pr, deepcopy)
302
304
  df_pr = self._clean_up_predict_dataflow_annotations(df_pr)
303
305
 
304
- page_parsing_component = PageParsingService(text_container=LayoutType.WORD)
306
+ page_parsing_component = PageParsingService(
307
+ text_container=LayoutType.WORD,
308
+ floating_text_block_categories=floating_text_block_categories, # type: ignore
309
+ include_residual_text_container=bool(include_residual_text_containers),
310
+ )
305
311
  df_gt = page_parsing_component.predict_dataflow(df_gt)
306
312
 
307
313
  if self.pipe_component:
@@ -1051,7 +1051,8 @@ class ModelCatalog:
1051
1051
  with jsonlines.open(path) as reader:
1052
1052
  for obj in reader:
1053
1053
  if not obj["name"] in ModelCatalog.CATALOG:
1054
- obj["categories"] = {int(key): get_type(val) for key, val in obj["categories"].items()}
1054
+ categories = obj.get("categories") or {}
1055
+ obj["categories"] = {int(key): get_type(val) for key, val in categories.items()}
1055
1056
  ModelCatalog.register(obj["name"], ModelProfile(**obj))
1056
1057
 
1057
1058
  @staticmethod