deepdoctection 0.43.5__tar.gz → 0.44.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (155) hide show
  1. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/PKG-INFO +3 -3
  2. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/README.md +2 -2
  3. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/__init__.py +3 -1
  4. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/analyzer/config.py +1 -1
  5. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/configs/profiles.jsonl +1 -0
  6. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datapoint/__init__.py +1 -1
  7. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datapoint/image.py +49 -1
  8. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datapoint/view.py +27 -13
  9. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/base.py +195 -51
  10. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/model.py +1 -1
  11. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/base.py +29 -25
  12. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/common.py +2 -2
  13. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/concurrency.py +2 -2
  14. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/language.py +2 -2
  15. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/layout.py +2 -2
  16. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/lm.py +13 -3
  17. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/order.py +9 -5
  18. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/refine.py +7 -7
  19. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/segment.py +30 -30
  20. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/sub_layout.py +2 -2
  21. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/text.py +10 -5
  22. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/transform.py +2 -4
  23. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection.egg-info/PKG-INFO +3 -3
  24. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/LICENSE +0 -0
  25. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/analyzer/__init__.py +0 -0
  26. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/analyzer/dd.py +0 -0
  27. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/analyzer/factory.py +0 -0
  28. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/configs/__init__.py +0 -0
  29. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/configs/conf_dd_one.yaml +0 -0
  30. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/configs/conf_tesseract.yaml +0 -0
  31. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/dataflow/__init__.py +0 -0
  32. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/dataflow/base.py +0 -0
  33. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/dataflow/common.py +0 -0
  34. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/dataflow/custom.py +0 -0
  35. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/dataflow/custom_serialize.py +0 -0
  36. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/dataflow/parallel_map.py +0 -0
  37. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/dataflow/serialize.py +0 -0
  38. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/dataflow/stats.py +0 -0
  39. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datapoint/annotation.py +0 -0
  40. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datapoint/box.py +0 -0
  41. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datapoint/convert.py +0 -0
  42. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/__init__.py +0 -0
  43. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/adapter.py +0 -0
  44. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/dataflow_builder.py +0 -0
  45. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/info.py +0 -0
  46. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/__init__.py +0 -0
  47. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/doclaynet.py +0 -0
  48. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/fintabnet.py +0 -0
  49. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/funsd.py +0 -0
  50. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/iiitar13k.py +0 -0
  51. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/layouttest.py +0 -0
  52. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/publaynet.py +0 -0
  53. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/pubtables1m.py +0 -0
  54. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/pubtabnet.py +0 -0
  55. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/rvlcdip.py +0 -0
  56. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/xfund.py +0 -0
  57. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/xsl/__init__.py +0 -0
  58. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/instances/xsl/pascal_voc.xsl +0 -0
  59. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/registry.py +0 -0
  60. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/save.py +0 -0
  61. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/eval/__init__.py +0 -0
  62. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/eval/accmetric.py +0 -0
  63. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/eval/base.py +0 -0
  64. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/eval/cocometric.py +0 -0
  65. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/eval/eval.py +0 -0
  66. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/eval/registry.py +0 -0
  67. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/eval/tedsmetric.py +0 -0
  68. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/eval/tp_eval_callback.py +0 -0
  69. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/__init__.py +0 -0
  70. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/base.py +0 -0
  71. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/d2detect.py +0 -0
  72. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/deskew.py +0 -0
  73. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/doctrocr.py +0 -0
  74. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/fastlang.py +0 -0
  75. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/hfdetr.py +0 -0
  76. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/hflayoutlm.py +0 -0
  77. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/hflm.py +0 -0
  78. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/pdftext.py +0 -0
  79. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/pt/__init__.py +0 -0
  80. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/pt/nms.py +0 -0
  81. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/pt/ptutils.py +0 -0
  82. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tessocr.py +0 -0
  83. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/texocr.py +0 -0
  84. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/__init__.py +0 -0
  85. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tfutils.py +0 -0
  86. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpcompat.py +0 -0
  87. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/__init__.py +0 -0
  88. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/common.py +0 -0
  89. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +0 -0
  90. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/config/config.py +0 -0
  91. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +0 -0
  92. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +0 -0
  93. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +0 -0
  94. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +0 -0
  95. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +0 -0
  96. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +0 -0
  97. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +0 -0
  98. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +0 -0
  99. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +0 -0
  100. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/predict.py +0 -0
  101. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/preproc.py +0 -0
  102. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +0 -0
  103. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +0 -0
  104. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tp/tpfrcnn/utils/np_box_ops.py +0 -0
  105. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/tpdetect.py +0 -0
  106. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/mapper/__init__.py +0 -0
  107. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/mapper/cats.py +0 -0
  108. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/mapper/cocostruct.py +0 -0
  109. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/mapper/d2struct.py +0 -0
  110. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/mapper/hfstruct.py +0 -0
  111. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/mapper/laylmstruct.py +0 -0
  112. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/mapper/maputils.py +0 -0
  113. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/mapper/match.py +0 -0
  114. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/mapper/misc.py +0 -0
  115. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/mapper/pascalstruct.py +0 -0
  116. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/mapper/prodigystruct.py +0 -0
  117. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/mapper/pubstruct.py +0 -0
  118. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/mapper/tpstruct.py +0 -0
  119. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/mapper/xfundstruct.py +0 -0
  120. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/__init__.py +0 -0
  121. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/anngen.py +0 -0
  122. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/doctectionpipe.py +0 -0
  123. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/registry.py +0 -0
  124. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/py.typed +0 -0
  125. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/train/__init__.py +0 -0
  126. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/train/d2_frcnn_train.py +0 -0
  127. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/train/hf_detr_train.py +0 -0
  128. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/train/hf_layoutlm_train.py +0 -0
  129. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/train/tp_frcnn_train.py +0 -0
  130. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/__init__.py +0 -0
  131. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/concurrency.py +0 -0
  132. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/context.py +0 -0
  133. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/develop.py +0 -0
  134. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/env_info.py +0 -0
  135. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/error.py +0 -0
  136. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/file_utils.py +0 -0
  137. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/fs.py +0 -0
  138. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/identifier.py +0 -0
  139. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/logger.py +0 -0
  140. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/metacfg.py +0 -0
  141. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/mocks.py +0 -0
  142. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/pdf_utils.py +0 -0
  143. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/settings.py +0 -0
  144. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/tqdm.py +0 -0
  145. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/transform.py +0 -0
  146. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/types.py +0 -0
  147. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/utils.py +0 -0
  148. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/utils/viz.py +0 -0
  149. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection.egg-info/SOURCES.txt +0 -0
  150. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection.egg-info/dependency_links.txt +0 -0
  151. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection.egg-info/requires.txt +0 -0
  152. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection.egg-info/top_level.txt +0 -0
  153. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/setup.cfg +0 -0
  154. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/setup.py +0 -0
  155. {deepdoctection-0.43.5 → deepdoctection-0.44.0}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deepdoctection
3
- Version: 0.43.5
3
+ Version: 0.44.0
4
4
  Summary: Repository for Document AI
5
5
  Home-page: https://github.com/deepdoctection/deepdoctection
6
6
  Author: Dr. Janis Meyer
@@ -321,7 +321,7 @@ For a simple setup which is enough to parse documents with the default setting,
321
321
 
322
322
  ```
323
323
  pip install transformers
324
- pip install python-doctr
324
+ pip install python-doctr==0.9.0
325
325
  pip install deepdoctection
326
326
  ```
327
327
 
@@ -329,7 +329,7 @@ pip install deepdoctection
329
329
 
330
330
  ```
331
331
  pip install tensorpack
332
- pip install python-doctr
332
+ pip install python-doctr==0.9.0
333
333
  pip install deepdoctection
334
334
  ```
335
335
 
@@ -178,7 +178,7 @@ For a simple setup which is enough to parse documents with the default setting,
178
178
 
179
179
  ```
180
180
  pip install transformers
181
- pip install python-doctr
181
+ pip install python-doctr==0.9.0
182
182
  pip install deepdoctection
183
183
  ```
184
184
 
@@ -186,7 +186,7 @@ pip install deepdoctection
186
186
 
187
187
  ```
188
188
  pip install tensorpack
189
- pip install python-doctr
189
+ pip install python-doctr==0.9.0
190
190
  pip install deepdoctection
191
191
  ```
192
192
 
@@ -25,7 +25,7 @@ from .utils.logger import LoggingRecord, logger
25
25
 
26
26
  # pylint: enable=wrong-import-position
27
27
 
28
- __version__ = "0.43.5"
28
+ __version__ = "0.44.0"
29
29
 
30
30
  _IMPORT_STRUCTURE = {
31
31
  "analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory"],
@@ -92,6 +92,7 @@ _IMPORT_STRUCTURE = {
92
92
  "convert_pdf_bytes_to_np_array_v2",
93
93
  "as_dict",
94
94
  "ImageAnnotationBaseView",
95
+ "MetaAnnotation",
95
96
  "Image",
96
97
  "Word",
97
98
  "Layout",
@@ -105,6 +106,7 @@ _IMPORT_STRUCTURE = {
105
106
  "DatasetAdapter",
106
107
  "DatasetBase",
107
108
  "MergeDataset",
109
+ "DatasetCard",
108
110
  "CustomDataset",
109
111
  "DataFlowBaseBuilder",
110
112
  "DatasetInfo",
@@ -629,7 +629,7 @@ cfg.PT.ENFORCE_WEIGHTS.ITEM = True
629
629
 
630
630
  # Specifies the PyTorch model weights for item detection.
631
631
  # Use either .pt or .safetensors files.
632
- cfg.PT.ITEM.WEIGHTS = "deepdoctection/tatr_tab_struct_v2/pytorch_model.bin"
632
+ cfg.PT.ITEM.WEIGHTS = "deepdoctection/tatr_tab_struct_v2/model.safetensors"
633
633
 
634
634
  # Specifies the TorchScript model for item detection.
635
635
  # Use .ts files for deployment without model implementation dependencies.
@@ -30,3 +30,4 @@
30
30
  {"name": "Felix92/doctr-torch-parseq-multilingual-v1/pytorch_model.bin", "description": "", "size": [63286381], "tp_model": false, "config": "Felix92/doctr-torch-parseq-multilingual-v1/config.json", "preprocessor_config": null, "hf_repo_id": "Felix92/doctr-torch-parseq-multilingual-v1", "hf_model_name": "pytorch_model.bin", "hf_config_file": ["config.json"], "urls": null, "categories": {}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "DoctrTextRecognizer", "architecture": "parseq", "padding": null}
31
31
  {"name": "doctr/crnn_vgg16_bn/pt/master-fde31e4a.pt", "description": "MASTER", "size": [63286381], "tp_model": false, "config": null, "preprocessor_config": null, "hf_repo_id": null, "hf_model_name": null, "hf_config_file": null, "urls": ["https://doctr-static.mindee.com/models?id=v0.7.0/master-fde31e4a.pt&src=0"], "categories": {}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "DoctrTextRecognizer", "architecture": "master", "padding": null}
32
32
  {"name": "Aryn/deformable-detr-DocLayNet/model.safetensors", "description": "Deformable DEtection TRansformer (DETR), trained on DocLayNet (including 80k annotated pages in 11 classes).", "size": [115511753], "tp_model": false, "config": "Aryn/deformable-detr-DocLayNet/config.json", "preprocessor_config": "Aryn/deformable-detr-DocLayNet/preprocessor_config.json", "hf_repo_id": "Aryn/deformable-detr-DocLayNet", "hf_model_name": "model.safetensors", "hf_config_file": ["config.json", "preprocessor_config.json"], "urls": null, "categories": {"1": "default_type", "2": "caption", "11": "text", "12": "title", "3": "footnote", "4": "formula", "5": "list_item", "6": "page_footer", "7": "page_header", "8": "figure", "9": "section_header", "10": "table"}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "HFDetrDerivedDetector", "architecture": null, "padding": null}
33
+ {"name": "deepdoctection/tatr_tab_struct_v2/model.safetensors", "description": "Table Transformer (DETR) model trained on PubTables1M. It was introduced in the paper Aligning benchmark datasets for table structure recognition by Smock et al. This model is devoted to table structure recognition and assumes to receive a slightly croppedtable as input. It will predict rows, column and spanning cells. Use a padding of around 5 pixels. This artefact has been converted from deepdoctection/tatr_tab_struct_v2/pytorch_model.bin and should be used to reduce security issues", "size": [115511753], "tp_model": false, "config": "deepdoctection/tatr_tab_struct_v2/config.json", "preprocessor_config": "deepdoctection/tatr_tab_struct_v2/preprocessor_config.json", "hf_repo_id": "deepdoctection/tatr_tab_struct_v2", "hf_model_name": "model.safetensors", "hf_config_file": ["config.json", "preprocessor_config.json"], "urls": null, "categories": {"1": "table", "2": "column", "3": "row", "4": "column_header", "5": "projected_row_header", "6": "spanning"}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "HFDetrDerivedDetector", "architecture": null, "padding": null}
@@ -34,5 +34,5 @@ After all, the point here is not to provide an optimal processing environment.
34
34
  from .annotation import *
35
35
  from .box import *
36
36
  from .convert import *
37
- from .image import Image
37
+ from .image import Image, MetaAnnotation
38
38
  from .view import *
@@ -25,7 +25,7 @@ from collections import defaultdict
25
25
  from dataclasses import dataclass, field
26
26
  from os import environ, fspath
27
27
  from pathlib import Path
28
- from typing import Any, Optional, Sequence, Union, no_type_check
28
+ from typing import Any, Optional, Sequence, TypedDict, Union, no_type_check
29
29
 
30
30
  import numpy as np
31
31
  from numpy import uint8
@@ -40,6 +40,54 @@ from .box import crop_box_from_image, global_to_local_coords, intersection_box
40
40
  from .convert import as_dict, convert_b64_to_np_array, convert_np_array_to_b64, convert_pdf_bytes_to_np_array_v2
41
41
 
42
42
 
43
+ class MetaAnnotationDict(TypedDict):
44
+ """MetaAnnotationDict"""
45
+ image_annotations: list[str]
46
+ sub_categories: dict[str, dict[str, list[str]]]
47
+ relationships: dict[str, list[str]]
48
+ summaries: list[str]
49
+
50
+
51
+ @dataclass(frozen=True)
52
+ class MetaAnnotation:
53
+ """
54
+ An immutable dataclass that stores information about what `Image` are being
55
+ modified through a pipeline component.
56
+
57
+ Attributes:
58
+ image_annotations: Tuple of `ObjectTypes` representing image annotations.
59
+ sub_categories: Dictionary mapping `ObjectTypes` to dicts of `ObjectTypes` to sets of `ObjectTypes`
60
+ for sub-categories.
61
+ relationships: Dictionary mapping `ObjectTypes` to sets of `ObjectTypes` for relationships.
62
+ summaries: Tuple of `ObjectTypes` representing summaries.
63
+ """
64
+
65
+ image_annotations: tuple[ObjectTypes, ...] = field(default=())
66
+ sub_categories: dict[ObjectTypes, dict[ObjectTypes, set[ObjectTypes]]] = field(default_factory=dict)
67
+ relationships: dict[ObjectTypes, set[ObjectTypes]] = field(default_factory=dict)
68
+ summaries: tuple[ObjectTypes, ...] = field(default=())
69
+
70
+ def as_dict(self) -> MetaAnnotationDict:
71
+ """
72
+ Returns the MetaAnnotation as a dictionary, with all `ObjectTypes` converted to strings.
73
+
74
+ Returns:
75
+ A dictionary representation of the MetaAnnotation where all `ObjectTypes` are converted to strings.
76
+ """
77
+ return {
78
+ "image_annotations": [obj.value for obj in self.image_annotations],
79
+ "sub_categories": {
80
+ outer_key.value: {
81
+ inner_key.value: [val.value for val in inner_values]
82
+ for inner_key, inner_values in outer_value.items()
83
+ }
84
+ for outer_key, outer_value in self.sub_categories.items()
85
+ },
86
+ "relationships": {key.value: [val.value for val in values] for key, values in self.relationships.items()},
87
+ "summaries": [obj.value for obj in self.summaries],
88
+ }
89
+
90
+
43
91
  @dataclass
44
92
  class Image:
45
93
  """
@@ -195,7 +195,9 @@ class Word(ImageAnnotationBaseView):
195
195
  attr_names = (
196
196
  set(WordType)
197
197
  .union(super().get_attribute_names())
198
- .union({Relationships.READING_ORDER, Relationships.LAYOUT_LINK, Relationships.LINK})
198
+ .union(
199
+ {Relationships.READING_ORDER, Relationships.LAYOUT_LINK, Relationships.LINK, Relationships.SUCCESSOR}
200
+ )
199
201
  )
200
202
  return {attr_name.value if isinstance(attr_name, ObjectTypes) else attr_name for attr_name in attr_names}
201
203
 
@@ -384,16 +386,10 @@ class Table(Layout):
384
386
  Returns:
385
387
  A list of a table cells.
386
388
  """
387
- all_relation_ids = self.get_relationship(Relationships.CHILD)
388
- cell_anns: list[Cell] = self.base_page.get_annotation( # type: ignore
389
- annotation_ids=all_relation_ids,
390
- category_names=[
391
- LayoutType.CELL,
392
- CellType.HEADER,
393
- CellType.BODY,
394
- CellType.SPANNING,
395
- ],
396
- )
389
+ cell_anns: list[Cell] = []
390
+ for row_number in range(1, self.number_of_rows + 1): # type: ignore
391
+ cell_anns.extend(self.row(row_number)) # type: ignore
392
+
397
393
  return cell_anns
398
394
 
399
395
  @property
@@ -592,6 +588,18 @@ class Table(Layout):
592
588
  )
593
589
  return table_list
594
590
 
591
+ @property
592
+ def csv_(self) -> list[list[list[Text_]]]:
593
+ """
594
+ Returns:
595
+ A csv-style representation of a table as list of lists of cell.text_.
596
+ """
597
+ cells = self.cells
598
+ table_list = [[[] for _ in range(self.number_of_columns)] for _ in range(self.number_of_rows)] # type: ignore
599
+ for cell in cells:
600
+ table_list[cell.row_number - 1][cell.column_number - 1].append(cell.text_) # type: ignore
601
+ return table_list
602
+
595
603
  def __str__(self) -> str:
596
604
  out = " ".join([" ".join(row + ["\n"]) for row in self.csv])
597
605
  return out
@@ -599,7 +607,13 @@ class Table(Layout):
599
607
  @property
600
608
  def text(self) -> str:
601
609
  try:
602
- return str(self)
610
+ cells = self.cells
611
+ if not cells:
612
+ return super().text
613
+ text_list: list[str] = []
614
+ for cell in cells:
615
+ text_list.append(cell.text)
616
+ return " ".join(text_list)
603
617
  except (TypeError, AnnotationError):
604
618
  return super().text
605
619
 
@@ -616,7 +630,7 @@ class Table(Layout):
616
630
  token_class_ids: list[str] = []
617
631
  token_tag_ids: list[str] = []
618
632
  for cell in cells:
619
- text.extend(cell.text_["text"])
633
+ text.append(cell.text_["text"])
620
634
  words.extend(cell.text_["words"])
621
635
  ann_ids.extend(cell.text_["ann_ids"])
622
636
  token_classes.extend(cell.text_["token_classes"])
@@ -25,14 +25,15 @@ import os
25
25
  import pprint
26
26
  from abc import ABC, abstractmethod
27
27
  from collections import defaultdict
28
+ from dataclasses import dataclass, field
28
29
  from inspect import signature
29
30
  from pathlib import Path
30
- from typing import Any, Mapping, Optional, Sequence, Type, Union
31
+ from typing import Any, Mapping, Optional, Sequence, Type, TypedDict, Union
31
32
 
32
33
  import numpy as np
33
34
 
34
35
  from ..dataflow import CacheData, ConcatData, CustomDataFromList, DataFlow
35
- from ..datapoint.image import Image
36
+ from ..datapoint.image import Image, MetaAnnotation
36
37
  from ..utils.logger import LoggingRecord, logger
37
38
  from ..utils.settings import DatasetType, ObjectTypes, TypeOrStr, get_type
38
39
  from ..utils.types import PathLikeOrStr
@@ -405,6 +406,193 @@ class MergeDataset(DatasetBase):
405
406
  self._dataflow_builder.categories = self._categories()
406
407
 
407
408
 
409
+ class DatasetCardDict(TypedDict):
410
+ """DatasetCardDict"""
411
+ name: str
412
+ dataset_type: Union[str, Any]
413
+ location: str
414
+ init_categories: Sequence[Any]
415
+ init_sub_categories: dict[Any, dict[Any, list[Any]]]
416
+ annotation_files: Optional[dict[Any, Union[Any, Sequence[Any]]]]
417
+ description: str
418
+ service_id_to_meta_annotation: dict[str, Any]
419
+
420
+
421
+ # Usage:
422
+ # def as_dict(self, ...) -> DatasetCardDict:
423
+
424
+
425
+ @dataclass
426
+ class DatasetCard:
427
+ """
428
+ An immutable dataclass representing the metadata of a dataset, including categories, sub-categories,
429
+ storage location, annotation files, and description. It facilitates management and consistency checks
430
+ for annotations generated by pipeline components.
431
+
432
+ Attributes:
433
+ name: Name of the dataset.
434
+ dataset_type: Type of the dataset as `ObjectTypes`.
435
+ location: Storage location of the dataset as `Path`.
436
+ init_categories: List of all initial categories (`ObjectTypes`) present in the dataset.
437
+ init_sub_categories: Mapping from main categories to sub-categories and their possible values.
438
+ annotation_files: Optional mapping from split names to annotation files.
439
+ description: Description of the dataset.
440
+ service_id_to_meta_annotation: Mapping from service IDs to `MetaAnnotation` objects, storing
441
+ annotation structure for different pipeline components.
442
+ """
443
+
444
+ name: str
445
+ dataset_type: ObjectTypes
446
+ location: Path
447
+ init_categories: list[ObjectTypes] = field(default_factory=list)
448
+ init_sub_categories: dict[ObjectTypes, dict[ObjectTypes, list[ObjectTypes]]] = field(default_factory=dict)
449
+ annotation_files: Optional[Mapping[str, Union[str, Sequence[str]]]] = None
450
+ description: str = field(default="")
451
+ service_id_to_meta_annotation: dict[str, MetaAnnotation] = field(default_factory=dict)
452
+
453
+ def save_dataset_card(self, file_path: Union[str, Path]) -> None:
454
+ """Save the DatasetCard instance as a JSON file."""
455
+ with open(file_path, "w", encoding="utf-8") as f:
456
+ json.dump(self.as_dict(), f, indent=4)
457
+
458
+ @staticmethod
459
+ def load_dataset_card(file_path: PathLikeOrStr) -> DatasetCard:
460
+ """Load a DatasetCard instance from a JSON file."""
461
+ with open(file_path, "r", encoding="utf-8") as f:
462
+ data = json.load(f)
463
+ service_id_to_meta_annotation = {}
464
+ if "service_id_to_meta_annotation" in data:
465
+ for service_id, meta_ann_dict in data.pop("service_id_to_meta_annotation").items():
466
+ meta_ann_dict["image_annotations"] = tuple(
467
+ get_type(cat) for cat in meta_ann_dict["image_annotations"]
468
+ )
469
+ meta_ann_dict["sub_categories"] = {
470
+ get_type(cat): {
471
+ get_type(sub_cat): set({get_type(value) for value in values})
472
+ for sub_cat, values in sub_cats.items()
473
+ }
474
+ for cat, sub_cats in meta_ann_dict["sub_categories"].items()
475
+ }
476
+ meta_ann_dict["relationships"] = {
477
+ get_type(key): set({get_type(value) for value in values})
478
+ for key, values in meta_ann_dict["relationships"].items()
479
+ }
480
+ meta_ann_dict["summaries"] = tuple(get_type(val) for val in meta_ann_dict["summaries"])
481
+ service_id_to_meta_annotation[service_id] = MetaAnnotation(**meta_ann_dict)
482
+ data["service_id_to_meta_annotation"] = service_id_to_meta_annotation
483
+ return DatasetCard(**data)
484
+
485
+ def as_dict(self, keep_object_types: bool = False) -> DatasetCardDict:
486
+ """Convert the DatasetCard to a dictionary."""
487
+ if keep_object_types:
488
+ return {
489
+ "name": self.name,
490
+ "dataset_type": self.dataset_type,
491
+ "location": self.location.as_posix(),
492
+ "init_categories": self.init_categories,
493
+ "init_sub_categories": self.init_sub_categories,
494
+ "annotation_files": self.annotation_files, # type: ignore
495
+ "description": self.description,
496
+ "service_id_to_meta_annotation": {
497
+ key: val.as_dict() for key, val in self.service_id_to_meta_annotation.items()
498
+ },
499
+ }
500
+ return {
501
+ "name": self.name,
502
+ "dataset_type": self.dataset_type.value,
503
+ "location": self.location.as_posix(),
504
+ "init_categories": [cat.value for cat in self.init_categories],
505
+ "init_sub_categories": {
506
+ cat.value: {
507
+ sub_cat.value: list({value.value for value in values}) for sub_cat, values in sub_cats.items()
508
+ }
509
+ for cat, sub_cats in self.init_sub_categories.items()
510
+ },
511
+ "annotation_files": self.annotation_files, # type: ignore
512
+ "description": self.description,
513
+ "service_id_to_meta_annotation": {
514
+ key: val.as_dict() for key, val in self.service_id_to_meta_annotation.items()
515
+ },
516
+ }
517
+
518
+ def update_from_pipeline(
519
+ self, meta_annotations: MetaAnnotation, service_id_to_meta_annotation: Mapping[str, MetaAnnotation]
520
+ ) -> None:
521
+ """
522
+ Update the initial categories, sub-categories, and service ID to `MetaAnnotation` mapping
523
+ based on the results from a pipeline.
524
+
525
+ ```python
526
+ analyzer = dd.get_dd_analyzer(config_overwrite=["USE_OCR=True","USE_TABLE_SEGMENTATION=True"])
527
+ meta_annotations = analyzer.get_meta_annotation()
528
+ service_id_to_meta_annotation = analyzer.get_service_id_to_meta_annotation()
529
+ card.update_from_pipeline(meta_annotations, service_id_to_meta_annotation)
530
+ ```
531
+
532
+ Args:
533
+ meta_annotations: A `MetaAnnotation` object containing new or updated categories and sub-categories.
534
+ service_id_to_meta_annotation: A mapping from service IDs to `MetaAnnotation` objects generated by the
535
+ pipeline.
536
+
537
+ Adds any missing categories, sub-categories, and values to the respective attributes of the instance.
538
+ """
539
+ for category in meta_annotations.image_annotations:
540
+ if category not in self.init_categories:
541
+ self.init_categories.append(category)
542
+ for cat, sub_cats in meta_annotations.sub_categories.items():
543
+ if cat not in self.init_sub_categories:
544
+ self.init_sub_categories[cat] = {}
545
+ for sub_cat, values in sub_cats.items():
546
+ if sub_cat not in self.init_sub_categories[cat]:
547
+ self.init_sub_categories[cat][sub_cat] = []
548
+ for value in values:
549
+ if value not in self.init_sub_categories[cat][sub_cat]:
550
+ self.init_sub_categories[cat][sub_cat].append(value)
551
+
552
+ for service_id, meta_annotation in service_id_to_meta_annotation.items():
553
+ if service_id not in self.service_id_to_meta_annotation:
554
+ self.service_id_to_meta_annotation[service_id] = meta_annotation
555
+
556
+ def __post_init__(self) -> None:
557
+ """
558
+ Perform internal consistency checks ensuring `init_categories` and
559
+ `init_sub_categories` align with `service_id_to_meta_annotation`.
560
+ """
561
+ self.dataset_type = get_type(self.dataset_type)
562
+ self.location = Path(self.location)
563
+ self.init_categories = [get_type(cat) for cat in self.init_categories]
564
+ self.init_sub_categories = {
565
+ get_type(outer_key): {
566
+ get_type(inner_key): [get_type(value) for value in inner_values]
567
+ for inner_key, inner_values in outer_value.items()
568
+ }
569
+ for outer_key, outer_value in self.init_sub_categories.items()
570
+ }
571
+
572
+ if self.service_id_to_meta_annotation is None:
573
+ return
574
+
575
+ # Check compatibility of image_annotations with init_categories
576
+ for service_id, meta_annotation in self.service_id_to_meta_annotation.items():
577
+ for annotation in meta_annotation.image_annotations:
578
+ if annotation not in self.init_categories:
579
+ raise ValueError(
580
+ f"Image annotation '{annotation}' in service ID '{service_id}' is not "
581
+ f"present in `init_categories`."
582
+ )
583
+
584
+ # Check compatibility of sub_categories
585
+ for cat, sub_cats in meta_annotation.sub_categories.items():
586
+ if not (
587
+ cat in self.init_sub_categories
588
+ and all(sub_cat in self.init_sub_categories[cat] for sub_cat in sub_cats)
589
+ ):
590
+ raise ValueError(
591
+ f"Sub-categories for category '{cat}' in service ID '{service_id}' "
592
+ f"do not match with `init_sub_categories`."
593
+ )
594
+
595
+
408
596
  class CustomDataset(DatasetBase):
409
597
  """
410
598
  A simple dataset interface that implements the boilerplate code and reduces complexity by merely leaving
@@ -512,53 +700,9 @@ class CustomDataset(DatasetBase):
512
700
  Returns:
513
701
  A CustomDataset instance created from the dataset card.
514
702
  """
515
-
516
- with open(file_path, "r", encoding="UTF-8") as file:
517
- meta_data = json.load(file)
518
- meta_data["dataset_type"] = get_type(meta_data["dataset_type"])
519
- meta_data["location"] = Path(meta_data["location"])
520
- meta_data["init_categories"] = [get_type(cat) for cat in meta_data["init_categories"]]
521
- meta_data["init_sub_categories"] = (
522
- {
523
- get_type(cat): {
524
- get_type(sub_cat_key): [get_type(sub_cat_value) for sub_cat_value in sub_cat_values]
525
- for sub_cat_key, sub_cat_values in sub_cats.items()
526
- }
527
- for cat, sub_cats in meta_data["init_sub_categories"].items()
528
- }
529
- if meta_data["init_sub_categories"] is not None
530
- else None
703
+ dataset_card = DatasetCard.load_dataset_card(file_path)
704
+ dataset_card_as_dict = dataset_card.as_dict(True)
705
+ dataset_card_as_dict.pop("service_id_to_meta_annotation") # type: ignore # pylint: disable=E1123
706
+ return CustomDataset( # pylint: disable=E1123
707
+ **dataset_card_as_dict, dataflow_builder=dataflow_builder # type: ignore
531
708
  )
532
- return CustomDataset(**meta_data, dataflow_builder=dataflow_builder)
533
-
534
- def as_dict(self) -> Mapping[str, Any]:
535
- """
536
- Return:
537
- The meta-data of the dataset as a dictionary.
538
- """
539
- return {
540
- "name": self.name,
541
- "dataset_type": self.type,
542
- "location": str(self.location),
543
- "annotation_files": self.annotation_files,
544
- "init_categories": [cat.value for cat in self.init_categories],
545
- "init_sub_categories": {
546
- cat.value: {
547
- sub_cat_key.value: [sub_cat_value.value for sub_cat_value in sub_cat_values]
548
- for sub_cat_key, sub_cat_values in sub_cats.items()
549
- }
550
- for cat, sub_cats in self.init_sub_categories.items()
551
- }
552
- if self.init_sub_categories is not None
553
- else None,
554
- }
555
-
556
- def save_dataset_card(self, file_path: str) -> None:
557
- """
558
- Save the dataset card to a `JSON` file.
559
-
560
- Args:
561
- file_path: file_path
562
- """
563
- with open(file_path, "w", encoding="UTF-8") as file:
564
- json.dump(self.as_dict(), file, indent=4)
@@ -306,7 +306,7 @@ class ModelCatalog:
306
306
 
307
307
  # Loading default profiles
308
308
  dd_profile_path = maybe_copy_config_to_cache(
309
- get_package_path(), get_cache_dir_path(), "deepdoctection/configs/profiles.jsonl", False
309
+ get_package_path(), get_cache_dir_path(), "deepdoctection/configs/profiles.jsonl", True
310
310
  )
311
311
  ModelCatalog.load_profiles_from_file(dd_profile_path)
312
312
  # Additional profiles can be added
@@ -23,12 +23,11 @@ from __future__ import annotations
23
23
 
24
24
  from abc import ABC, abstractmethod
25
25
  from collections import defaultdict
26
- from dataclasses import dataclass, field
27
26
  from typing import Any, Callable, Mapping, Optional, Union
28
27
  from uuid import uuid1
29
28
 
30
29
  from ..dataflow import DataFlow, MapData
31
- from ..datapoint.image import Image
30
+ from ..datapoint.image import Image, MetaAnnotation
32
31
  from ..mapper.misc import curry
33
32
  from ..utils.context import timed_operation
34
33
  from ..utils.identifier import get_uuid_from_str
@@ -37,25 +36,6 @@ from ..utils.types import DP
37
36
  from .anngen import DatapointManager
38
37
 
39
38
 
40
- @dataclass(frozen=True)
41
- class MetaAnnotation:
42
- """
43
- A immutable dataclass that stores information about what `Image` are being
44
- modified through a pipeline component.
45
-
46
- Attributes:
47
- image_annotations: Tuple of `ObjectTypes` representing image annotations.
48
- sub_categories: Dictionary mapping `ObjectTypes` to sets of `ObjectTypes` for sub-categories.
49
- relationships: Dictionary mapping `ObjectTypes` to sets of `ObjectTypes` for relationships.
50
- summaries: Tuple of `ObjectTypes` representing summaries.
51
- """
52
-
53
- image_annotations: tuple[ObjectTypes, ...] = field(default=())
54
- sub_categories: dict[ObjectTypes, set[ObjectTypes]] = field(default_factory=dict)
55
- relationships: dict[ObjectTypes, set[ObjectTypes]] = field(default_factory=dict)
56
- summaries: tuple[ObjectTypes, ...] = field(default=())
57
-
58
-
59
39
  class PipelineComponent(ABC):
60
40
  """
61
41
  Base class for pipeline components.
@@ -427,15 +407,24 @@ class Pipeline(ABC):
427
407
  as well as summaries (list with sub categories).
428
408
  """
429
409
  image_annotations: list[ObjectTypes] = []
430
- sub_categories = defaultdict(set)
431
- relationships = defaultdict(set)
410
+ sub_categories: dict[ObjectTypes, dict[ObjectTypes, set[ObjectTypes]]] = {}
411
+ relationships = defaultdict(set[ObjectTypes]) # type: ignore
432
412
  summaries: list[ObjectTypes] = []
433
413
  for component in self.pipe_component_list:
434
414
  meta_anns = component.get_meta_annotation()
435
415
  image_annotations.extend(meta_anns.image_annotations)
436
416
  for key, value in meta_anns.sub_categories.items():
437
- sub_categories[key].update(value)
438
- for key, value in meta_anns.relationships.items():
417
+ sub_dict = meta_anns.sub_categories[key]
418
+ for sub_cat, sub_cat_value in value.items():
419
+ if sub_cat in sub_dict:
420
+ sub_dict[sub_cat].update(sub_cat_value)
421
+ else:
422
+ sub_dict[sub_cat] = {sub_cat_value} # type: ignore
423
+ if key in sub_categories:
424
+ sub_categories[key].update(sub_dict)
425
+ else:
426
+ sub_categories[key] = sub_dict
427
+ for key, value in meta_anns.relationships.items(): # type: ignore
439
428
  relationships[key].update(value)
440
429
  summaries.extend(meta_anns.summaries)
441
430
  return MetaAnnotation(
@@ -445,6 +434,21 @@ class Pipeline(ABC):
445
434
  summaries=tuple(summaries),
446
435
  )
447
436
 
437
+ def get_service_id_to_meta_annotation(self) -> Mapping[str, MetaAnnotation]:
438
+ """
439
+ Collects meta annotations from all pipeline components and return a dict of service id to its meta annotation.
440
+
441
+ Returns:
442
+ `service_id` to `MetaAnnotation` with information about image annotations (list), sub categories (dict with
443
+ category names and generated sub categories), relationships (dict with category names and generated
444
+ relationships) as well as summaries (list with sub categories).
445
+ """
446
+ service_id_to_meta_annotation = {}
447
+ for component in self.pipe_component_list:
448
+ meta_anns = component.get_meta_annotation()
449
+ service_id_to_meta_annotation[component.service_id] = meta_anns
450
+ return service_id_to_meta_annotation
451
+
448
452
  def get_pipeline_info(
449
453
  self, service_id: Optional[str] = None, name: Optional[str] = None
450
454
  ) -> Union[str, Mapping[str, str]]:
@@ -28,13 +28,13 @@ from typing import Literal, Mapping, Optional, Sequence, Union
28
28
  import numpy as np
29
29
 
30
30
  from ..dataflow import DataFlow, MapData
31
- from ..datapoint.image import Image
31
+ from ..datapoint.image import Image, MetaAnnotation
32
32
  from ..datapoint.view import IMAGE_DEFAULTS, Page
33
33
  from ..extern.base import DetectionResult
34
34
  from ..mapper.match import match_anns_by_distance, match_anns_by_intersection
35
35
  from ..mapper.misc import to_image
36
36
  from ..utils.settings import LayoutType, ObjectTypes, Relationships, TypeOrStr, get_type
37
- from .base import MetaAnnotation, PipelineComponent
37
+ from .base import PipelineComponent
38
38
  from .registry import pipeline_component_registry
39
39
 
40
40
  if os.environ.get("DD_USE_TORCH"):
@@ -29,11 +29,11 @@ from typing import Callable, Optional, Sequence, Union
29
29
  import tqdm
30
30
 
31
31
  from ..dataflow import DataFlow, MapData
32
- from ..datapoint.image import Image
32
+ from ..datapoint.image import Image, MetaAnnotation
33
33
  from ..utils.context import timed_operation
34
34
  from ..utils.tqdm import get_tqdm
35
35
  from ..utils.types import QueueType, TqdmType
36
- from .base import MetaAnnotation, PipelineComponent
36
+ from .base import PipelineComponent
37
37
  from .common import ImageParsingService, PageParsingService
38
38
  from .registry import pipeline_component_registry
39
39
 
@@ -20,12 +20,12 @@ Module for language detection pipeline component
20
20
  """
21
21
  from typing import Optional, Sequence
22
22
 
23
- from ..datapoint.image import Image
23
+ from ..datapoint.image import Image, MetaAnnotation
24
24
  from ..datapoint.view import ImageDefaults, Page
25
25
  from ..extern.base import LanguageDetector, ObjectDetector
26
26
  from ..utils.error import ImageError
27
27
  from ..utils.settings import PageType, TypeOrStr, get_type
28
- from .base import MetaAnnotation, PipelineComponent
28
+ from .base import PipelineComponent
29
29
  from .registry import pipeline_component_registry
30
30
 
31
31