deepdoctection 0.43.6__tar.gz → 0.44.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (155) hide show
  1. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/PKG-INFO +4 -4
  2. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/__init__.py +5 -1
  3. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datapoint/__init__.py +1 -1
  4. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datapoint/image.py +50 -1
  5. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datapoint/view.py +149 -54
  6. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/base.py +196 -51
  7. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/fastlang.py +4 -2
  8. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/mapper/laylmstruct.py +7 -7
  9. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/pipe/base.py +29 -25
  10. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/pipe/common.py +2 -2
  11. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/pipe/concurrency.py +2 -2
  12. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/pipe/language.py +2 -2
  13. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/pipe/layout.py +2 -2
  14. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/pipe/lm.py +13 -3
  15. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/pipe/order.py +9 -5
  16. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/pipe/refine.py +7 -7
  17. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/pipe/segment.py +30 -30
  18. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/pipe/sub_layout.py +2 -2
  19. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/pipe/text.py +10 -5
  20. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/pipe/transform.py +2 -4
  21. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/file_utils.py +34 -0
  22. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/types.py +0 -1
  23. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection.egg-info/PKG-INFO +4 -4
  24. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection.egg-info/requires.txt +3 -3
  25. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/setup.py +1 -1
  26. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/LICENSE +0 -0
  27. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/README.md +0 -0
  28. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/analyzer/__init__.py +0 -0
  29. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/analyzer/config.py +0 -0
  30. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/analyzer/dd.py +0 -0
  31. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/analyzer/factory.py +0 -0
  32. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/configs/__init__.py +0 -0
  33. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/configs/conf_dd_one.yaml +0 -0
  34. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/configs/conf_tesseract.yaml +0 -0
  35. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/configs/profiles.jsonl +0 -0
  36. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/dataflow/__init__.py +0 -0
  37. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/dataflow/base.py +0 -0
  38. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/dataflow/common.py +0 -0
  39. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/dataflow/custom.py +0 -0
  40. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/dataflow/custom_serialize.py +0 -0
  41. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/dataflow/parallel_map.py +0 -0
  42. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/dataflow/serialize.py +0 -0
  43. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/dataflow/stats.py +0 -0
  44. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datapoint/annotation.py +0 -0
  45. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datapoint/box.py +0 -0
  46. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datapoint/convert.py +0 -0
  47. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/__init__.py +0 -0
  48. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/adapter.py +0 -0
  49. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/dataflow_builder.py +0 -0
  50. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/info.py +0 -0
  51. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/instances/__init__.py +0 -0
  52. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/instances/doclaynet.py +0 -0
  53. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/instances/fintabnet.py +0 -0
  54. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/instances/funsd.py +0 -0
  55. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/instances/iiitar13k.py +0 -0
  56. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/instances/layouttest.py +0 -0
  57. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/instances/publaynet.py +0 -0
  58. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/instances/pubtables1m.py +0 -0
  59. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/instances/pubtabnet.py +0 -0
  60. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/instances/rvlcdip.py +0 -0
  61. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/instances/xfund.py +0 -0
  62. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/instances/xsl/__init__.py +0 -0
  63. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/instances/xsl/pascal_voc.xsl +0 -0
  64. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/registry.py +0 -0
  65. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/datasets/save.py +0 -0
  66. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/eval/__init__.py +0 -0
  67. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/eval/accmetric.py +0 -0
  68. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/eval/base.py +0 -0
  69. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/eval/cocometric.py +0 -0
  70. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/eval/eval.py +0 -0
  71. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/eval/registry.py +0 -0
  72. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/eval/tedsmetric.py +0 -0
  73. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/eval/tp_eval_callback.py +0 -0
  74. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/__init__.py +0 -0
  75. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/base.py +0 -0
  76. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/d2detect.py +0 -0
  77. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/deskew.py +0 -0
  78. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/doctrocr.py +0 -0
  79. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/hfdetr.py +0 -0
  80. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/hflayoutlm.py +0 -0
  81. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/hflm.py +0 -0
  82. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/model.py +0 -0
  83. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/pdftext.py +0 -0
  84. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/pt/__init__.py +0 -0
  85. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/pt/nms.py +0 -0
  86. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/pt/ptutils.py +0 -0
  87. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tessocr.py +0 -0
  88. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/texocr.py +0 -0
  89. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/__init__.py +0 -0
  90. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tfutils.py +0 -0
  91. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpcompat.py +0 -0
  92. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/__init__.py +0 -0
  93. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/common.py +0 -0
  94. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +0 -0
  95. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/config/config.py +0 -0
  96. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +0 -0
  97. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +0 -0
  98. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +0 -0
  99. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +0 -0
  100. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +0 -0
  101. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +0 -0
  102. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +0 -0
  103. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +0 -0
  104. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +0 -0
  105. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/predict.py +0 -0
  106. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/preproc.py +0 -0
  107. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +0 -0
  108. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +0 -0
  109. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tp/tpfrcnn/utils/np_box_ops.py +0 -0
  110. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/extern/tpdetect.py +0 -0
  111. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/mapper/__init__.py +0 -0
  112. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/mapper/cats.py +0 -0
  113. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/mapper/cocostruct.py +0 -0
  114. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/mapper/d2struct.py +0 -0
  115. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/mapper/hfstruct.py +0 -0
  116. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/mapper/maputils.py +0 -0
  117. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/mapper/match.py +0 -0
  118. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/mapper/misc.py +0 -0
  119. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/mapper/pascalstruct.py +0 -0
  120. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/mapper/prodigystruct.py +0 -0
  121. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/mapper/pubstruct.py +0 -0
  122. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/mapper/tpstruct.py +0 -0
  123. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/mapper/xfundstruct.py +0 -0
  124. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/pipe/__init__.py +0 -0
  125. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/pipe/anngen.py +0 -0
  126. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/pipe/doctectionpipe.py +0 -0
  127. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/pipe/registry.py +0 -0
  128. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/py.typed +0 -0
  129. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/train/__init__.py +0 -0
  130. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/train/d2_frcnn_train.py +0 -0
  131. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/train/hf_detr_train.py +0 -0
  132. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/train/hf_layoutlm_train.py +0 -0
  133. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/train/tp_frcnn_train.py +0 -0
  134. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/__init__.py +0 -0
  135. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/concurrency.py +0 -0
  136. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/context.py +0 -0
  137. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/develop.py +0 -0
  138. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/env_info.py +0 -0
  139. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/error.py +0 -0
  140. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/fs.py +0 -0
  141. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/identifier.py +0 -0
  142. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/logger.py +0 -0
  143. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/metacfg.py +0 -0
  144. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/mocks.py +0 -0
  145. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/pdf_utils.py +0 -0
  146. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/settings.py +0 -0
  147. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/tqdm.py +0 -0
  148. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/transform.py +0 -0
  149. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/utils.py +0 -0
  150. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection/utils/viz.py +0 -0
  151. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection.egg-info/SOURCES.txt +0 -0
  152. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection.egg-info/dependency_links.txt +0 -0
  153. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/deepdoctection.egg-info/top_level.txt +0 -0
  154. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/setup.cfg +0 -0
  155. {deepdoctection-0.43.6 → deepdoctection-0.44.1}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deepdoctection
3
- Version: 0.43.6
3
+ Version: 0.44.1
4
4
  Summary: Repository for Document AI
5
5
  Home-page: https://github.com/deepdoctection/deepdoctection
6
6
  Author: Dr. Janis Meyer
@@ -27,7 +27,7 @@ Requires-Dist: networkx>=2.7.1
27
27
  Requires-Dist: numpy<2.0,>=1.21
28
28
  Requires-Dist: packaging>=20.0
29
29
  Requires-Dist: Pillow>=10.0.0
30
- Requires-Dist: pypdf>=3.16.0
30
+ Requires-Dist: pypdf>=6.0.0
31
31
  Requires-Dist: pypdfium2>=4.30.0
32
32
  Requires-Dist: pyyaml>=6.0.1
33
33
  Requires-Dist: pyzmq>=16
@@ -46,7 +46,7 @@ Requires-Dist: networkx>=2.7.1; extra == "tf"
46
46
  Requires-Dist: numpy<2.0,>=1.21; extra == "tf"
47
47
  Requires-Dist: packaging>=20.0; extra == "tf"
48
48
  Requires-Dist: Pillow>=10.0.0; extra == "tf"
49
- Requires-Dist: pypdf>=3.16.0; extra == "tf"
49
+ Requires-Dist: pypdf>=6.0.0; extra == "tf"
50
50
  Requires-Dist: pypdfium2>=4.30.0; extra == "tf"
51
51
  Requires-Dist: pyyaml>=6.0.1; extra == "tf"
52
52
  Requires-Dist: pyzmq>=16; extra == "tf"
@@ -78,7 +78,7 @@ Requires-Dist: networkx>=2.7.1; extra == "pt"
78
78
  Requires-Dist: numpy<2.0,>=1.21; extra == "pt"
79
79
  Requires-Dist: packaging>=20.0; extra == "pt"
80
80
  Requires-Dist: Pillow>=10.0.0; extra == "pt"
81
- Requires-Dist: pypdf>=3.16.0; extra == "pt"
81
+ Requires-Dist: pypdf>=6.0.0; extra == "pt"
82
82
  Requires-Dist: pypdfium2>=4.30.0; extra == "pt"
83
83
  Requires-Dist: pyyaml>=6.0.1; extra == "pt"
84
84
  Requires-Dist: pyzmq>=16; extra == "pt"
@@ -25,7 +25,7 @@ from .utils.logger import LoggingRecord, logger
25
25
 
26
26
  # pylint: enable=wrong-import-position
27
27
 
28
- __version__ = "0.43.6"
28
+ __version__ = "0.44.1"
29
29
 
30
30
  _IMPORT_STRUCTURE = {
31
31
  "analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory"],
@@ -92,6 +92,7 @@ _IMPORT_STRUCTURE = {
92
92
  "convert_pdf_bytes_to_np_array_v2",
93
93
  "as_dict",
94
94
  "ImageAnnotationBaseView",
95
+ "MetaAnnotation",
95
96
  "Image",
96
97
  "Word",
97
98
  "Layout",
@@ -105,6 +106,7 @@ _IMPORT_STRUCTURE = {
105
106
  "DatasetAdapter",
106
107
  "DatasetBase",
107
108
  "MergeDataset",
109
+ "DatasetCard",
108
110
  "CustomDataset",
109
111
  "DataFlowBaseBuilder",
110
112
  "DatasetInfo",
@@ -313,6 +315,8 @@ _IMPORT_STRUCTURE = {
313
315
  "get_apted_requirement",
314
316
  "distance_available",
315
317
  "get_distance_requirement",
318
+ "numpy_v1_available",
319
+ "get_numpy_v1_requirement",
316
320
  "transformers_available",
317
321
  "get_transformers_requirement",
318
322
  "detectron2_available",
@@ -34,5 +34,5 @@ After all, the point here is not to provide an optimal processing environment.
34
34
  from .annotation import *
35
35
  from .box import *
36
36
  from .convert import *
37
- from .image import Image
37
+ from .image import Image, MetaAnnotation
38
38
  from .view import *
@@ -25,7 +25,7 @@ from collections import defaultdict
25
25
  from dataclasses import dataclass, field
26
26
  from os import environ, fspath
27
27
  from pathlib import Path
28
- from typing import Any, Optional, Sequence, Union, no_type_check
28
+ from typing import Any, Optional, Sequence, TypedDict, Union, no_type_check
29
29
 
30
30
  import numpy as np
31
31
  from numpy import uint8
@@ -40,6 +40,55 @@ from .box import crop_box_from_image, global_to_local_coords, intersection_box
40
40
  from .convert import as_dict, convert_b64_to_np_array, convert_np_array_to_b64, convert_pdf_bytes_to_np_array_v2
41
41
 
42
42
 
43
+ class MetaAnnotationDict(TypedDict):
44
+ """MetaAnnotationDict"""
45
+
46
+ image_annotations: list[str]
47
+ sub_categories: dict[str, dict[str, list[str]]]
48
+ relationships: dict[str, list[str]]
49
+ summaries: list[str]
50
+
51
+
52
+ @dataclass(frozen=True)
53
+ class MetaAnnotation:
54
+ """
55
+ An immutable dataclass that stores information about what `Image` are being
56
+ modified through a pipeline component.
57
+
58
+ Attributes:
59
+ image_annotations: Tuple of `ObjectTypes` representing image annotations.
60
+ sub_categories: Dictionary mapping `ObjectTypes` to dicts of `ObjectTypes` to sets of `ObjectTypes`
61
+ for sub-categories.
62
+ relationships: Dictionary mapping `ObjectTypes` to sets of `ObjectTypes` for relationships.
63
+ summaries: Tuple of `ObjectTypes` representing summaries.
64
+ """
65
+
66
+ image_annotations: tuple[ObjectTypes, ...] = field(default=())
67
+ sub_categories: dict[ObjectTypes, dict[ObjectTypes, set[ObjectTypes]]] = field(default_factory=dict)
68
+ relationships: dict[ObjectTypes, set[ObjectTypes]] = field(default_factory=dict)
69
+ summaries: tuple[ObjectTypes, ...] = field(default=())
70
+
71
+ def as_dict(self) -> MetaAnnotationDict:
72
+ """
73
+ Returns the MetaAnnotation as a dictionary, with all `ObjectTypes` converted to strings.
74
+
75
+ Returns:
76
+ A dictionary representation of the MetaAnnotation where all `ObjectTypes` are converted to strings.
77
+ """
78
+ return {
79
+ "image_annotations": [obj.value for obj in self.image_annotations],
80
+ "sub_categories": {
81
+ outer_key.value: {
82
+ inner_key.value: [val.value for val in inner_values]
83
+ for inner_key, inner_values in outer_value.items()
84
+ }
85
+ for outer_key, outer_value in self.sub_categories.items()
86
+ },
87
+ "relationships": {key.value: [val.value for val in values] for key, values in self.relationships.items()},
88
+ "summaries": [obj.value for obj in self.summaries],
89
+ }
90
+
91
+
43
92
  @dataclass
44
93
  class Image:
45
94
  """
@@ -42,13 +42,60 @@ from ..utils.settings import (
42
42
  get_type,
43
43
  )
44
44
  from ..utils.transform import ResizeTransform, box_to_point4, point4_to_box
45
- from ..utils.types import HTML, AnnotationDict, Chunks, ImageDict, PathLikeOrStr, PixelValues, Text_, csv
45
+ from ..utils.types import HTML, AnnotationDict, Chunks, ImageDict, PathLikeOrStr, PixelValues, csv
46
46
  from ..utils.viz import draw_boxes, interactive_imshow, viz_handler
47
47
  from .annotation import CategoryAnnotation, ContainerAnnotation, ImageAnnotation, ann_from_dict
48
48
  from .box import BoundingBox, crop_box_from_image
49
49
  from .image import Image
50
50
 
51
51
 
52
+ @dataclass(frozen=True)
53
+ class Text_:
54
+ """
55
+ Immutable dataclass for storing structured text extraction results.
56
+
57
+ Attributes:
58
+ text: The concatenated text string.
59
+ words: List of word strings.
60
+ ann_ids: List of annotation IDs for each word.
61
+ token_classes: List of token class names for each word.
62
+ token_class_ann_ids: List of annotation IDs for each token class.
63
+ token_tags: List of token tag names for each word.
64
+ token_tag_ann_ids: List of annotation IDs for each token tag.
65
+ token_class_ids: List of token class IDs.
66
+ token_tag_ids: List of token tag IDs.
67
+ """
68
+
69
+ text: str = ""
70
+ words: list[str] = field(default_factory=list)
71
+ ann_ids: list[str] = field(default_factory=list)
72
+ token_classes: list[str] = field(default_factory=list)
73
+ token_class_ann_ids: list[str] = field(default_factory=list)
74
+ token_tags: list[str] = field(default_factory=list)
75
+ token_tag_ann_ids: list[str] = field(default_factory=list)
76
+ token_class_ids: list[str] = field(default_factory=list)
77
+ token_tag_ids: list[str] = field(default_factory=list)
78
+
79
+ def as_dict(self) -> dict[str, Union[list[str], str]]:
80
+ """
81
+ Returns the Text_ as a dictionary.
82
+
83
+ Returns:
84
+ A dictionary representation of the Text_ dataclass.
85
+ """
86
+ return {
87
+ "text": self.text,
88
+ "words": self.words,
89
+ "ann_ids": self.ann_ids,
90
+ "token_classes": self.token_classes,
91
+ "token_class_ann_ids": self.token_class_ann_ids,
92
+ "token_tags": self.token_tags,
93
+ "token_tag_ann_ids": self.token_tag_ann_ids,
94
+ "token_class_ids": self.token_class_ids,
95
+ "token_tag_ids": self.token_tag_ids,
96
+ }
97
+
98
+
52
99
  class ImageAnnotationBaseView(ImageAnnotation):
53
100
  """
54
101
  Consumption class for having easier access to categories added to an `ImageAnnotation`.
@@ -263,13 +310,28 @@ class Layout(ImageAnnotationBaseView):
263
310
  """
264
311
  words = self.get_ordered_words()
265
312
  if words:
266
- characters, ann_ids, token_classes, token_tags, token_classes_ids, token_tag_ids = zip(
313
+ (
314
+ characters,
315
+ ann_ids,
316
+ token_classes,
317
+ token_class_ann_ids,
318
+ token_tags,
319
+ token_tag_ann_ids,
320
+ token_classes_ids,
321
+ token_tag_ids,
322
+ ) = map(list, zip(
267
323
  *[
268
324
  (
269
325
  word.characters,
270
326
  word.annotation_id,
271
327
  word.token_class,
328
+ word.get_sub_category(WordType.TOKEN_CLASS).annotation_id
329
+ if WordType.TOKEN_CLASS in word.sub_categories
330
+ else None,
272
331
  word.token_tag,
332
+ word.get_sub_category(WordType.TOKEN_TAG).annotation_id
333
+ if WordType.TOKEN_TAG in word.sub_categories
334
+ else None,
273
335
  word.get_sub_category(WordType.TOKEN_CLASS).category_id
274
336
  if WordType.TOKEN_CLASS in word.sub_categories
275
337
  else None,
@@ -279,25 +341,40 @@ class Layout(ImageAnnotationBaseView):
279
341
  )
280
342
  for word in words
281
343
  ]
282
- )
344
+ ))
283
345
  else:
284
- characters, ann_ids, token_classes, token_tags, token_classes_ids, token_tag_ids = (
285
- [], # type: ignore
286
- [], # type: ignore
287
- [], # type: ignore
288
- [], # type: ignore
289
- [], # type: ignore
290
- [], # type: ignore
346
+ (
347
+ characters,
348
+ ann_ids,
349
+ token_classes,
350
+ token_class_ann_ids,
351
+ token_tags,
352
+ token_tag_ann_ids,
353
+ token_classes_ids,
354
+ token_tag_ids,
355
+ ) = (
356
+ [],
357
+ [],
358
+ [],
359
+ [],
360
+ [],
361
+ [],
362
+ [],
363
+ [],
291
364
  )
292
- return {
293
- "text": " ".join(characters),
294
- "words": characters,
295
- "ann_ids": ann_ids,
296
- "token_classes": token_classes,
297
- "token_tags": token_tags,
298
- "token_class_ids": token_classes_ids,
299
- "token_tag_ids": token_tag_ids,
300
- }
365
+
366
+ return Text_(
367
+ text=" ".join(characters), # type: ignore
368
+ words=characters, # type: ignore
369
+ ann_ids=ann_ids, # type: ignore
370
+ token_classes=token_classes, # type: ignore
371
+ token_class_ann_ids=token_class_ann_ids, # type: ignore
372
+ token_tags=token_tags, # type: ignore
373
+ token_tag_ann_ids=token_tag_ann_ids, # type: ignore
374
+ token_class_ids=token_classes_ids, # type: ignore
375
+ token_tag_ids=token_tag_ids, # type: ignore
376
+ )
377
+
301
378
 
302
379
  def get_attribute_names(self) -> set[str]:
303
380
  attr_names = (
@@ -590,14 +667,16 @@ class Table(Layout):
590
667
 
591
668
  @property
592
669
  def csv_(self) -> list[list[list[Text_]]]:
670
+ """
671
+ Returns:
672
+ A csv-style representation of a table as list of lists of cell.text_.
673
+ """
593
674
  cells = self.cells
594
675
  table_list = [[[] for _ in range(self.number_of_columns)] for _ in range(self.number_of_rows)] # type: ignore
595
676
  for cell in cells:
596
677
  table_list[cell.row_number - 1][cell.column_number - 1].append(cell.text_) # type: ignore
597
678
  return table_list
598
679
 
599
-
600
-
601
680
  def __str__(self) -> str:
602
681
  out = " ".join([" ".join(row + ["\n"]) for row in self.csv])
603
682
  return out
@@ -624,26 +703,34 @@ class Table(Layout):
624
703
  words: list[str] = []
625
704
  ann_ids: list[str] = []
626
705
  token_classes: list[str] = []
706
+ token_class_ann_ids: list[str] = []
627
707
  token_tags: list[str] = []
708
+ token_tag_ann_ids: list[str] = []
628
709
  token_class_ids: list[str] = []
629
710
  token_tag_ids: list[str] = []
630
711
  for cell in cells:
631
- text.append(cell.text_["text"])
632
- words.extend(cell.text_["words"])
633
- ann_ids.extend(cell.text_["ann_ids"])
634
- token_classes.extend(cell.text_["token_classes"])
635
- token_tags.extend(cell.text_["token_tags"])
636
- token_class_ids.extend(cell.text_["token_class_ids"])
637
- token_tag_ids.extend(cell.text_["token_tag_ids"])
638
- return {
639
- "text": " ".join(text),
640
- "words": words,
641
- "ann_ids": ann_ids,
642
- "token_classes": token_classes,
643
- "token_tags": token_tags,
644
- "token_class_ids": token_class_ids,
645
- "token_tag_ids": token_tag_ids,
646
- }
712
+ text_ = cell.text_
713
+ text.append(text_.text)
714
+ words.extend(text_.words)
715
+ ann_ids.extend(text_.ann_ids)
716
+ token_classes.extend(text_.token_classes)
717
+ token_class_ann_ids.extend(text_.token_class_ann_ids)
718
+ token_tags.extend(text_.token_tags)
719
+ token_tag_ann_ids.extend(text_.token_tag_ann_ids)
720
+ token_class_ids.extend(text_.token_class_ids)
721
+ token_tag_ids.extend(text_.token_tag_ids)
722
+ return Text_(
723
+ text=" ".join(text),
724
+ words=words,
725
+ ann_ids=ann_ids,
726
+ token_classes=token_classes,
727
+ token_class_ann_ids=token_class_ann_ids,
728
+ token_tags=token_tags,
729
+ token_tag_ann_ids=token_tag_ann_ids,
730
+ token_class_ids=token_class_ids,
731
+ token_tag_ids=token_tag_ids,
732
+ )
733
+
647
734
 
648
735
  @property
649
736
  def words(self) -> list[ImageAnnotationBaseView]:
@@ -1051,7 +1138,7 @@ class Page(Image):
1051
1138
 
1052
1139
  ```python
1053
1140
  {"text": text string,
1054
- "text_list": list of single words,
1141
+ "words": list of single words,
1055
1142
  "annotation_ids": word annotation ids}
1056
1143
  ```
1057
1144
  """
@@ -1060,26 +1147,34 @@ class Page(Image):
1060
1147
  words: list[str] = []
1061
1148
  ann_ids: list[str] = []
1062
1149
  token_classes: list[str] = []
1150
+ token_class_ann_ids: list[str] = []
1063
1151
  token_tags: list[str] = []
1152
+ token_tag_ann_ids: list[str] = []
1064
1153
  token_class_ids: list[str] = []
1065
1154
  token_tag_ids: list[str] = []
1066
1155
  for block in block_with_order:
1067
- text.append(block.text_["text"]) # type: ignore
1068
- words.extend(block.text_["words"]) # type: ignore
1069
- ann_ids.extend(block.text_["ann_ids"]) # type: ignore
1070
- token_classes.extend(block.text_["token_classes"]) # type: ignore
1071
- token_tags.extend(block.text_["token_tags"]) # type: ignore
1072
- token_class_ids.extend(block.text_["token_class_ids"]) # type: ignore
1073
- token_tag_ids.extend(block.text_["token_tag_ids"]) # type: ignore
1074
- return {
1075
- "text": " ".join(text),
1076
- "words": words,
1077
- "ann_ids": ann_ids,
1078
- "token_classes": token_classes,
1079
- "token_tags": token_tags,
1080
- "token_class_ids": token_class_ids,
1081
- "token_tag_ids": token_tag_ids,
1082
- }
1156
+ text_ = block.text_
1157
+ text.append(text_.text) # type: ignore
1158
+ words.extend(text_.words) # type: ignore
1159
+ ann_ids.extend(text_.ann_ids) # type: ignore
1160
+ token_classes.extend(text_.token_classes) # type: ignore
1161
+ token_class_ann_ids.extend(text_.token_class_ann_ids) # type: ignore
1162
+ token_tags.extend(text_.token_tags) # type: ignore
1163
+ token_tag_ann_ids.extend(text_.token_tag_ann_ids) # type: ignore
1164
+ token_class_ids.extend(text_.token_class_ids) # type: ignore
1165
+ token_tag_ids.extend(text_.token_tag_ids) # type: ignore
1166
+ return Text_(
1167
+ text=" ".join(text),
1168
+ words=words,
1169
+ ann_ids=ann_ids,
1170
+ token_classes=token_classes,
1171
+ token_class_ann_ids=token_class_ann_ids,
1172
+ token_tags=token_tags,
1173
+ token_tag_ann_ids=token_tag_ann_ids,
1174
+ token_class_ids=token_class_ids,
1175
+ token_tag_ids=token_tag_ann_ids,
1176
+ )
1177
+
1083
1178
 
1084
1179
  def get_layout_context(self, annotation_id: str, context_size: int = 3) -> list[ImageAnnotationBaseView]:
1085
1180
  """