deepdoctection 0.31__tar.gz → 0.32__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (245) hide show
  1. {deepdoctection-0.31 → deepdoctection-0.32}/PKG-INFO +27 -18
  2. {deepdoctection-0.31 → deepdoctection-0.32}/README.md +14 -7
  3. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/__init__.py +35 -28
  4. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/analyzer/dd.py +30 -24
  5. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/configs/conf_dd_one.yaml +34 -31
  6. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datapoint/annotation.py +2 -1
  7. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datapoint/box.py +2 -1
  8. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datapoint/image.py +13 -7
  9. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datapoint/view.py +95 -24
  10. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/__init__.py +1 -4
  11. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/adapter.py +5 -2
  12. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/base.py +5 -3
  13. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/info.py +2 -2
  14. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/instances/doclaynet.py +3 -2
  15. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/instances/fintabnet.py +2 -1
  16. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/instances/funsd.py +2 -1
  17. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/instances/iiitar13k.py +5 -2
  18. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/instances/layouttest.py +2 -1
  19. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/instances/publaynet.py +2 -2
  20. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/instances/pubtables1m.py +6 -3
  21. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/instances/pubtabnet.py +2 -1
  22. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/instances/rvlcdip.py +2 -1
  23. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/instances/xfund.py +2 -1
  24. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/eval/__init__.py +1 -4
  25. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/eval/cocometric.py +2 -1
  26. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/eval/eval.py +17 -13
  27. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/eval/tedsmetric.py +14 -11
  28. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/eval/tp_eval_callback.py +9 -3
  29. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/__init__.py +2 -7
  30. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/d2detect.py +24 -32
  31. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/deskew.py +4 -2
  32. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/doctrocr.py +75 -81
  33. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/fastlang.py +4 -2
  34. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/hfdetr.py +22 -28
  35. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/hflayoutlm.py +335 -103
  36. deepdoctection-0.32/deepdoctection/extern/hflm.py +225 -0
  37. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/model.py +56 -47
  38. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/pdftext.py +8 -4
  39. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/pt/__init__.py +1 -3
  40. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/pt/nms.py +6 -2
  41. deepdoctection-0.32/deepdoctection/extern/pt/ptutils.py +57 -0
  42. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/texocr.py +4 -2
  43. deepdoctection-0.32/deepdoctection/extern/tp/tfutils.py +91 -0
  44. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tp/tpcompat.py +10 -7
  45. deepdoctection-0.32/deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
  46. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
  47. deepdoctection-0.32/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
  48. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/config/config.py +9 -6
  49. deepdoctection-0.32/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
  50. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +17 -7
  51. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
  52. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +9 -4
  53. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
  54. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +16 -11
  55. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +17 -10
  56. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +14 -8
  57. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
  58. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
  59. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/preproc.py +7 -3
  60. {deepdoctection-0.31/tests/datapoint → deepdoctection-0.32/deepdoctection/extern/tp/tpfrcnn/utils}/__init__.py +4 -0
  61. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
  62. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tpdetect.py +5 -8
  63. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/mapper/__init__.py +3 -8
  64. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/mapper/d2struct.py +8 -6
  65. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/mapper/hfstruct.py +6 -1
  66. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/mapper/laylmstruct.py +163 -20
  67. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/mapper/maputils.py +3 -1
  68. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/mapper/misc.py +6 -3
  69. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/mapper/tpstruct.py +2 -2
  70. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/pipe/__init__.py +1 -1
  71. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/pipe/common.py +11 -9
  72. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/pipe/concurrency.py +2 -1
  73. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/pipe/layout.py +3 -1
  74. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/pipe/lm.py +32 -64
  75. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/pipe/order.py +142 -35
  76. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/pipe/refine.py +8 -14
  77. deepdoctection-0.31/deepdoctection/pipe/cell.py → deepdoctection-0.32/deepdoctection/pipe/sub_layout.py +1 -1
  78. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/train/__init__.py +6 -12
  79. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/train/d2_frcnn_train.py +21 -16
  80. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/train/hf_detr_train.py +18 -11
  81. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/train/hf_layoutlm_train.py +118 -101
  82. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/train/tp_frcnn_train.py +21 -19
  83. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/env_info.py +41 -117
  84. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/logger.py +1 -0
  85. deepdoctection-0.32/deepdoctection/utils/mocks.py +93 -0
  86. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/settings.py +1 -0
  87. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/viz.py +4 -3
  88. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection.egg-info/PKG-INFO +27 -18
  89. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection.egg-info/SOURCES.txt +4 -90
  90. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection.egg-info/requires.txt +11 -8
  91. {deepdoctection-0.31 → deepdoctection-0.32}/setup.cfg +7 -1
  92. {deepdoctection-0.31 → deepdoctection-0.32}/setup.py +9 -8
  93. {deepdoctection-0.31 → deepdoctection-0.32}/tests/test_utils.py +8 -0
  94. deepdoctection-0.31/deepdoctection/extern/pt/ptutils.py +0 -49
  95. deepdoctection-0.31/deepdoctection/extern/tp/tfutils.py +0 -57
  96. deepdoctection-0.31/deepdoctection/extern/tp/tpfrcnn/__init__.py +0 -0
  97. deepdoctection-0.31/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +0 -0
  98. deepdoctection-0.31/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +0 -0
  99. deepdoctection-0.31/deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +0 -0
  100. deepdoctection-0.31/tests/__init__.py +0 -22
  101. deepdoctection-0.31/tests/analyzer/__init__.py +0 -16
  102. deepdoctection-0.31/tests/analyzer/test_dd.py +0 -202
  103. deepdoctection-0.31/tests/conftest.py +0 -498
  104. deepdoctection-0.31/tests/data.py +0 -1632
  105. deepdoctection-0.31/tests/dataflow/__init__.py +0 -16
  106. deepdoctection-0.31/tests/dataflow/conftest.py +0 -95
  107. deepdoctection-0.31/tests/dataflow/test_common.py +0 -219
  108. deepdoctection-0.31/tests/dataflow/test_custom.py +0 -60
  109. deepdoctection-0.31/tests/dataflow/test_custom_serialize.py +0 -177
  110. deepdoctection-0.31/tests/dataflow/test_parallel_map.py +0 -66
  111. deepdoctection-0.31/tests/dataflow/test_stats.py +0 -103
  112. deepdoctection-0.31/tests/datapoint/conftest.py +0 -262
  113. deepdoctection-0.31/tests/datapoint/test_annotation.py +0 -170
  114. deepdoctection-0.31/tests/datapoint/test_box.py +0 -416
  115. deepdoctection-0.31/tests/datapoint/test_convert.py +0 -52
  116. deepdoctection-0.31/tests/datapoint/test_image.py +0 -387
  117. deepdoctection-0.31/tests/datapoint/test_view.py +0 -150
  118. deepdoctection-0.31/tests/datasets/__init__.py +0 -16
  119. deepdoctection-0.31/tests/datasets/instances/__init__.py +0 -16
  120. deepdoctection-0.31/tests/datasets/instances/conftest.py +0 -35
  121. deepdoctection-0.31/tests/datasets/instances/test_doclaynet.py +0 -43
  122. deepdoctection-0.31/tests/datasets/instances/test_fintabnet.py +0 -70
  123. deepdoctection-0.31/tests/datasets/instances/test_funsd.py +0 -58
  124. deepdoctection-0.31/tests/datasets/instances/test_iiitar13k.py +0 -42
  125. deepdoctection-0.31/tests/datasets/instances/test_layouttest.py +0 -63
  126. deepdoctection-0.31/tests/datasets/instances/test_publaynet.py +0 -64
  127. deepdoctection-0.31/tests/datasets/instances/test_pubtables1m.py +0 -66
  128. deepdoctection-0.31/tests/datasets/instances/test_pubtabnet.py +0 -65
  129. deepdoctection-0.31/tests/datasets/instances/test_rvlcdip.py +0 -46
  130. deepdoctection-0.31/tests/datasets/test_adapter.py +0 -77
  131. deepdoctection-0.31/tests/datasets/test_info.py +0 -273
  132. deepdoctection-0.31/tests/datasets/test_registry.py +0 -75
  133. deepdoctection-0.31/tests/eval/__init__.py +0 -16
  134. deepdoctection-0.31/tests/eval/conftest.py +0 -107
  135. deepdoctection-0.31/tests/eval/test_accmetric.py +0 -364
  136. deepdoctection-0.31/tests/eval/test_cocometric.py +0 -123
  137. deepdoctection-0.31/tests/eval/test_eval.py +0 -86
  138. deepdoctection-0.31/tests/eval/test_registry.py +0 -84
  139. deepdoctection-0.31/tests/eval/test_tedsmetric.py +0 -40
  140. deepdoctection-0.31/tests/extern/__init__.py +0 -0
  141. deepdoctection-0.31/tests/extern/conftest.py +0 -108
  142. deepdoctection-0.31/tests/extern/data.py +0 -102
  143. deepdoctection-0.31/tests/extern/test_deskew.py +0 -67
  144. deepdoctection-0.31/tests/extern/test_doctrocr.py +0 -190
  145. deepdoctection-0.31/tests/extern/test_fastlang.py +0 -64
  146. deepdoctection-0.31/tests/extern/test_hfdetr.py +0 -116
  147. deepdoctection-0.31/tests/extern/test_hflayoutlm.py +0 -492
  148. deepdoctection-0.31/tests/extern/test_pdftext.py +0 -70
  149. deepdoctection-0.31/tests/extern/test_tessocr.py +0 -164
  150. deepdoctection-0.31/tests/extern/test_texocr.py +0 -52
  151. deepdoctection-0.31/tests/extern/test_tpdetect.py +0 -123
  152. deepdoctection-0.31/tests/mapper/__init__.py +0 -16
  153. deepdoctection-0.31/tests/mapper/conftest.py +0 -297
  154. deepdoctection-0.31/tests/mapper/data.py +0 -2182
  155. deepdoctection-0.31/tests/mapper/test_cats.py +0 -305
  156. deepdoctection-0.31/tests/mapper/test_cocostruct.py +0 -91
  157. deepdoctection-0.31/tests/mapper/test_d2struct.py +0 -56
  158. deepdoctection-0.31/tests/mapper/test_hfstruct.py +0 -59
  159. deepdoctection-0.31/tests/mapper/test_iiitar13k.py +0 -64
  160. deepdoctection-0.31/tests/mapper/test_laylmstruct.py +0 -141
  161. deepdoctection-0.31/tests/mapper/test_misc.py +0 -72
  162. deepdoctection-0.31/tests/mapper/test_prodigystruct.py +0 -78
  163. deepdoctection-0.31/tests/mapper/test_pubstruct.py +0 -170
  164. deepdoctection-0.31/tests/mapper/test_tpstruct.py +0 -51
  165. deepdoctection-0.31/tests/mapper/test_utils.py +0 -83
  166. deepdoctection-0.31/tests/mapper/test_xfundstruct.py +0 -68
  167. deepdoctection-0.31/tests/pipe/__init__.py +0 -16
  168. deepdoctection-0.31/tests/pipe/test_anngen.py +0 -179
  169. deepdoctection-0.31/tests/pipe/test_cell.py +0 -144
  170. deepdoctection-0.31/tests/pipe/test_common.py +0 -107
  171. deepdoctection-0.31/tests/pipe/test_language.py +0 -76
  172. deepdoctection-0.31/tests/pipe/test_layout.py +0 -66
  173. deepdoctection-0.31/tests/pipe/test_lm.py +0 -119
  174. deepdoctection-0.31/tests/pipe/test_order.py +0 -197
  175. deepdoctection-0.31/tests/pipe/test_refine.py +0 -325
  176. deepdoctection-0.31/tests/pipe/test_registry.py +0 -58
  177. deepdoctection-0.31/tests/pipe/test_segment.py +0 -392
  178. deepdoctection-0.31/tests/pipe/test_text.py +0 -208
  179. deepdoctection-0.31/tests/pipe/test_transform.py +0 -65
  180. deepdoctection-0.31/tests/train/__init__.py +0 -16
  181. deepdoctection-0.31/tests/train/conftest.py +0 -118
  182. deepdoctection-0.31/tests/train/test_d2_frcnn_train.py +0 -64
  183. deepdoctection-0.31/tests/train/test_tp_frcnn_train.py +0 -99
  184. deepdoctection-0.31/tests_d2/__init__.py +0 -20
  185. deepdoctection-0.31/tests_d2/conftest.py +0 -56
  186. deepdoctection-0.31/tests_d2/test_d2detect.py +0 -95
  187. {deepdoctection-0.31 → deepdoctection-0.32}/LICENSE +0 -0
  188. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/analyzer/__init__.py +0 -0
  189. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/configs/__init__.py +0 -0
  190. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/configs/conf_tesseract.yaml +0 -0
  191. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/dataflow/__init__.py +0 -0
  192. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/dataflow/base.py +0 -0
  193. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/dataflow/common.py +0 -0
  194. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/dataflow/custom.py +0 -0
  195. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/dataflow/custom_serialize.py +0 -0
  196. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/dataflow/parallel_map.py +0 -0
  197. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/dataflow/serialize.py +0 -0
  198. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/dataflow/stats.py +0 -0
  199. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datapoint/__init__.py +0 -0
  200. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datapoint/convert.py +0 -0
  201. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/dataflow_builder.py +0 -0
  202. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/instances/__init__.py +0 -0
  203. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/instances/xsl/__init__.py +0 -0
  204. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/instances/xsl/pascal_voc.xsl +0 -0
  205. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/registry.py +0 -0
  206. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/save.py +0 -0
  207. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/eval/accmetric.py +0 -0
  208. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/eval/base.py +0 -0
  209. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/eval/registry.py +0 -0
  210. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/base.py +0 -0
  211. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tessocr.py +0 -0
  212. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tp/__init__.py +0 -0
  213. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/utils/np_box_ops.py +0 -0
  214. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/mapper/cats.py +0 -0
  215. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/mapper/cocostruct.py +0 -0
  216. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/mapper/match.py +0 -0
  217. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/mapper/pascalstruct.py +0 -0
  218. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/mapper/prodigystruct.py +0 -0
  219. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/mapper/pubstruct.py +0 -0
  220. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/mapper/xfundstruct.py +0 -0
  221. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/pipe/anngen.py +0 -0
  222. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/pipe/base.py +0 -0
  223. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/pipe/doctectionpipe.py +0 -0
  224. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/pipe/language.py +0 -0
  225. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/pipe/registry.py +0 -0
  226. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/pipe/segment.py +0 -0
  227. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/pipe/text.py +0 -0
  228. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/pipe/transform.py +0 -0
  229. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/py.typed +0 -0
  230. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/__init__.py +0 -0
  231. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/concurrency.py +0 -0
  232. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/context.py +0 -0
  233. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/detection_types.py +0 -0
  234. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/develop.py +0 -0
  235. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/error.py +0 -0
  236. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/file_utils.py +0 -0
  237. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/fs.py +0 -0
  238. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/identifier.py +0 -0
  239. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/metacfg.py +0 -0
  240. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/pdf_utils.py +0 -0
  241. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/tqdm.py +0 -0
  242. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/transform.py +0 -0
  243. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/utils.py +0 -0
  244. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection.egg-info/dependency_links.txt +0 -0
  245. {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deepdoctection
3
- Version: 0.31
3
+ Version: 0.32
4
4
  Summary: Repository for Document AI
5
5
  Home-page: https://github.com/deepdoctection/deepdoctection
6
6
  Author: Dr. Janis Meyer
@@ -9,18 +9,18 @@ Classifier: Development Status :: 4 - Beta
9
9
  Classifier: License :: OSI Approved :: Apache Software License
10
10
  Classifier: Natural Language :: English
11
11
  Classifier: Operating System :: POSIX :: Linux
12
- Classifier: Programming Language :: Python :: 3.8
13
12
  Classifier: Programming Language :: Python :: 3.9
14
13
  Classifier: Programming Language :: Python :: 3.10
15
14
  Classifier: Programming Language :: Python :: 3.11
16
15
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
- Requires-Python: >=3.8
16
+ Requires-Python: >=3.9
18
17
  Description-Content-Type: text/markdown
19
18
  License-File: LICENSE
20
19
  Requires-Dist: catalogue==2.0.10
21
20
  Requires-Dist: huggingface_hub>=0.12.0
22
21
  Requires-Dist: importlib-metadata>=5.0.0
23
22
  Requires-Dist: jsonlines==3.1.0
23
+ Requires-Dist: lazy-imports==0.3.1
24
24
  Requires-Dist: mock==4.0.3
25
25
  Requires-Dist: networkx>=2.7.1
26
26
  Requires-Dist: numpy>=1.21
@@ -37,6 +37,7 @@ Requires-Dist: catalogue==2.0.10; extra == "tf"
37
37
  Requires-Dist: huggingface_hub>=0.12.0; extra == "tf"
38
38
  Requires-Dist: importlib-metadata>=5.0.0; extra == "tf"
39
39
  Requires-Dist: jsonlines==3.1.0; extra == "tf"
40
+ Requires-Dist: lazy-imports==0.3.1; extra == "tf"
40
41
  Requires-Dist: mock==4.0.3; extra == "tf"
41
42
  Requires-Dist: networkx>=2.7.1; extra == "tf"
42
43
  Requires-Dist: numpy>=1.21; extra == "tf"
@@ -52,10 +53,10 @@ Requires-Dist: tensorpack==0.11; extra == "tf"
52
53
  Requires-Dist: protobuf==3.20.1; extra == "tf"
53
54
  Requires-Dist: tensorflow-addons>=0.17.1; extra == "tf"
54
55
  Requires-Dist: tf2onnx>=1.9.2; extra == "tf"
55
- Requires-Dist: python-doctr==0.7.0; extra == "tf"
56
+ Requires-Dist: python-doctr==0.8.1; extra == "tf"
56
57
  Requires-Dist: pycocotools>=2.0.2; extra == "tf"
57
- Requires-Dist: boto3; extra == "tf"
58
- Requires-Dist: pdfplumber>=0.7.1; extra == "tf"
58
+ Requires-Dist: boto3==1.34.102; extra == "tf"
59
+ Requires-Dist: pdfplumber>=0.11.0; extra == "tf"
59
60
  Requires-Dist: fasttext==0.9.2; extra == "tf"
60
61
  Requires-Dist: jdeskew>=0.2.2; extra == "tf"
61
62
  Requires-Dist: apted==1.0.3; extra == "tf"
@@ -66,6 +67,7 @@ Requires-Dist: catalogue==2.0.10; extra == "pt"
66
67
  Requires-Dist: huggingface_hub>=0.12.0; extra == "pt"
67
68
  Requires-Dist: importlib-metadata>=5.0.0; extra == "pt"
68
69
  Requires-Dist: jsonlines==3.1.0; extra == "pt"
70
+ Requires-Dist: lazy-imports==0.3.1; extra == "pt"
69
71
  Requires-Dist: mock==4.0.3; extra == "pt"
70
72
  Requires-Dist: networkx>=2.7.1; extra == "pt"
71
73
  Requires-Dist: numpy>=1.21; extra == "pt"
@@ -80,9 +82,9 @@ Requires-Dist: tqdm==4.64.0; extra == "pt"
80
82
  Requires-Dist: timm>=0.9.16; extra == "pt"
81
83
  Requires-Dist: transformers>=4.36.0; extra == "pt"
82
84
  Requires-Dist: accelerate>=0.29.1; extra == "pt"
83
- Requires-Dist: python-doctr==0.7.0; extra == "pt"
84
- Requires-Dist: boto3; extra == "pt"
85
- Requires-Dist: pdfplumber>=0.7.1; extra == "pt"
85
+ Requires-Dist: python-doctr==0.8.1; extra == "pt"
86
+ Requires-Dist: boto3==1.34.102; extra == "pt"
87
+ Requires-Dist: pdfplumber>=0.11.0; extra == "pt"
86
88
  Requires-Dist: fasttext==0.9.2; extra == "pt"
87
89
  Requires-Dist: jdeskew>=0.2.2; extra == "pt"
88
90
  Requires-Dist: apted==1.0.3; extra == "pt"
@@ -90,10 +92,10 @@ Requires-Dist: distance==0.1.3; extra == "pt"
90
92
  Requires-Dist: lxml>=4.9.1; extra == "pt"
91
93
  Provides-Extra: docs
92
94
  Requires-Dist: tensorpack==0.11; extra == "docs"
93
- Requires-Dist: boto3; extra == "docs"
95
+ Requires-Dist: boto3==1.34.102; extra == "docs"
94
96
  Requires-Dist: transformers>=4.36.0; extra == "docs"
95
97
  Requires-Dist: accelerate>=0.29.1; extra == "docs"
96
- Requires-Dist: pdfplumber>=0.7.1; extra == "docs"
98
+ Requires-Dist: pdfplumber>=0.11.0; extra == "docs"
97
99
  Requires-Dist: lxml>=4.9.1; extra == "docs"
98
100
  Requires-Dist: lxml-stubs>=0.5.1; extra == "docs"
99
101
  Requires-Dist: jdeskew>=0.2.2; extra == "docs"
@@ -153,7 +155,8 @@ pipelines. Its core function does not depend on any specific deep learning libra
153
155
  - Text mining for native PDFs with [**pdfplumber**](https://github.com/jsvine/pdfplumber),
154
156
  - Language detection with [**fastText**](https://github.com/facebookresearch/fastText),
155
157
  - Deskewing and rotating images with [**jdeskew**](https://github.com/phamquiluan/jdeskew).
156
- - Document and token classification with all LayoutLM models provided by the Transformer library.
158
+ - Document and token classification with all LayoutLM models provided by the
159
+ [**Transformer library**](https://github.com/huggingface/transformers).
157
160
  (Yes, you can use any LayoutLM-model with any of the provided OCR-or pdfplumber tools straight away!).
158
161
  - Table detection and table structure recognition with
159
162
  [**table-transformer**](https://github.com/microsoft/table-transformer).
@@ -163,10 +166,16 @@ pipelines. Its core function does not depend on any specific deep learning libra
163
166
  - Comprehensive configuration of **analyzer** like choosing different models, output parsing, OCR selection.
164
167
  Check this [notebook](https://github.com/deepdoctection/notebooks/blob/main/Analyzer_Configuration.ipynb) or the
165
168
  [docs](https://deepdoctection.readthedocs.io/en/latest/tutorials/analyzer_configuration_notebook/) for more infos.
166
- - Document layout analysis and table recognition now runs with Torchscript (CPU) as well and Detectron2 is
167
- not required anymore for basic inference.
168
- - [**new**] More angle predictors for determining the rotation of a document based on Tesseract and DocTr
169
+ - Document layout analysis and table recognition now runs with
170
+ [**Torchscript**](https://pytorch.org/docs/stable/jit.html) (CPU) as well and [**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) is not required
171
+ anymore for basic inference.
172
+ - [**new**] More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
169
173
  (not contained in the built-in Analyzer).
174
+ - [**new**] Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
175
+ [**transformers**](https://github.com/huggingface/transformers).
176
+ We have added a model wrapper for token classification with LiLT and added a some LiLT models to the model catalog
177
+ that seem to look promising, especially if you want to train a model on non-english data. The training script for
178
+ LayoutLM can be used for LiLT as well and we will be providing a notebook on how to train a model on a custom dataset soon.
170
179
 
171
180
  **deep**doctection provides on top of that methods for pre-processing inputs to models like cropping or resizing and to
172
181
  post-process results, like validating duplicate outputs, relating words to detected layout segments or ordering words
@@ -257,9 +266,9 @@ Everything in the overview listed below the **deep**doctection layer are necessa
257
266
  separately.
258
267
 
259
268
  - Linux or macOS. (Windows is not supported but there is a [Dockerfile](./docker/pytorch-cpu-jupyter/Dockerfile) available)
260
- - Python >= 3.8
261
- - 1.12 <= PyTorch < 2.0 **or** Tensorflow >= 2.9 and CUDA. If you want to run the models provided by Tensorpack a GPU is
262
- required. You can run on PyTorch with a CPU only.
269
+ - Python >= 3.9
270
+ - 1.13 <= PyTorch **or** 2.11 <= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
271
+ In general, if you want to train or fine-tune models, a GPU is required.
263
272
  - **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF documents into
264
273
  images.
265
274
  - With respect to the Deep Learning framework, you must decide between [Tensorflow](https://www.tensorflow.org/install?hl=en)
@@ -31,7 +31,8 @@ pipelines. Its core function does not depend on any specific deep learning libra
31
31
  - Text mining for native PDFs with [**pdfplumber**](https://github.com/jsvine/pdfplumber),
32
32
  - Language detection with [**fastText**](https://github.com/facebookresearch/fastText),
33
33
  - Deskewing and rotating images with [**jdeskew**](https://github.com/phamquiluan/jdeskew).
34
- - Document and token classification with all LayoutLM models provided by the Transformer library.
34
+ - Document and token classification with all LayoutLM models provided by the
35
+ [**Transformer library**](https://github.com/huggingface/transformers).
35
36
  (Yes, you can use any LayoutLM-model with any of the provided OCR-or pdfplumber tools straight away!).
36
37
  - Table detection and table structure recognition with
37
38
  [**table-transformer**](https://github.com/microsoft/table-transformer).
@@ -41,10 +42,16 @@ pipelines. Its core function does not depend on any specific deep learning libra
41
42
  - Comprehensive configuration of **analyzer** like choosing different models, output parsing, OCR selection.
42
43
  Check this [notebook](https://github.com/deepdoctection/notebooks/blob/main/Analyzer_Configuration.ipynb) or the
43
44
  [docs](https://deepdoctection.readthedocs.io/en/latest/tutorials/analyzer_configuration_notebook/) for more infos.
44
- - Document layout analysis and table recognition now runs with Torchscript (CPU) as well and Detectron2 is
45
- not required anymore for basic inference.
46
- - [**new**] More angle predictors for determining the rotation of a document based on Tesseract and DocTr
45
+ - Document layout analysis and table recognition now runs with
46
+ [**Torchscript**](https://pytorch.org/docs/stable/jit.html) (CPU) as well and [**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) is not required
47
+ anymore for basic inference.
48
+ - [**new**] More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
47
49
  (not contained in the built-in Analyzer).
50
+ - [**new**] Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
51
+ [**transformers**](https://github.com/huggingface/transformers).
52
+ We have added a model wrapper for token classification with LiLT and added a some LiLT models to the model catalog
53
+ that seem to look promising, especially if you want to train a model on non-english data. The training script for
54
+ LayoutLM can be used for LiLT as well and we will be providing a notebook on how to train a model on a custom dataset soon.
48
55
 
49
56
  **deep**doctection provides on top of that methods for pre-processing inputs to models like cropping or resizing and to
50
57
  post-process results, like validating duplicate outputs, relating words to detected layout segments or ordering words
@@ -135,9 +142,9 @@ Everything in the overview listed below the **deep**doctection layer are necessa
135
142
  separately.
136
143
 
137
144
  - Linux or macOS. (Windows is not supported but there is a [Dockerfile](./docker/pytorch-cpu-jupyter/Dockerfile) available)
138
- - Python >= 3.8
139
- - 1.12 <= PyTorch < 2.0 **or** Tensorflow >= 2.9 and CUDA. If you want to run the models provided by Tensorpack a GPU is
140
- required. You can run on PyTorch with a CPU only.
145
+ - Python >= 3.9
146
+ - 1.13 <= PyTorch **or** 2.11 <= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
147
+ In general, if you want to train or fine-tune models, a GPU is required.
141
148
  - **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF documents into
142
149
  images.
143
150
  - With respect to the Deep Learning framework, you must decide between [Tensorflow](https://www.tensorflow.org/install?hl=en)
@@ -19,15 +19,13 @@ import os
19
19
  import sys
20
20
  from typing import TYPE_CHECKING
21
21
 
22
- from packaging import version
23
-
24
- from .utils.env_info import auto_select_lib_and_device
22
+ from .utils.env_info import collect_env_info
25
23
  from .utils.file_utils import _LazyModule, get_tf_version, pytorch_available, tf_available
26
- from .utils.logger import logger
24
+ from .utils.logger import LoggingRecord, logger
27
25
 
28
26
  # pylint: enable=wrong-import-position
29
27
 
30
- __version__ = 0.31
28
+ __version__ = 0.32
31
29
 
32
30
  _IMPORT_STRUCTURE = {
33
31
  "analyzer": [
@@ -182,6 +180,7 @@ _IMPORT_STRUCTURE = {
182
180
  "DocTrRotationTransformer",
183
181
  "FasttextLangDetector",
184
182
  "HFDetrDerivedDetector",
183
+ "get_tokenizer_from_architecture",
185
184
  "HFLayoutLmTokenClassifierBase",
186
185
  "HFLayoutLmTokenClassifier",
187
186
  "HFLayoutLmv2TokenClassifier",
@@ -189,6 +188,9 @@ _IMPORT_STRUCTURE = {
189
188
  "HFLayoutLmSequenceClassifier",
190
189
  "HFLayoutLmv2SequenceClassifier",
191
190
  "HFLayoutLmv3SequenceClassifier",
191
+ "HFLiltTokenClassifier",
192
+ "HFLiltSequenceClassifier",
193
+ "HFLmSequenceClassifier",
192
194
  "ModelProfile",
193
195
  "ModelCatalog",
194
196
  "print_model_infos",
@@ -268,11 +270,11 @@ _IMPORT_STRUCTURE = {
268
270
  "DoctectionPipe",
269
271
  "LanguageDetectionService",
270
272
  "ImageLayoutService",
271
- "get_tokenizer_from_architecture",
272
273
  "LMTokenClassifierService",
273
274
  "LMSequenceClassifierService",
274
275
  "OrderGenerator",
275
276
  "TextLineGenerator",
277
+ "TextLineService",
276
278
  "TextOrderService",
277
279
  "TableSegmentationRefinementService",
278
280
  "generate_html_string",
@@ -297,14 +299,13 @@ _IMPORT_STRUCTURE = {
297
299
  "save_tmp_file",
298
300
  "timed_operation",
299
301
  "collect_env_info",
300
- "get_device",
301
- "auto_select_lib_and_device",
302
302
  "auto_select_viz_library",
303
303
  "get_tensorflow_requirement",
304
304
  "tf_addons_available",
305
305
  "get_tf_addons_requirements",
306
306
  "tensorpack_available",
307
307
  "get_tensorpack_requirement",
308
+ "pytorch_available",
308
309
  "get_pytorch_requirement",
309
310
  "lxml_available",
310
311
  "get_lxml_requirement",
@@ -418,25 +419,31 @@ _IMPORT_STRUCTURE = {
418
419
  ],
419
420
  }
420
421
 
422
+ # Setting some environment variables so that standard functions can be invoked with available hardware
423
+ env_info = collect_env_info()
424
+ logger.debug(LoggingRecord(msg=env_info))
421
425
 
422
- # disable TF warnings for versions > 2.4.1
423
- if tf_available():
424
- if version.parse(get_tf_version()) > version.parse("2.4.1"):
425
- os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
426
- try:
427
- import tensorflow.python.util.deprecation as deprecation # type: ignore # pylint: disable=E0401,R0402
428
-
429
- deprecation._PRINT_DEPRECATION_WARNINGS = False # pylint: disable=W0212
430
- except Exception: # pylint: disable=W0703
431
- try:
432
- from tensorflow.python.util import deprecation # type: ignore # pylint: disable=E0401
433
-
434
- deprecation._PRINT_DEPRECATION_WARNINGS = False # pylint: disable=W0212
435
- except Exception: # pylint: disable=W0703
436
- pass
426
+ if os.environ.get("PYTORCH_AVAILABLE") and os.environ.get("DD_USE_TORCH") is None:
427
+ os.environ["DD_USE_TORCH"] = "1"
428
+ os.environ["USE_TORCH"] = "1"
429
+ if os.environ.get("TENSORFLOW_AVAILABLE") and os.environ.get("DD_USE_TF") is None:
430
+ os.environ["DD_USE_TF"] = "1"
431
+ os.environ["USE_TF"] = "1"
432
+ if os.environ.get("DD_USE_TORCH") and os.environ.get("DD_USE_TF"):
433
+ logger.warning(
434
+ "Both DD_USE_TORCH and DD_USE_TF are set. Defaulting to PyTorch. If you want a different "
435
+ "behaviour, set DD_USE_TORCH to None before importing deepdoctection."
436
+ )
437
+ os.environ.pop("DD_USE_TF")
438
+ os.environ.pop("USE_TF")
437
439
 
438
- # Setting some environment variables so that standard functions can be invoked with available hardware
439
- auto_select_lib_and_device()
440
+ if not os.environ.get("PYTORCH_AVAILABLE") and not os.environ.get("TENSORFLOW_AVAILABLE"):
441
+ logger.warning(
442
+ LoggingRecord(
443
+ msg="Neither Tensorflow or Pytorch are available. You will not be able to use any Deep Learning "
444
+ "model from the library."
445
+ )
446
+ )
440
447
 
441
448
 
442
449
  # Direct imports for type-checking
@@ -444,10 +451,10 @@ if TYPE_CHECKING:
444
451
  from .analyzer import *
445
452
  from .dataflow import *
446
453
  from .datapoint import *
447
- from .datasets import *
454
+ from .datasets import * # type: ignore
448
455
  from .eval import *
449
- from .extern import *
450
- from .mapper import *
456
+ from .extern import * # type: ignore
457
+ from .mapper import * # type: ignore
451
458
  from .pipe import *
452
459
  from .train import *
453
460
  from .utils import *
@@ -23,51 +23,43 @@ Module for **deep**doctection analyzer.
23
23
  -user factory with a reduced config setting
24
24
  """
25
25
 
26
- import ast
27
26
  import os
28
27
  from os import environ
29
28
  from shutil import copyfile
30
29
  from typing import List, Optional, Union
31
30
 
31
+ from lazy_imports import try_import
32
+
32
33
  from ..extern.base import ObjectDetector
34
+ from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
33
35
  from ..extern.doctrocr import DoctrTextlineDetector, DoctrTextRecognizer
36
+ from ..extern.hfdetr import HFDetrDerivedDetector
34
37
  from ..extern.model import ModelCatalog, ModelDownloadManager
35
38
  from ..extern.pdftext import PdfPlumberTextDetector
39
+ from ..extern.pt.ptutils import get_torch_device
36
40
  from ..extern.tessocr import TesseractOcrDetector
37
41
  from ..extern.texocr import TextractOcrDetector
42
+ from ..extern.tp.tfutils import disable_tp_layer_logging, get_tf_device
43
+ from ..extern.tpdetect import TPFrcnnDetector
38
44
  from ..pipe.base import PipelineComponent
39
- from ..pipe.cell import DetectResultGenerator, SubImageLayoutService
40
45
  from ..pipe.common import AnnotationNmsService, MatchingService, PageParsingService
41
46
  from ..pipe.doctectionpipe import DoctectionPipe
42
47
  from ..pipe.layout import ImageLayoutService
43
48
  from ..pipe.order import TextOrderService
44
49
  from ..pipe.refine import TableSegmentationRefinementService
45
50
  from ..pipe.segment import PubtablesSegmentationService, TableSegmentationService
51
+ from ..pipe.sub_layout import DetectResultGenerator, SubImageLayoutService
46
52
  from ..pipe.text import TextExtractionService
47
53
  from ..utils.detection_types import Pathlike
48
- from ..utils.env_info import get_device
49
- from ..utils.file_utils import (
50
- boto3_available,
51
- detectron2_available,
52
- pytorch_available,
53
- tensorpack_available,
54
- tf_available,
55
- )
54
+ from ..utils.error import DependencyError
55
+ from ..utils.file_utils import detectron2_available, tensorpack_available
56
56
  from ..utils.fs import get_configs_dir_path, get_package_path, mkdir_p
57
57
  from ..utils.logger import LoggingRecord, logger
58
58
  from ..utils.metacfg import AttrDict, set_config_by_yaml
59
59
  from ..utils.settings import CellType, LayoutType
60
60
  from ..utils.transform import PadTransform
61
61
 
62
- if tf_available() and tensorpack_available():
63
- from ..extern.tp.tfutils import disable_tp_layer_logging
64
- from ..extern.tpdetect import TPFrcnnDetector
65
-
66
- if pytorch_available():
67
- from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
68
- from ..extern.hfdetr import HFDetrDerivedDetector
69
-
70
- if boto3_available():
62
+ with try_import() as image_guard:
71
63
  from botocore.config import Config # type: ignore
72
64
 
73
65
 
@@ -344,11 +336,20 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
344
336
  pipe_component_list.append(table_segmentation)
345
337
 
346
338
  if cfg.USE_TABLE_REFINEMENT:
347
- table_segmentation_refinement = TableSegmentationRefinementService()
339
+ table_segmentation_refinement = TableSegmentationRefinementService(
340
+ [LayoutType.table, LayoutType.table_rotated],
341
+ [
342
+ LayoutType.cell,
343
+ CellType.column_header,
344
+ CellType.projected_row_header,
345
+ CellType.spanning,
346
+ CellType.row_header,
347
+ ],
348
+ )
348
349
  pipe_component_list.append(table_segmentation_refinement)
349
350
 
350
351
  if cfg.USE_PDF_MINER:
351
- pdf_text = PdfPlumberTextDetector()
352
+ pdf_text = PdfPlumberTextDetector(x_tolerance=cfg.PDF_MINER.X_TOLERANCE, y_tolerance=cfg.PDF_MINER.Y_TOLERANCE)
352
353
  d_text = TextExtractionService(pdf_text)
353
354
  pipe_component_list.append(d_text)
354
355
 
@@ -401,7 +402,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
401
402
 
402
403
 
403
404
  def get_dd_analyzer(
404
- reset_config_file: bool = False,
405
+ reset_config_file: bool = True,
405
406
  config_overwrite: Optional[List[str]] = None,
406
407
  path_config_file: Optional[Pathlike] = None,
407
408
  ) -> DoctectionPipe:
@@ -430,8 +431,13 @@ def get_dd_analyzer(
430
431
  :return: A DoctectionPipe instance with given configs
431
432
  """
432
433
  config_overwrite = [] if config_overwrite is None else config_overwrite
433
- lib = "TF" if ast.literal_eval(os.environ.get("USE_TENSORFLOW", "False")) else "PT"
434
- device = get_device(False)
434
+ lib = "TF" if os.environ.get("DD_USE_TF") else "PT"
435
+ if lib == "TF":
436
+ device = get_tf_device()
437
+ elif lib == "PT":
438
+ device = get_torch_device()
439
+ else:
440
+ raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
435
441
  dd_one_config_path = maybe_copy_config_to_cache(
436
442
  get_package_path(), get_configs_dir_path(), _DD_ONE, reset_config_file
437
443
  )
@@ -1,38 +1,38 @@
1
1
  USE_LAYOUT: True
2
2
  USE_TABLE_SEGMENTATION: True
3
3
  TF:
4
- LAYOUT:
5
- WEIGHTS: layout/model-800000_inf_only.data-00000-of-00001
6
- FILTER:
7
- CELL:
8
- WEIGHTS: cell/model-1800000_inf_only.data-00000-of-00001
9
- FILTER:
10
- ITEM:
11
- WEIGHTS: item/model-1620000_inf_only.data-00000-of-00001
12
- FILTER:
4
+ LAYOUT:
5
+ WEIGHTS: layout/model-800000_inf_only.data-00000-of-00001
6
+ FILTER:
7
+ CELL:
8
+ WEIGHTS: cell/model-1800000_inf_only.data-00000-of-00001
9
+ FILTER:
10
+ ITEM:
11
+ WEIGHTS: item/model-1620000_inf_only.data-00000-of-00001
12
+ FILTER:
13
13
  PT:
14
- LAYOUT:
15
- WEIGHTS: layout/d2_model_0829999_layout_inf_only.pt
16
- WEIGHTS_TS: layout/d2_model_0829999_layout_inf_only.ts
17
- FILTER:
18
- PAD:
19
- TOP: 60
20
- RIGHT: 60
21
- BOTTOM: 60
22
- LEFT: 60
23
- ITEM:
24
- WEIGHTS: item/d2_model_1639999_item_inf_only.pt
25
- WEIGHTS_TS: item/d2_model_1639999_item_inf_only.ts
26
- FILTER:
27
- PAD:
28
- TOP: 60
29
- RIGHT: 60
30
- BOTTOM: 60
31
- LEFT: 60
32
- CELL:
33
- WEIGHTS: cell/d2_model_1849999_cell_inf_only.pt
34
- WEIGHTS_TS: cell/d2_model_1849999_cell_inf_only.ts
35
- FILTER:
14
+ LAYOUT:
15
+ WEIGHTS: layout/d2_model_0829999_layout_inf_only.pt
16
+ WEIGHTS_TS: layout/d2_model_0829999_layout_inf_only.ts
17
+ FILTER:
18
+ PAD:
19
+ TOP: 60
20
+ RIGHT: 60
21
+ BOTTOM: 60
22
+ LEFT: 60
23
+ ITEM:
24
+ WEIGHTS: item/d2_model_1639999_item_inf_only.pt
25
+ WEIGHTS_TS: item/d2_model_1639999_item_inf_only.ts
26
+ FILTER:
27
+ PAD:
28
+ TOP: 60
29
+ RIGHT: 60
30
+ BOTTOM: 60
31
+ LEFT: 60
32
+ CELL:
33
+ WEIGHTS: cell/d2_model_1849999_cell_inf_only.pt
34
+ WEIGHTS_TS: cell/d2_model_1849999_cell_inf_only.ts
35
+ FILTER:
36
36
  LAYOUT_NMS_PAIRS:
37
37
  COMBINATIONS:
38
38
  THRESHOLDS:
@@ -48,6 +48,9 @@ SEGMENTATION:
48
48
  STRETCH_RULE: equal
49
49
  USE_TABLE_REFINEMENT: True
50
50
  USE_PDF_MINER: False
51
+ PDF_MINER:
52
+ X_TOLERANCE: 3
53
+ Y_TOLERANCE: 3
51
54
  USE_OCR: True
52
55
  OCR:
53
56
  USE_TESSERACT: True
@@ -504,5 +504,6 @@ class ContainerAnnotation(CategoryAnnotation):
504
504
  @classmethod
505
505
  def from_dict(cls, **kwargs: JsonDict) -> "SummaryAnnotation":
506
506
  container_ann = ann_from_dict(cls, **kwargs)
507
- container_ann.value = kwargs.get("value")
507
+ value = kwargs.get("value", "")
508
+ container_ann.value = value if isinstance(value, str) else list(value)
508
509
  return container_ann
@@ -25,6 +25,7 @@ from typing import List, Optional, Sequence, no_type_check
25
25
 
26
26
  import numpy as np
27
27
  import numpy.typing as npt
28
+ from lazy_imports import try_import
28
29
  from numpy import float32
29
30
 
30
31
  from ..utils.detection_types import ImageType
@@ -32,7 +33,7 @@ from ..utils.error import BoundingBoxError
32
33
  from ..utils.file_utils import cocotools_available
33
34
  from ..utils.logger import LoggingRecord, logger
34
35
 
35
- if cocotools_available():
36
+ with try_import() as import_guard:
36
37
  import pycocotools.mask as coco_mask
37
38
 
38
39
 
@@ -18,6 +18,8 @@
18
18
  """
19
19
  Dataclass Image
20
20
  """
21
+ from __future__ import annotations
22
+
21
23
  import json
22
24
  from dataclasses import dataclass, field
23
25
  from os import environ
@@ -202,7 +204,7 @@ class Image:
202
204
  self._bbox = None
203
205
  self.embeddings.pop(self.image_id)
204
206
 
205
- def get_image(self) -> "_Img": # type: ignore
207
+ def get_image(self) -> _Img: # type: ignore # pylint: disable=E0602
206
208
  """
207
209
  Get the image either in base64 string representation or as np.array.
208
210
 
@@ -531,16 +533,20 @@ class Image:
531
533
  )
532
534
  ann.image.dump(sub_image)
533
535
 
534
- def remove_image_from_lower_hierachy(self) -> None:
536
+ def remove_image_from_lower_hierachy(self, pixel_values_only: bool = False) -> None:
535
537
  """Will remove all images from image annotations."""
536
538
  for ann in self.annotations:
537
- absolute_bounding_box = ann.get_bounding_box(self.image_id)
538
- ann.bounding_box = absolute_bounding_box
539
- ann.image = None
539
+ if pixel_values_only:
540
+ if ann.image is not None:
541
+ ann.image.clear_image()
542
+ else:
543
+ absolute_bounding_box = ann.get_bounding_box(self.image_id)
544
+ ann.bounding_box = absolute_bounding_box
545
+ ann.image = None
540
546
 
541
547
  @classmethod
542
548
  @no_type_check
543
- def from_dict(cls, **kwargs) -> "Image":
549
+ def from_dict(cls, **kwargs) -> Image:
544
550
  """
545
551
  Create `Image` instance from dict.
546
552
 
@@ -571,7 +577,7 @@ class Image:
571
577
 
572
578
  @classmethod
573
579
  @no_type_check
574
- def from_file(cls, file_path: str) -> "Image":
580
+ def from_file(cls, file_path: str) -> Image:
575
581
  """
576
582
  Create `Image` instance from .json file.
577
583