deepdoctection 0.30__tar.gz → 0.32__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (247) hide show
  1. {deepdoctection-0.30 → deepdoctection-0.32}/PKG-INFO +57 -73
  2. {deepdoctection-0.30 → deepdoctection-0.32}/README.md +15 -6
  3. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/__init__.py +38 -29
  4. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/analyzer/dd.py +36 -29
  5. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/configs/conf_dd_one.yaml +34 -31
  6. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/dataflow/base.py +0 -19
  7. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/dataflow/custom.py +4 -3
  8. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/dataflow/custom_serialize.py +14 -5
  9. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/dataflow/parallel_map.py +12 -11
  10. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/dataflow/serialize.py +5 -4
  11. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datapoint/annotation.py +35 -13
  12. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datapoint/box.py +3 -5
  13. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datapoint/convert.py +3 -1
  14. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datapoint/image.py +79 -36
  15. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datapoint/view.py +152 -49
  16. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/__init__.py +1 -4
  17. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/adapter.py +6 -3
  18. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/base.py +86 -11
  19. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/dataflow_builder.py +1 -1
  20. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/info.py +4 -4
  21. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/instances/doclaynet.py +3 -2
  22. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/instances/fintabnet.py +2 -1
  23. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/instances/funsd.py +2 -1
  24. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/instances/iiitar13k.py +5 -2
  25. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/instances/layouttest.py +4 -8
  26. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/instances/publaynet.py +2 -2
  27. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/instances/pubtables1m.py +6 -3
  28. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/instances/pubtabnet.py +2 -1
  29. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/instances/rvlcdip.py +2 -1
  30. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/instances/xfund.py +2 -1
  31. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/eval/__init__.py +1 -4
  32. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/eval/accmetric.py +1 -1
  33. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/eval/base.py +5 -4
  34. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/eval/cocometric.py +2 -1
  35. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/eval/eval.py +19 -15
  36. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/eval/tedsmetric.py +14 -11
  37. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/eval/tp_eval_callback.py +14 -7
  38. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/__init__.py +2 -7
  39. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/base.py +39 -13
  40. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/d2detect.py +182 -90
  41. deepdoctection-0.32/deepdoctection/extern/deskew.py +82 -0
  42. deepdoctection-0.32/deepdoctection/extern/doctrocr.py +526 -0
  43. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/fastlang.py +49 -9
  44. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/hfdetr.py +106 -55
  45. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/hflayoutlm.py +441 -122
  46. deepdoctection-0.32/deepdoctection/extern/hflm.py +225 -0
  47. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/model.py +56 -47
  48. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/pdftext.py +10 -5
  49. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/pt/__init__.py +1 -3
  50. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/pt/nms.py +6 -2
  51. deepdoctection-0.32/deepdoctection/extern/pt/ptutils.py +57 -0
  52. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tessocr.py +134 -22
  53. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/texocr.py +6 -2
  54. deepdoctection-0.32/deepdoctection/extern/tp/tfutils.py +91 -0
  55. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tp/tpcompat.py +14 -11
  56. deepdoctection-0.32/deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
  57. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
  58. deepdoctection-0.32/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
  59. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/config/config.py +9 -6
  60. deepdoctection-0.32/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
  61. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +17 -7
  62. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
  63. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +9 -4
  64. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
  65. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +16 -11
  66. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +17 -10
  67. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +14 -8
  68. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
  69. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
  70. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/preproc.py +8 -9
  71. {deepdoctection-0.30/tests/datapoint → deepdoctection-0.32/deepdoctection/extern/tp/tpfrcnn/utils}/__init__.py +4 -0
  72. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
  73. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tpdetect.py +54 -30
  74. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/mapper/__init__.py +3 -8
  75. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/mapper/d2struct.py +9 -7
  76. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/mapper/hfstruct.py +7 -2
  77. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/mapper/laylmstruct.py +164 -21
  78. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/mapper/maputils.py +16 -3
  79. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/mapper/misc.py +6 -3
  80. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/mapper/prodigystruct.py +1 -1
  81. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/mapper/pubstruct.py +10 -10
  82. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/mapper/tpstruct.py +3 -3
  83. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/pipe/__init__.py +1 -1
  84. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/pipe/anngen.py +35 -8
  85. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/pipe/base.py +53 -19
  86. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/pipe/common.py +23 -13
  87. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/pipe/concurrency.py +2 -1
  88. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/pipe/doctectionpipe.py +2 -2
  89. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/pipe/language.py +3 -2
  90. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/pipe/layout.py +6 -3
  91. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/pipe/lm.py +34 -66
  92. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/pipe/order.py +142 -35
  93. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/pipe/refine.py +26 -24
  94. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/pipe/segment.py +21 -16
  95. deepdoctection-0.30/deepdoctection/pipe/cell.py → deepdoctection-0.32/deepdoctection/pipe/sub_layout.py +30 -9
  96. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/pipe/text.py +14 -8
  97. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/pipe/transform.py +16 -9
  98. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/train/__init__.py +6 -12
  99. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/train/d2_frcnn_train.py +36 -28
  100. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/train/hf_detr_train.py +26 -17
  101. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/train/hf_layoutlm_train.py +133 -111
  102. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/train/tp_frcnn_train.py +21 -19
  103. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/__init__.py +3 -0
  104. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/concurrency.py +1 -1
  105. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/context.py +2 -2
  106. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/env_info.py +41 -84
  107. deepdoctection-0.32/deepdoctection/utils/error.py +84 -0
  108. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/file_utils.py +4 -15
  109. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/fs.py +7 -7
  110. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/logger.py +1 -0
  111. deepdoctection-0.32/deepdoctection/utils/mocks.py +93 -0
  112. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/pdf_utils.py +5 -4
  113. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/settings.py +6 -1
  114. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/transform.py +1 -1
  115. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/utils.py +0 -6
  116. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/viz.py +48 -5
  117. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection.egg-info/PKG-INFO +57 -73
  118. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection.egg-info/SOURCES.txt +5 -90
  119. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection.egg-info/requires.txt +40 -65
  120. {deepdoctection-0.30 → deepdoctection-0.32}/setup.cfg +7 -1
  121. {deepdoctection-0.30 → deepdoctection-0.32}/setup.py +23 -23
  122. {deepdoctection-0.30 → deepdoctection-0.32}/tests/test_utils.py +8 -0
  123. deepdoctection-0.30/deepdoctection/extern/deskew.py +0 -55
  124. deepdoctection-0.30/deepdoctection/extern/doctrocr.py +0 -344
  125. deepdoctection-0.30/deepdoctection/extern/pt/ptutils.py +0 -48
  126. deepdoctection-0.30/deepdoctection/extern/tp/tfutils.py +0 -57
  127. deepdoctection-0.30/deepdoctection/extern/tp/tpfrcnn/__init__.py +0 -0
  128. deepdoctection-0.30/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +0 -0
  129. deepdoctection-0.30/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +0 -0
  130. deepdoctection-0.30/deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +0 -0
  131. deepdoctection-0.30/tests/__init__.py +0 -22
  132. deepdoctection-0.30/tests/analyzer/__init__.py +0 -16
  133. deepdoctection-0.30/tests/analyzer/test_dd.py +0 -202
  134. deepdoctection-0.30/tests/conftest.py +0 -499
  135. deepdoctection-0.30/tests/data.py +0 -1592
  136. deepdoctection-0.30/tests/dataflow/__init__.py +0 -16
  137. deepdoctection-0.30/tests/dataflow/conftest.py +0 -95
  138. deepdoctection-0.30/tests/dataflow/test_common.py +0 -219
  139. deepdoctection-0.30/tests/dataflow/test_custom.py +0 -60
  140. deepdoctection-0.30/tests/dataflow/test_custom_serialize.py +0 -177
  141. deepdoctection-0.30/tests/dataflow/test_parallel_map.py +0 -66
  142. deepdoctection-0.30/tests/dataflow/test_stats.py +0 -103
  143. deepdoctection-0.30/tests/datapoint/conftest.py +0 -262
  144. deepdoctection-0.30/tests/datapoint/test_annotation.py +0 -170
  145. deepdoctection-0.30/tests/datapoint/test_box.py +0 -416
  146. deepdoctection-0.30/tests/datapoint/test_convert.py +0 -52
  147. deepdoctection-0.30/tests/datapoint/test_image.py +0 -341
  148. deepdoctection-0.30/tests/datapoint/test_view.py +0 -150
  149. deepdoctection-0.30/tests/datasets/__init__.py +0 -16
  150. deepdoctection-0.30/tests/datasets/instances/__init__.py +0 -16
  151. deepdoctection-0.30/tests/datasets/instances/conftest.py +0 -35
  152. deepdoctection-0.30/tests/datasets/instances/test_doclaynet.py +0 -43
  153. deepdoctection-0.30/tests/datasets/instances/test_fintabnet.py +0 -70
  154. deepdoctection-0.30/tests/datasets/instances/test_funsd.py +0 -58
  155. deepdoctection-0.30/tests/datasets/instances/test_iiitar13k.py +0 -42
  156. deepdoctection-0.30/tests/datasets/instances/test_layouttest.py +0 -63
  157. deepdoctection-0.30/tests/datasets/instances/test_publaynet.py +0 -64
  158. deepdoctection-0.30/tests/datasets/instances/test_pubtables1m.py +0 -66
  159. deepdoctection-0.30/tests/datasets/instances/test_pubtabnet.py +0 -65
  160. deepdoctection-0.30/tests/datasets/instances/test_rvlcdip.py +0 -46
  161. deepdoctection-0.30/tests/datasets/test_adapter.py +0 -77
  162. deepdoctection-0.30/tests/datasets/test_info.py +0 -273
  163. deepdoctection-0.30/tests/datasets/test_registry.py +0 -75
  164. deepdoctection-0.30/tests/eval/__init__.py +0 -16
  165. deepdoctection-0.30/tests/eval/conftest.py +0 -107
  166. deepdoctection-0.30/tests/eval/test_accmetric.py +0 -364
  167. deepdoctection-0.30/tests/eval/test_cocometric.py +0 -123
  168. deepdoctection-0.30/tests/eval/test_eval.py +0 -86
  169. deepdoctection-0.30/tests/eval/test_registry.py +0 -84
  170. deepdoctection-0.30/tests/eval/test_tedsmetric.py +0 -40
  171. deepdoctection-0.30/tests/extern/__init__.py +0 -0
  172. deepdoctection-0.30/tests/extern/conftest.py +0 -99
  173. deepdoctection-0.30/tests/extern/data.py +0 -100
  174. deepdoctection-0.30/tests/extern/test_deskew.py +0 -57
  175. deepdoctection-0.30/tests/extern/test_doctrocr.py +0 -146
  176. deepdoctection-0.30/tests/extern/test_fastlang.py +0 -64
  177. deepdoctection-0.30/tests/extern/test_hfdetr.py +0 -116
  178. deepdoctection-0.30/tests/extern/test_hflayoutlm.py +0 -492
  179. deepdoctection-0.30/tests/extern/test_pdftext.py +0 -70
  180. deepdoctection-0.30/tests/extern/test_tessocr.py +0 -105
  181. deepdoctection-0.30/tests/extern/test_texocr.py +0 -52
  182. deepdoctection-0.30/tests/extern/test_tpdetect.py +0 -123
  183. deepdoctection-0.30/tests/mapper/__init__.py +0 -16
  184. deepdoctection-0.30/tests/mapper/conftest.py +0 -297
  185. deepdoctection-0.30/tests/mapper/data.py +0 -2182
  186. deepdoctection-0.30/tests/mapper/test_cats.py +0 -305
  187. deepdoctection-0.30/tests/mapper/test_cocostruct.py +0 -91
  188. deepdoctection-0.30/tests/mapper/test_d2struct.py +0 -56
  189. deepdoctection-0.30/tests/mapper/test_hfstruct.py +0 -59
  190. deepdoctection-0.30/tests/mapper/test_iiitar13k.py +0 -64
  191. deepdoctection-0.30/tests/mapper/test_laylmstruct.py +0 -141
  192. deepdoctection-0.30/tests/mapper/test_misc.py +0 -72
  193. deepdoctection-0.30/tests/mapper/test_prodigystruct.py +0 -78
  194. deepdoctection-0.30/tests/mapper/test_pubstruct.py +0 -170
  195. deepdoctection-0.30/tests/mapper/test_tpstruct.py +0 -51
  196. deepdoctection-0.30/tests/mapper/test_utils.py +0 -83
  197. deepdoctection-0.30/tests/mapper/test_xfundstruct.py +0 -68
  198. deepdoctection-0.30/tests/pipe/__init__.py +0 -16
  199. deepdoctection-0.30/tests/pipe/test_anngen.py +0 -179
  200. deepdoctection-0.30/tests/pipe/test_cell.py +0 -123
  201. deepdoctection-0.30/tests/pipe/test_common.py +0 -107
  202. deepdoctection-0.30/tests/pipe/test_language.py +0 -76
  203. deepdoctection-0.30/tests/pipe/test_layout.py +0 -65
  204. deepdoctection-0.30/tests/pipe/test_lm.py +0 -119
  205. deepdoctection-0.30/tests/pipe/test_order.py +0 -197
  206. deepdoctection-0.30/tests/pipe/test_refine.py +0 -325
  207. deepdoctection-0.30/tests/pipe/test_registry.py +0 -58
  208. deepdoctection-0.30/tests/pipe/test_segment.py +0 -392
  209. deepdoctection-0.30/tests/pipe/test_text.py +0 -204
  210. deepdoctection-0.30/tests/pipe/test_transform.py +0 -63
  211. deepdoctection-0.30/tests/train/__init__.py +0 -16
  212. deepdoctection-0.30/tests/train/conftest.py +0 -118
  213. deepdoctection-0.30/tests/train/test_d2_frcnn_train.py +0 -64
  214. deepdoctection-0.30/tests/train/test_tp_frcnn_train.py +0 -99
  215. deepdoctection-0.30/tests_d2/__init__.py +0 -20
  216. deepdoctection-0.30/tests_d2/conftest.py +0 -56
  217. deepdoctection-0.30/tests_d2/test_d2detect.py +0 -95
  218. {deepdoctection-0.30 → deepdoctection-0.32}/LICENSE +0 -0
  219. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/analyzer/__init__.py +0 -0
  220. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/configs/__init__.py +0 -0
  221. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/configs/conf_tesseract.yaml +0 -0
  222. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/dataflow/__init__.py +0 -0
  223. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/dataflow/common.py +0 -0
  224. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/dataflow/stats.py +0 -0
  225. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datapoint/__init__.py +0 -0
  226. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/instances/__init__.py +0 -0
  227. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/instances/xsl/__init__.py +0 -0
  228. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/instances/xsl/pascal_voc.xsl +0 -0
  229. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/registry.py +0 -0
  230. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/save.py +0 -0
  231. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/eval/registry.py +0 -0
  232. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tp/__init__.py +0 -0
  233. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/utils/np_box_ops.py +0 -0
  234. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/mapper/cats.py +0 -0
  235. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/mapper/cocostruct.py +0 -0
  236. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/mapper/match.py +0 -0
  237. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/mapper/pascalstruct.py +0 -0
  238. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/mapper/xfundstruct.py +0 -0
  239. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/pipe/registry.py +0 -0
  240. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/py.typed +0 -0
  241. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/detection_types.py +0 -0
  242. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/develop.py +0 -0
  243. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/identifier.py +0 -0
  244. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/metacfg.py +0 -0
  245. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/tqdm.py +0 -0
  246. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection.egg-info/dependency_links.txt +0 -0
  247. {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deepdoctection
3
- Version: 0.30
3
+ Version: 0.32
4
4
  Summary: Repository for Document AI
5
5
  Home-page: https://github.com/deepdoctection/deepdoctection
6
6
  Author: Dr. Janis Meyer
@@ -9,94 +9,96 @@ Classifier: Development Status :: 4 - Beta
9
9
  Classifier: License :: OSI Approved :: Apache Software License
10
10
  Classifier: Natural Language :: English
11
11
  Classifier: Operating System :: POSIX :: Linux
12
- Classifier: Programming Language :: Python :: 3.8
13
12
  Classifier: Programming Language :: Python :: 3.9
14
13
  Classifier: Programming Language :: Python :: 3.10
15
14
  Classifier: Programming Language :: Python :: 3.11
16
15
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
- Requires-Python: >=3.8
16
+ Requires-Python: >=3.9
18
17
  Description-Content-Type: text/markdown
19
18
  License-File: LICENSE
20
- Requires-Dist: catalogue==2.0.7
19
+ Requires-Dist: catalogue==2.0.10
21
20
  Requires-Dist: huggingface_hub>=0.12.0
22
- Requires-Dist: importlib-metadata>=4.11.2
21
+ Requires-Dist: importlib-metadata>=5.0.0
23
22
  Requires-Dist: jsonlines==3.1.0
23
+ Requires-Dist: lazy-imports==0.3.1
24
24
  Requires-Dist: mock==4.0.3
25
25
  Requires-Dist: networkx>=2.7.1
26
26
  Requires-Dist: numpy>=1.21
27
27
  Requires-Dist: packaging>=20.0
28
28
  Requires-Dist: Pillow>=10.0.0
29
29
  Requires-Dist: pypdf>=3.16.0
30
- Requires-Dist: pyyaml==6.0
30
+ Requires-Dist: pyyaml>=6.0.1
31
31
  Requires-Dist: pyzmq>=16
32
32
  Requires-Dist: termcolor>=1.1
33
33
  Requires-Dist: tabulate>=0.7.7
34
34
  Requires-Dist: tqdm==4.64.0
35
35
  Provides-Extra: tf
36
- Requires-Dist: catalogue==2.0.7; extra == "tf"
36
+ Requires-Dist: catalogue==2.0.10; extra == "tf"
37
37
  Requires-Dist: huggingface_hub>=0.12.0; extra == "tf"
38
- Requires-Dist: importlib-metadata>=4.11.2; extra == "tf"
38
+ Requires-Dist: importlib-metadata>=5.0.0; extra == "tf"
39
39
  Requires-Dist: jsonlines==3.1.0; extra == "tf"
40
+ Requires-Dist: lazy-imports==0.3.1; extra == "tf"
40
41
  Requires-Dist: mock==4.0.3; extra == "tf"
41
42
  Requires-Dist: networkx>=2.7.1; extra == "tf"
42
43
  Requires-Dist: numpy>=1.21; extra == "tf"
43
44
  Requires-Dist: packaging>=20.0; extra == "tf"
44
45
  Requires-Dist: Pillow>=10.0.0; extra == "tf"
45
46
  Requires-Dist: pypdf>=3.16.0; extra == "tf"
46
- Requires-Dist: pyyaml==6.0; extra == "tf"
47
+ Requires-Dist: pyyaml>=6.0.1; extra == "tf"
47
48
  Requires-Dist: pyzmq>=16; extra == "tf"
48
49
  Requires-Dist: termcolor>=1.1; extra == "tf"
49
50
  Requires-Dist: tabulate>=0.7.7; extra == "tf"
50
51
  Requires-Dist: tqdm==4.64.0; extra == "tf"
51
- Requires-Dist: tensorpack; extra == "tf"
52
+ Requires-Dist: tensorpack==0.11; extra == "tf"
52
53
  Requires-Dist: protobuf==3.20.1; extra == "tf"
53
54
  Requires-Dist: tensorflow-addons>=0.17.1; extra == "tf"
54
55
  Requires-Dist: tf2onnx>=1.9.2; extra == "tf"
55
- Requires-Dist: python-doctr==0.7.0; extra == "tf"
56
+ Requires-Dist: python-doctr==0.8.1; extra == "tf"
56
57
  Requires-Dist: pycocotools>=2.0.2; extra == "tf"
57
- Requires-Dist: boto3; extra == "tf"
58
- Requires-Dist: pdfplumber>=0.7.1; extra == "tf"
59
- Requires-Dist: fasttext; extra == "tf"
60
- Requires-Dist: jdeskew; extra == "tf"
58
+ Requires-Dist: boto3==1.34.102; extra == "tf"
59
+ Requires-Dist: pdfplumber>=0.11.0; extra == "tf"
60
+ Requires-Dist: fasttext==0.9.2; extra == "tf"
61
+ Requires-Dist: jdeskew>=0.2.2; extra == "tf"
61
62
  Requires-Dist: apted==1.0.3; extra == "tf"
62
63
  Requires-Dist: distance==0.1.3; extra == "tf"
63
64
  Requires-Dist: lxml>=4.9.1; extra == "tf"
64
65
  Provides-Extra: pt
65
- Requires-Dist: catalogue==2.0.7; extra == "pt"
66
+ Requires-Dist: catalogue==2.0.10; extra == "pt"
66
67
  Requires-Dist: huggingface_hub>=0.12.0; extra == "pt"
67
- Requires-Dist: importlib-metadata>=4.11.2; extra == "pt"
68
+ Requires-Dist: importlib-metadata>=5.0.0; extra == "pt"
68
69
  Requires-Dist: jsonlines==3.1.0; extra == "pt"
70
+ Requires-Dist: lazy-imports==0.3.1; extra == "pt"
69
71
  Requires-Dist: mock==4.0.3; extra == "pt"
70
72
  Requires-Dist: networkx>=2.7.1; extra == "pt"
71
73
  Requires-Dist: numpy>=1.21; extra == "pt"
72
74
  Requires-Dist: packaging>=20.0; extra == "pt"
73
75
  Requires-Dist: Pillow>=10.0.0; extra == "pt"
74
76
  Requires-Dist: pypdf>=3.16.0; extra == "pt"
75
- Requires-Dist: pyyaml==6.0; extra == "pt"
77
+ Requires-Dist: pyyaml>=6.0.1; extra == "pt"
76
78
  Requires-Dist: pyzmq>=16; extra == "pt"
77
79
  Requires-Dist: termcolor>=1.1; extra == "pt"
78
80
  Requires-Dist: tabulate>=0.7.7; extra == "pt"
79
81
  Requires-Dist: tqdm==4.64.0; extra == "pt"
80
- Requires-Dist: timm; extra == "pt"
82
+ Requires-Dist: timm>=0.9.16; extra == "pt"
81
83
  Requires-Dist: transformers>=4.36.0; extra == "pt"
82
- Requires-Dist: accelerate; extra == "pt"
83
- Requires-Dist: python-doctr==0.7.0; extra == "pt"
84
- Requires-Dist: boto3; extra == "pt"
85
- Requires-Dist: pdfplumber>=0.7.1; extra == "pt"
86
- Requires-Dist: fasttext; extra == "pt"
87
- Requires-Dist: jdeskew; extra == "pt"
84
+ Requires-Dist: accelerate>=0.29.1; extra == "pt"
85
+ Requires-Dist: python-doctr==0.8.1; extra == "pt"
86
+ Requires-Dist: boto3==1.34.102; extra == "pt"
87
+ Requires-Dist: pdfplumber>=0.11.0; extra == "pt"
88
+ Requires-Dist: fasttext==0.9.2; extra == "pt"
89
+ Requires-Dist: jdeskew>=0.2.2; extra == "pt"
88
90
  Requires-Dist: apted==1.0.3; extra == "pt"
89
91
  Requires-Dist: distance==0.1.3; extra == "pt"
90
92
  Requires-Dist: lxml>=4.9.1; extra == "pt"
91
93
  Provides-Extra: docs
92
- Requires-Dist: tensorpack; extra == "docs"
93
- Requires-Dist: boto3; extra == "docs"
94
+ Requires-Dist: tensorpack==0.11; extra == "docs"
95
+ Requires-Dist: boto3==1.34.102; extra == "docs"
94
96
  Requires-Dist: transformers>=4.36.0; extra == "docs"
95
- Requires-Dist: accelerate; extra == "docs"
96
- Requires-Dist: pdfplumber>=0.7.1; extra == "docs"
97
+ Requires-Dist: accelerate>=0.29.1; extra == "docs"
98
+ Requires-Dist: pdfplumber>=0.11.0; extra == "docs"
97
99
  Requires-Dist: lxml>=4.9.1; extra == "docs"
98
- Requires-Dist: lxml-stubs; extra == "docs"
99
- Requires-Dist: jdeskew; extra == "docs"
100
+ Requires-Dist: lxml-stubs>=0.5.1; extra == "docs"
101
+ Requires-Dist: jdeskew>=0.2.2; extra == "docs"
100
102
  Requires-Dist: jinja2==3.0.3; extra == "docs"
101
103
  Requires-Dist: mkdocs-material; extra == "docs"
102
104
  Requires-Dist: mkdocstrings-python; extra == "docs"
@@ -105,47 +107,20 @@ Provides-Extra: dev
105
107
  Requires-Dist: python-dotenv==1.0.0; extra == "dev"
106
108
  Requires-Dist: click; extra == "dev"
107
109
  Requires-Dist: black==23.7.0; extra == "dev"
108
- Requires-Dist: isort; extra == "dev"
110
+ Requires-Dist: isort==5.13.2; extra == "dev"
109
111
  Requires-Dist: pylint==2.17.4; extra == "dev"
110
112
  Requires-Dist: mypy==1.4.1; extra == "dev"
111
113
  Requires-Dist: wandb; extra == "dev"
112
- Requires-Dist: types-PyYAML; extra == "dev"
113
- Requires-Dist: types-termcolor==1.1.3; extra == "dev"
114
- Requires-Dist: types-tabulate; extra == "dev"
115
- Requires-Dist: types-tqdm; extra == "dev"
116
- Requires-Dist: lxml-stubs; extra == "dev"
117
- Requires-Dist: types-Pillow; extra == "dev"
118
- Requires-Dist: types-urllib3; extra == "dev"
114
+ Requires-Dist: types-PyYAML>=6.0.12.12; extra == "dev"
115
+ Requires-Dist: types-termcolor>=1.1.3; extra == "dev"
116
+ Requires-Dist: types-tabulate>=0.9.0.3; extra == "dev"
117
+ Requires-Dist: types-tqdm>=4.66.0.5; extra == "dev"
118
+ Requires-Dist: lxml-stubs>=0.5.1; extra == "dev"
119
+ Requires-Dist: types-Pillow>=10.2.0.20240406; extra == "dev"
120
+ Requires-Dist: types-urllib3>=1.26.25.14; extra == "dev"
119
121
  Provides-Extra: test
120
- Requires-Dist: pytest; extra == "test"
122
+ Requires-Dist: pytest==8.0.2; extra == "test"
121
123
  Requires-Dist: pytest-cov; extra == "test"
122
- Provides-Extra: hf
123
- Requires-Dist: catalogue==2.0.7; extra == "hf"
124
- Requires-Dist: huggingface_hub>=0.12.0; extra == "hf"
125
- Requires-Dist: importlib-metadata>=4.11.2; extra == "hf"
126
- Requires-Dist: jsonlines==3.1.0; extra == "hf"
127
- Requires-Dist: mock==4.0.3; extra == "hf"
128
- Requires-Dist: networkx>=2.7.1; extra == "hf"
129
- Requires-Dist: numpy>=1.21; extra == "hf"
130
- Requires-Dist: packaging>=20.0; extra == "hf"
131
- Requires-Dist: Pillow>=10.0.0; extra == "hf"
132
- Requires-Dist: pypdf>=3.16.0; extra == "hf"
133
- Requires-Dist: pyyaml==6.0; extra == "hf"
134
- Requires-Dist: pyzmq>=16; extra == "hf"
135
- Requires-Dist: termcolor>=1.1; extra == "hf"
136
- Requires-Dist: tabulate>=0.7.7; extra == "hf"
137
- Requires-Dist: tqdm==4.64.0; extra == "hf"
138
- Requires-Dist: timm; extra == "hf"
139
- Requires-Dist: transformers>=4.36.0; extra == "hf"
140
- Requires-Dist: accelerate; extra == "hf"
141
- Requires-Dist: python-doctr==0.7.0; extra == "hf"
142
- Requires-Dist: boto3; extra == "hf"
143
- Requires-Dist: pdfplumber>=0.7.1; extra == "hf"
144
- Requires-Dist: fasttext; extra == "hf"
145
- Requires-Dist: jdeskew; extra == "hf"
146
- Requires-Dist: apted==1.0.3; extra == "hf"
147
- Requires-Dist: distance==0.1.3; extra == "hf"
148
- Requires-Dist: lxml>=4.9.1; extra == "hf"
149
124
 
150
125
 
151
126
  <p align="center">
@@ -180,7 +155,8 @@ pipelines. Its core function does not depend on any specific deep learning libra
180
155
  - Text mining for native PDFs with [**pdfplumber**](https://github.com/jsvine/pdfplumber),
181
156
  - Language detection with [**fastText**](https://github.com/facebookresearch/fastText),
182
157
  - Deskewing and rotating images with [**jdeskew**](https://github.com/phamquiluan/jdeskew).
183
- - Document and token classification with all LayoutLM models provided by the Transformer library.
158
+ - Document and token classification with all LayoutLM models provided by the
159
+ [**Transformer library**](https://github.com/huggingface/transformers).
184
160
  (Yes, you can use any LayoutLM-model with any of the provided OCR-or pdfplumber tools straight away!).
185
161
  - Table detection and table structure recognition with
186
162
  [**table-transformer**](https://github.com/microsoft/table-transformer).
@@ -190,8 +166,16 @@ pipelines. Its core function does not depend on any specific deep learning libra
190
166
  - Comprehensive configuration of **analyzer** like choosing different models, output parsing, OCR selection.
191
167
  Check this [notebook](https://github.com/deepdoctection/notebooks/blob/main/Analyzer_Configuration.ipynb) or the
192
168
  [docs](https://deepdoctection.readthedocs.io/en/latest/tutorials/analyzer_configuration_notebook/) for more infos.
193
- - Document layout analysis and table recognition now runs with Torchscript (CPU) as well and Detectron2 is
194
- not required anymore for basic inference.
169
+ - Document layout analysis and table recognition now runs with
170
+ [**Torchscript**](https://pytorch.org/docs/stable/jit.html) (CPU) as well and [**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) is not required
171
+ anymore for basic inference.
172
+ - [**new**] More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
173
+ (not contained in the built-in Analyzer).
174
+ - [**new**] Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
175
+ [**transformers**](https://github.com/huggingface/transformers).
176
+ We have added a model wrapper for token classification with LiLT and added a some LiLT models to the model catalog
177
+ that seem to look promising, especially if you want to train a model on non-english data. The training script for
178
+ LayoutLM can be used for LiLT as well and we will be providing a notebook on how to train a model on a custom dataset soon.
195
179
 
196
180
  **deep**doctection provides on top of that methods for pre-processing inputs to models like cropping or resizing and to
197
181
  post-process results, like validating duplicate outputs, relating words to detected layout segments or ordering words
@@ -282,9 +266,9 @@ Everything in the overview listed below the **deep**doctection layer are necessa
282
266
  separately.
283
267
 
284
268
  - Linux or macOS. (Windows is not supported but there is a [Dockerfile](./docker/pytorch-cpu-jupyter/Dockerfile) available)
285
- - Python >= 3.8
286
- - 1.12 <= PyTorch < 2.0 **or** Tensorflow >= 2.9 and CUDA. If you want to run the models provided by Tensorpack a GPU is
287
- required. You can run on PyTorch with a CPU only.
269
+ - Python >= 3.9
270
+ - 1.13 <= PyTorch **or** 2.11 <= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
271
+ In general, if you want to train or fine-tune models, a GPU is required.
288
272
  - **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF documents into
289
273
  images.
290
274
  - With respect to the Deep Learning framework, you must decide between [Tensorflow](https://www.tensorflow.org/install?hl=en)
@@ -31,7 +31,8 @@ pipelines. Its core function does not depend on any specific deep learning libra
31
31
  - Text mining for native PDFs with [**pdfplumber**](https://github.com/jsvine/pdfplumber),
32
32
  - Language detection with [**fastText**](https://github.com/facebookresearch/fastText),
33
33
  - Deskewing and rotating images with [**jdeskew**](https://github.com/phamquiluan/jdeskew).
34
- - Document and token classification with all LayoutLM models provided by the Transformer library.
34
+ - Document and token classification with all LayoutLM models provided by the
35
+ [**Transformer library**](https://github.com/huggingface/transformers).
35
36
  (Yes, you can use any LayoutLM-model with any of the provided OCR-or pdfplumber tools straight away!).
36
37
  - Table detection and table structure recognition with
37
38
  [**table-transformer**](https://github.com/microsoft/table-transformer).
@@ -41,8 +42,16 @@ pipelines. Its core function does not depend on any specific deep learning libra
41
42
  - Comprehensive configuration of **analyzer** like choosing different models, output parsing, OCR selection.
42
43
  Check this [notebook](https://github.com/deepdoctection/notebooks/blob/main/Analyzer_Configuration.ipynb) or the
43
44
  [docs](https://deepdoctection.readthedocs.io/en/latest/tutorials/analyzer_configuration_notebook/) for more infos.
44
- - Document layout analysis and table recognition now runs with Torchscript (CPU) as well and Detectron2 is
45
- not required anymore for basic inference.
45
+ - Document layout analysis and table recognition now runs with
46
+ [**Torchscript**](https://pytorch.org/docs/stable/jit.html) (CPU) as well and [**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) is not required
47
+ anymore for basic inference.
48
+ - [**new**] More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
49
+ (not contained in the built-in Analyzer).
50
+ - [**new**] Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
51
+ [**transformers**](https://github.com/huggingface/transformers).
52
+ We have added a model wrapper for token classification with LiLT and added a some LiLT models to the model catalog
53
+ that seem to look promising, especially if you want to train a model on non-english data. The training script for
54
+ LayoutLM can be used for LiLT as well and we will be providing a notebook on how to train a model on a custom dataset soon.
46
55
 
47
56
  **deep**doctection provides on top of that methods for pre-processing inputs to models like cropping or resizing and to
48
57
  post-process results, like validating duplicate outputs, relating words to detected layout segments or ordering words
@@ -133,9 +142,9 @@ Everything in the overview listed below the **deep**doctection layer are necessa
133
142
  separately.
134
143
 
135
144
  - Linux or macOS. (Windows is not supported but there is a [Dockerfile](./docker/pytorch-cpu-jupyter/Dockerfile) available)
136
- - Python >= 3.8
137
- - 1.12 <= PyTorch < 2.0 **or** Tensorflow >= 2.9 and CUDA. If you want to run the models provided by Tensorpack a GPU is
138
- required. You can run on PyTorch with a CPU only.
145
+ - Python >= 3.9
146
+ - 1.13 <= PyTorch **or** 2.11 <= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
147
+ In general, if you want to train or fine-tune models, a GPU is required.
139
148
  - **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF documents into
140
149
  images.
141
150
  - With respect to the Deep Learning framework, you must decide between [Tensorflow](https://www.tensorflow.org/install?hl=en)
@@ -19,15 +19,13 @@ import os
19
19
  import sys
20
20
  from typing import TYPE_CHECKING
21
21
 
22
- from packaging import version
23
-
24
- from .utils.env_info import auto_select_lib_and_device
22
+ from .utils.env_info import collect_env_info
25
23
  from .utils.file_utils import _LazyModule, get_tf_version, pytorch_available, tf_available
26
- from .utils.logger import logger
24
+ from .utils.logger import LoggingRecord, logger
27
25
 
28
26
  # pylint: enable=wrong-import-position
29
27
 
30
- __version__ = 0.30
28
+ __version__ = 0.32
31
29
 
32
30
  _IMPORT_STRUCTURE = {
33
31
  "analyzer": [
@@ -179,8 +177,10 @@ _IMPORT_STRUCTURE = {
179
177
  "Jdeskewer",
180
178
  "DoctrTextlineDetector",
181
179
  "DoctrTextRecognizer",
180
+ "DocTrRotationTransformer",
182
181
  "FasttextLangDetector",
183
182
  "HFDetrDerivedDetector",
183
+ "get_tokenizer_from_architecture",
184
184
  "HFLayoutLmTokenClassifierBase",
185
185
  "HFLayoutLmTokenClassifier",
186
186
  "HFLayoutLmv2TokenClassifier",
@@ -188,12 +188,16 @@ _IMPORT_STRUCTURE = {
188
188
  "HFLayoutLmSequenceClassifier",
189
189
  "HFLayoutLmv2SequenceClassifier",
190
190
  "HFLayoutLmv3SequenceClassifier",
191
+ "HFLiltTokenClassifier",
192
+ "HFLiltSequenceClassifier",
193
+ "HFLmSequenceClassifier",
191
194
  "ModelProfile",
192
195
  "ModelCatalog",
193
196
  "print_model_infos",
194
197
  "ModelDownloadManager",
195
198
  "PdfPlumberTextDetector",
196
199
  "TesseractOcrDetector",
200
+ "TesseractRotationTransformer",
197
201
  "TextractOcrDetector",
198
202
  "TPFrcnnDetector",
199
203
  ],
@@ -266,11 +270,11 @@ _IMPORT_STRUCTURE = {
266
270
  "DoctectionPipe",
267
271
  "LanguageDetectionService",
268
272
  "ImageLayoutService",
269
- "get_tokenizer_from_architecture",
270
273
  "LMTokenClassifierService",
271
274
  "LMSequenceClassifierService",
272
275
  "OrderGenerator",
273
276
  "TextLineGenerator",
277
+ "TextLineService",
274
278
  "TextOrderService",
275
279
  "TableSegmentationRefinementService",
276
280
  "generate_html_string",
@@ -279,7 +283,7 @@ _IMPORT_STRUCTURE = {
279
283
  "PubtablesSegmentationService",
280
284
  "SegmentationResult",
281
285
  "TextExtractionService",
282
- "SimpleTransformPipelineComponent",
286
+ "SimpleTransformService",
283
287
  ],
284
288
  "train": [
285
289
  "D2Trainer",
@@ -295,14 +299,13 @@ _IMPORT_STRUCTURE = {
295
299
  "save_tmp_file",
296
300
  "timed_operation",
297
301
  "collect_env_info",
298
- "get_device",
299
- "auto_select_lib_and_device",
300
302
  "auto_select_viz_library",
301
303
  "get_tensorflow_requirement",
302
304
  "tf_addons_available",
303
305
  "get_tf_addons_requirements",
304
306
  "tensorpack_available",
305
307
  "get_tensorpack_requirement",
308
+ "pytorch_available",
306
309
  "get_pytorch_requirement",
307
310
  "lxml_available",
308
311
  "get_lxml_requirement",
@@ -416,25 +419,31 @@ _IMPORT_STRUCTURE = {
416
419
  ],
417
420
  }
418
421
 
422
+ # Setting some environment variables so that standard functions can be invoked with available hardware
423
+ env_info = collect_env_info()
424
+ logger.debug(LoggingRecord(msg=env_info))
419
425
 
420
- # disable TF warnings for versions > 2.4.1
421
- if tf_available():
422
- if version.parse(get_tf_version()) > version.parse("2.4.1"):
423
- os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
424
- try:
425
- import tensorflow.python.util.deprecation as deprecation # type: ignore # pylint: disable=E0401,R0402
426
-
427
- deprecation._PRINT_DEPRECATION_WARNINGS = False # pylint: disable=W0212
428
- except Exception: # pylint: disable=W0703
429
- try:
430
- from tensorflow.python.util import deprecation # type: ignore # pylint: disable=E0401
431
-
432
- deprecation._PRINT_DEPRECATION_WARNINGS = False # pylint: disable=W0212
433
- except Exception: # pylint: disable=W0703
434
- pass
426
+ if os.environ.get("PYTORCH_AVAILABLE") and os.environ.get("DD_USE_TORCH") is None:
427
+ os.environ["DD_USE_TORCH"] = "1"
428
+ os.environ["USE_TORCH"] = "1"
429
+ if os.environ.get("TENSORFLOW_AVAILABLE") and os.environ.get("DD_USE_TF") is None:
430
+ os.environ["DD_USE_TF"] = "1"
431
+ os.environ["USE_TF"] = "1"
432
+ if os.environ.get("DD_USE_TORCH") and os.environ.get("DD_USE_TF"):
433
+ logger.warning(
434
+ "Both DD_USE_TORCH and DD_USE_TF are set. Defaulting to PyTorch. If you want a different "
435
+ "behaviour, set DD_USE_TORCH to None before importing deepdoctection."
436
+ )
437
+ os.environ.pop("DD_USE_TF")
438
+ os.environ.pop("USE_TF")
435
439
 
436
- # Setting some environment variables so that standard functions can be invoked with available hardware
437
- auto_select_lib_and_device()
440
+ if not os.environ.get("PYTORCH_AVAILABLE") and not os.environ.get("TENSORFLOW_AVAILABLE"):
441
+ logger.warning(
442
+ LoggingRecord(
443
+ msg="Neither Tensorflow or Pytorch are available. You will not be able to use any Deep Learning "
444
+ "model from the library."
445
+ )
446
+ )
438
447
 
439
448
 
440
449
  # Direct imports for type-checking
@@ -442,10 +451,10 @@ if TYPE_CHECKING:
442
451
  from .analyzer import *
443
452
  from .dataflow import *
444
453
  from .datapoint import *
445
- from .datasets import *
454
+ from .datasets import * # type: ignore
446
455
  from .eval import *
447
- from .extern import *
448
- from .mapper import *
456
+ from .extern import * # type: ignore
457
+ from .mapper import * # type: ignore
449
458
  from .pipe import *
450
459
  from .train import *
451
460
  from .utils import *
@@ -23,51 +23,43 @@ Module for **deep**doctection analyzer.
23
23
  -user factory with a reduced config setting
24
24
  """
25
25
 
26
- import ast
27
26
  import os
28
27
  from os import environ
29
28
  from shutil import copyfile
30
29
  from typing import List, Optional, Union
31
30
 
31
+ from lazy_imports import try_import
32
+
32
33
  from ..extern.base import ObjectDetector
34
+ from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
33
35
  from ..extern.doctrocr import DoctrTextlineDetector, DoctrTextRecognizer
36
+ from ..extern.hfdetr import HFDetrDerivedDetector
34
37
  from ..extern.model import ModelCatalog, ModelDownloadManager
35
38
  from ..extern.pdftext import PdfPlumberTextDetector
39
+ from ..extern.pt.ptutils import get_torch_device
36
40
  from ..extern.tessocr import TesseractOcrDetector
37
41
  from ..extern.texocr import TextractOcrDetector
42
+ from ..extern.tp.tfutils import disable_tp_layer_logging, get_tf_device
43
+ from ..extern.tpdetect import TPFrcnnDetector
38
44
  from ..pipe.base import PipelineComponent
39
- from ..pipe.cell import DetectResultGenerator, SubImageLayoutService
40
45
  from ..pipe.common import AnnotationNmsService, MatchingService, PageParsingService
41
46
  from ..pipe.doctectionpipe import DoctectionPipe
42
47
  from ..pipe.layout import ImageLayoutService
43
48
  from ..pipe.order import TextOrderService
44
49
  from ..pipe.refine import TableSegmentationRefinementService
45
50
  from ..pipe.segment import PubtablesSegmentationService, TableSegmentationService
51
+ from ..pipe.sub_layout import DetectResultGenerator, SubImageLayoutService
46
52
  from ..pipe.text import TextExtractionService
47
53
  from ..utils.detection_types import Pathlike
48
- from ..utils.env_info import get_device
49
- from ..utils.file_utils import (
50
- boto3_available,
51
- detectron2_available,
52
- pytorch_available,
53
- tensorpack_available,
54
- tf_available,
55
- )
54
+ from ..utils.error import DependencyError
55
+ from ..utils.file_utils import detectron2_available, tensorpack_available
56
56
  from ..utils.fs import get_configs_dir_path, get_package_path, mkdir_p
57
57
  from ..utils.logger import LoggingRecord, logger
58
58
  from ..utils.metacfg import AttrDict, set_config_by_yaml
59
59
  from ..utils.settings import CellType, LayoutType
60
60
  from ..utils.transform import PadTransform
61
61
 
62
- if tf_available() and tensorpack_available():
63
- from ..extern.tp.tfutils import disable_tp_layer_logging
64
- from ..extern.tpdetect import TPFrcnnDetector
65
-
66
- if pytorch_available():
67
- from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
68
- from ..extern.hfdetr import HFDetrDerivedDetector
69
-
70
- if boto3_available():
62
+ with try_import() as image_guard:
71
63
  from botocore.config import Config # type: ignore
72
64
 
73
65
 
@@ -113,11 +105,12 @@ def config_sanity_checks(cfg: AttrDict) -> None:
113
105
  """Some config sanity checks"""
114
106
  if cfg.USE_PDF_MINER and cfg.USE_OCR and cfg.OCR.USE_DOCTR:
115
107
  raise ValueError("Configuration USE_PDF_MINER= True and USE_OCR=True and USE_DOCTR=True is not allowed")
116
- if cfg.OCR.USE_TESSERACT + cfg.OCR.USE_DOCTR + cfg.OCR.USE_TEXTRACT != 1:
117
- raise ValueError(
118
- "Choose either OCR.USE_TESSERACT=True or OCR.USE_DOCTR=True or OCR.USE_TEXTRACT=True and set the other two "
119
- "to False. Only one OCR system can be activated."
120
- )
108
+ if cfg.USE_OCR:
109
+ if cfg.OCR.USE_TESSERACT + cfg.OCR.USE_DOCTR + cfg.OCR.USE_TEXTRACT != 1:
110
+ raise ValueError(
111
+ "Choose either OCR.USE_TESSERACT=True or OCR.USE_DOCTR=True or OCR.USE_TEXTRACT=True "
112
+ "and set the other two to False. Only one OCR system can be activated."
113
+ )
121
114
 
122
115
 
123
116
  def build_detector(
@@ -343,11 +336,20 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
343
336
  pipe_component_list.append(table_segmentation)
344
337
 
345
338
  if cfg.USE_TABLE_REFINEMENT:
346
- table_segmentation_refinement = TableSegmentationRefinementService()
339
+ table_segmentation_refinement = TableSegmentationRefinementService(
340
+ [LayoutType.table, LayoutType.table_rotated],
341
+ [
342
+ LayoutType.cell,
343
+ CellType.column_header,
344
+ CellType.projected_row_header,
345
+ CellType.spanning,
346
+ CellType.row_header,
347
+ ],
348
+ )
347
349
  pipe_component_list.append(table_segmentation_refinement)
348
350
 
349
351
  if cfg.USE_PDF_MINER:
350
- pdf_text = PdfPlumberTextDetector()
352
+ pdf_text = PdfPlumberTextDetector(x_tolerance=cfg.PDF_MINER.X_TOLERANCE, y_tolerance=cfg.PDF_MINER.Y_TOLERANCE)
351
353
  d_text = TextExtractionService(pdf_text)
352
354
  pipe_component_list.append(d_text)
353
355
 
@@ -400,7 +402,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
400
402
 
401
403
 
402
404
  def get_dd_analyzer(
403
- reset_config_file: bool = False,
405
+ reset_config_file: bool = True,
404
406
  config_overwrite: Optional[List[str]] = None,
405
407
  path_config_file: Optional[Pathlike] = None,
406
408
  ) -> DoctectionPipe:
@@ -429,8 +431,13 @@ def get_dd_analyzer(
429
431
  :return: A DoctectionPipe instance with given configs
430
432
  """
431
433
  config_overwrite = [] if config_overwrite is None else config_overwrite
432
- lib = "TF" if ast.literal_eval(os.environ.get("USE_TENSORFLOW", "False")) else "PT"
433
- device = get_device(False)
434
+ lib = "TF" if os.environ.get("DD_USE_TF") else "PT"
435
+ if lib == "TF":
436
+ device = get_tf_device()
437
+ elif lib == "PT":
438
+ device = get_torch_device()
439
+ else:
440
+ raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
434
441
  dd_one_config_path = maybe_copy_config_to_cache(
435
442
  get_package_path(), get_configs_dir_path(), _DD_ONE, reset_config_file
436
443
  )
@@ -1,38 +1,38 @@
1
1
  USE_LAYOUT: True
2
2
  USE_TABLE_SEGMENTATION: True
3
3
  TF:
4
- LAYOUT:
5
- WEIGHTS: layout/model-800000_inf_only.data-00000-of-00001
6
- FILTER:
7
- CELL:
8
- WEIGHTS: cell/model-1800000_inf_only.data-00000-of-00001
9
- FILTER:
10
- ITEM:
11
- WEIGHTS: item/model-1620000_inf_only.data-00000-of-00001
12
- FILTER:
4
+ LAYOUT:
5
+ WEIGHTS: layout/model-800000_inf_only.data-00000-of-00001
6
+ FILTER:
7
+ CELL:
8
+ WEIGHTS: cell/model-1800000_inf_only.data-00000-of-00001
9
+ FILTER:
10
+ ITEM:
11
+ WEIGHTS: item/model-1620000_inf_only.data-00000-of-00001
12
+ FILTER:
13
13
  PT:
14
- LAYOUT:
15
- WEIGHTS: layout/d2_model_0829999_layout_inf_only.pt
16
- WEIGHTS_TS: layout/d2_model_0829999_layout_inf_only.ts
17
- FILTER:
18
- PAD:
19
- TOP: 60
20
- RIGHT: 60
21
- BOTTOM: 60
22
- LEFT: 60
23
- ITEM:
24
- WEIGHTS: item/d2_model_1639999_item_inf_only.pt
25
- WEIGHTS_TS: item/d2_model_1639999_item_inf_only.ts
26
- FILTER:
27
- PAD:
28
- TOP: 60
29
- RIGHT: 60
30
- BOTTOM: 60
31
- LEFT: 60
32
- CELL:
33
- WEIGHTS: cell/d2_model_1849999_cell_inf_only.pt
34
- WEIGHTS_TS: cell/d2_model_1849999_cell_inf_only.ts
35
- FILTER:
14
+ LAYOUT:
15
+ WEIGHTS: layout/d2_model_0829999_layout_inf_only.pt
16
+ WEIGHTS_TS: layout/d2_model_0829999_layout_inf_only.ts
17
+ FILTER:
18
+ PAD:
19
+ TOP: 60
20
+ RIGHT: 60
21
+ BOTTOM: 60
22
+ LEFT: 60
23
+ ITEM:
24
+ WEIGHTS: item/d2_model_1639999_item_inf_only.pt
25
+ WEIGHTS_TS: item/d2_model_1639999_item_inf_only.ts
26
+ FILTER:
27
+ PAD:
28
+ TOP: 60
29
+ RIGHT: 60
30
+ BOTTOM: 60
31
+ LEFT: 60
32
+ CELL:
33
+ WEIGHTS: cell/d2_model_1849999_cell_inf_only.pt
34
+ WEIGHTS_TS: cell/d2_model_1849999_cell_inf_only.ts
35
+ FILTER:
36
36
  LAYOUT_NMS_PAIRS:
37
37
  COMBINATIONS:
38
38
  THRESHOLDS:
@@ -48,6 +48,9 @@ SEGMENTATION:
48
48
  STRETCH_RULE: equal
49
49
  USE_TABLE_REFINEMENT: True
50
50
  USE_PDF_MINER: False
51
+ PDF_MINER:
52
+ X_TOLERANCE: 3
53
+ Y_TOLERANCE: 3
51
54
  USE_OCR: True
52
55
  OCR:
53
56
  USE_TESSERACT: True
@@ -17,25 +17,6 @@ from typing import Any, Iterator, no_type_check
17
17
  from ..utils.utils import get_rng
18
18
 
19
19
 
20
- class DataFlowTerminated(BaseException):
21
- """
22
- An exception indicating that the DataFlow is unable to produce any more
23
- data, i.e. something wrong happened so that calling `__iter__`
24
- cannot give a valid iterator anymore.
25
- In most DataFlow this will never be raised.
26
- """
27
-
28
-
29
- class DataFlowResetStateNotCalled(BaseException):
30
- """
31
- An exception indicating that `reset_state()` has not been called before starting
32
- iteration.
33
- """
34
-
35
- def __init__(self) -> None:
36
- super().__init__("Iterating a dataflow requires .reset_state() to be called first")
37
-
38
-
39
20
  class DataFlowReentrantGuard:
40
21
  """
41
22
  A tool to enforce non-reentrancy.