deepdoctection 0.31__tar.gz → 0.33__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (247) hide show
  1. {deepdoctection-0.31 → deepdoctection-0.33}/PKG-INFO +30 -21
  2. {deepdoctection-0.31 → deepdoctection-0.33}/README.md +14 -7
  3. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/__init__.py +16 -29
  4. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/analyzer/dd.py +70 -59
  5. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/configs/conf_dd_one.yaml +34 -31
  6. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/dataflow/common.py +9 -5
  7. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/dataflow/custom.py +5 -5
  8. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/dataflow/custom_serialize.py +75 -18
  9. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/dataflow/parallel_map.py +3 -3
  10. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/dataflow/serialize.py +4 -4
  11. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/dataflow/stats.py +3 -3
  12. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datapoint/annotation.py +41 -56
  13. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datapoint/box.py +9 -8
  14. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datapoint/convert.py +6 -6
  15. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datapoint/image.py +56 -44
  16. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datapoint/view.py +245 -150
  17. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/__init__.py +1 -4
  18. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/adapter.py +35 -26
  19. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/base.py +14 -12
  20. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/dataflow_builder.py +3 -3
  21. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/info.py +24 -26
  22. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/instances/doclaynet.py +51 -51
  23. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/instances/fintabnet.py +46 -46
  24. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/instances/funsd.py +25 -24
  25. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/instances/iiitar13k.py +13 -10
  26. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/instances/layouttest.py +4 -3
  27. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/instances/publaynet.py +5 -5
  28. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/instances/pubtables1m.py +24 -21
  29. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/instances/pubtabnet.py +32 -30
  30. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/instances/rvlcdip.py +30 -30
  31. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/instances/xfund.py +26 -26
  32. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/save.py +6 -6
  33. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/eval/__init__.py +1 -4
  34. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/eval/accmetric.py +32 -33
  35. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/eval/base.py +8 -9
  36. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/eval/cocometric.py +15 -13
  37. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/eval/eval.py +41 -37
  38. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/eval/tedsmetric.py +30 -23
  39. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/eval/tp_eval_callback.py +16 -19
  40. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/__init__.py +2 -7
  41. deepdoctection-0.33/deepdoctection/extern/base.py +644 -0
  42. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/d2detect.py +85 -113
  43. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/deskew.py +14 -11
  44. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/doctrocr.py +141 -130
  45. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/fastlang.py +27 -18
  46. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/hfdetr.py +71 -62
  47. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/hflayoutlm.py +504 -211
  48. deepdoctection-0.33/deepdoctection/extern/hflm.py +230 -0
  49. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/model.py +488 -302
  50. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/pdftext.py +23 -19
  51. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/pt/__init__.py +1 -3
  52. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/pt/nms.py +6 -2
  53. deepdoctection-0.33/deepdoctection/extern/pt/ptutils.py +59 -0
  54. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tessocr.py +39 -38
  55. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/texocr.py +18 -18
  56. deepdoctection-0.33/deepdoctection/extern/tp/tfutils.py +105 -0
  57. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tp/tpcompat.py +21 -14
  58. deepdoctection-0.33/deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
  59. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
  60. deepdoctection-0.33/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
  61. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/config/config.py +13 -10
  62. deepdoctection-0.33/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
  63. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +18 -8
  64. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
  65. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +14 -9
  66. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
  67. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +22 -17
  68. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +21 -14
  69. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +19 -11
  70. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
  71. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
  72. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/preproc.py +12 -8
  73. {deepdoctection-0.31/tests/datapoint → deepdoctection-0.33/deepdoctection/extern/tp/tpfrcnn/utils}/__init__.py +4 -0
  74. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
  75. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tpdetect.py +45 -53
  76. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/mapper/__init__.py +3 -8
  77. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/mapper/cats.py +27 -29
  78. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/mapper/cocostruct.py +10 -10
  79. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/mapper/d2struct.py +27 -26
  80. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/mapper/hfstruct.py +13 -8
  81. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/mapper/laylmstruct.py +178 -37
  82. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/mapper/maputils.py +12 -11
  83. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/mapper/match.py +2 -2
  84. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/mapper/misc.py +11 -9
  85. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/mapper/pascalstruct.py +4 -4
  86. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/mapper/prodigystruct.py +5 -5
  87. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/mapper/pubstruct.py +84 -92
  88. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/mapper/tpstruct.py +5 -5
  89. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/mapper/xfundstruct.py +33 -33
  90. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/pipe/__init__.py +1 -1
  91. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/pipe/anngen.py +12 -14
  92. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/pipe/base.py +52 -106
  93. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/pipe/common.py +72 -59
  94. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/pipe/concurrency.py +16 -11
  95. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/pipe/doctectionpipe.py +24 -21
  96. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/pipe/language.py +20 -25
  97. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/pipe/layout.py +20 -16
  98. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/pipe/lm.py +75 -105
  99. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/pipe/order.py +194 -89
  100. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/pipe/refine.py +111 -124
  101. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/pipe/segment.py +156 -161
  102. deepdoctection-0.31/deepdoctection/pipe/cell.py → deepdoctection-0.33/deepdoctection/pipe/sub_layout.py +50 -40
  103. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/pipe/text.py +37 -36
  104. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/pipe/transform.py +19 -16
  105. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/train/__init__.py +6 -12
  106. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/train/d2_frcnn_train.py +48 -41
  107. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/train/hf_detr_train.py +41 -30
  108. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/train/hf_layoutlm_train.py +153 -135
  109. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/train/tp_frcnn_train.py +32 -31
  110. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/concurrency.py +1 -1
  111. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/context.py +13 -6
  112. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/develop.py +4 -4
  113. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/env_info.py +87 -125
  114. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/file_utils.py +6 -11
  115. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/fs.py +22 -18
  116. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/identifier.py +2 -2
  117. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/logger.py +16 -15
  118. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/metacfg.py +7 -7
  119. deepdoctection-0.33/deepdoctection/utils/mocks.py +93 -0
  120. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/pdf_utils.py +11 -11
  121. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/settings.py +185 -181
  122. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/tqdm.py +1 -1
  123. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/transform.py +14 -9
  124. deepdoctection-0.33/deepdoctection/utils/types.py +104 -0
  125. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/utils.py +7 -7
  126. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/viz.py +74 -72
  127. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection.egg-info/PKG-INFO +30 -21
  128. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection.egg-info/SOURCES.txt +5 -91
  129. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection.egg-info/requires.txt +14 -11
  130. {deepdoctection-0.31 → deepdoctection-0.33}/setup.cfg +10 -1
  131. {deepdoctection-0.31 → deepdoctection-0.33}/setup.py +10 -9
  132. {deepdoctection-0.31 → deepdoctection-0.33}/tests/test_utils.py +8 -0
  133. deepdoctection-0.31/deepdoctection/extern/base.py +0 -439
  134. deepdoctection-0.31/deepdoctection/extern/pt/ptutils.py +0 -49
  135. deepdoctection-0.31/deepdoctection/extern/tp/tfutils.py +0 -57
  136. deepdoctection-0.31/deepdoctection/extern/tp/tpfrcnn/__init__.py +0 -0
  137. deepdoctection-0.31/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +0 -0
  138. deepdoctection-0.31/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +0 -0
  139. deepdoctection-0.31/deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +0 -0
  140. deepdoctection-0.31/deepdoctection/utils/detection_types.py +0 -68
  141. deepdoctection-0.31/tests/__init__.py +0 -22
  142. deepdoctection-0.31/tests/analyzer/__init__.py +0 -16
  143. deepdoctection-0.31/tests/analyzer/test_dd.py +0 -202
  144. deepdoctection-0.31/tests/conftest.py +0 -498
  145. deepdoctection-0.31/tests/data.py +0 -1632
  146. deepdoctection-0.31/tests/dataflow/__init__.py +0 -16
  147. deepdoctection-0.31/tests/dataflow/conftest.py +0 -95
  148. deepdoctection-0.31/tests/dataflow/test_common.py +0 -219
  149. deepdoctection-0.31/tests/dataflow/test_custom.py +0 -60
  150. deepdoctection-0.31/tests/dataflow/test_custom_serialize.py +0 -177
  151. deepdoctection-0.31/tests/dataflow/test_parallel_map.py +0 -66
  152. deepdoctection-0.31/tests/dataflow/test_stats.py +0 -103
  153. deepdoctection-0.31/tests/datapoint/conftest.py +0 -262
  154. deepdoctection-0.31/tests/datapoint/test_annotation.py +0 -170
  155. deepdoctection-0.31/tests/datapoint/test_box.py +0 -416
  156. deepdoctection-0.31/tests/datapoint/test_convert.py +0 -52
  157. deepdoctection-0.31/tests/datapoint/test_image.py +0 -387
  158. deepdoctection-0.31/tests/datapoint/test_view.py +0 -150
  159. deepdoctection-0.31/tests/datasets/__init__.py +0 -16
  160. deepdoctection-0.31/tests/datasets/instances/__init__.py +0 -16
  161. deepdoctection-0.31/tests/datasets/instances/conftest.py +0 -35
  162. deepdoctection-0.31/tests/datasets/instances/test_doclaynet.py +0 -43
  163. deepdoctection-0.31/tests/datasets/instances/test_fintabnet.py +0 -70
  164. deepdoctection-0.31/tests/datasets/instances/test_funsd.py +0 -58
  165. deepdoctection-0.31/tests/datasets/instances/test_iiitar13k.py +0 -42
  166. deepdoctection-0.31/tests/datasets/instances/test_layouttest.py +0 -63
  167. deepdoctection-0.31/tests/datasets/instances/test_publaynet.py +0 -64
  168. deepdoctection-0.31/tests/datasets/instances/test_pubtables1m.py +0 -66
  169. deepdoctection-0.31/tests/datasets/instances/test_pubtabnet.py +0 -65
  170. deepdoctection-0.31/tests/datasets/instances/test_rvlcdip.py +0 -46
  171. deepdoctection-0.31/tests/datasets/test_adapter.py +0 -77
  172. deepdoctection-0.31/tests/datasets/test_info.py +0 -273
  173. deepdoctection-0.31/tests/datasets/test_registry.py +0 -75
  174. deepdoctection-0.31/tests/eval/__init__.py +0 -16
  175. deepdoctection-0.31/tests/eval/conftest.py +0 -107
  176. deepdoctection-0.31/tests/eval/test_accmetric.py +0 -364
  177. deepdoctection-0.31/tests/eval/test_cocometric.py +0 -123
  178. deepdoctection-0.31/tests/eval/test_eval.py +0 -86
  179. deepdoctection-0.31/tests/eval/test_registry.py +0 -84
  180. deepdoctection-0.31/tests/eval/test_tedsmetric.py +0 -40
  181. deepdoctection-0.31/tests/extern/__init__.py +0 -0
  182. deepdoctection-0.31/tests/extern/conftest.py +0 -108
  183. deepdoctection-0.31/tests/extern/data.py +0 -102
  184. deepdoctection-0.31/tests/extern/test_deskew.py +0 -67
  185. deepdoctection-0.31/tests/extern/test_doctrocr.py +0 -190
  186. deepdoctection-0.31/tests/extern/test_fastlang.py +0 -64
  187. deepdoctection-0.31/tests/extern/test_hfdetr.py +0 -116
  188. deepdoctection-0.31/tests/extern/test_hflayoutlm.py +0 -492
  189. deepdoctection-0.31/tests/extern/test_pdftext.py +0 -70
  190. deepdoctection-0.31/tests/extern/test_tessocr.py +0 -164
  191. deepdoctection-0.31/tests/extern/test_texocr.py +0 -52
  192. deepdoctection-0.31/tests/extern/test_tpdetect.py +0 -123
  193. deepdoctection-0.31/tests/mapper/__init__.py +0 -16
  194. deepdoctection-0.31/tests/mapper/conftest.py +0 -297
  195. deepdoctection-0.31/tests/mapper/data.py +0 -2182
  196. deepdoctection-0.31/tests/mapper/test_cats.py +0 -305
  197. deepdoctection-0.31/tests/mapper/test_cocostruct.py +0 -91
  198. deepdoctection-0.31/tests/mapper/test_d2struct.py +0 -56
  199. deepdoctection-0.31/tests/mapper/test_hfstruct.py +0 -59
  200. deepdoctection-0.31/tests/mapper/test_iiitar13k.py +0 -64
  201. deepdoctection-0.31/tests/mapper/test_laylmstruct.py +0 -141
  202. deepdoctection-0.31/tests/mapper/test_misc.py +0 -72
  203. deepdoctection-0.31/tests/mapper/test_prodigystruct.py +0 -78
  204. deepdoctection-0.31/tests/mapper/test_pubstruct.py +0 -170
  205. deepdoctection-0.31/tests/mapper/test_tpstruct.py +0 -51
  206. deepdoctection-0.31/tests/mapper/test_utils.py +0 -83
  207. deepdoctection-0.31/tests/mapper/test_xfundstruct.py +0 -68
  208. deepdoctection-0.31/tests/pipe/__init__.py +0 -16
  209. deepdoctection-0.31/tests/pipe/test_anngen.py +0 -179
  210. deepdoctection-0.31/tests/pipe/test_cell.py +0 -144
  211. deepdoctection-0.31/tests/pipe/test_common.py +0 -107
  212. deepdoctection-0.31/tests/pipe/test_language.py +0 -76
  213. deepdoctection-0.31/tests/pipe/test_layout.py +0 -66
  214. deepdoctection-0.31/tests/pipe/test_lm.py +0 -119
  215. deepdoctection-0.31/tests/pipe/test_order.py +0 -197
  216. deepdoctection-0.31/tests/pipe/test_refine.py +0 -325
  217. deepdoctection-0.31/tests/pipe/test_registry.py +0 -58
  218. deepdoctection-0.31/tests/pipe/test_segment.py +0 -392
  219. deepdoctection-0.31/tests/pipe/test_text.py +0 -208
  220. deepdoctection-0.31/tests/pipe/test_transform.py +0 -65
  221. deepdoctection-0.31/tests/train/__init__.py +0 -16
  222. deepdoctection-0.31/tests/train/conftest.py +0 -118
  223. deepdoctection-0.31/tests/train/test_d2_frcnn_train.py +0 -64
  224. deepdoctection-0.31/tests/train/test_tp_frcnn_train.py +0 -99
  225. deepdoctection-0.31/tests_d2/__init__.py +0 -20
  226. deepdoctection-0.31/tests_d2/conftest.py +0 -56
  227. deepdoctection-0.31/tests_d2/test_d2detect.py +0 -95
  228. {deepdoctection-0.31 → deepdoctection-0.33}/LICENSE +0 -0
  229. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/analyzer/__init__.py +0 -0
  230. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/configs/__init__.py +0 -0
  231. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/configs/conf_tesseract.yaml +0 -0
  232. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/dataflow/__init__.py +0 -0
  233. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/dataflow/base.py +0 -0
  234. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datapoint/__init__.py +0 -0
  235. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/instances/__init__.py +0 -0
  236. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/instances/xsl/__init__.py +0 -0
  237. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/instances/xsl/pascal_voc.xsl +0 -0
  238. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/registry.py +0 -0
  239. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/eval/registry.py +0 -0
  240. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tp/__init__.py +0 -0
  241. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/utils/np_box_ops.py +0 -0
  242. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/pipe/registry.py +0 -0
  243. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/py.typed +0 -0
  244. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/__init__.py +0 -0
  245. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/error.py +0 -0
  246. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection.egg-info/dependency_links.txt +0 -0
  247. {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deepdoctection
3
- Version: 0.31
3
+ Version: 0.33
4
4
  Summary: Repository for Document AI
5
5
  Home-page: https://github.com/deepdoctection/deepdoctection
6
6
  Author: Dr. Janis Meyer
@@ -9,21 +9,21 @@ Classifier: Development Status :: 4 - Beta
9
9
  Classifier: License :: OSI Approved :: Apache Software License
10
10
  Classifier: Natural Language :: English
11
11
  Classifier: Operating System :: POSIX :: Linux
12
- Classifier: Programming Language :: Python :: 3.8
13
12
  Classifier: Programming Language :: Python :: 3.9
14
13
  Classifier: Programming Language :: Python :: 3.10
15
14
  Classifier: Programming Language :: Python :: 3.11
16
15
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
- Requires-Python: >=3.8
16
+ Requires-Python: >=3.9
18
17
  Description-Content-Type: text/markdown
19
18
  License-File: LICENSE
20
19
  Requires-Dist: catalogue==2.0.10
21
20
  Requires-Dist: huggingface_hub>=0.12.0
22
21
  Requires-Dist: importlib-metadata>=5.0.0
23
22
  Requires-Dist: jsonlines==3.1.0
23
+ Requires-Dist: lazy-imports==0.3.1
24
24
  Requires-Dist: mock==4.0.3
25
25
  Requires-Dist: networkx>=2.7.1
26
- Requires-Dist: numpy>=1.21
26
+ Requires-Dist: numpy<2.0,>=1.21
27
27
  Requires-Dist: packaging>=20.0
28
28
  Requires-Dist: Pillow>=10.0.0
29
29
  Requires-Dist: pypdf>=3.16.0
@@ -37,9 +37,10 @@ Requires-Dist: catalogue==2.0.10; extra == "tf"
37
37
  Requires-Dist: huggingface_hub>=0.12.0; extra == "tf"
38
38
  Requires-Dist: importlib-metadata>=5.0.0; extra == "tf"
39
39
  Requires-Dist: jsonlines==3.1.0; extra == "tf"
40
+ Requires-Dist: lazy-imports==0.3.1; extra == "tf"
40
41
  Requires-Dist: mock==4.0.3; extra == "tf"
41
42
  Requires-Dist: networkx>=2.7.1; extra == "tf"
42
- Requires-Dist: numpy>=1.21; extra == "tf"
43
+ Requires-Dist: numpy<2.0,>=1.21; extra == "tf"
43
44
  Requires-Dist: packaging>=20.0; extra == "tf"
44
45
  Requires-Dist: Pillow>=10.0.0; extra == "tf"
45
46
  Requires-Dist: pypdf>=3.16.0; extra == "tf"
@@ -52,10 +53,10 @@ Requires-Dist: tensorpack==0.11; extra == "tf"
52
53
  Requires-Dist: protobuf==3.20.1; extra == "tf"
53
54
  Requires-Dist: tensorflow-addons>=0.17.1; extra == "tf"
54
55
  Requires-Dist: tf2onnx>=1.9.2; extra == "tf"
55
- Requires-Dist: python-doctr==0.7.0; extra == "tf"
56
+ Requires-Dist: python-doctr==0.8.1; extra == "tf"
56
57
  Requires-Dist: pycocotools>=2.0.2; extra == "tf"
57
- Requires-Dist: boto3; extra == "tf"
58
- Requires-Dist: pdfplumber>=0.7.1; extra == "tf"
58
+ Requires-Dist: boto3==1.34.102; extra == "tf"
59
+ Requires-Dist: pdfplumber>=0.11.0; extra == "tf"
59
60
  Requires-Dist: fasttext==0.9.2; extra == "tf"
60
61
  Requires-Dist: jdeskew>=0.2.2; extra == "tf"
61
62
  Requires-Dist: apted==1.0.3; extra == "tf"
@@ -66,9 +67,10 @@ Requires-Dist: catalogue==2.0.10; extra == "pt"
66
67
  Requires-Dist: huggingface_hub>=0.12.0; extra == "pt"
67
68
  Requires-Dist: importlib-metadata>=5.0.0; extra == "pt"
68
69
  Requires-Dist: jsonlines==3.1.0; extra == "pt"
70
+ Requires-Dist: lazy-imports==0.3.1; extra == "pt"
69
71
  Requires-Dist: mock==4.0.3; extra == "pt"
70
72
  Requires-Dist: networkx>=2.7.1; extra == "pt"
71
- Requires-Dist: numpy>=1.21; extra == "pt"
73
+ Requires-Dist: numpy<2.0,>=1.21; extra == "pt"
72
74
  Requires-Dist: packaging>=20.0; extra == "pt"
73
75
  Requires-Dist: Pillow>=10.0.0; extra == "pt"
74
76
  Requires-Dist: pypdf>=3.16.0; extra == "pt"
@@ -80,9 +82,9 @@ Requires-Dist: tqdm==4.64.0; extra == "pt"
80
82
  Requires-Dist: timm>=0.9.16; extra == "pt"
81
83
  Requires-Dist: transformers>=4.36.0; extra == "pt"
82
84
  Requires-Dist: accelerate>=0.29.1; extra == "pt"
83
- Requires-Dist: python-doctr==0.7.0; extra == "pt"
84
- Requires-Dist: boto3; extra == "pt"
85
- Requires-Dist: pdfplumber>=0.7.1; extra == "pt"
85
+ Requires-Dist: python-doctr==0.8.1; extra == "pt"
86
+ Requires-Dist: boto3==1.34.102; extra == "pt"
87
+ Requires-Dist: pdfplumber>=0.11.0; extra == "pt"
86
88
  Requires-Dist: fasttext==0.9.2; extra == "pt"
87
89
  Requires-Dist: jdeskew>=0.2.2; extra == "pt"
88
90
  Requires-Dist: apted==1.0.3; extra == "pt"
@@ -90,10 +92,10 @@ Requires-Dist: distance==0.1.3; extra == "pt"
90
92
  Requires-Dist: lxml>=4.9.1; extra == "pt"
91
93
  Provides-Extra: docs
92
94
  Requires-Dist: tensorpack==0.11; extra == "docs"
93
- Requires-Dist: boto3; extra == "docs"
95
+ Requires-Dist: boto3==1.34.102; extra == "docs"
94
96
  Requires-Dist: transformers>=4.36.0; extra == "docs"
95
97
  Requires-Dist: accelerate>=0.29.1; extra == "docs"
96
- Requires-Dist: pdfplumber>=0.7.1; extra == "docs"
98
+ Requires-Dist: pdfplumber>=0.11.0; extra == "docs"
97
99
  Requires-Dist: lxml>=4.9.1; extra == "docs"
98
100
  Requires-Dist: lxml-stubs>=0.5.1; extra == "docs"
99
101
  Requires-Dist: jdeskew>=0.2.2; extra == "docs"
@@ -153,7 +155,8 @@ pipelines. Its core function does not depend on any specific deep learning libra
153
155
  - Text mining for native PDFs with [**pdfplumber**](https://github.com/jsvine/pdfplumber),
154
156
  - Language detection with [**fastText**](https://github.com/facebookresearch/fastText),
155
157
  - Deskewing and rotating images with [**jdeskew**](https://github.com/phamquiluan/jdeskew).
156
- - Document and token classification with all LayoutLM models provided by the Transformer library.
158
+ - Document and token classification with all LayoutLM models provided by the
159
+ [**Transformer library**](https://github.com/huggingface/transformers).
157
160
  (Yes, you can use any LayoutLM-model with any of the provided OCR-or pdfplumber tools straight away!).
158
161
  - Table detection and table structure recognition with
159
162
  [**table-transformer**](https://github.com/microsoft/table-transformer).
@@ -163,10 +166,16 @@ pipelines. Its core function does not depend on any specific deep learning libra
163
166
  - Comprehensive configuration of **analyzer** like choosing different models, output parsing, OCR selection.
164
167
  Check this [notebook](https://github.com/deepdoctection/notebooks/blob/main/Analyzer_Configuration.ipynb) or the
165
168
  [docs](https://deepdoctection.readthedocs.io/en/latest/tutorials/analyzer_configuration_notebook/) for more infos.
166
- - Document layout analysis and table recognition now runs with Torchscript (CPU) as well and Detectron2 is
167
- not required anymore for basic inference.
168
- - [**new**] More angle predictors for determining the rotation of a document based on Tesseract and DocTr
169
+ - Document layout analysis and table recognition now runs with
170
+ [**Torchscript**](https://pytorch.org/docs/stable/jit.html) (CPU) as well and [**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) is not required
171
+ anymore for basic inference.
172
+ - [**new**] More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
169
173
  (not contained in the built-in Analyzer).
174
+ - [**new**] Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
175
+ [**transformers**](https://github.com/huggingface/transformers).
176
+ We have added a model wrapper for token classification with LiLT and added a some LiLT models to the model catalog
177
+ that seem to look promising, especially if you want to train a model on non-english data. The training script for
178
+ LayoutLM can be used for LiLT as well and we will be providing a notebook on how to train a model on a custom dataset soon.
170
179
 
171
180
  **deep**doctection provides on top of that methods for pre-processing inputs to models like cropping or resizing and to
172
181
  post-process results, like validating duplicate outputs, relating words to detected layout segments or ordering words
@@ -257,9 +266,9 @@ Everything in the overview listed below the **deep**doctection layer are necessa
257
266
  separately.
258
267
 
259
268
  - Linux or macOS. (Windows is not supported but there is a [Dockerfile](./docker/pytorch-cpu-jupyter/Dockerfile) available)
260
- - Python >= 3.8
261
- - 1.12 <= PyTorch < 2.0 **or** Tensorflow >= 2.9 and CUDA. If you want to run the models provided by Tensorpack a GPU is
262
- required. You can run on PyTorch with a CPU only.
269
+ - Python >= 3.9
270
+ - 1.13 <= PyTorch **or** 2.11 <= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
271
+ In general, if you want to train or fine-tune models, a GPU is required.
263
272
  - **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF documents into
264
273
  images.
265
274
  - With respect to the Deep Learning framework, you must decide between [Tensorflow](https://www.tensorflow.org/install?hl=en)
@@ -31,7 +31,8 @@ pipelines. Its core function does not depend on any specific deep learning libra
31
31
  - Text mining for native PDFs with [**pdfplumber**](https://github.com/jsvine/pdfplumber),
32
32
  - Language detection with [**fastText**](https://github.com/facebookresearch/fastText),
33
33
  - Deskewing and rotating images with [**jdeskew**](https://github.com/phamquiluan/jdeskew).
34
- - Document and token classification with all LayoutLM models provided by the Transformer library.
34
+ - Document and token classification with all LayoutLM models provided by the
35
+ [**Transformer library**](https://github.com/huggingface/transformers).
35
36
  (Yes, you can use any LayoutLM-model with any of the provided OCR-or pdfplumber tools straight away!).
36
37
  - Table detection and table structure recognition with
37
38
  [**table-transformer**](https://github.com/microsoft/table-transformer).
@@ -41,10 +42,16 @@ pipelines. Its core function does not depend on any specific deep learning libra
41
42
  - Comprehensive configuration of **analyzer** like choosing different models, output parsing, OCR selection.
42
43
  Check this [notebook](https://github.com/deepdoctection/notebooks/blob/main/Analyzer_Configuration.ipynb) or the
43
44
  [docs](https://deepdoctection.readthedocs.io/en/latest/tutorials/analyzer_configuration_notebook/) for more infos.
44
- - Document layout analysis and table recognition now runs with Torchscript (CPU) as well and Detectron2 is
45
- not required anymore for basic inference.
46
- - [**new**] More angle predictors for determining the rotation of a document based on Tesseract and DocTr
45
+ - Document layout analysis and table recognition now runs with
46
+ [**Torchscript**](https://pytorch.org/docs/stable/jit.html) (CPU) as well and [**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) is not required
47
+ anymore for basic inference.
48
+ - [**new**] More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
47
49
  (not contained in the built-in Analyzer).
50
+ - [**new**] Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
51
+ [**transformers**](https://github.com/huggingface/transformers).
52
+ We have added a model wrapper for token classification with LiLT and added a some LiLT models to the model catalog
53
+ that seem to look promising, especially if you want to train a model on non-english data. The training script for
54
+ LayoutLM can be used for LiLT as well and we will be providing a notebook on how to train a model on a custom dataset soon.
48
55
 
49
56
  **deep**doctection provides on top of that methods for pre-processing inputs to models like cropping or resizing and to
50
57
  post-process results, like validating duplicate outputs, relating words to detected layout segments or ordering words
@@ -135,9 +142,9 @@ Everything in the overview listed below the **deep**doctection layer are necessa
135
142
  separately.
136
143
 
137
144
  - Linux or macOS. (Windows is not supported but there is a [Dockerfile](./docker/pytorch-cpu-jupyter/Dockerfile) available)
138
- - Python >= 3.8
139
- - 1.12 <= PyTorch < 2.0 **or** Tensorflow >= 2.9 and CUDA. If you want to run the models provided by Tensorpack a GPU is
140
- required. You can run on PyTorch with a CPU only.
145
+ - Python >= 3.9
146
+ - 1.13 <= PyTorch **or** 2.11 <= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
147
+ In general, if you want to train or fine-tune models, a GPU is required.
141
148
  - **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF documents into
142
149
  images.
143
150
  - With respect to the Deep Learning framework, you must decide between [Tensorflow](https://www.tensorflow.org/install?hl=en)
@@ -19,15 +19,13 @@ import os
19
19
  import sys
20
20
  from typing import TYPE_CHECKING
21
21
 
22
- from packaging import version
23
-
24
- from .utils.env_info import auto_select_lib_and_device
22
+ from .utils.env_info import collect_env_info
25
23
  from .utils.file_utils import _LazyModule, get_tf_version, pytorch_available, tf_available
26
- from .utils.logger import logger
24
+ from .utils.logger import LoggingRecord, logger
27
25
 
28
26
  # pylint: enable=wrong-import-position
29
27
 
30
- __version__ = 0.31
28
+ __version__ = 0.33
31
29
 
32
30
  _IMPORT_STRUCTURE = {
33
31
  "analyzer": [
@@ -162,6 +160,8 @@ _IMPORT_STRUCTURE = {
162
160
  "EvalCallback",
163
161
  ],
164
162
  "extern": [
163
+ "ModelCategories",
164
+ "NerModelCategories",
165
165
  "PredictorBase",
166
166
  "DetectionResult",
167
167
  "ObjectDetector",
@@ -182,6 +182,7 @@ _IMPORT_STRUCTURE = {
182
182
  "DocTrRotationTransformer",
183
183
  "FasttextLangDetector",
184
184
  "HFDetrDerivedDetector",
185
+ "get_tokenizer_from_architecture",
185
186
  "HFLayoutLmTokenClassifierBase",
186
187
  "HFLayoutLmTokenClassifier",
187
188
  "HFLayoutLmv2TokenClassifier",
@@ -189,6 +190,9 @@ _IMPORT_STRUCTURE = {
189
190
  "HFLayoutLmSequenceClassifier",
190
191
  "HFLayoutLmv2SequenceClassifier",
191
192
  "HFLayoutLmv3SequenceClassifier",
193
+ "HFLiltTokenClassifier",
194
+ "HFLiltSequenceClassifier",
195
+ "HFLmSequenceClassifier",
192
196
  "ModelProfile",
193
197
  "ModelCatalog",
194
198
  "print_model_infos",
@@ -268,11 +272,11 @@ _IMPORT_STRUCTURE = {
268
272
  "DoctectionPipe",
269
273
  "LanguageDetectionService",
270
274
  "ImageLayoutService",
271
- "get_tokenizer_from_architecture",
272
275
  "LMTokenClassifierService",
273
276
  "LMSequenceClassifierService",
274
277
  "OrderGenerator",
275
278
  "TextLineGenerator",
279
+ "TextLineService",
276
280
  "TextOrderService",
277
281
  "TableSegmentationRefinementService",
278
282
  "generate_html_string",
@@ -297,14 +301,13 @@ _IMPORT_STRUCTURE = {
297
301
  "save_tmp_file",
298
302
  "timed_operation",
299
303
  "collect_env_info",
300
- "get_device",
301
- "auto_select_lib_and_device",
302
304
  "auto_select_viz_library",
303
305
  "get_tensorflow_requirement",
304
306
  "tf_addons_available",
305
307
  "get_tf_addons_requirements",
306
308
  "tensorpack_available",
307
309
  "get_tensorpack_requirement",
310
+ "pytorch_available",
308
311
  "get_pytorch_requirement",
309
312
  "lxml_available",
310
313
  "get_lxml_requirement",
@@ -418,25 +421,9 @@ _IMPORT_STRUCTURE = {
418
421
  ],
419
422
  }
420
423
 
421
-
422
- # disable TF warnings for versions > 2.4.1
423
- if tf_available():
424
- if version.parse(get_tf_version()) > version.parse("2.4.1"):
425
- os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
426
- try:
427
- import tensorflow.python.util.deprecation as deprecation # type: ignore # pylint: disable=E0401,R0402
428
-
429
- deprecation._PRINT_DEPRECATION_WARNINGS = False # pylint: disable=W0212
430
- except Exception: # pylint: disable=W0703
431
- try:
432
- from tensorflow.python.util import deprecation # type: ignore # pylint: disable=E0401
433
-
434
- deprecation._PRINT_DEPRECATION_WARNINGS = False # pylint: disable=W0212
435
- except Exception: # pylint: disable=W0703
436
- pass
437
-
438
424
  # Setting some environment variables so that standard functions can be invoked with available hardware
439
- auto_select_lib_and_device()
425
+ env_info = collect_env_info()
426
+ logger.debug(LoggingRecord(msg=env_info))
440
427
 
441
428
 
442
429
  # Direct imports for type-checking
@@ -444,10 +431,10 @@ if TYPE_CHECKING:
444
431
  from .analyzer import *
445
432
  from .dataflow import *
446
433
  from .datapoint import *
447
- from .datasets import *
434
+ from .datasets import * # type: ignore
448
435
  from .eval import *
449
- from .extern import *
450
- from .mapper import *
436
+ from .extern import * # type: ignore
437
+ from .mapper import * # type: ignore
451
438
  from .pipe import *
452
439
  from .train import *
453
440
  from .utils import *
@@ -23,51 +23,46 @@ Module for **deep**doctection analyzer.
23
23
  -user factory with a reduced config setting
24
24
  """
25
25
 
26
- import ast
26
+ from __future__ import annotations
27
+
27
28
  import os
28
29
  from os import environ
29
30
  from shutil import copyfile
30
- from typing import List, Optional, Union
31
+ from typing import Optional, Union
32
+
33
+ from lazy_imports import try_import
31
34
 
32
35
  from ..extern.base import ObjectDetector
36
+ from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
33
37
  from ..extern.doctrocr import DoctrTextlineDetector, DoctrTextRecognizer
38
+ from ..extern.hfdetr import HFDetrDerivedDetector
34
39
  from ..extern.model import ModelCatalog, ModelDownloadManager
35
40
  from ..extern.pdftext import PdfPlumberTextDetector
41
+ from ..extern.pt.ptutils import get_torch_device
36
42
  from ..extern.tessocr import TesseractOcrDetector
37
43
  from ..extern.texocr import TextractOcrDetector
44
+ from ..extern.tp.tfutils import disable_tp_layer_logging, get_tf_device
45
+ from ..extern.tpdetect import TPFrcnnDetector
38
46
  from ..pipe.base import PipelineComponent
39
- from ..pipe.cell import DetectResultGenerator, SubImageLayoutService
40
47
  from ..pipe.common import AnnotationNmsService, MatchingService, PageParsingService
41
48
  from ..pipe.doctectionpipe import DoctectionPipe
42
49
  from ..pipe.layout import ImageLayoutService
43
50
  from ..pipe.order import TextOrderService
44
51
  from ..pipe.refine import TableSegmentationRefinementService
45
52
  from ..pipe.segment import PubtablesSegmentationService, TableSegmentationService
53
+ from ..pipe.sub_layout import DetectResultGenerator, SubImageLayoutService
46
54
  from ..pipe.text import TextExtractionService
47
- from ..utils.detection_types import Pathlike
48
- from ..utils.env_info import get_device
49
- from ..utils.file_utils import (
50
- boto3_available,
51
- detectron2_available,
52
- pytorch_available,
53
- tensorpack_available,
54
- tf_available,
55
- )
55
+ from ..utils.env_info import ENV_VARS_TRUE
56
+ from ..utils.error import DependencyError
57
+ from ..utils.file_utils import detectron2_available, tensorpack_available
56
58
  from ..utils.fs import get_configs_dir_path, get_package_path, mkdir_p
57
59
  from ..utils.logger import LoggingRecord, logger
58
60
  from ..utils.metacfg import AttrDict, set_config_by_yaml
59
61
  from ..utils.settings import CellType, LayoutType
60
62
  from ..utils.transform import PadTransform
63
+ from ..utils.types import PathLikeOrStr
61
64
 
62
- if tf_available() and tensorpack_available():
63
- from ..extern.tp.tfutils import disable_tp_layer_logging
64
- from ..extern.tpdetect import TPFrcnnDetector
65
-
66
- if pytorch_available():
67
- from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
68
- from ..extern.hfdetr import HFDetrDerivedDetector
69
-
70
- if boto3_available():
65
+ with try_import() as image_guard:
71
66
  from botocore.config import Config # type: ignore
72
67
 
73
68
 
@@ -89,7 +84,7 @@ _TESSERACT = "deepdoctection/configs/conf_tesseract.yaml"
89
84
 
90
85
 
91
86
  def maybe_copy_config_to_cache(
92
- package_path: Pathlike, configs_dir_path: Pathlike, file_name: str, force_copy: bool = True
87
+ package_path: PathLikeOrStr, configs_dir_path: PathLikeOrStr, file_name: str, force_copy: bool = True
93
88
  ) -> str:
94
89
  """
95
90
  Initial copying of various files
@@ -123,7 +118,7 @@ def config_sanity_checks(cfg: AttrDict) -> None:
123
118
 
124
119
  def build_detector(
125
120
  cfg: AttrDict, mode: str
126
- ) -> Union["D2FrcnnDetector", "TPFrcnnDetector", "HFDetrDerivedDetector", "D2FrcnnTracingDetector"]:
121
+ ) -> Union[D2FrcnnDetector, TPFrcnnDetector, HFDetrDerivedDetector, D2FrcnnTracingDetector]:
127
122
  """Building a D2-Detector, a TP-Detector as Detr-Detector or a D2-Torch Tracing Detector according to
128
123
  the config
129
124
 
@@ -141,8 +136,8 @@ def build_detector(
141
136
  config_path = ModelCatalog.get_full_path_configs(weights)
142
137
  weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
143
138
  profile = ModelCatalog.get_profile(weights)
144
- categories = profile.categories
145
- assert categories is not None
139
+ categories = profile.categories if profile.categories is not None else {}
140
+
146
141
  if profile.model_wrapper in ("TPFrcnnDetector",):
147
142
  return TPFrcnnDetector(config_path, weights_path, categories, filter_categories=filter_categories)
148
143
  if profile.model_wrapper in ("D2FrcnnDetector",):
@@ -210,11 +205,13 @@ def build_sub_image_service(detector: ObjectDetector, cfg: AttrDict, mode: str)
210
205
  padder = None
211
206
  if mode == "ITEM":
212
207
  if detector.__class__.__name__ in ("HFDetrDerivedDetector",):
213
- exclude_category_ids.extend(["1", "3", "4", "5", "6"])
208
+ exclude_category_ids.extend([1, 3, 4, 5, 6])
214
209
  padder = build_padder(cfg, mode)
215
- detect_result_generator = DetectResultGenerator(detector.categories, exclude_category_ids=exclude_category_ids)
210
+ detect_result_generator = DetectResultGenerator(
211
+ categories=detector.categories.categories, exclude_category_ids=exclude_category_ids
212
+ )
216
213
  return SubImageLayoutService(
217
- detector, [LayoutType.table, LayoutType.table_rotated], None, detect_result_generator, padder
214
+ detector, [LayoutType.TABLE, LayoutType.TABLE_ROTATED], None, detect_result_generator, padder
218
215
  )
219
216
 
220
217
 
@@ -241,9 +238,9 @@ def build_ocr(cfg: AttrDict) -> Union[TesseractOcrDetector, DoctrTextRecognizer,
241
238
  )
242
239
  if cfg.OCR.USE_TEXTRACT:
243
240
  credentials_kwargs = {
244
- "aws_access_key_id": environ.get("ACCESS_KEY"),
245
- "aws_secret_access_key": environ.get("SECRET_KEY"),
246
- "config": Config(region_name=environ.get("REGION")),
241
+ "aws_access_key_id": environ.get("ACCESS_KEY", None),
242
+ "aws_secret_access_key": environ.get("SECRET_KEY", None),
243
+ "config": Config(region_name=environ.get("REGION", None)),
247
244
  }
248
245
  return TextractOcrDetector(**credentials_kwargs)
249
246
  raise ValueError("You have set USE_OCR=True but any of USE_TESSERACT, USE_DOCTR, USE_TEXTRACT is set to False")
@@ -268,7 +265,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
268
265
  :param cfg: A configuration
269
266
  :return: Analyzer pipeline
270
267
  """
271
- pipe_component_list: List[PipelineComponent] = []
268
+ pipe_component_list: list[PipelineComponent] = []
272
269
 
273
270
  if cfg.USE_LAYOUT:
274
271
  d_layout = build_detector(cfg, "LAYOUT")
@@ -308,22 +305,22 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
308
305
  cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
309
306
  cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
310
307
  cfg.SEGMENTATION.CELL_CATEGORY_ID,
311
- LayoutType.table,
308
+ LayoutType.TABLE,
312
309
  [
313
- CellType.spanning,
314
- CellType.row_header,
315
- CellType.column_header,
316
- CellType.projected_row_header,
317
- LayoutType.cell,
310
+ CellType.SPANNING,
311
+ CellType.ROW_HEADER,
312
+ CellType.COLUMN_HEADER,
313
+ CellType.PROJECTED_ROW_HEADER,
314
+ LayoutType.CELL,
318
315
  ],
319
316
  [
320
- CellType.spanning,
321
- CellType.row_header,
322
- CellType.column_header,
323
- CellType.projected_row_header,
317
+ CellType.SPANNING,
318
+ CellType.ROW_HEADER,
319
+ CellType.COLUMN_HEADER,
320
+ CellType.PROJECTED_ROW_HEADER,
324
321
  ],
325
- [LayoutType.row, LayoutType.column],
326
- [CellType.row_number, CellType.column_number],
322
+ [LayoutType.ROW, LayoutType.COLUMN],
323
+ [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER],
327
324
  stretch_rule=cfg.SEGMENTATION.STRETCH_RULE,
328
325
  )
329
326
  pipe_component_list.append(pubtables)
@@ -335,20 +332,29 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
335
332
  cfg.SEGMENTATION.FULL_TABLE_TILING,
336
333
  cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
337
334
  cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
338
- LayoutType.table,
339
- [CellType.header, CellType.body, LayoutType.cell],
340
- [LayoutType.row, LayoutType.column],
341
- [CellType.row_number, CellType.column_number],
335
+ LayoutType.TABLE,
336
+ [CellType.HEADER, CellType.BODY, LayoutType.CELL],
337
+ [LayoutType.ROW, LayoutType.COLUMN],
338
+ [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER],
342
339
  cfg.SEGMENTATION.STRETCH_RULE,
343
340
  )
344
341
  pipe_component_list.append(table_segmentation)
345
342
 
346
343
  if cfg.USE_TABLE_REFINEMENT:
347
- table_segmentation_refinement = TableSegmentationRefinementService()
344
+ table_segmentation_refinement = TableSegmentationRefinementService(
345
+ [LayoutType.TABLE, LayoutType.TABLE_ROTATED],
346
+ [
347
+ LayoutType.CELL,
348
+ CellType.COLUMN_HEADER,
349
+ CellType.PROJECTED_ROW_HEADER,
350
+ CellType.SPANNING,
351
+ CellType.ROW_HEADER,
352
+ ],
353
+ )
348
354
  pipe_component_list.append(table_segmentation_refinement)
349
355
 
350
356
  if cfg.USE_PDF_MINER:
351
- pdf_text = PdfPlumberTextDetector()
357
+ pdf_text = PdfPlumberTextDetector(x_tolerance=cfg.PDF_MINER.X_TOLERANCE, y_tolerance=cfg.PDF_MINER.Y_TOLERANCE)
352
358
  d_text = TextExtractionService(pdf_text)
353
359
  pipe_component_list.append(d_text)
354
360
 
@@ -362,7 +368,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
362
368
 
363
369
  ocr = build_ocr(cfg)
364
370
  skip_if_text_extracted = cfg.USE_PDF_MINER
365
- extract_from_roi = LayoutType.word if cfg.OCR.USE_DOCTR else None
371
+ extract_from_roi = LayoutType.WORD if cfg.OCR.USE_DOCTR else None
366
372
  text = TextExtractionService(
367
373
  ocr, skip_if_text_extracted=skip_if_text_extracted, extract_from_roi=extract_from_roi
368
374
  )
@@ -371,7 +377,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
371
377
  if cfg.USE_PDF_MINER or cfg.USE_OCR:
372
378
  match = MatchingService(
373
379
  parent_categories=cfg.WORD_MATCHING.PARENTAL_CATEGORIES,
374
- child_categories=LayoutType.word,
380
+ child_categories=LayoutType.WORD,
375
381
  matching_rule=cfg.WORD_MATCHING.RULE,
376
382
  threshold=cfg.WORD_MATCHING.THRESHOLD,
377
383
  max_parent_only=cfg.WORD_MATCHING.MAX_PARENT_ONLY,
@@ -379,7 +385,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
379
385
  pipe_component_list.append(match)
380
386
 
381
387
  order = TextOrderService(
382
- text_container=LayoutType.word,
388
+ text_container=LayoutType.WORD,
383
389
  text_block_categories=cfg.TEXT_ORDERING.TEXT_BLOCK_CATEGORIES,
384
390
  floating_text_block_categories=cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES,
385
391
  include_residual_text_container=cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
@@ -391,7 +397,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
391
397
  pipe_component_list.append(order)
392
398
 
393
399
  page_parsing_service = PageParsingService(
394
- text_container=LayoutType.word,
400
+ text_container=LayoutType.WORD,
395
401
  floating_text_block_categories=cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES,
396
402
  include_residual_text_container=cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
397
403
  )
@@ -401,9 +407,9 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
401
407
 
402
408
 
403
409
  def get_dd_analyzer(
404
- reset_config_file: bool = False,
405
- config_overwrite: Optional[List[str]] = None,
406
- path_config_file: Optional[Pathlike] = None,
410
+ reset_config_file: bool = True,
411
+ config_overwrite: Optional[list[str]] = None,
412
+ path_config_file: Optional[PathLikeOrStr] = None,
407
413
  ) -> DoctectionPipe:
408
414
  """
409
415
  Factory function for creating the built-in **deep**doctection analyzer.
@@ -430,8 +436,13 @@ def get_dd_analyzer(
430
436
  :return: A DoctectionPipe instance with given configs
431
437
  """
432
438
  config_overwrite = [] if config_overwrite is None else config_overwrite
433
- lib = "TF" if ast.literal_eval(os.environ.get("USE_TENSORFLOW", "False")) else "PT"
434
- device = get_device(False)
439
+ lib = "TF" if os.environ.get("DD_USE_TF", "0") in ENV_VARS_TRUE else "PT"
440
+ if lib == "TF":
441
+ device = get_tf_device()
442
+ elif lib == "PT":
443
+ device = get_torch_device()
444
+ else:
445
+ raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
435
446
  dd_one_config_path = maybe_copy_config_to_cache(
436
447
  get_package_path(), get_configs_dir_path(), _DD_ONE, reset_config_file
437
448
  )
@@ -1,38 +1,38 @@
1
1
  USE_LAYOUT: True
2
2
  USE_TABLE_SEGMENTATION: True
3
3
  TF:
4
- LAYOUT:
5
- WEIGHTS: layout/model-800000_inf_only.data-00000-of-00001
6
- FILTER:
7
- CELL:
8
- WEIGHTS: cell/model-1800000_inf_only.data-00000-of-00001
9
- FILTER:
10
- ITEM:
11
- WEIGHTS: item/model-1620000_inf_only.data-00000-of-00001
12
- FILTER:
4
+ LAYOUT:
5
+ WEIGHTS: layout/model-800000_inf_only.data-00000-of-00001
6
+ FILTER:
7
+ CELL:
8
+ WEIGHTS: cell/model-1800000_inf_only.data-00000-of-00001
9
+ FILTER:
10
+ ITEM:
11
+ WEIGHTS: item/model-1620000_inf_only.data-00000-of-00001
12
+ FILTER:
13
13
  PT:
14
- LAYOUT:
15
- WEIGHTS: layout/d2_model_0829999_layout_inf_only.pt
16
- WEIGHTS_TS: layout/d2_model_0829999_layout_inf_only.ts
17
- FILTER:
18
- PAD:
19
- TOP: 60
20
- RIGHT: 60
21
- BOTTOM: 60
22
- LEFT: 60
23
- ITEM:
24
- WEIGHTS: item/d2_model_1639999_item_inf_only.pt
25
- WEIGHTS_TS: item/d2_model_1639999_item_inf_only.ts
26
- FILTER:
27
- PAD:
28
- TOP: 60
29
- RIGHT: 60
30
- BOTTOM: 60
31
- LEFT: 60
32
- CELL:
33
- WEIGHTS: cell/d2_model_1849999_cell_inf_only.pt
34
- WEIGHTS_TS: cell/d2_model_1849999_cell_inf_only.ts
35
- FILTER:
14
+ LAYOUT:
15
+ WEIGHTS: layout/d2_model_0829999_layout_inf_only.pt
16
+ WEIGHTS_TS: layout/d2_model_0829999_layout_inf_only.ts
17
+ FILTER:
18
+ PAD:
19
+ TOP: 60
20
+ RIGHT: 60
21
+ BOTTOM: 60
22
+ LEFT: 60
23
+ ITEM:
24
+ WEIGHTS: item/d2_model_1639999_item_inf_only.pt
25
+ WEIGHTS_TS: item/d2_model_1639999_item_inf_only.ts
26
+ FILTER:
27
+ PAD:
28
+ TOP: 60
29
+ RIGHT: 60
30
+ BOTTOM: 60
31
+ LEFT: 60
32
+ CELL:
33
+ WEIGHTS: cell/d2_model_1849999_cell_inf_only.pt
34
+ WEIGHTS_TS: cell/d2_model_1849999_cell_inf_only.ts
35
+ FILTER:
36
36
  LAYOUT_NMS_PAIRS:
37
37
  COMBINATIONS:
38
38
  THRESHOLDS:
@@ -48,6 +48,9 @@ SEGMENTATION:
48
48
  STRETCH_RULE: equal
49
49
  USE_TABLE_REFINEMENT: True
50
50
  USE_PDF_MINER: False
51
+ PDF_MINER:
52
+ X_TOLERANCE: 3
53
+ Y_TOLERANCE: 3
51
54
  USE_OCR: True
52
55
  OCR:
53
56
  USE_TESSERACT: True