deepdoctection 0.44.0__tar.gz → 0.45.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (156) hide show
  1. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/PKG-INFO +15 -15
  2. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/README.md +6 -4
  3. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/__init__.py +6 -3
  4. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/analyzer/config.py +41 -0
  5. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/analyzer/factory.py +249 -1
  6. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/configs/profiles.jsonl +2 -1
  7. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/datapoint/image.py +1 -0
  8. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/datapoint/view.py +162 -69
  9. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/datasets/base.py +1 -0
  10. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/__init__.py +1 -0
  11. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/d2detect.py +1 -1
  12. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/fastlang.py +6 -4
  13. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/hflayoutlm.py +23 -10
  14. deepdoctection-0.45.0/deepdoctection/extern/hflm.py +689 -0
  15. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/mapper/laylmstruct.py +7 -7
  16. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/pipe/language.py +4 -4
  17. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/pipe/lm.py +7 -3
  18. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/utils/file_utils.py +34 -0
  19. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/utils/settings.py +2 -0
  20. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/utils/types.py +0 -1
  21. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/utils/viz.py +3 -3
  22. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection.egg-info/PKG-INFO +15 -15
  23. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection.egg-info/requires.txt +8 -10
  24. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/setup.py +5 -5
  25. deepdoctection-0.44.0/deepdoctection/extern/hflm.py +0 -264
  26. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/LICENSE +0 -0
  27. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/analyzer/__init__.py +0 -0
  28. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/analyzer/dd.py +0 -0
  29. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/configs/__init__.py +0 -0
  30. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/configs/conf_dd_one.yaml +0 -0
  31. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/configs/conf_tesseract.yaml +0 -0
  32. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/dataflow/__init__.py +0 -0
  33. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/dataflow/base.py +0 -0
  34. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/dataflow/common.py +0 -0
  35. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/dataflow/custom.py +0 -0
  36. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/dataflow/custom_serialize.py +0 -0
  37. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/dataflow/parallel_map.py +0 -0
  38. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/dataflow/serialize.py +0 -0
  39. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/dataflow/stats.py +0 -0
  40. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/datapoint/__init__.py +0 -0
  41. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/datapoint/annotation.py +0 -0
  42. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/datapoint/box.py +0 -0
  43. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/datapoint/convert.py +0 -0
  44. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/datasets/__init__.py +0 -0
  45. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/datasets/adapter.py +0 -0
  46. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/datasets/dataflow_builder.py +0 -0
  47. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/datasets/info.py +0 -0
  48. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/datasets/instances/__init__.py +0 -0
  49. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/datasets/instances/doclaynet.py +0 -0
  50. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/datasets/instances/fintabnet.py +0 -0
  51. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/datasets/instances/funsd.py +0 -0
  52. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/datasets/instances/iiitar13k.py +0 -0
  53. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/datasets/instances/layouttest.py +0 -0
  54. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/datasets/instances/publaynet.py +0 -0
  55. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/datasets/instances/pubtables1m.py +0 -0
  56. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/datasets/instances/pubtabnet.py +0 -0
  57. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/datasets/instances/rvlcdip.py +0 -0
  58. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/datasets/instances/xfund.py +0 -0
  59. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/datasets/instances/xsl/__init__.py +0 -0
  60. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/datasets/instances/xsl/pascal_voc.xsl +0 -0
  61. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/datasets/registry.py +0 -0
  62. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/datasets/save.py +0 -0
  63. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/eval/__init__.py +0 -0
  64. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/eval/accmetric.py +0 -0
  65. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/eval/base.py +0 -0
  66. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/eval/cocometric.py +0 -0
  67. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/eval/eval.py +0 -0
  68. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/eval/registry.py +0 -0
  69. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/eval/tedsmetric.py +0 -0
  70. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/eval/tp_eval_callback.py +0 -0
  71. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/base.py +0 -0
  72. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/deskew.py +0 -0
  73. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/doctrocr.py +0 -0
  74. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/hfdetr.py +0 -0
  75. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/model.py +0 -0
  76. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/pdftext.py +0 -0
  77. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/pt/__init__.py +0 -0
  78. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/pt/nms.py +0 -0
  79. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/pt/ptutils.py +0 -0
  80. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/tessocr.py +0 -0
  81. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/texocr.py +0 -0
  82. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/tp/__init__.py +0 -0
  83. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/tp/tfutils.py +0 -0
  84. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/tp/tpcompat.py +0 -0
  85. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/tp/tpfrcnn/__init__.py +0 -0
  86. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/tp/tpfrcnn/common.py +0 -0
  87. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +0 -0
  88. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/tp/tpfrcnn/config/config.py +0 -0
  89. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +0 -0
  90. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +0 -0
  91. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +0 -0
  92. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +0 -0
  93. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +0 -0
  94. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +0 -0
  95. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +0 -0
  96. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +0 -0
  97. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +0 -0
  98. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/tp/tpfrcnn/predict.py +0 -0
  99. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/tp/tpfrcnn/preproc.py +0 -0
  100. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +0 -0
  101. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +0 -0
  102. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/tp/tpfrcnn/utils/np_box_ops.py +0 -0
  103. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/extern/tpdetect.py +0 -0
  104. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/mapper/__init__.py +0 -0
  105. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/mapper/cats.py +0 -0
  106. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/mapper/cocostruct.py +0 -0
  107. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/mapper/d2struct.py +0 -0
  108. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/mapper/hfstruct.py +0 -0
  109. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/mapper/maputils.py +0 -0
  110. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/mapper/match.py +0 -0
  111. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/mapper/misc.py +0 -0
  112. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/mapper/pascalstruct.py +0 -0
  113. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/mapper/prodigystruct.py +0 -0
  114. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/mapper/pubstruct.py +0 -0
  115. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/mapper/tpstruct.py +0 -0
  116. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/mapper/xfundstruct.py +0 -0
  117. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/pipe/__init__.py +0 -0
  118. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/pipe/anngen.py +0 -0
  119. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/pipe/base.py +0 -0
  120. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/pipe/common.py +0 -0
  121. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/pipe/concurrency.py +0 -0
  122. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/pipe/doctectionpipe.py +0 -0
  123. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/pipe/layout.py +0 -0
  124. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/pipe/order.py +0 -0
  125. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/pipe/refine.py +0 -0
  126. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/pipe/registry.py +0 -0
  127. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/pipe/segment.py +0 -0
  128. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/pipe/sub_layout.py +0 -0
  129. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/pipe/text.py +0 -0
  130. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/pipe/transform.py +0 -0
  131. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/py.typed +0 -0
  132. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/train/__init__.py +0 -0
  133. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/train/d2_frcnn_train.py +0 -0
  134. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/train/hf_detr_train.py +0 -0
  135. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/train/hf_layoutlm_train.py +0 -0
  136. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/train/tp_frcnn_train.py +0 -0
  137. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/utils/__init__.py +0 -0
  138. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/utils/concurrency.py +0 -0
  139. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/utils/context.py +0 -0
  140. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/utils/develop.py +0 -0
  141. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/utils/env_info.py +0 -0
  142. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/utils/error.py +0 -0
  143. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/utils/fs.py +0 -0
  144. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/utils/identifier.py +0 -0
  145. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/utils/logger.py +0 -0
  146. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/utils/metacfg.py +0 -0
  147. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/utils/mocks.py +0 -0
  148. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/utils/pdf_utils.py +0 -0
  149. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/utils/tqdm.py +0 -0
  150. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/utils/transform.py +0 -0
  151. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection/utils/utils.py +0 -0
  152. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection.egg-info/SOURCES.txt +0 -0
  153. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection.egg-info/dependency_links.txt +0 -0
  154. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/deepdoctection.egg-info/top_level.txt +0 -0
  155. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/setup.cfg +0 -0
  156. {deepdoctection-0.44.0 → deepdoctection-0.45.0}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deepdoctection
3
- Version: 0.44.0
3
+ Version: 0.45.0
4
4
  Summary: Repository for Document AI
5
5
  Home-page: https://github.com/deepdoctection/deepdoctection
6
6
  Author: Dr. Janis Meyer
@@ -24,10 +24,10 @@ Requires-Dist: jsonlines==3.1.0
24
24
  Requires-Dist: lazy-imports==0.3.1
25
25
  Requires-Dist: mock==4.0.3
26
26
  Requires-Dist: networkx>=2.7.1
27
- Requires-Dist: numpy<2.0,>=1.21
27
+ Requires-Dist: numpy>2.0
28
28
  Requires-Dist: packaging>=20.0
29
29
  Requires-Dist: Pillow>=10.0.0
30
- Requires-Dist: pypdf>=3.16.0
30
+ Requires-Dist: pypdf>=6.0.0
31
31
  Requires-Dist: pypdfium2>=4.30.0
32
32
  Requires-Dist: pyyaml>=6.0.1
33
33
  Requires-Dist: pyzmq>=16
@@ -43,10 +43,10 @@ Requires-Dist: jsonlines==3.1.0; extra == "tf"
43
43
  Requires-Dist: lazy-imports==0.3.1; extra == "tf"
44
44
  Requires-Dist: mock==4.0.3; extra == "tf"
45
45
  Requires-Dist: networkx>=2.7.1; extra == "tf"
46
- Requires-Dist: numpy<2.0,>=1.21; extra == "tf"
46
+ Requires-Dist: numpy>2.0; extra == "tf"
47
47
  Requires-Dist: packaging>=20.0; extra == "tf"
48
48
  Requires-Dist: Pillow>=10.0.0; extra == "tf"
49
- Requires-Dist: pypdf>=3.16.0; extra == "tf"
49
+ Requires-Dist: pypdf>=6.0.0; extra == "tf"
50
50
  Requires-Dist: pypdfium2>=4.30.0; extra == "tf"
51
51
  Requires-Dist: pyyaml>=6.0.1; extra == "tf"
52
52
  Requires-Dist: pyzmq>=16; extra == "tf"
@@ -58,11 +58,10 @@ Requires-Dist: tensorpack==0.11; extra == "tf"
58
58
  Requires-Dist: protobuf==3.20.1; extra == "tf"
59
59
  Requires-Dist: tensorflow-addons>=0.17.1; extra == "tf"
60
60
  Requires-Dist: tf2onnx>=1.9.2; extra == "tf"
61
- Requires-Dist: python-doctr==0.9.0; extra == "tf"
61
+ Requires-Dist: python-doctr==0.10.0; extra == "tf"
62
62
  Requires-Dist: pycocotools>=2.0.2; extra == "tf"
63
63
  Requires-Dist: boto3==1.34.102; extra == "tf"
64
64
  Requires-Dist: pdfplumber>=0.11.0; extra == "tf"
65
- Requires-Dist: fasttext-wheel; extra == "tf"
66
65
  Requires-Dist: jdeskew>=0.2.2; extra == "tf"
67
66
  Requires-Dist: apted==1.0.3; extra == "tf"
68
67
  Requires-Dist: distance==0.1.3; extra == "tf"
@@ -75,10 +74,10 @@ Requires-Dist: jsonlines==3.1.0; extra == "pt"
75
74
  Requires-Dist: lazy-imports==0.3.1; extra == "pt"
76
75
  Requires-Dist: mock==4.0.3; extra == "pt"
77
76
  Requires-Dist: networkx>=2.7.1; extra == "pt"
78
- Requires-Dist: numpy<2.0,>=1.21; extra == "pt"
77
+ Requires-Dist: numpy>2.0; extra == "pt"
79
78
  Requires-Dist: packaging>=20.0; extra == "pt"
80
79
  Requires-Dist: Pillow>=10.0.0; extra == "pt"
81
- Requires-Dist: pypdf>=3.16.0; extra == "pt"
80
+ Requires-Dist: pypdf>=6.0.0; extra == "pt"
82
81
  Requires-Dist: pypdfium2>=4.30.0; extra == "pt"
83
82
  Requires-Dist: pyyaml>=6.0.1; extra == "pt"
84
83
  Requires-Dist: pyzmq>=16; extra == "pt"
@@ -89,11 +88,10 @@ Requires-Dist: tqdm>=4.64.0; extra == "pt"
89
88
  Requires-Dist: timm>=0.9.16; extra == "pt"
90
89
  Requires-Dist: transformers>=4.48.0; extra == "pt"
91
90
  Requires-Dist: accelerate>=0.29.1; extra == "pt"
92
- Requires-Dist: python-doctr==0.9.0; extra == "pt"
91
+ Requires-Dist: python-doctr==0.10.0; extra == "pt"
93
92
  Requires-Dist: pycocotools>=2.0.2; extra == "pt"
94
93
  Requires-Dist: boto3==1.34.102; extra == "pt"
95
94
  Requires-Dist: pdfplumber>=0.11.0; extra == "pt"
96
- Requires-Dist: fasttext-wheel; extra == "pt"
97
95
  Requires-Dist: jdeskew>=0.2.2; extra == "pt"
98
96
  Requires-Dist: apted==1.0.3; extra == "pt"
99
97
  Requires-Dist: distance==0.1.3; extra == "pt"
@@ -183,7 +181,8 @@ It also provides a framework for training, evaluating and inferencing Document A
183
181
  [**LiLT**](https://github.com/jpWang/LiLT) and selected
184
182
  [**Bert**](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)-style including features like sliding windows.
185
183
  - Text mining for native PDFs with [**pdfplumber**](https://github.com/jsvine/pdfplumber),
186
- - Language detection with [**fastText**](https://github.com/facebookresearch/fastText),
184
+ - Language detection with `papluca/xlm-roberta-base-language-detection`. [**fastText**](https://github.com/facebookresearch/fastText) is still available but
185
+ but will be removed in a future version.
187
186
  - Deskewing and rotating images with [**jdeskew**](https://github.com/phamquiluan/jdeskew).
188
187
  - Fine-tuning and evaluation tools.
189
188
  - Lot's of [tutorials](https://github.com/deepdoctection/notebooks)
@@ -294,7 +293,7 @@ alt="text" width="40%">
294
293
 
295
294
  - Linux or macOS. Windows is not supported but there is a [Dockerfile](./docker/pytorch-cpu-jupyter/Dockerfile) available.
296
295
  - Python >= 3.9
297
- - 2.2 \<= PyTorch **or** 2.11 \<= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
296
+ - 2.6 \<= PyTorch **or** 2.11 \<= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
298
297
  Tensorflow support will be stopped from Python 3.11 onwards.
299
298
  - To fine-tune models, a GPU is recommended.
300
299
 
@@ -321,7 +320,7 @@ For a simple setup which is enough to parse documents with the default setting,
321
320
 
322
321
  ```
323
322
  pip install transformers
324
- pip install python-doctr==0.9.0
323
+ pip install python-doctr==0.10.0 # If you use Python 3.10 or higher you can use the latest version.
325
324
  pip install deepdoctection
326
325
  ```
327
326
 
@@ -329,8 +328,9 @@ pip install deepdoctection
329
328
 
330
329
  ```
331
330
  pip install tensorpack
332
- pip install python-doctr==0.9.0
333
331
  pip install deepdoctection
332
+ pip install "numpy>=1.21,<2.0" --upgrade --force-reinstall # because TF 2.11 does not support numpy 2.0
333
+ pip install "python-doctr==0.9.0"
334
334
  ```
335
335
 
336
336
  Both setups are sufficient to run the [**introduction notebook**](https://github.com/deepdoctection/notebooks/blob/main/Get_Started.ipynb).
@@ -40,7 +40,8 @@ It also provides a framework for training, evaluating and inferencing Document A
40
40
  [**LiLT**](https://github.com/jpWang/LiLT) and selected
41
41
  [**Bert**](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)-style including features like sliding windows.
42
42
  - Text mining for native PDFs with [**pdfplumber**](https://github.com/jsvine/pdfplumber),
43
- - Language detection with [**fastText**](https://github.com/facebookresearch/fastText),
43
+ - Language detection with `papluca/xlm-roberta-base-language-detection`. [**fastText**](https://github.com/facebookresearch/fastText) is still available but
44
+ but will be removed in a future version.
44
45
  - Deskewing and rotating images with [**jdeskew**](https://github.com/phamquiluan/jdeskew).
45
46
  - Fine-tuning and evaluation tools.
46
47
  - Lot's of [tutorials](https://github.com/deepdoctection/notebooks)
@@ -151,7 +152,7 @@ alt="text" width="40%">
151
152
 
152
153
  - Linux or macOS. Windows is not supported but there is a [Dockerfile](./docker/pytorch-cpu-jupyter/Dockerfile) available.
153
154
  - Python >= 3.9
154
- - 2.2 \<= PyTorch **or** 2.11 \<= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
155
+ - 2.6 \<= PyTorch **or** 2.11 \<= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
155
156
  Tensorflow support will be stopped from Python 3.11 onwards.
156
157
  - To fine-tune models, a GPU is recommended.
157
158
 
@@ -178,7 +179,7 @@ For a simple setup which is enough to parse documents with the default setting,
178
179
 
179
180
  ```
180
181
  pip install transformers
181
- pip install python-doctr==0.9.0
182
+ pip install python-doctr==0.10.0 # If you use Python 3.10 or higher you can use the latest version.
182
183
  pip install deepdoctection
183
184
  ```
184
185
 
@@ -186,8 +187,9 @@ pip install deepdoctection
186
187
 
187
188
  ```
188
189
  pip install tensorpack
189
- pip install python-doctr==0.9.0
190
190
  pip install deepdoctection
191
+ pip install "numpy>=1.21,<2.0" --upgrade --force-reinstall # because TF 2.11 does not support numpy 2.0
192
+ pip install "python-doctr==0.9.0"
191
193
  ```
192
194
 
193
195
  Both setups are sufficient to run the [**introduction notebook**](https://github.com/deepdoctection/notebooks/blob/main/Get_Started.ipynb).
@@ -25,11 +25,10 @@ from .utils.logger import LoggingRecord, logger
25
25
 
26
26
  # pylint: enable=wrong-import-position
27
27
 
28
- __version__ = "0.44.0"
28
+ __version__ = "0.45.0"
29
29
 
30
30
  _IMPORT_STRUCTURE = {
31
- "analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory"],
32
- "configs": ["update_cfg_from_defaults"],
31
+ "analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory", "update_cfg_from_defaults"],
33
32
  "dataflow": [
34
33
  "DataFlowTerminated",
35
34
  "DataFlowResetStateNotCalled",
@@ -186,7 +185,9 @@ _IMPORT_STRUCTURE = {
186
185
  "HFLayoutLmv3SequenceClassifier",
187
186
  "HFLiltTokenClassifier",
188
187
  "HFLiltSequenceClassifier",
188
+ "HFLmTokenClassifier",
189
189
  "HFLmSequenceClassifier",
190
+ "HFLmLanguageDetector",
190
191
  "ModelProfile",
191
192
  "ModelCatalog",
192
193
  "print_model_infos",
@@ -315,6 +316,8 @@ _IMPORT_STRUCTURE = {
315
316
  "get_apted_requirement",
316
317
  "distance_available",
317
318
  "get_distance_requirement",
319
+ "numpy_v1_available",
320
+ "get_numpy_v1_requirement",
318
321
  "transformers_available",
319
322
  "get_transformers_requirement",
320
323
  "detectron2_available",
@@ -520,6 +520,13 @@ cfg.USE_LAYOUT_LINK = False
520
520
  # (e.g., by grouping orphan text containers). Only applicable if list items were previously grouped.
521
521
  cfg.USE_LINE_MATCHER = False
522
522
 
523
+ # Enables a sequence classification pipeline component, e.g. a LayoutLM or a Bert-like model.
524
+ cfg.USE_LM_SEQUENCE_CLASS = False
525
+
526
+ # Enables a token classification pipeline component, e.g. a LayoutLM or Bert-like model
527
+ cfg.USE_LM_TOKEN_CLASS = False
528
+
529
+
523
530
  # Relevant when LIB = TF. Specifies the layout detection model.
524
531
  # This model should detect multiple or single objects across an entire page.
525
532
  # Currently, only one default model is supported.
@@ -899,6 +906,40 @@ cfg.LAYOUT_LINK.PARENTAL_CATEGORIES = [LayoutType.FIGURE, LayoutType.TABLE]
899
906
  # These are typically smaller or subordinate elements (e.g., captions).
900
907
  cfg.LAYOUT_LINK.CHILD_CATEGORIES = [LayoutType.CAPTION]
901
908
 
909
+
910
+ # Weights configuration for sequence classifier. This will be a fine-tuned version of a LayoutLM, LayoutLMv2,
911
+ # LayoutXLM, LayoutLMv3, LiLT or Roberta base model for sequence classification.
912
+ cfg.LM_SEQUENCE_CLASS.WEIGHTS = None
913
+
914
+ # When predicting document classes, it might be possible that some pages are empty or do not contain any text, in
915
+ # which case the model will be unable to predict anything. If set to `True` it will
916
+ # assign images with no features the category `TokenClasses.OTHER`.
917
+ cfg.LM_SEQUENCE_CLASS.USE_OTHER_AS_DEFAULT_CATEGORY = False
918
+
919
+ # Weights configuration for sequence classifier. This will be a fine-tuned version of a LayoutLM, LayoutLMv2,
920
+ # LayoutXLM, LayoutLMv3, LiLT or Roberta base model for token classification.
921
+ cfg.LM_TOKEN_CLASS.WEIGHTS = None
922
+
923
+ # When predicting token classes, it might be possible that some words might not get sent to the model because they are
924
+ # categorized as not eligible token (e.g. empty string). If set to `True` it will assign all words without token
925
+ # as `TokenClasses.OTHER`.
926
+ cfg.LM_TOKEN_CLASS.USE_OTHER_AS_DEFAULT_CATEGORY = False
927
+
928
+ # Using bounding boxes of segments instead of words might improve model accuracy
929
+ # for models that have been trained on segments rather than words (e.g. LiLT, LayoutLMv3).
930
+ # Choose a single or a sequence of layout segments to use their bounding boxes. Note,
931
+ # that the layout segments need to have a child-relationship with words. If a word
932
+ # does not appear as child, it will use the word bounding box.
933
+ cfg.LM_TOKEN_CLASS.SEGMENT_POSITIONS = None
934
+
935
+ # If the output of the `tokenizer` exceeds the `max_length` sequence length, a
936
+ # sliding window will be created with each window having `max_length` sequence
937
+ # input. When using `SLIDING_WINDOW_STRIDE=0` no strides will be created,
938
+ # otherwise it will create slides with windows shifted `SLIDING_WINDOW_STRIDE` to
939
+ # the right.
940
+ cfg.LM_TOKEN_CLASS.SLIDING_WINDOW_STRIDE = 0
941
+
942
+
902
943
  # Freezes the configuration to make it immutable.
903
944
  # This prevents accidental modification at runtime.
904
945
  cfg.freeze()
@@ -19,9 +19,10 @@
19
19
  `ServiceFactory` for building analyzers
20
20
  """
21
21
 
22
+ from __future__ import annotations
22
23
 
23
24
  from os import environ
24
- from typing import Union
25
+ from typing import TYPE_CHECKING, Union
25
26
 
26
27
  from lazy_imports import try_import
27
28
 
@@ -29,6 +30,18 @@ from ..extern.base import ImageTransformer, ObjectDetector, PdfMiner
29
30
  from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
30
31
  from ..extern.doctrocr import DoctrTextlineDetector, DoctrTextRecognizer
31
32
  from ..extern.hfdetr import HFDetrDerivedDetector
33
+ from ..extern.hflayoutlm import (
34
+ HFLayoutLmSequenceClassifier,
35
+ HFLayoutLmTokenClassifier,
36
+ HFLayoutLmv2SequenceClassifier,
37
+ HFLayoutLmv2TokenClassifier,
38
+ HFLayoutLmv3SequenceClassifier,
39
+ HFLayoutLmv3TokenClassifier,
40
+ HFLiltSequenceClassifier,
41
+ HFLiltTokenClassifier,
42
+ get_tokenizer_from_model_class,
43
+ )
44
+ from ..extern.hflm import HFLmSequenceClassifier, HFLmTokenClassifier
32
45
  from ..extern.model import ModelCatalog, ModelDownloadManager
33
46
  from ..extern.pdftext import PdfPlumberTextDetector
34
47
  from ..extern.tessocr import TesseractOcrDetector, TesseractRotationTransformer
@@ -45,6 +58,7 @@ from ..pipe.common import (
45
58
  )
46
59
  from ..pipe.doctectionpipe import DoctectionPipe
47
60
  from ..pipe.layout import ImageLayoutService, skip_if_category_or_service_extracted
61
+ from ..pipe.lm import LMSequenceClassifierService, LMTokenClassifierService
48
62
  from ..pipe.order import TextOrderService
49
63
  from ..pipe.refine import TableSegmentationRefinementService
50
64
  from ..pipe.segment import PubtablesSegmentationService, TableSegmentationService
@@ -60,6 +74,10 @@ from ..utils.transform import PadTransform
60
74
  with try_import() as image_guard:
61
75
  from botocore.config import Config # type: ignore
62
76
 
77
+ if TYPE_CHECKING:
78
+ from ..extern.hflayoutlm import LayoutSequenceModels, LayoutTokenModels
79
+ from ..extern.hflm import LmSequenceModels, LmTokenModels
80
+
63
81
 
64
82
  __all__ = [
65
83
  "ServiceFactory",
@@ -841,6 +859,226 @@ class ServiceFactory:
841
859
  """
842
860
  return ServiceFactory._build_text_order_service(config)
843
861
 
862
+ @staticmethod
863
+ def _build_sequence_classifier(config: AttrDict) -> Union[LayoutSequenceModels, LmSequenceModels]:
864
+ """
865
+ Builds and returns a sequence classifier instance.
866
+
867
+ Args:
868
+ config: Configuration object that determines the type of sequence classifier to construct.
869
+
870
+ Returns:
871
+ A sequence classifier instance constructed according to the specified configuration.
872
+ """
873
+ config_path = ModelCatalog.get_full_path_configs(config.LM_SEQUENCE_CLASS.WEIGHTS)
874
+ weights_path = ModelDownloadManager.maybe_download_weights_and_configs(config.LM_SEQUENCE_CLASS.WEIGHTS)
875
+ profile = ModelCatalog.get_profile(config.LM_SEQUENCE_CLASS.WEIGHTS)
876
+ categories = profile.categories if profile.categories is not None else {}
877
+ use_xlm_tokenizer = "xlm_tokenizer" == profile.architecture
878
+
879
+ if profile.model_wrapper in ("HFLayoutLmSequenceClassifier",):
880
+ return HFLayoutLmSequenceClassifier(
881
+ path_config_json=config_path,
882
+ path_weights=weights_path,
883
+ categories=categories,
884
+ device=config.DEVICE,
885
+ use_xlm_tokenizer=use_xlm_tokenizer,
886
+ )
887
+ if profile.model_wrapper in ("HFLayoutLmv2SequenceClassifier",):
888
+ return HFLayoutLmv2SequenceClassifier(
889
+ path_config_json=config_path,
890
+ path_weights=weights_path,
891
+ categories=categories,
892
+ device=config.DEVICE,
893
+ use_xlm_tokenizer=use_xlm_tokenizer,
894
+ )
895
+ if profile.model_wrapper in ("HFLayoutLmv3SequenceClassifier",):
896
+ return HFLayoutLmv3SequenceClassifier(
897
+ path_config_json=config_path,
898
+ path_weights=weights_path,
899
+ categories=categories,
900
+ device=config.DEVICE,
901
+ use_xlm_tokenizer=use_xlm_tokenizer,
902
+ )
903
+ if profile.model_wrapper in ("HFLiltSequenceClassifier",):
904
+ return HFLiltSequenceClassifier(
905
+ path_config_json=config_path,
906
+ path_weights=weights_path,
907
+ categories=categories,
908
+ device=config.DEVICE,
909
+ use_xlm_tokenizer=use_xlm_tokenizer,
910
+ )
911
+ if profile.model_wrapper in ("HFLmSequenceClassifier",):
912
+ return HFLmSequenceClassifier(
913
+ path_config_json=config_path,
914
+ path_weights=weights_path,
915
+ categories=categories,
916
+ device=config.DEVICE,
917
+ use_xlm_tokenizer=use_xlm_tokenizer,
918
+ )
919
+ raise ValueError(f"Unsupported model wrapper: {profile.model_wrapper}")
920
+
921
+ @staticmethod
922
+ def build_sequence_classifier(config: AttrDict) -> Union[LayoutSequenceModels, LmSequenceModels]:
923
+ """
924
+ Builds and returns a sequence classifier instance.
925
+
926
+ Args:
927
+ config: Configuration object that determines the type of sequence classifier to construct.
928
+
929
+ Returns:
930
+ A sequence classifier instance constructed according to the specified configuration.
931
+ """
932
+ return ServiceFactory._build_sequence_classifier(config)
933
+
934
+ @staticmethod
935
+ def _build_sequence_classifier_service(
936
+ config: AttrDict, sequence_classifier: Union[LayoutSequenceModels, LmSequenceModels]
937
+ ) -> LMSequenceClassifierService:
938
+ """
939
+ Building a sequence classifier service.
940
+
941
+ Args:
942
+ config: Configuration object.
943
+ sequence_classifier: Sequence classifier instance.
944
+
945
+ Returns:
946
+ LMSequenceClassifierService: Text order service instance.
947
+ """
948
+ tokenizer_fast = get_tokenizer_from_model_class(
949
+ sequence_classifier.model.__class__.__name__, sequence_classifier.use_xlm_tokenizer
950
+ )
951
+
952
+ return LMSequenceClassifierService(
953
+ tokenizer=tokenizer_fast,
954
+ language_model=sequence_classifier,
955
+ use_other_as_default_category=config.LM_SEQUENCE_CLASS.USE_OTHER_AS_DEFAULT_CATEGORY,
956
+ )
957
+
958
+ @staticmethod
959
+ def build_sequence_classifier_service(
960
+ config: AttrDict, sequence_classifier: Union[LayoutSequenceModels, LmSequenceModels]
961
+ ) -> LMSequenceClassifierService:
962
+ """
963
+ Building a sequence classifier service.
964
+
965
+ Args:
966
+ config: Configuration object.
967
+ sequence_classifier: Sequence classifier instance.
968
+
969
+ Returns:
970
+ LMSequenceClassifierService: Text order service instance.
971
+ """
972
+ return ServiceFactory._build_sequence_classifier_service(config, sequence_classifier)
973
+
974
+ @staticmethod
975
+ def _build_token_classifier(config: AttrDict) -> Union[LayoutTokenModels, LmTokenModels]:
976
+ """
977
+ Builds and returns a token classifier model.
978
+
979
+ Args:
980
+ config: Configuration object.
981
+
982
+ Returns:
983
+ The instantiated token classifier model.
984
+ """
985
+ config_path = ModelCatalog.get_full_path_configs(config.LM_TOKEN_CLASS.WEIGHTS)
986
+ weights_path = ModelDownloadManager.maybe_download_weights_and_configs(config.LM_TOKEN_CLASS.WEIGHTS)
987
+ profile = ModelCatalog.get_profile(config.LM_TOKEN_CLASS.WEIGHTS)
988
+ categories = profile.categories if profile.categories is not None else {}
989
+ use_xlm_tokenizer = "xlm_tokenizer" == profile.architecture
990
+ if profile.model_wrapper in ("HFLayoutLmTokenClassifier",):
991
+ return HFLayoutLmTokenClassifier(
992
+ path_config_json=config_path,
993
+ path_weights=weights_path,
994
+ categories=categories,
995
+ device=config.DEVICE,
996
+ use_xlm_tokenizer=use_xlm_tokenizer,
997
+ )
998
+ if profile.model_wrapper in ("HFLayoutLmv2TokenClassifier",):
999
+ return HFLayoutLmv2TokenClassifier(
1000
+ path_config_json=config_path,
1001
+ path_weights=weights_path,
1002
+ categories=categories,
1003
+ device=config.DEVICE,
1004
+ )
1005
+ if profile.model_wrapper in ("HFLayoutLmv3TokenClassifier",):
1006
+ return HFLayoutLmv3TokenClassifier(
1007
+ path_config_json=config_path,
1008
+ path_weights=weights_path,
1009
+ categories=categories,
1010
+ device=config.DEVICE,
1011
+ )
1012
+ if profile.model_wrapper in ("HFLiltTokenClassifier",):
1013
+ return HFLiltTokenClassifier(
1014
+ path_config_json=config_path,
1015
+ path_weights=weights_path,
1016
+ categories=categories,
1017
+ device=config.DEVICE,
1018
+ )
1019
+ if profile.model_wrapper in ("HFLmTokenClassifier",):
1020
+ return HFLmTokenClassifier(
1021
+ path_config_json=config_path,
1022
+ path_weights=weights_path,
1023
+ categories=categories,
1024
+ )
1025
+ raise ValueError(f"Unsupported model wrapper: {profile.model_wrapper}")
1026
+
1027
+ @staticmethod
1028
+ def build_token_classifier(config: AttrDict) -> Union[LayoutTokenModels, LmTokenModels]:
1029
+ """
1030
+ Builds and returns a token classifier model.
1031
+
1032
+ Args:
1033
+ config: Configuration object.
1034
+
1035
+ Returns:
1036
+ The instantiated token classifier model.
1037
+ """
1038
+ return ServiceFactory._build_token_classifier(config)
1039
+
1040
+ @staticmethod
1041
+ def _build_token_classifier_service(
1042
+ config: AttrDict, token_classifier: Union[LayoutTokenModels, LmTokenModels]
1043
+ ) -> LMTokenClassifierService:
1044
+ """
1045
+ Building a token classifier service.
1046
+
1047
+ Args:
1048
+ config: Configuration object.
1049
+ token_classifier: Token classifier instance.
1050
+
1051
+ Returns:
1052
+ A LMTokenClassifierService instance.
1053
+ """
1054
+ tokenizer_fast = get_tokenizer_from_model_class(
1055
+ token_classifier.model.__class__.__name__, token_classifier.use_xlm_tokenizer
1056
+ )
1057
+
1058
+ return LMTokenClassifierService(
1059
+ tokenizer=tokenizer_fast,
1060
+ language_model=token_classifier,
1061
+ use_other_as_default_category=config.LM_TOKEN_CLASS.USE_OTHER_AS_DEFAULT_CATEGORY,
1062
+ segment_positions=config.LM_TOKEN_CLASS.SEGMENT_POSITIONS,
1063
+ sliding_window_stride=config.LM_TOKEN_CLASS.SLIDING_WINDOW_STRIDE,
1064
+ )
1065
+
1066
+ @staticmethod
1067
+ def build_token_classifier_service(
1068
+ config: AttrDict, token_classifier: Union[LayoutTokenModels, LmTokenModels]
1069
+ ) -> LMTokenClassifierService:
1070
+ """
1071
+ Building a token classifier service.
1072
+
1073
+ Args:
1074
+ config: Configuration object.
1075
+ token_classifier: Token classifier instance.
1076
+
1077
+ Returns:
1078
+ A LMTokenClassifierService instance.
1079
+ """
1080
+ return ServiceFactory._build_token_classifier_service(config, token_classifier)
1081
+
844
1082
  @staticmethod
845
1083
  def _build_page_parsing_service(config: AttrDict) -> PageParsingService:
846
1084
  """
@@ -955,6 +1193,16 @@ class ServiceFactory:
955
1193
  line_list_matching_service = ServiceFactory.build_line_matching_service(config)
956
1194
  pipe_component_list.append(line_list_matching_service)
957
1195
 
1196
+ if config.USE_LM_SEQUENCE_CLASS:
1197
+ sequence_classifier = ServiceFactory.build_sequence_classifier(config)
1198
+ sequence_classifier_service = ServiceFactory.build_sequence_classifier_service(config, sequence_classifier)
1199
+ pipe_component_list.append(sequence_classifier_service)
1200
+
1201
+ if config.USE_LM_TOKEN_CLASS:
1202
+ token_classifier = ServiceFactory.build_token_classifier(config)
1203
+ token_classifier_service = ServiceFactory.build_token_classifier_service(config, token_classifier)
1204
+ pipe_component_list.append(token_classifier_service)
1205
+
958
1206
  page_parsing_service = ServiceFactory.build_page_parsing_service(config)
959
1207
 
960
1208
  return DoctectionPipe(pipeline_component_list=pipe_component_list, page_parsing_service=page_parsing_service)
@@ -30,4 +30,5 @@
30
30
  {"name": "Felix92/doctr-torch-parseq-multilingual-v1/pytorch_model.bin", "description": "", "size": [63286381], "tp_model": false, "config": "Felix92/doctr-torch-parseq-multilingual-v1/config.json", "preprocessor_config": null, "hf_repo_id": "Felix92/doctr-torch-parseq-multilingual-v1", "hf_model_name": "pytorch_model.bin", "hf_config_file": ["config.json"], "urls": null, "categories": {}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "DoctrTextRecognizer", "architecture": "parseq", "padding": null}
31
31
  {"name": "doctr/crnn_vgg16_bn/pt/master-fde31e4a.pt", "description": "MASTER", "size": [63286381], "tp_model": false, "config": null, "preprocessor_config": null, "hf_repo_id": null, "hf_model_name": null, "hf_config_file": null, "urls": ["https://doctr-static.mindee.com/models?id=v0.7.0/master-fde31e4a.pt&src=0"], "categories": {}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "DoctrTextRecognizer", "architecture": "master", "padding": null}
32
32
  {"name": "Aryn/deformable-detr-DocLayNet/model.safetensors", "description": "Deformable DEtection TRansformer (DETR), trained on DocLayNet (including 80k annotated pages in 11 classes).", "size": [115511753], "tp_model": false, "config": "Aryn/deformable-detr-DocLayNet/config.json", "preprocessor_config": "Aryn/deformable-detr-DocLayNet/preprocessor_config.json", "hf_repo_id": "Aryn/deformable-detr-DocLayNet", "hf_model_name": "model.safetensors", "hf_config_file": ["config.json", "preprocessor_config.json"], "urls": null, "categories": {"1": "default_type", "2": "caption", "11": "text", "12": "title", "3": "footnote", "4": "formula", "5": "list_item", "6": "page_footer", "7": "page_header", "8": "figure", "9": "section_header", "10": "table"}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "HFDetrDerivedDetector", "architecture": null, "padding": null}
33
- {"name": "deepdoctection/tatr_tab_struct_v2/model.safetensors", "description": "Table Transformer (DETR) model trained on PubTables1M. It was introduced in the paper Aligning benchmark datasets for table structure recognition by Smock et al. This model is devoted to table structure recognition and assumes to receive a slightly croppedtable as input. It will predict rows, column and spanning cells. Use a padding of around 5 pixels. This artefact has been converted from deepdoctection/tatr_tab_struct_v2/pytorch_model.bin and should be used to reduce security issues", "size": [115511753], "tp_model": false, "config": "deepdoctection/tatr_tab_struct_v2/config.json", "preprocessor_config": "deepdoctection/tatr_tab_struct_v2/preprocessor_config.json", "hf_repo_id": "deepdoctection/tatr_tab_struct_v2", "hf_model_name": "model.safetensors", "hf_config_file": ["config.json", "preprocessor_config.json"], "urls": null, "categories": {"1": "table", "2": "column", "3": "row", "4": "column_header", "5": "projected_row_header", "6": "spanning"}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "HFDetrDerivedDetector", "architecture": null, "padding": null}
33
+ {"name": "deepdoctection/tatr_tab_struct_v2/model.safetensors", "description": "Table Transformer (DETR) model trained on PubTables1M. It was introduced in the paper Aligning benchmark datasets for table structure recognition by Smock et al. This model is devoted to table structure recognition and assumes to receive a slightly croppedtable as input. It will predict rows, column and spanning cells. Use a padding of around 5 pixels. This artefact has been converted from deepdoctection/tatr_tab_struct_v2/pytorch_model.bin and should be used to reduce security issues", "size": [115511753], "tp_model": false, "config": "deepdoctection/tatr_tab_struct_v2/config.json", "preprocessor_config": "deepdoctection/tatr_tab_struct_v2/preprocessor_config.json", "hf_repo_id": "deepdoctection/tatr_tab_struct_v2", "hf_model_name": "model.safetensors", "hf_config_file": ["config.json", "preprocessor_config.json"], "urls": null, "categories": {"1": "table", "2": "column", "3": "row", "4": "column_header", "5": "projected_row_header", "6": "spanning"}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "HFDetrDerivedDetector", "architecture": null, "padding": null}
34
+ {"name": "papluca/xlm-roberta-base-language-detection/model.safetensors", "description": "This model is an XLM-RoBERTa transformer model with a classification head on top (i.e. a linear layer on top of the pooled output). For additional information please refer to the xlm-roberta-base model card or to the paper Unsupervised Cross-lingual Representation Learning at Scale by Conneau et al.", "size": [101971449], "tp_model": false, "config": "papluca/xlm-roberta-base-language-detection/config.json", "preprocessor_config": null, "hf_repo_id": "papluca/xlm-roberta-base-language-detection", "hf_model_name": "model.safetensors", "hf_config_file": ["config.json"], "urls": null, "categories": {"1": "jpn", "2": "dut", "3": "ara", "4": "pol", "5": "deu", "6": "ita", "7": "por", "8": "tur", "9": "spa", "10": "hin", "11": "gre", "12": "urd", "13": "bul", "14": "eng", "15": "fre", "16": "chi", "17": "rus", "18": "tha", "19": "swa", "20": "vie"}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "HFLmLanguageDetector", "architecture": null, "padding": null}
@@ -42,6 +42,7 @@ from .convert import as_dict, convert_b64_to_np_array, convert_np_array_to_b64,
42
42
 
43
43
  class MetaAnnotationDict(TypedDict):
44
44
  """MetaAnnotationDict"""
45
+
45
46
  image_annotations: list[str]
46
47
  sub_categories: dict[str, dict[str, list[str]]]
47
48
  relationships: dict[str, list[str]]