deepdoctection 0.42.0__py3-none-any.whl → 0.43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (124) hide show
  1. deepdoctection/__init__.py +2 -1
  2. deepdoctection/analyzer/__init__.py +2 -1
  3. deepdoctection/analyzer/config.py +904 -0
  4. deepdoctection/analyzer/dd.py +36 -62
  5. deepdoctection/analyzer/factory.py +311 -141
  6. deepdoctection/configs/conf_dd_one.yaml +100 -44
  7. deepdoctection/configs/profiles.jsonl +32 -0
  8. deepdoctection/dataflow/__init__.py +9 -6
  9. deepdoctection/dataflow/base.py +33 -15
  10. deepdoctection/dataflow/common.py +96 -75
  11. deepdoctection/dataflow/custom.py +36 -29
  12. deepdoctection/dataflow/custom_serialize.py +135 -91
  13. deepdoctection/dataflow/parallel_map.py +33 -31
  14. deepdoctection/dataflow/serialize.py +15 -10
  15. deepdoctection/dataflow/stats.py +41 -28
  16. deepdoctection/datapoint/__init__.py +4 -6
  17. deepdoctection/datapoint/annotation.py +104 -66
  18. deepdoctection/datapoint/box.py +190 -130
  19. deepdoctection/datapoint/convert.py +66 -39
  20. deepdoctection/datapoint/image.py +151 -95
  21. deepdoctection/datapoint/view.py +383 -236
  22. deepdoctection/datasets/__init__.py +2 -6
  23. deepdoctection/datasets/adapter.py +11 -11
  24. deepdoctection/datasets/base.py +118 -81
  25. deepdoctection/datasets/dataflow_builder.py +18 -12
  26. deepdoctection/datasets/info.py +76 -57
  27. deepdoctection/datasets/instances/__init__.py +6 -2
  28. deepdoctection/datasets/instances/doclaynet.py +17 -14
  29. deepdoctection/datasets/instances/fintabnet.py +16 -22
  30. deepdoctection/datasets/instances/funsd.py +11 -6
  31. deepdoctection/datasets/instances/iiitar13k.py +9 -9
  32. deepdoctection/datasets/instances/layouttest.py +9 -9
  33. deepdoctection/datasets/instances/publaynet.py +9 -9
  34. deepdoctection/datasets/instances/pubtables1m.py +13 -13
  35. deepdoctection/datasets/instances/pubtabnet.py +13 -15
  36. deepdoctection/datasets/instances/rvlcdip.py +8 -8
  37. deepdoctection/datasets/instances/xfund.py +11 -9
  38. deepdoctection/datasets/registry.py +18 -11
  39. deepdoctection/datasets/save.py +12 -11
  40. deepdoctection/eval/__init__.py +3 -2
  41. deepdoctection/eval/accmetric.py +72 -52
  42. deepdoctection/eval/base.py +29 -10
  43. deepdoctection/eval/cocometric.py +14 -12
  44. deepdoctection/eval/eval.py +56 -41
  45. deepdoctection/eval/registry.py +6 -3
  46. deepdoctection/eval/tedsmetric.py +24 -9
  47. deepdoctection/eval/tp_eval_callback.py +13 -12
  48. deepdoctection/extern/__init__.py +1 -1
  49. deepdoctection/extern/base.py +176 -97
  50. deepdoctection/extern/d2detect.py +127 -92
  51. deepdoctection/extern/deskew.py +19 -10
  52. deepdoctection/extern/doctrocr.py +157 -106
  53. deepdoctection/extern/fastlang.py +25 -17
  54. deepdoctection/extern/hfdetr.py +137 -60
  55. deepdoctection/extern/hflayoutlm.py +329 -248
  56. deepdoctection/extern/hflm.py +67 -33
  57. deepdoctection/extern/model.py +108 -762
  58. deepdoctection/extern/pdftext.py +37 -12
  59. deepdoctection/extern/pt/nms.py +15 -1
  60. deepdoctection/extern/pt/ptutils.py +13 -9
  61. deepdoctection/extern/tessocr.py +87 -54
  62. deepdoctection/extern/texocr.py +29 -14
  63. deepdoctection/extern/tp/tfutils.py +36 -8
  64. deepdoctection/extern/tp/tpcompat.py +54 -16
  65. deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
  66. deepdoctection/extern/tpdetect.py +4 -2
  67. deepdoctection/mapper/__init__.py +1 -1
  68. deepdoctection/mapper/cats.py +117 -76
  69. deepdoctection/mapper/cocostruct.py +35 -17
  70. deepdoctection/mapper/d2struct.py +56 -29
  71. deepdoctection/mapper/hfstruct.py +32 -19
  72. deepdoctection/mapper/laylmstruct.py +221 -185
  73. deepdoctection/mapper/maputils.py +71 -35
  74. deepdoctection/mapper/match.py +76 -62
  75. deepdoctection/mapper/misc.py +68 -44
  76. deepdoctection/mapper/pascalstruct.py +13 -12
  77. deepdoctection/mapper/prodigystruct.py +33 -19
  78. deepdoctection/mapper/pubstruct.py +42 -32
  79. deepdoctection/mapper/tpstruct.py +39 -19
  80. deepdoctection/mapper/xfundstruct.py +20 -13
  81. deepdoctection/pipe/__init__.py +1 -2
  82. deepdoctection/pipe/anngen.py +104 -62
  83. deepdoctection/pipe/base.py +226 -107
  84. deepdoctection/pipe/common.py +206 -123
  85. deepdoctection/pipe/concurrency.py +74 -47
  86. deepdoctection/pipe/doctectionpipe.py +108 -47
  87. deepdoctection/pipe/language.py +41 -24
  88. deepdoctection/pipe/layout.py +45 -18
  89. deepdoctection/pipe/lm.py +146 -78
  90. deepdoctection/pipe/order.py +196 -113
  91. deepdoctection/pipe/refine.py +111 -63
  92. deepdoctection/pipe/registry.py +1 -1
  93. deepdoctection/pipe/segment.py +213 -142
  94. deepdoctection/pipe/sub_layout.py +76 -46
  95. deepdoctection/pipe/text.py +52 -33
  96. deepdoctection/pipe/transform.py +8 -6
  97. deepdoctection/train/d2_frcnn_train.py +87 -69
  98. deepdoctection/train/hf_detr_train.py +72 -40
  99. deepdoctection/train/hf_layoutlm_train.py +85 -46
  100. deepdoctection/train/tp_frcnn_train.py +56 -28
  101. deepdoctection/utils/concurrency.py +59 -16
  102. deepdoctection/utils/context.py +40 -19
  103. deepdoctection/utils/develop.py +25 -17
  104. deepdoctection/utils/env_info.py +85 -36
  105. deepdoctection/utils/error.py +16 -10
  106. deepdoctection/utils/file_utils.py +246 -62
  107. deepdoctection/utils/fs.py +162 -43
  108. deepdoctection/utils/identifier.py +29 -16
  109. deepdoctection/utils/logger.py +49 -32
  110. deepdoctection/utils/metacfg.py +83 -21
  111. deepdoctection/utils/pdf_utils.py +119 -62
  112. deepdoctection/utils/settings.py +24 -10
  113. deepdoctection/utils/tqdm.py +10 -5
  114. deepdoctection/utils/transform.py +182 -46
  115. deepdoctection/utils/utils.py +61 -28
  116. deepdoctection/utils/viz.py +150 -104
  117. deepdoctection-0.43.dist-info/METADATA +376 -0
  118. deepdoctection-0.43.dist-info/RECORD +149 -0
  119. {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/WHEEL +1 -1
  120. deepdoctection/analyzer/_config.py +0 -146
  121. deepdoctection-0.42.0.dist-info/METADATA +0 -431
  122. deepdoctection-0.42.0.dist-info/RECORD +0 -148
  123. {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/licenses/LICENSE +0 -0
  124. {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/top_level.txt +0 -0
@@ -15,12 +15,10 @@
15
15
  # See the License for the specific language governing permissions and
16
16
  # limitations under the License.
17
17
 
18
- """
19
- Module for **deep**doctection analyzer.
20
-
21
- -factory build_analyzer for a given config
22
18
 
23
- -user factory with a reduced config setting
19
+ """
20
+ - factory `build_analyzer` for a given config
21
+ - user factory with a reduced config setting
24
22
  """
25
23
 
26
24
  from __future__ import annotations
@@ -32,12 +30,12 @@ from ..extern.pt.ptutils import get_torch_device
32
30
  from ..extern.tp.tfutils import disable_tp_layer_logging, get_tf_device
33
31
  from ..pipe.doctectionpipe import DoctectionPipe
34
32
  from ..utils.env_info import ENV_VARS_TRUE
35
- from ..utils.file_utils import tensorpack_available, detectron2_available
33
+ from ..utils.file_utils import detectron2_available, tensorpack_available
36
34
  from ..utils.fs import get_configs_dir_path, get_package_path, maybe_copy_config_to_cache
37
35
  from ..utils.logger import LoggingRecord, logger
38
36
  from ..utils.metacfg import set_config_by_yaml
39
37
  from ..utils.types import PathLikeOrStr
40
- from ._config import cfg
38
+ from .config import cfg
41
39
  from .factory import ServiceFactory
42
40
 
43
41
  __all__ = [
@@ -47,32 +45,6 @@ __all__ = [
47
45
 
48
46
  _DD_ONE = "deepdoctection/configs/conf_dd_one.yaml"
49
47
  _TESSERACT = "deepdoctection/configs/conf_tesseract.yaml"
50
- _MODEL_CHOICES = {
51
- "layout": [
52
- "layout/d2_model_0829999_layout_inf_only.pt",
53
- "xrf_layout/model_final_inf_only.pt",
54
- "microsoft/table-transformer-detection/pytorch_model.bin",
55
- ],
56
- "segmentation": [
57
- "item/model-1620000_inf_only.data-00000-of-00001",
58
- "xrf_item/model_final_inf_only.pt",
59
- "microsoft/table-transformer-structure-recognition/pytorch_model.bin",
60
- "deepdoctection/tatr_tab_struct_v2/pytorch_model.bin",
61
- ],
62
- "ocr": ["Tesseract", "DocTr", "Textract"],
63
- "doctr_word": ["doctr/db_resnet50/pt/db_resnet50-ac60cadc.pt"],
64
- "doctr_recognition": [
65
- "doctr/crnn_vgg16_bn/pt/crnn_vgg16_bn-9762b0b0.pt",
66
- "doctr/crnn_vgg16_bn/pt/pytorch_model.bin",
67
- ],
68
- "llm": ["gpt-3.5-turbo", "gpt-4"],
69
- "segmentation_choices": {
70
- "item/model-1620000_inf_only.data-00000-of-00001": "cell/model-1800000_inf_only.data-00000-of-00001",
71
- "xrf_item/model_final_inf_only.pt": "xrf_cell/model_final_inf_only.pt",
72
- "microsoft/table-transformer-structure-recognition/pytorch_model.bin": None,
73
- "deepdoctection/tatr_tab_struct_v2/pytorch_model.bin": None,
74
- },
75
- }
76
48
 
77
49
 
78
50
  def config_sanity_checks() -> None:
@@ -89,32 +61,35 @@ def config_sanity_checks() -> None:
89
61
 
90
62
  def get_dd_analyzer(
91
63
  reset_config_file: bool = True,
64
+ load_default_config_file: bool = False,
92
65
  config_overwrite: Optional[list[str]] = None,
93
66
  path_config_file: Optional[PathLikeOrStr] = None,
94
67
  ) -> DoctectionPipe:
95
68
  """
96
69
  Factory function for creating the built-in **deep**doctection analyzer.
97
70
 
98
- The Standard Analyzer is a pipeline that comprises the following analysis components:
99
-
100
- - Document layout analysis
101
-
102
- - Table segmentation
103
-
104
- - Text extraction/OCR
105
-
106
- - Reading order
107
-
108
- We refer to the various notebooks and docs for running an analyzer and changing the configs.
109
-
110
- :param reset_config_file: This will copy the `.yaml` file with default variables to the `.cache` and therefore
111
- resetting all configurations if set to `True`.
112
- :param config_overwrite: Passing a list of string arguments and values to overwrite the `.yaml` configuration with
113
- highest priority, e.g. ["USE_TABLE_SEGMENTATION=False",
114
- "USE_OCR=False",
115
- "TF.LAYOUT.WEIGHTS=my_fancy_pytorch_model"]
116
- :param path_config_file: Path to a custom config file. Can be outside of the .cache directory.
117
- :return: A DoctectionPipe instance with given configs
71
+ Info:
72
+ The Standard Analyzer is a pipeline that comprises the following analysis components:
73
+
74
+ - Rotation
75
+ - Document layout analysis
76
+ - Table segmentation
77
+ - Text extraction/OCR
78
+ - Reading order
79
+ - Layout linking
80
+
81
+ Args:
82
+ reset_config_file: This will copy the `.yaml` file with default variables to the `.cache` and therefore
83
+ resetting all configurations if set to `True`.
84
+ load_default_config_file: This will load the default config file from the `.cache` directory if set to `True`.
85
+ If set to `False`, the config file will be ignored.
86
+ config_overwrite: Passing a list of string arguments and values to overwrite the `.yaml`
87
+ configuration with highest priority, e.g. `["USE_TABLE_SEGMENTATION=False", "USE_OCR=False",
88
+ "TF.LAYOUT.WEIGHTS=my_fancy_pytorch_model"]`.
89
+ path_config_file: Path to a custom config file. Can be outside of the `.cache` directory.
90
+
91
+ Returns:
92
+ DoctectionPipe: A `DoctectionPipe` instance with given configs.
118
93
  """
119
94
  config_overwrite = [] if config_overwrite is None else config_overwrite
120
95
  if os.environ.get("DD_USE_TF", "0") in ENV_VARS_TRUE:
@@ -131,25 +106,24 @@ def get_dd_analyzer(
131
106
  )
132
107
  maybe_copy_config_to_cache(get_package_path(), get_configs_dir_path() / "dd", _TESSERACT)
133
108
 
134
- # Set up of the configuration and logging
135
- file_cfg = set_config_by_yaml(dd_one_config_path if not path_config_file else path_config_file)
136
- cfg.freeze(freezed=False)
137
- cfg.overwrite_config(file_cfg)
138
-
139
109
  cfg.freeze(freezed=False)
110
+ if load_default_config_file:
111
+ # Set up of the configuration and logging
112
+ file_cfg = set_config_by_yaml(dd_one_config_path if not path_config_file else path_config_file)
113
+ cfg.overwrite_config(file_cfg)
140
114
  cfg.LANGUAGE = None
141
115
  cfg.LIB = lib
142
116
  cfg.DEVICE = device
143
117
  if not detectron2_available() or cfg.PT.LAYOUT.WEIGHTS is None:
144
- cfg.PT.ENFORCE_WEIGHTS.LAYOUT=False
118
+ cfg.PT.ENFORCE_WEIGHTS.LAYOUT = False
145
119
  if not detectron2_available() or cfg.PT.ITEM.WEIGHTS is None:
146
- cfg.PT.ENFORCE_WEIGHTS.ITEM=False
120
+ cfg.PT.ENFORCE_WEIGHTS.ITEM = False
147
121
  if not detectron2_available() or cfg.PT.CELL.WEIGHTS is None:
148
- cfg.PT.ENFORCE_WEIGHTS.CELL=False
149
- cfg.freeze()
122
+ cfg.PT.ENFORCE_WEIGHTS.CELL = False
150
123
 
151
124
  if config_overwrite:
152
125
  cfg.update_args(config_overwrite)
126
+ cfg.freeze()
153
127
 
154
128
  config_sanity_checks()
155
129
  logger.info(LoggingRecord(f"Config: \n {str(cfg)}", cfg.to_dict())) # type: ignore