deepdoctection 0.42.1__py3-none-any.whl → 0.43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (124) hide show
  1. deepdoctection/__init__.py +2 -1
  2. deepdoctection/analyzer/__init__.py +2 -1
  3. deepdoctection/analyzer/config.py +904 -0
  4. deepdoctection/analyzer/dd.py +36 -62
  5. deepdoctection/analyzer/factory.py +311 -141
  6. deepdoctection/configs/conf_dd_one.yaml +100 -44
  7. deepdoctection/configs/profiles.jsonl +32 -0
  8. deepdoctection/dataflow/__init__.py +9 -6
  9. deepdoctection/dataflow/base.py +33 -15
  10. deepdoctection/dataflow/common.py +96 -75
  11. deepdoctection/dataflow/custom.py +36 -29
  12. deepdoctection/dataflow/custom_serialize.py +135 -91
  13. deepdoctection/dataflow/parallel_map.py +33 -31
  14. deepdoctection/dataflow/serialize.py +15 -10
  15. deepdoctection/dataflow/stats.py +41 -28
  16. deepdoctection/datapoint/__init__.py +4 -6
  17. deepdoctection/datapoint/annotation.py +104 -66
  18. deepdoctection/datapoint/box.py +190 -130
  19. deepdoctection/datapoint/convert.py +66 -39
  20. deepdoctection/datapoint/image.py +151 -95
  21. deepdoctection/datapoint/view.py +383 -236
  22. deepdoctection/datasets/__init__.py +2 -6
  23. deepdoctection/datasets/adapter.py +11 -11
  24. deepdoctection/datasets/base.py +118 -81
  25. deepdoctection/datasets/dataflow_builder.py +18 -12
  26. deepdoctection/datasets/info.py +76 -57
  27. deepdoctection/datasets/instances/__init__.py +6 -2
  28. deepdoctection/datasets/instances/doclaynet.py +17 -14
  29. deepdoctection/datasets/instances/fintabnet.py +16 -22
  30. deepdoctection/datasets/instances/funsd.py +11 -6
  31. deepdoctection/datasets/instances/iiitar13k.py +9 -9
  32. deepdoctection/datasets/instances/layouttest.py +9 -9
  33. deepdoctection/datasets/instances/publaynet.py +9 -9
  34. deepdoctection/datasets/instances/pubtables1m.py +13 -13
  35. deepdoctection/datasets/instances/pubtabnet.py +13 -15
  36. deepdoctection/datasets/instances/rvlcdip.py +8 -8
  37. deepdoctection/datasets/instances/xfund.py +11 -9
  38. deepdoctection/datasets/registry.py +18 -11
  39. deepdoctection/datasets/save.py +12 -11
  40. deepdoctection/eval/__init__.py +3 -2
  41. deepdoctection/eval/accmetric.py +72 -52
  42. deepdoctection/eval/base.py +29 -10
  43. deepdoctection/eval/cocometric.py +14 -12
  44. deepdoctection/eval/eval.py +56 -41
  45. deepdoctection/eval/registry.py +6 -3
  46. deepdoctection/eval/tedsmetric.py +24 -9
  47. deepdoctection/eval/tp_eval_callback.py +13 -12
  48. deepdoctection/extern/__init__.py +1 -1
  49. deepdoctection/extern/base.py +176 -97
  50. deepdoctection/extern/d2detect.py +127 -92
  51. deepdoctection/extern/deskew.py +19 -10
  52. deepdoctection/extern/doctrocr.py +157 -106
  53. deepdoctection/extern/fastlang.py +25 -17
  54. deepdoctection/extern/hfdetr.py +137 -60
  55. deepdoctection/extern/hflayoutlm.py +329 -248
  56. deepdoctection/extern/hflm.py +67 -33
  57. deepdoctection/extern/model.py +108 -762
  58. deepdoctection/extern/pdftext.py +37 -12
  59. deepdoctection/extern/pt/nms.py +15 -1
  60. deepdoctection/extern/pt/ptutils.py +13 -9
  61. deepdoctection/extern/tessocr.py +87 -54
  62. deepdoctection/extern/texocr.py +29 -14
  63. deepdoctection/extern/tp/tfutils.py +36 -8
  64. deepdoctection/extern/tp/tpcompat.py +54 -16
  65. deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
  66. deepdoctection/extern/tpdetect.py +4 -2
  67. deepdoctection/mapper/__init__.py +1 -1
  68. deepdoctection/mapper/cats.py +117 -76
  69. deepdoctection/mapper/cocostruct.py +35 -17
  70. deepdoctection/mapper/d2struct.py +56 -29
  71. deepdoctection/mapper/hfstruct.py +32 -19
  72. deepdoctection/mapper/laylmstruct.py +221 -185
  73. deepdoctection/mapper/maputils.py +71 -35
  74. deepdoctection/mapper/match.py +76 -62
  75. deepdoctection/mapper/misc.py +68 -44
  76. deepdoctection/mapper/pascalstruct.py +13 -12
  77. deepdoctection/mapper/prodigystruct.py +33 -19
  78. deepdoctection/mapper/pubstruct.py +42 -32
  79. deepdoctection/mapper/tpstruct.py +39 -19
  80. deepdoctection/mapper/xfundstruct.py +20 -13
  81. deepdoctection/pipe/__init__.py +1 -2
  82. deepdoctection/pipe/anngen.py +104 -62
  83. deepdoctection/pipe/base.py +226 -107
  84. deepdoctection/pipe/common.py +206 -123
  85. deepdoctection/pipe/concurrency.py +74 -47
  86. deepdoctection/pipe/doctectionpipe.py +108 -47
  87. deepdoctection/pipe/language.py +41 -24
  88. deepdoctection/pipe/layout.py +45 -18
  89. deepdoctection/pipe/lm.py +146 -78
  90. deepdoctection/pipe/order.py +196 -113
  91. deepdoctection/pipe/refine.py +111 -63
  92. deepdoctection/pipe/registry.py +1 -1
  93. deepdoctection/pipe/segment.py +213 -142
  94. deepdoctection/pipe/sub_layout.py +76 -46
  95. deepdoctection/pipe/text.py +52 -33
  96. deepdoctection/pipe/transform.py +8 -6
  97. deepdoctection/train/d2_frcnn_train.py +87 -69
  98. deepdoctection/train/hf_detr_train.py +72 -40
  99. deepdoctection/train/hf_layoutlm_train.py +85 -46
  100. deepdoctection/train/tp_frcnn_train.py +56 -28
  101. deepdoctection/utils/concurrency.py +59 -16
  102. deepdoctection/utils/context.py +40 -19
  103. deepdoctection/utils/develop.py +25 -17
  104. deepdoctection/utils/env_info.py +85 -36
  105. deepdoctection/utils/error.py +16 -10
  106. deepdoctection/utils/file_utils.py +246 -62
  107. deepdoctection/utils/fs.py +162 -43
  108. deepdoctection/utils/identifier.py +29 -16
  109. deepdoctection/utils/logger.py +49 -32
  110. deepdoctection/utils/metacfg.py +83 -21
  111. deepdoctection/utils/pdf_utils.py +119 -62
  112. deepdoctection/utils/settings.py +24 -10
  113. deepdoctection/utils/tqdm.py +10 -5
  114. deepdoctection/utils/transform.py +182 -46
  115. deepdoctection/utils/utils.py +61 -28
  116. deepdoctection/utils/viz.py +150 -104
  117. deepdoctection-0.43.dist-info/METADATA +376 -0
  118. deepdoctection-0.43.dist-info/RECORD +149 -0
  119. deepdoctection/analyzer/_config.py +0 -146
  120. deepdoctection-0.42.1.dist-info/METADATA +0 -431
  121. deepdoctection-0.42.1.dist-info/RECORD +0 -148
  122. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/WHEEL +0 -0
  123. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/licenses/LICENSE +0 -0
  124. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,904 @@
1
+ # -*- coding: utf-8 -*-
2
+ # File: config.py
3
+
4
+ # Copyright 2024 Dr. Janis Meyer. All rights reserved.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+
19
+ """
20
+ This module defines all configuration options for the deepdoctection analyzer pipeline using an `AttrDict`.
21
+ The configuration controls component activation, model selection, thresholds, and processing behavior.
22
+
23
+ General note: All models used in `*.WEIGHTS` must be registered in the `ModelCatalog`.
24
+ Registered models are listed in `deepdoctection/profiles.jsonl`. To add new models,
25
+ either extend this file with additional JSON objects or provide a separate `JSONL` file
26
+ and reference it via the `MODEL_CATALOG` environment variable.
27
+
28
+ Relevant only for Tesseract OCR. Specifies the language model to use.
29
+ Supported language codes are listed at:
30
+ <https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html>.
31
+
32
+ Note:
33
+ Models must be downloaded in advance.
34
+
35
+ Attributes:
36
+ cfg: The main configuration object that stores all settings as a hierarchical AttrDict.
37
+
38
+ ---
39
+
40
+ ## General Configuration
41
+
42
+ LANGUAGE:
43
+ Relevant only for Tesseract OCR. Specifies the OCR model to use.
44
+ Supported language codes are listed at:
45
+ <https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html>.
46
+ Note: models must be downloaded in advance.
47
+
48
+ LIB:
49
+ Deep learning framework. Choose either 'TF' (TensorFlow) or 'PT' (PyTorch).
50
+ Selection is made via environment variables: DD_USE_TF or DD_USE_PT.
51
+
52
+ DEVICE:
53
+ Device configuration.
54
+ For PyTorch: torch.device("cpu"), torch.device("mps"), or torch.device("cuda")
55
+ For TensorFlow: tf.device("/cpu:0") or tf.device("/gpu:0")
56
+
57
+ ---
58
+
59
+ ## Pipeline Component Activation
60
+
61
+ USE_ROTATOR:
62
+ Enables the initial pipeline component using TesseractRotationTransformer to auto-rotate pages
63
+ by 90-degree increments. All subsequent components process the rotated page.
64
+
65
+ USE_LAYOUT:
66
+ Enables layout analysis component (second in the pipeline) for either full document layout analysis
67
+ (DLA) or single-object detection. Additional configurations via PT.LAYOUT.*, TF.LAYOUT.*, and
68
+ PT.ENFORCE_WEIGHTS.LAYOUT.
69
+
70
+ USE_LAYOUT_NMS:
71
+ Enables optional fine-grained Non-Maximum Suppression (NMS) after layout detection.
72
+ Configure via LAYOUT_NMS_PAIRS.* settings.
73
+
74
+ USE_TABLE_SEGMENTATION:
75
+ Enables table segmentation (third and later pipeline components).
76
+ Applies row/column detection, optional cell detection, and segmentation services.
77
+ Configure sub-services via PT.ITEM.*, TF.ITEM.*, PT.CELL.*, TF.CELL.*, and SEGMENTATION.*
78
+
79
+ USE_TABLE_REFINEMENT:
80
+ Enables optional refinement of table structure to ensure valid HTML generation.
81
+ Should be set to False when using the Table Transformer approach.
82
+
83
+ USE_PDF_MINER:
84
+ Enables text extraction using PDFPlumber. Only works on PDFs with embedded text layers.
85
+ Configure additional behavior using PDF_MINER.*
86
+
87
+ USE_OCR:
88
+ Enables OCR functionality using Tesseract, DocTr, or Textract.
89
+ Also activates MatchingService and TextOrderingService to associate text with layout elements.
90
+ Further configurations via OCR.*, WORD_MATCHING.*, TEXT_CONTAINER, and TEXT_ORDERING.*
91
+
92
+ USE_LAYOUT_LINK:
93
+ Enables MatchingService to associate nearby layout elements (e.g., figures and captions).
94
+
95
+ USE_LINE_MATCHER:
96
+ Enables line matching in post-processing. Useful when synthetic line elements are created
97
+ (e.g., by grouping orphan text containers). Only applicable if list items were previously grouped.
98
+
99
+ ---
100
+ ## Layout Detection Models
101
+
102
+ ### TensorFlow Layout Configuration
103
+
104
+ TF.LAYOUT.WEIGHTS:
105
+ Relevant when LIB = TF. Specifies the layout detection model.
106
+ This model should detect multiple or single objects across an entire page.
107
+ Currently, only one default model is supported.
108
+
109
+ TF.LAYOUT.FILTER:
110
+ Filters out unnecessary categories from the layout detection model output.
111
+ Accepts either a list of strings (e.g., ['list', 'figure']) or a list of ObjectTypes
112
+ (e.g., [LayoutType.LIST, LayoutType.FIGURE]).
113
+
114
+ ### PyTorch Layout Configuration
115
+
116
+ PT.ENFORCE_WEIGHTS.LAYOUT:
117
+ Relevant when LIB = PT. Allows selection between two model formats:
118
+ 1. Standard PyTorch weights (.pt or .safetensors), or
119
+ 2. TorchScript weights (.ts), which require only the Torch runtime and not the model implementation.
120
+ If PT.ENFORCE_WEIGHTS.LAYOUT is set to True, PT.LAYOUT.WEIGHTS will take precedence.
121
+ The get_dd_analyzer() function will set PT.ENFORCE_WEIGHTS.LAYOUT = False automatically
122
+ if Detectron2 is not installed or PT.LAYOUT.WEIGHTS is None.
123
+
124
+ PT.LAYOUT.WEIGHTS:
125
+ Specifies the PyTorch layout detection model (standard weights).
126
+ Must detect single or multiple objects across the full page.
127
+ Acceptable formats: .pt or .safetensors (e.g.,
128
+ layout/d2_model_0829999_layout_inf_only.pt,
129
+ microsoft/table-transformer-detection/pytorch_model.bin,
130
+ Aryn/deformable-detr-DocLayNet/model.safetensors).
131
+
132
+ PT.LAYOUT.WEIGHTS_TS:
133
+ Specifies the TorchScript version of the layout model.
134
+ Must detect single or multiple objects across the full page.
135
+ Acceptable format: .ts files (e.g., layout/d2_model_0829999_layout_inf_only.ts).
136
+
137
+ PT.LAYOUT.FILTER:
138
+ Filters out unwanted categories from the model's predictions.
139
+ Accepts either string values (e.g., ['list', 'figure']) or ObjectTypes
140
+ (e.g., [LayoutType.LIST, LayoutType.FIGURE]).
141
+
142
+ PT.LAYOUT.PADDING:
143
+ Adds padding to the image, which may be required for some models such as
144
+ microsoft/table-transformer-detection/pytorch_model.bin to improve detection accuracy.
145
+ Padding values should not be manually set; they are defined in the ModelProfile inside ServiceFactory.
146
+ If PT.LAYOUT.PADDING is True, you must also set the values for PT.LAYOUT.PAD.TOP, .RIGHT, .BOTTOM, and .LEFT.
147
+
148
+ PT.LAYOUT.PAD.*:
149
+ Padding values for each edge of the image (TOP, RIGHT, BOTTOM, LEFT).
150
+
151
+ ---
152
+ ## Layout NMS Configuration
153
+
154
+ LAYOUT_NMS_PAIRS.*:
155
+ Non-Maximum Suppression (NMS) configuration for overlapping layout elements.
156
+ For each element pair, define:
157
+ 1. COMBINATIONS: the combination of element types
158
+ 2. THRESHOLDS: the IoU threshold
159
+ 3. PRIORITY: which element has priority (or None)
160
+
161
+ Example:
162
+ LAYOUT_NMS_PAIRS.COMBINATIONS = [['table', 'title'], ['table', 'text']]
163
+ LAYOUT_NMS_PAIRS.THRESHOLDS = [0.001, 0.01]
164
+ LAYOUT_NMS_PAIRS.PRIORITY = ['table', None]
165
+
166
+ ---
167
+ ## Table Components Configuration
168
+
169
+ ### TensorFlow Item (Row/Column) Detection
170
+
171
+ TF.ITEM.WEIGHTS:
172
+ Relevant when LIB = TF. Specifies the item detection model (for rows and columns).
173
+ Currently, only the default model is supported.
174
+
175
+ TF.ITEM.FILTER:
176
+ Filters out unnecessary categories from the item detection model.
177
+ Accepts either a list of strings (e.g., ['row', 'column']) or ObjectTypes.
178
+
179
+ ### PyTorch Item (Row/Column) Detection
180
+
181
+ PT.ENFORCE_WEIGHTS.ITEM:
182
+ Relevant when LIB = PT. Use either TorchScript weights via PT.ITEM.WEIGHTS_TS
183
+ or standard PyTorch weights via PT.ITEM.WEIGHTS (.pt or .safetensors).
184
+ If PT.ENFORCE_WEIGHTS.ITEM = True, PT.ITEM.WEIGHTS will take precedence over TorchScript.
185
+
186
+ PT.ITEM.WEIGHTS:
187
+ Specifies the PyTorch model weights for item detection.
188
+ Use either .pt or .safetensors files.
189
+
190
+ PT.ITEM.WEIGHTS_TS:
191
+ Specifies the TorchScript model for item detection.
192
+ Use .ts files for deployment without model implementation dependencies.
193
+
194
+ PT.ITEM.FILTER:
195
+ Filters out unnecessary categories from the item detection model.
196
+ For example, the model microsoft/table-transformer-structure-recognition/pytorch_model.bin
197
+ predicts not only rows and columns, but also tables. To prevent redundant outputs, use:
198
+ PT.ITEM.FILTER = ['table']
199
+
200
+ PT.ITEM.PADDING:
201
+ Enables image padding for item detection. Required for models such as
202
+ microsoft/table-transformer-structure-recognition/pytorch_model.bin to optimize accuracy.
203
+ Padding values are derived from the ModelProfile within the ServiceFactory and should not be manually set.
204
+ If PT.ITEM.PADDING = True, you must define all edge values: TOP, RIGHT, BOTTOM, and LEFT.
205
+
206
+ PT.ITEM.PAD.*:
207
+ Padding values for each edge of the sub-image (TOP, RIGHT, BOTTOM, LEFT).
208
+
209
+ ### Cell Detection Configuration
210
+
211
+ Configuration for the second SubImagePipelineComponent.
212
+ This is only used in the original Deepdoctection table recognition approach,
213
+ not with the Table Transformer method.
214
+
215
+ ### TensorFlow Cell Detection
216
+
217
+ TF.CELL.WEIGHTS:
218
+ Configuration for the second SubImagePipelineComponent.
219
+ This is only used in the original Deepdoctection table recognition approach,
220
+ not with the Table Transformer method.
221
+ The CELL configuration structure mirrors that of the ITEM component.
222
+
223
+ TF.CELL.FILTER:
224
+ Filters out unnecessary categories from the cell detection model output.
225
+
226
+ ### PyTorch Cell Detection
227
+
228
+ PT.ENFORCE_WEIGHTS.CELL:
229
+ Determines whether PT.CELL.WEIGHTS should take priority over PT.CELL.WEIGHTS_TS.
230
+ If set to True, standard PyTorch weights are enforced.
231
+
232
+ PT.CELL.WEIGHTS:
233
+ Specifies the PyTorch model weights for cell detection using standard formats (.pt or
234
+ .safetensors).
235
+
236
+ PT.CELL.WEIGHTS_TS:
237
+ Specifies the TorchScript model for cell detection (.ts format).
238
+
239
+ PT.CELL.FILTER:
240
+ Filters out unwanted categories from the cell detection model.
241
+
242
+ PT.CELL.PADDING:
243
+ Enables padding for the sub-image used in cell detection.
244
+ Required for certain models to enhance prediction quality.
245
+ If set to True, padding values for all four edges must be defined.
246
+
247
+ PT.CELL.PAD.*:
248
+ Padding values for each edge of the sub-image used in cell detection (TOP, RIGHT, BOTTOM, LEFT).
249
+
250
+ ---
251
+ ## Table Segmentation Configuration
252
+
253
+ SEGMENTATION.ASSIGNMENT_RULE:
254
+ Specifies the rule used to assign detected cells to rows and columns.
255
+ Can be either 'iou' (Intersection over Union) or 'ioa' (Intersection over Area).
256
+ In the Table Transformer approach, this also applies to special cell types like spanning or header cells.
257
+
258
+ SEGMENTATION.THRESHOLD_ROWS:
259
+ Threshold for assigning a (special) cell to a row based on the chosen rule (IOU or
260
+ IOA). The row assignment is based on the highest-overlapping row.
261
+ Multiple overlaps can lead to increased rowspan.
262
+
263
+ SEGMENTATION.THRESHOLD_COLS:
264
+ Threshold for assigning a (special) cell to a column based on the chosen rule (IOU or
265
+ IOA). The column assignment is based on the highest-overlapping column.
266
+
267
+ SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS:
268
+ Removes overlapping rows based on an IoU threshold.
269
+ Helps prevent multiple row spans caused by overlapping detections.
270
+ Note: for better alignment, SEGMENTATION.FULL_TABLE_TILING can be enabled.
271
+ Using a low threshold here may result in a very coarse grid.
272
+
273
+ SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS:
274
+ Same as above, but applied to columns.
275
+
276
+ SEGMENTATION.FULL_TABLE_TILING:
277
+ Ensures that predicted rows and columns fully cover the table region.
278
+ When enabled, rows will be stretched horizontally and vertically to fit the full region.
279
+ For rows, the first row will be stretched to the top, and the space to the second row is used to estimate the
280
+ bottom edge. This rule applies similarly to columns.
281
+
282
+ SEGMENTATION.STRETCH_RULE:
283
+ Defines how row and column boundaries are stretched when tiling is enabled.
284
+ Options:
285
+ - "left": lower edge equals the upper edge of the next row
286
+ - "equal": lower edge is halfway between two adjacent rows
287
+
288
+ SEGMENTATION.TABLE_NAME:
289
+ Specifies the layout category used to identify tables.
290
+ Used in both Deepdoctection and Table Transformer approaches.
291
+
292
+ SEGMENTATION.CELL_NAMES:
293
+ Lists the layout or cell types used in the original Deepdoctection approach.
294
+ Used by TableSegmentationService for cell assignments.
295
+
296
+ SEGMENTATION.PUBTABLES_CELL_NAMES:
297
+ Lists all cell types used by the Table Transformer approach
298
+ (PubtablesSegmentationService). LayoutType.CELL is synthetically generated and not predicted by the structure
299
+ recognition model.
300
+
301
+ SEGMENTATION.PUBTABLES_SPANNING_CELL_NAMES:
302
+ Subset of PUBTABLES_CELL_NAMES that represent spanning/header cells.
303
+ These need to be matched with row or column elements.
304
+
305
+ SEGMENTATION.ITEM_NAMES:
306
+ Lists the layout categories used to identify row and column elements.
307
+ Used by TableSegmentationService.
308
+
309
+ SEGMENTATION.PUBTABLES_ITEM_NAMES:
310
+ Equivalent to ITEM_NAMES but used in the Table Transformer approach.
311
+
312
+ SEGMENTATION.SUB_ITEM_NAMES:
313
+ Used in TableSegmentationService to specify sub-category annotations for row and
314
+ column numbers.
315
+
316
+ SEGMENTATION.PUBTABLES_SUB_ITEM_NAMES:
317
+ Equivalent to SUB_ITEM_NAMES, but used with the Table Transformer approach.
318
+
319
+ SEGMENTATION.PUBTABLES_ITEM_HEADER_CELL_NAMES:
320
+ Used in PubtablesSegmentationService.
321
+ Specifies which cells should be treated as header cells that need to be linked to row/column elements.
322
+
323
+ SEGMENTATION.PUBTABLES_ITEM_HEADER_THRESHOLDS:
324
+ Defines the threshold values for matching column/row header cells to
325
+ their respective rows/columns in the Table Transformer approach. The matching rule is defined in
326
+ SEGMENTATION.ASSIGNMENT_RULE.
327
+
328
+ ---
329
+ ## Text Extraction Configuration
330
+
331
+ Configuration options for PDF text extraction using PDFPlumber.
332
+ These values are passed directly to pdfplumber.utils.extract_words().
333
+ For reference, see:
334
+ https://github.com/jsvine/pdfplumber/blob/main/pdfplumber/utils/text.py
335
+
336
+ PDF_MINER.X_TOLERANCE:
337
+ Horizontal tolerance when merging characters into words.
338
+ Characters that are horizontally closer than this value will be grouped into a single word.
339
+
340
+ PDF_MINER.Y_TOLERANCE:
341
+ Vertical tolerance when grouping characters into lines.
342
+ Characters within this vertical range will be considered part of the same line.
343
+
344
+ ## OCR Configuration
345
+
346
+ OCR engine selection.
347
+ If `cfg.USE_OCR = True`, then one of the following must be set to `True`:
348
+ - `cfg.OCR.USE_TESSERACT`
349
+ - `cfg.OCR.USE_DOCTR`
350
+ - `cfg.OCR.USE_TEXTRACT`
351
+ All other engines must be set to False.
352
+
353
+ OCR.USE_TESSERACT:
354
+ Enables Tesseract as the OCR engine.
355
+ Note: Tesseract must be installed separately. This integration does not use pytesseract.
356
+ Configuration options are defined in a separate file: conf_tesseract.yaml.
357
+
358
+ OCR.CONFIG.TESSERACT:
359
+ Path to the Tesseract configuration file.
360
+
361
+ OCR.USE_DOCTR:
362
+ Enables DocTR as the OCR engine.
363
+ DocTR provides flexible and lightweight OCR models with strong accuracy and versatility.
364
+
365
+ OCR.USE_TEXTRACT:
366
+ Enables AWS Textract as the OCR engine.
367
+ Requires the following environment variables to be set:
368
+ AWS_ACCESS_KEY, AWS_SECRET_KEY, and AWS_REGION.
369
+ Alternatively, AWS credentials can be configured via the AWS CLI.
370
+
371
+ DocTR OCR uses a two-stage process: word detection followed by text recognition.
372
+ The following weights configure each stage for TensorFlow and PyTorch.
373
+
374
+ OCR.WEIGHTS.DOCTR_WORD.TF:
375
+ TensorFlow weights for the word detection model used by DocTR.
376
+
377
+ OCR.WEIGHTS.DOCTR_WORD.PT:
378
+ PyTorch weights for the word detection model used by DocTR.
379
+
380
+ OCR.WEIGHTS.DOCTR_RECOGNITION.TF:
381
+ TensorFlow weights for the text recognition model used by DocTR.
382
+
383
+ OCR.WEIGHTS.DOCTR_RECOGNITION.PT:
384
+ PyTorch weights for the text recognition model used by DocTR.
385
+
386
+ ---
387
+ ## Text Processing Configuration
388
+
389
+ TEXT_CONTAINER:
390
+ Specifies the annotation type used as a text container.
391
+ A text container is typically an ImageAnnotation generated by the OCR engine or PDF mining tool.
392
+ It contains a sub-annotation of type WordType.CHARACTERS.
393
+ Most commonly, text containers are of type LayoutType.WORD, but LayoutType.LINE may also be used.
394
+ It is recommended to align this value with IMAGE_DEFAULTS.TEXT_CONTAINER
395
+ rather than modifying it directly in the config.
396
+
397
+ WORD_MATCHING.PARENTAL_CATEGORIES:
398
+ Specifies the layout categories considered as potential parents of text
399
+ containers.
400
+
401
+ WORD_MATCHING.RULE:
402
+ Rule used for matching: either 'iou' (intersection over union) or 'ioa' (intersection over
403
+ area).
404
+
405
+ WORD_MATCHING.THRESHOLD:
406
+ Threshold for the selected matching rule (IOU or IOA).
407
+ Text containers must exceed this threshold to be assigned to a layout section.
408
+
409
+ WORD_MATCHING.MAX_PARENT_ONLY:
410
+ If a text container overlaps with multiple layout sections,
411
+ setting this to True will assign it only to the best-matching (i.e., highest-overlapping) section.
412
+ Prevents duplication of text in the output.
413
+
414
+ TEXT_ORDERING.TEXT_BLOCK_CATEGORIES:
415
+ Specifies which layout categories must be ordered (e.g., paragraphs, list items). These are layout blocks
416
+ that will be processed by the TextOrderingService.
417
+
418
+ TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES:
419
+ Specifies which text blocks are considered floating (not aligned with
420
+ strict columns or grids). These will be linked with a subcategory of type Relationships.READING_ORDER.
421
+
422
+ TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER:
423
+ Determines whether residual (unmatched) text containers should be included in the ordering process.
424
+ If set to True, orphaned text containers are grouped into lines and added to the layout ordering.
425
+ If set to False, unmatched text containers will not appear in the output.
426
+
427
+ TEXT_ORDERING.STARTING_POINT_TOLERANCE:
428
+ Tolerance used to determine whether a text block's left/right coordinate lies within a column's boundary.
429
+ Helps with assigning text blocks to columns based on horizontal alignment.
430
+
431
+ TEXT_ORDERING.BROKEN_LINE_TOLERANCE:
432
+ Horizontal distance threshold for grouping words into the same line.
433
+ If the gap between words exceeds this value, they will be treated as belonging to separate lines or columns.
434
+
435
+ TEXT_ORDERING.HEIGHT_TOLERANCE:
436
+ Used for ordering vertically broken floating text blocks into coherent columns.
437
+ Defines vertical alignment tolerance between adjacent text blocks.
438
+
439
+ TEXT_ORDERING.PARAGRAPH_BREAK:
440
+ Defines the spacing threshold that indicates a paragraph break in vertically
441
+ arranged text blocks. Helps determine reading order in multi-column, broken layouts.
442
+
443
+ ---
444
+ ## Layout Linking Configuration
445
+
446
+ Configuration for linking spatially related layout sections
447
+ (e.g., associating figures with their captions) based on proximity.
448
+ The distance is calculated using the center points of the layout elements.
449
+
450
+ LAYOUT_LINK.PARENTAL_CATEGORIES:
451
+ Specifies the parent layout categories in the link relationship.
452
+ These are the elements to which related components (e.g., captions) should be linked.
453
+
454
+ LAYOUT_LINK.CHILD_CATEGORIES:
455
+ Specifies the child layout categories in the link relationship.
456
+ These are typically smaller or subordinate elements (e.g., captions).
457
+
458
+ """
459
+
460
+ from ..datapoint.view import IMAGE_DEFAULTS
461
+ from ..utils.metacfg import AttrDict
462
+ from ..utils.settings import CellType, LayoutType
463
+
464
+ cfg = AttrDict()
465
+
466
+ # General note: All models used in *.WEIGHTS must be registered in the ModelCatalog.
467
+ # Registered models are listed in deepdoctection/profiles.jsonl. To add new models,
468
+ # either extend this file with additional JSON objects or provide a separate JSONL file
469
+ # and reference it via the MODEL_CATALOG environment variable.
470
+
471
+ # Relevant only for Tesseract OCR. Specifies the language model to use.
472
+ # Supported language codes are listed at:
473
+ # https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html.
474
+ # Note: models must be downloaded in advance.
475
+ cfg.LANGUAGE = None
476
+
477
+ # Deep learning framework. Choose either 'TF' (TensorFlow) or 'PT' (PyTorch).
478
+ # Selection is made via environment variables: DD_USE_TF or DD_USE_PT.
479
+ cfg.LIB = None
480
+
481
+ # Device configuration.
482
+ # For PyTorch: torch.device("cpu"), torch.device("mps"), or torch.device("cuda")
483
+ # For TensorFlow: tf.device("/cpu:0") or tf.device("/gpu:0")
484
+ cfg.DEVICE = None
485
+
486
+ # Enables the initial pipeline component using TesseractRotationTransformer to auto-rotate pages
487
+ # by 90-degree increments. All subsequent components process the rotated page.
488
+ cfg.USE_ROTATOR = False
489
+
490
+ # Enables layout analysis component (second in the pipeline) for either full document layout analysis (DLA)
491
+ # or single-object detection. Additional configurations via PT.LAYOUT.*, TF.LAYOUT.*, and PT.ENFORCE_WEIGHTS.LAYOUT.
492
+ cfg.USE_LAYOUT = True
493
+
494
+ # Enables optional fine-grained Non-Maximum Suppression (NMS) after layout detection.
495
+ # Configure via LAYOUT_NMS_PAIRS.* settings.
496
+ cfg.USE_LAYOUT_NMS = True
497
+
498
+ # Enables table segmentation (third and later pipeline components).
499
+ # Applies row/column detection, optional cell detection, and segmentation services.
500
+ # Configure sub-services via PT.ITEM.*, TF.ITEM.*, PT.CELL.*, TF.CELL.*, and SEGMENTATION.*
501
+ cfg.USE_TABLE_SEGMENTATION = True
502
+
503
+ # Enables optional refinement of table structure to ensure valid HTML generation.
504
+ # Should be set to False when using the Table Transformer approach.
505
+ cfg.USE_TABLE_REFINEMENT = False
506
+
507
+ # Enables text extraction using PDFPlumber. Only works on PDFs with embedded text layers.
508
+ # Configure additional behavior using PDF_MINER.*
509
+ cfg.USE_PDF_MINER = False
510
+
511
+ # Enables OCR functionality using Tesseract, DocTr, or Textract.
512
+ # Also activates MatchingService and TextOrderingService to associate text with layout elements.
513
+ # Further configurations via OCR.*, WORD_MATCHING.*, TEXT_CONTAINER, and TEXT_ORDERING.*
514
+ cfg.USE_OCR = True
515
+
516
+ # Enables MatchingService to associate nearby layout elements (e.g., figures and captions).
517
+ cfg.USE_LAYOUT_LINK = False
518
+
519
+ # Enables line matching in post-processing. Useful when synthetic line elements are created
520
+ # (e.g., by grouping orphan text containers). Only applicable if list items were previously grouped.
521
+ cfg.USE_LINE_MATCHER = False
522
+
523
+ # Relevant when LIB = TF. Specifies the layout detection model.
524
+ # This model should detect multiple or single objects across an entire page.
525
+ # Currently, only one default model is supported.
526
+ cfg.TF.LAYOUT.WEIGHTS = "layout/model-800000_inf_only.data-00000-of-00001"
527
+
528
+ # Filters out unnecessary categories from the layout detection model output.
529
+ # Accepts either a list of strings (e.g., ['list', 'figure']) or a list of ObjectTypes
530
+ # (e.g., [LayoutType.LIST, LayoutType.FIGURE]).
531
+ cfg.TF.LAYOUT.FILTER = None
532
+
533
+ # Relevant when LIB = PT. Allows selection between two model formats:
534
+ # 1. Standard PyTorch weights (.pt or .safetensors), or
535
+ # 2. TorchScript weights (.ts), which require only the Torch runtime and not the model implementation.
536
+ # If PT.ENFORCE_WEIGHTS.LAYOUT is set to True, PT.LAYOUT.WEIGHTS will take precedence.
537
+ # The get_dd_analyzer() function will set PT.ENFORCE_WEIGHTS.LAYOUT = False automatically
538
+ # if Detectron2 is not installed or PT.LAYOUT.WEIGHTS is None.
539
+ cfg.PT.ENFORCE_WEIGHTS.LAYOUT = True
540
+
541
+ # Specifies the PyTorch layout detection model (standard weights).
542
+ # Must detect single or multiple objects across the full page.
543
+ # Acceptable formats: .pt or .safetensors (e.g.,
544
+ # layout/d2_model_0829999_layout_inf_only.pt,
545
+ # microsoft/table-transformer-detection/pytorch_model.bin,
546
+ # Aryn/deformable-detr-DocLayNet/model.safetensors).
547
+ cfg.PT.LAYOUT.WEIGHTS = "Aryn/deformable-detr-DocLayNet/model.safetensors"
548
+
549
+ # Specifies the TorchScript version of the layout model.
550
+ # Must detect single or multiple objects across the full page.
551
+ # Acceptable format: .ts files (e.g., layout/d2_model_0829999_layout_inf_only.ts).
552
+ cfg.PT.LAYOUT.WEIGHTS_TS = "layout/d2_model_0829999_layout_inf_only.ts"
553
+
554
+ # Filters out unwanted categories from the model’s predictions.
555
+ # Accepts either string values (e.g., ['list', 'figure']) or ObjectTypes
556
+ # (e.g., [LayoutType.LIST, LayoutType.FIGURE]).
557
+ cfg.PT.LAYOUT.FILTER = None
558
+
559
+ # Adds padding to the image, which may be required for some models such as
560
+ # microsoft/table-transformer-detection/pytorch_model.bin to improve detection accuracy.
561
+ # Padding values should not be manually set; they are defined in the ModelProfile inside ServiceFactory.
562
+ # If PT.LAYOUT.PADDING is True, you must also set the values for PT.LAYOUT.PAD.TOP, .RIGHT, .BOTTOM, and .LEFT.
563
+ cfg.PT.LAYOUT.PADDING = False
564
+
565
+ # Padding value for the top edge of the image. Required by some layout detection models.
566
+ cfg.PT.LAYOUT.PAD.TOP = 0
567
+
568
+ # Padding value for the right edge of the image. Required by some layout detection models.
569
+ cfg.PT.LAYOUT.PAD.RIGHT = 0
570
+
571
+ # Padding value for the bottom edge of the image. Required by some layout detection models.
572
+ cfg.PT.LAYOUT.PAD.BOTTOM = 0
573
+
574
+ # Padding value for the left edge of the image. Required by some layout detection models.
575
+ cfg.PT.LAYOUT.PAD.LEFT = 0
576
+
577
+ # Non-Maximum Suppression (NMS) configuration for overlapping layout elements.
578
+ # For each element pair, define:
579
+ # 1. the combination of element types,
580
+ # 2. the IoU threshold, and
581
+ # 3. which element has priority (or None).
582
+ #
583
+ # Example:
584
+ # LAYOUT_NMS_PAIRS.COMBINATIONS = [['table', 'title'], ['table', 'text']]
585
+ # LAYOUT_NMS_PAIRS.THRESHOLDS = [0.001, 0.01]
586
+ # LAYOUT_NMS_PAIRS.PRIORITY = ['table', None]
587
+ cfg.LAYOUT_NMS_PAIRS.COMBINATIONS = [
588
+ [LayoutType.TABLE, LayoutType.TITLE],
589
+ [LayoutType.TABLE, LayoutType.TEXT],
590
+ [LayoutType.TABLE, LayoutType.KEY_VALUE_AREA],
591
+ [LayoutType.TABLE, LayoutType.LIST_ITEM],
592
+ [LayoutType.TABLE, LayoutType.LIST],
593
+ [LayoutType.TABLE, LayoutType.FIGURE],
594
+ [LayoutType.TITLE, LayoutType.TEXT],
595
+ [LayoutType.TEXT, LayoutType.KEY_VALUE_AREA],
596
+ [LayoutType.TEXT, LayoutType.LIST_ITEM],
597
+ [LayoutType.TEXT, LayoutType.CAPTION],
598
+ [LayoutType.KEY_VALUE_AREA, LayoutType.LIST_ITEM],
599
+ [LayoutType.FIGURE, LayoutType.CAPTION],
600
+ ]
601
+ cfg.LAYOUT_NMS_PAIRS.THRESHOLDS = [0.001, 0.01, 0.01, 0.001, 0.01, 0.01, 0.05, 0.01, 0.01, 0.01, 0.01, 0.001]
602
+ cfg.LAYOUT_NMS_PAIRS.PRIORITY = [
603
+ LayoutType.TABLE,
604
+ LayoutType.TABLE,
605
+ LayoutType.TABLE,
606
+ LayoutType.TABLE,
607
+ LayoutType.TABLE,
608
+ LayoutType.TABLE,
609
+ LayoutType.TEXT,
610
+ LayoutType.TEXT,
611
+ None,
612
+ LayoutType.CAPTION,
613
+ LayoutType.KEY_VALUE_AREA,
614
+ LayoutType.FIGURE,
615
+ ]
616
+
617
+ # Relevant when LIB = TF. Specifies the item detection model (for rows and columns).
618
+ # Currently, only the default model is supported.
619
+ cfg.TF.ITEM.WEIGHTS = "item/model-1620000_inf_only.data-00000-of-00001"
620
+
621
+ # Filters out unnecessary categories from the item detection model.
622
+ # Accepts either a list of strings (e.g., ['row', 'column']) or ObjectTypes.
623
+ cfg.TF.ITEM.FILTER = None
624
+
625
+ # Relevant when LIB = PT. Use either TorchScript weights via PT.ITEM.WEIGHTS_TS
626
+ # or standard PyTorch weights via PT.ITEM.WEIGHTS (.pt or .safetensors).
627
+ # If PT.ENFORCE_WEIGHTS.ITEM = True, PT.ITEM.WEIGHTS will take precedence over TorchScript.
628
+ cfg.PT.ENFORCE_WEIGHTS.ITEM = True
629
+
630
+ # Specifies the PyTorch model weights for item detection.
631
+ # Use either .pt or .safetensors files.
632
+ cfg.PT.ITEM.WEIGHTS = "deepdoctection/tatr_tab_struct_v2/pytorch_model.bin"
633
+
634
+ # Specifies the TorchScript model for item detection.
635
+ # Use .ts files for deployment without model implementation dependencies.
636
+ cfg.PT.ITEM.WEIGHTS_TS = "item/d2_model_1639999_item_inf_only.ts"
637
+
638
+ # Filters out unnecessary categories from the item detection model.
639
+ # For example, the model microsoft/table-transformer-structure-recognition/pytorch_model.bin
640
+ # predicts not only rows and columns, but also tables. To prevent redundant outputs, use:
641
+ # PT.ITEM.FILTER = ['table']
642
+ cfg.PT.ITEM.FILTER = ["table"]
643
+
644
+ # Enables image padding for item detection. Required for models such as
645
+ # microsoft/table-transformer-structure-recognition/pytorch_model.bin to optimize accuracy.
646
+ # Padding values are derived from the ModelProfile within the ServiceFactory and should not be manually set.
647
+ # If PT.ITEM.PADDING = True, you must define all edge values: TOP, RIGHT, BOTTOM, and LEFT.
648
+ cfg.PT.ITEM.PADDING = False
649
+
650
+ # Padding value for the top edge of the sub-image used in item detection.
651
+ cfg.PT.ITEM.PAD.TOP = 60
652
+
653
+ # Padding value for the right edge of the sub-image used in item detection.
654
+ cfg.PT.ITEM.PAD.RIGHT = 60
655
+
656
+ # Padding value for the bottom edge of the sub-image used in item detection.
657
+ cfg.PT.ITEM.PAD.BOTTOM = 60
658
+
659
+ # Padding value for the left edge of the sub-image used in item detection.
660
+ cfg.PT.ITEM.PAD.LEFT = 60
661
+
662
+ # Configuration for the second SubImagePipelineComponent.
663
+ # This is only used in the original Deepdoctection table recognition approach,
664
+ # not with the Table Transformer method.
665
+ # The CELL configuration structure mirrors that of the ITEM component.
666
+ cfg.TF.CELL.WEIGHTS = "cell/model-1800000_inf_only.data-00000-of-00001"
667
+
668
+ # Filters out unnecessary categories from the cell detection model output.
669
+ cfg.TF.CELL.FILTER = None
670
+
671
+ # Determines whether PT.CELL.WEIGHTS should take priority over PT.CELL.WEIGHTS_TS.
672
+ # If set to True, standard PyTorch weights are enforced.
673
+ cfg.PT.ENFORCE_WEIGHTS.CELL = True
674
+
675
+ # Specifies the PyTorch model weights for cell detection using standard formats (.pt or .safetensors).
676
+ cfg.PT.CELL.WEIGHTS = "cell/d2_model_1849999_cell_inf_only.pt"
677
+
678
+ # Specifies the TorchScript model for cell detection (.ts format).
679
+ cfg.PT.CELL.WEIGHTS_TS = "cell/d2_model_1849999_cell_inf_only.ts"
680
+
681
+ # Filters out unwanted categories from the cell detection model.
682
+ cfg.PT.CELL.FILTER = None
683
+
684
+ # Enables padding for the sub-image used in cell detection.
685
+ # Required for certain models to enhance prediction quality.
686
+ # If set to True, padding values for all four edges must be defined.
687
+ cfg.PT.CELL.PADDING = False
688
+
689
+ # Padding value for the top edge of the sub-image used in cell detection.
690
+ cfg.PT.CELL.PAD.TOP = 60
691
+
692
+ # Padding value for the right edge of the sub-image used in cell detection.
693
+ cfg.PT.CELL.PAD.RIGHT = 60
694
+
695
+ # Padding value for the bottom edge of the sub-image used in cell detection.
696
+ cfg.PT.CELL.PAD.BOTTOM = 60
697
+
698
+ # Padding value for the left edge of the sub-image used in cell detection.
699
+ cfg.PT.CELL.PAD.LEFT = 60
700
+
701
+ # Specifies the rule used to assign detected cells to rows and columns.
702
+ # Can be either 'iou' (Intersection over Union) or 'ioa' (Intersection over Area).
703
+ # In the Table Transformer approach, this also applies to special cell types like spanning or header cells.
704
+ cfg.SEGMENTATION.ASSIGNMENT_RULE = "ioa"
705
+
706
+ # Threshold for assigning a (special) cell to a row based on the chosen rule (IOU or IOA).
707
+ # The row assignment is based on the highest-overlapping row.
708
+ # Multiple overlaps can lead to increased rowspan.
709
+ cfg.SEGMENTATION.THRESHOLD_ROWS = 0.4
710
+
711
+ # Threshold for assigning a (special) cell to a column based on the chosen rule (IOU or IOA).
712
+ # The column assignment is based on the highest-overlapping column.
713
+ cfg.SEGMENTATION.THRESHOLD_COLS = 0.4
714
+
715
+ # Removes overlapping rows based on an IoU threshold.
716
+ # Helps prevent multiple row spans caused by overlapping detections.
717
+ # Note: for better alignment, SEGMENTATION.FULL_TABLE_TILING can be enabled.
718
+ # Using a low threshold here may result in a very coarse grid.
719
+ cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS = 0.2
720
+
721
+ # Same as above, but applied to columns.
722
+ cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS = 0.2
723
+
724
+ # Ensures that predicted rows and columns fully cover the table region.
725
+ # When enabled, rows will be stretched horizontally and vertically to fit the full region.
726
+ # For rows, the first row will be stretched to the top, and the space to the second row is used to estimate the
727
+ # bottom edge. This rule applies similarly to columns.
728
+ cfg.SEGMENTATION.FULL_TABLE_TILING = True
729
+
730
+ # Defines how row and column boundaries are stretched when tiling is enabled.
731
+ # Options:
732
+ # - "left": lower edge equals the upper edge of the next row
733
+ # - "equal": lower edge is halfway between two adjacent rows
734
+ cfg.SEGMENTATION.STRETCH_RULE = "equal"
735
+
736
+ # Specifies the layout category used to identify tables.
737
+ # Used in both Deepdoctection and Table Transformer approaches.
738
+ cfg.SEGMENTATION.TABLE_NAME = LayoutType.TABLE
739
+
740
+ # Lists the layout or cell types used in the original Deepdoctection approach.
741
+ # Used by TableSegmentationService for cell assignments.
742
+ cfg.SEGMENTATION.CELL_NAMES = [CellType.HEADER, CellType.BODY, LayoutType.CELL]
743
+
744
+ # Lists all cell types used by the Table Transformer approach (PubtablesSegmentationService).
745
+ # LayoutType.CELL is synthetically generated and not predicted by the structure recognition model.
746
+ cfg.SEGMENTATION.PUBTABLES_CELL_NAMES = [
747
+ LayoutType.CELL,
748
+ ]
749
+
750
+ # Subset of PUBTABLES_CELL_NAMES that represent spanning/header cells.
751
+ # These need to be matched with row or column elements.
752
+ cfg.SEGMENTATION.PUBTABLES_SPANNING_CELL_NAMES = [
753
+ CellType.SPANNING,
754
+ ]
755
+
756
+ # Lists the layout categories used to identify row and column elements.
757
+ # Used by TableSegmentationService.
758
+ cfg.SEGMENTATION.ITEM_NAMES = [LayoutType.ROW, LayoutType.COLUMN]
759
+
760
+ # Equivalent to ITEM_NAMES but used in the Table Transformer approach.
761
+ cfg.SEGMENTATION.PUBTABLES_ITEM_NAMES = [LayoutType.ROW, LayoutType.COLUMN]
762
+
763
+ # Used in TableSegmentationService to specify sub-category annotations for row and column numbers.
764
+ cfg.SEGMENTATION.SUB_ITEM_NAMES = [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER]
765
+
766
+ # Equivalent to SUB_ITEM_NAMES, but used with the Table Transformer approach.
767
+ cfg.SEGMENTATION.PUBTABLES_SUB_ITEM_NAMES = [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER]
768
+
769
+ # Used in PubtablesSegmentationService.
770
+ # Specifies which cells should be treated as header cells that need to be linked to row/column elements.
771
+ cfg.SEGMENTATION.PUBTABLES_ITEM_HEADER_CELL_NAMES = [
772
+ CellType.COLUMN_HEADER,
773
+ CellType.ROW_HEADER,
774
+ CellType.PROJECTED_ROW_HEADER,
775
+ ]
776
+
777
+ # Defines the threshold values for matching column/row header cells to their respective rows/columns
778
+ # in the Table Transformer approach. The matching rule is defined in SEGMENTATION.ASSIGNMENT_RULE.
779
+ cfg.SEGMENTATION.PUBTABLES_ITEM_HEADER_THRESHOLDS = [0.6, 0.0001]
780
+
781
+ # Configuration options for PDF text extraction using PDFPlumber.
782
+ # These values are passed directly to pdfplumber.utils.extract_words().
783
+ # For reference, see:
784
+ # https://github.com/jsvine/pdfplumber/blob/main/pdfplumber/utils/text.py
785
+
786
+ # Horizontal tolerance when merging characters into words.
787
+ # Characters that are horizontally closer than this value will be grouped into a single word.
788
+ cfg.PDF_MINER.X_TOLERANCE = 3
789
+
790
+ # Vertical tolerance when grouping characters into lines.
791
+ # Characters within this vertical range will be considered part of the same line.
792
+ cfg.PDF_MINER.Y_TOLERANCE = 3
793
+
794
+ # OCR engine selection.
795
+ # If cfg.USE_OCR = True, then one of the following must be set to True:
796
+ # - cfg.OCR.USE_TESSERACT
797
+ # - cfg.OCR.USE_DOCTR
798
+ # - cfg.OCR.USE_TEXTRACT
799
+ # All other engines must be set to False.
800
+
801
+ # Enables Tesseract as the OCR engine.
802
+ # Note: Tesseract must be installed separately. This integration does not use pytesseract.
803
+ # Configuration options are defined in a separate file: conf_tesseract.yaml.
804
+ cfg.OCR.USE_TESSERACT = False
805
+
806
+ # Path to the Tesseract configuration file.
807
+ cfg.OCR.CONFIG.TESSERACT = "dd/conf_tesseract.yaml"
808
+
809
+ # Enables DocTR as the OCR engine.
810
+ # DocTR provides flexible and lightweight OCR models with strong accuracy and versatility.
811
+ cfg.OCR.USE_DOCTR = True
812
+
813
+ # Enables AWS Textract as the OCR engine.
814
+ # Requires the following environment variables to be set:
815
+ # AWS_ACCESS_KEY, AWS_SECRET_KEY, and AWS_REGION.
816
+ # Alternatively, AWS credentials can be configured via the AWS CLI.
817
+ cfg.OCR.USE_TEXTRACT = False
818
+
819
+ # DocTR OCR uses a two-stage process: word detection followed by text recognition.
820
+ # The following weights configure each stage for TensorFlow and PyTorch.
821
+
822
+ # TensorFlow weights for the word detection model used by DocTR.
823
+ cfg.OCR.WEIGHTS.DOCTR_WORD.TF = "doctr/db_resnet50/tf/db_resnet50-adcafc63.zip"
824
+
825
+ # PyTorch weights for the word detection model used by DocTR.
826
+ cfg.OCR.WEIGHTS.DOCTR_WORD.PT = "doctr/db_resnet50/pt/db_resnet50-ac60cadc.pt"
827
+
828
+ # TensorFlow weights for the text recognition model used by DocTR.
829
+ cfg.OCR.WEIGHTS.DOCTR_RECOGNITION.TF = "doctr/crnn_vgg16_bn/tf/crnn_vgg16_bn-76b7f2c6.zip"
830
+
831
+ # PyTorch weights for the text recognition model used by DocTR.
832
+ cfg.OCR.WEIGHTS.DOCTR_RECOGNITION.PT = "doctr/crnn_vgg16_bn/pt/crnn_vgg16_bn-9762b0b0.pt"
833
+
834
+ # Specifies the annotation type used as a text container.
835
+ # A text container is typically an ImageAnnotation generated by the OCR engine or PDF mining tool.
836
+ # It contains a sub-annotation of type WordType.CHARACTERS.
837
+ # Most commonly, text containers are of type LayoutType.WORD, but LayoutType.LINE may also be used.
838
+ # It is recommended to align this value with IMAGE_DEFAULTS.TEXT_CONTAINER
839
+ # rather than modifying it directly in the config.
840
+ cfg.TEXT_CONTAINER = IMAGE_DEFAULTS.TEXT_CONTAINER
841
+
842
+ # Configuration for matching text containers (e.g., words or lines) to layout elements
843
+ # such as titles, paragraphs, tables, etc., using spatial overlap.
844
+ # When a match occurs, a parent-child relationship (Relationships.CHILD) is assigned.
845
+
846
+ # Specifies the layout categories considered as potential parents of text containers.
847
+ cfg.WORD_MATCHING.PARENTAL_CATEGORIES = IMAGE_DEFAULTS.TEXT_BLOCK_CATEGORIES
848
+
849
+ # Rule used for matching: either 'iou' (intersection over union) or 'ioa' (intersection over area).
850
+ cfg.WORD_MATCHING.RULE = "ioa"
851
+
852
+ # Threshold for the selected matching rule (IOU or IOA).
853
+ # Text containers must exceed this threshold to be assigned to a layout section.
854
+ cfg.WORD_MATCHING.THRESHOLD = 0.3
855
+
856
+ # If a text container overlaps with multiple layout sections,
857
+ # setting this to True will assign it only to the best-matching (i.e., highest-overlapping) section.
858
+ # Prevents duplication of text in the output.
859
+ cfg.WORD_MATCHING.MAX_PARENT_ONLY = True
860
+
861
+ # Specifies which layout categories must be ordered (e.g., paragraphs, list items).
862
+ # These are layout blocks that will be processed by the TextOrderingService.
863
+ cfg.TEXT_ORDERING.TEXT_BLOCK_CATEGORIES = IMAGE_DEFAULTS.TEXT_BLOCK_CATEGORIES
864
+
865
+ # Specifies which text blocks are considered floating (not aligned with strict columns or grids).
866
+ # These will be linked with a subcategory of type Relationships.READING_ORDER.
867
+ cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES = IMAGE_DEFAULTS.FLOATING_TEXT_BLOCK_CATEGORIES
868
+
869
+ # Determines whether residual (unmatched) text containers should be included in the ordering process.
870
+ # If set to True, orphaned text containers are grouped into lines and added to the layout ordering.
871
+ # If set to False, unmatched text containers will not appear in the output.
872
+ cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER = True
873
+
874
+ # Tolerance used to determine whether a text block's left/right coordinate lies within a column’s boundary.
875
+ # Helps with assigning text blocks to columns based on horizontal alignment.
876
+ cfg.TEXT_ORDERING.STARTING_POINT_TOLERANCE = 0.005
877
+
878
+ # Horizontal distance threshold for grouping words into the same line.
879
+ # If the gap between words exceeds this value, they will be treated as belonging to separate lines or columns.
880
+ cfg.TEXT_ORDERING.BROKEN_LINE_TOLERANCE = 0.003
881
+
882
+ # Used for ordering vertically broken floating text blocks into coherent columns.
883
+ # Defines vertical alignment tolerance between adjacent text blocks.
884
+ cfg.TEXT_ORDERING.HEIGHT_TOLERANCE = 2.0
885
+
886
+ # Defines the spacing threshold that indicates a paragraph break in vertically arranged text blocks.
887
+ # Helps determine reading order in multi-column, broken layouts.
888
+ cfg.TEXT_ORDERING.PARAGRAPH_BREAK = 0.035
889
+
890
+ # Configuration for linking spatially related layout sections
891
+ # (e.g., associating figures with their captions) based on proximity.
892
+ # The distance is calculated using the center points of the layout elements.
893
+
894
+ # Specifies the parent layout categories in the link relationship.
895
+ # These are the elements to which related components (e.g., captions) should be linked.
896
+ cfg.LAYOUT_LINK.PARENTAL_CATEGORIES = [LayoutType.FIGURE, LayoutType.TABLE]
897
+
898
+ # Specifies the child layout categories in the link relationship.
899
+ # These are typically smaller or subordinate elements (e.g., captions).
900
+ cfg.LAYOUT_LINK.CHILD_CATEGORIES = [LayoutType.CAPTION]
901
+
902
+ # Freezes the configuration to make it immutable.
903
+ # This prevents accidental modification at runtime.
904
+ cfg.freeze()