deepdoctection 0.42.0__py3-none-any.whl → 0.43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (124) hide show
  1. deepdoctection/__init__.py +2 -1
  2. deepdoctection/analyzer/__init__.py +2 -1
  3. deepdoctection/analyzer/config.py +904 -0
  4. deepdoctection/analyzer/dd.py +36 -62
  5. deepdoctection/analyzer/factory.py +311 -141
  6. deepdoctection/configs/conf_dd_one.yaml +100 -44
  7. deepdoctection/configs/profiles.jsonl +32 -0
  8. deepdoctection/dataflow/__init__.py +9 -6
  9. deepdoctection/dataflow/base.py +33 -15
  10. deepdoctection/dataflow/common.py +96 -75
  11. deepdoctection/dataflow/custom.py +36 -29
  12. deepdoctection/dataflow/custom_serialize.py +135 -91
  13. deepdoctection/dataflow/parallel_map.py +33 -31
  14. deepdoctection/dataflow/serialize.py +15 -10
  15. deepdoctection/dataflow/stats.py +41 -28
  16. deepdoctection/datapoint/__init__.py +4 -6
  17. deepdoctection/datapoint/annotation.py +104 -66
  18. deepdoctection/datapoint/box.py +190 -130
  19. deepdoctection/datapoint/convert.py +66 -39
  20. deepdoctection/datapoint/image.py +151 -95
  21. deepdoctection/datapoint/view.py +383 -236
  22. deepdoctection/datasets/__init__.py +2 -6
  23. deepdoctection/datasets/adapter.py +11 -11
  24. deepdoctection/datasets/base.py +118 -81
  25. deepdoctection/datasets/dataflow_builder.py +18 -12
  26. deepdoctection/datasets/info.py +76 -57
  27. deepdoctection/datasets/instances/__init__.py +6 -2
  28. deepdoctection/datasets/instances/doclaynet.py +17 -14
  29. deepdoctection/datasets/instances/fintabnet.py +16 -22
  30. deepdoctection/datasets/instances/funsd.py +11 -6
  31. deepdoctection/datasets/instances/iiitar13k.py +9 -9
  32. deepdoctection/datasets/instances/layouttest.py +9 -9
  33. deepdoctection/datasets/instances/publaynet.py +9 -9
  34. deepdoctection/datasets/instances/pubtables1m.py +13 -13
  35. deepdoctection/datasets/instances/pubtabnet.py +13 -15
  36. deepdoctection/datasets/instances/rvlcdip.py +8 -8
  37. deepdoctection/datasets/instances/xfund.py +11 -9
  38. deepdoctection/datasets/registry.py +18 -11
  39. deepdoctection/datasets/save.py +12 -11
  40. deepdoctection/eval/__init__.py +3 -2
  41. deepdoctection/eval/accmetric.py +72 -52
  42. deepdoctection/eval/base.py +29 -10
  43. deepdoctection/eval/cocometric.py +14 -12
  44. deepdoctection/eval/eval.py +56 -41
  45. deepdoctection/eval/registry.py +6 -3
  46. deepdoctection/eval/tedsmetric.py +24 -9
  47. deepdoctection/eval/tp_eval_callback.py +13 -12
  48. deepdoctection/extern/__init__.py +1 -1
  49. deepdoctection/extern/base.py +176 -97
  50. deepdoctection/extern/d2detect.py +127 -92
  51. deepdoctection/extern/deskew.py +19 -10
  52. deepdoctection/extern/doctrocr.py +157 -106
  53. deepdoctection/extern/fastlang.py +25 -17
  54. deepdoctection/extern/hfdetr.py +137 -60
  55. deepdoctection/extern/hflayoutlm.py +329 -248
  56. deepdoctection/extern/hflm.py +67 -33
  57. deepdoctection/extern/model.py +108 -762
  58. deepdoctection/extern/pdftext.py +37 -12
  59. deepdoctection/extern/pt/nms.py +15 -1
  60. deepdoctection/extern/pt/ptutils.py +13 -9
  61. deepdoctection/extern/tessocr.py +87 -54
  62. deepdoctection/extern/texocr.py +29 -14
  63. deepdoctection/extern/tp/tfutils.py +36 -8
  64. deepdoctection/extern/tp/tpcompat.py +54 -16
  65. deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
  66. deepdoctection/extern/tpdetect.py +4 -2
  67. deepdoctection/mapper/__init__.py +1 -1
  68. deepdoctection/mapper/cats.py +117 -76
  69. deepdoctection/mapper/cocostruct.py +35 -17
  70. deepdoctection/mapper/d2struct.py +56 -29
  71. deepdoctection/mapper/hfstruct.py +32 -19
  72. deepdoctection/mapper/laylmstruct.py +221 -185
  73. deepdoctection/mapper/maputils.py +71 -35
  74. deepdoctection/mapper/match.py +76 -62
  75. deepdoctection/mapper/misc.py +68 -44
  76. deepdoctection/mapper/pascalstruct.py +13 -12
  77. deepdoctection/mapper/prodigystruct.py +33 -19
  78. deepdoctection/mapper/pubstruct.py +42 -32
  79. deepdoctection/mapper/tpstruct.py +39 -19
  80. deepdoctection/mapper/xfundstruct.py +20 -13
  81. deepdoctection/pipe/__init__.py +1 -2
  82. deepdoctection/pipe/anngen.py +104 -62
  83. deepdoctection/pipe/base.py +226 -107
  84. deepdoctection/pipe/common.py +206 -123
  85. deepdoctection/pipe/concurrency.py +74 -47
  86. deepdoctection/pipe/doctectionpipe.py +108 -47
  87. deepdoctection/pipe/language.py +41 -24
  88. deepdoctection/pipe/layout.py +45 -18
  89. deepdoctection/pipe/lm.py +146 -78
  90. deepdoctection/pipe/order.py +196 -113
  91. deepdoctection/pipe/refine.py +111 -63
  92. deepdoctection/pipe/registry.py +1 -1
  93. deepdoctection/pipe/segment.py +213 -142
  94. deepdoctection/pipe/sub_layout.py +76 -46
  95. deepdoctection/pipe/text.py +52 -33
  96. deepdoctection/pipe/transform.py +8 -6
  97. deepdoctection/train/d2_frcnn_train.py +87 -69
  98. deepdoctection/train/hf_detr_train.py +72 -40
  99. deepdoctection/train/hf_layoutlm_train.py +85 -46
  100. deepdoctection/train/tp_frcnn_train.py +56 -28
  101. deepdoctection/utils/concurrency.py +59 -16
  102. deepdoctection/utils/context.py +40 -19
  103. deepdoctection/utils/develop.py +25 -17
  104. deepdoctection/utils/env_info.py +85 -36
  105. deepdoctection/utils/error.py +16 -10
  106. deepdoctection/utils/file_utils.py +246 -62
  107. deepdoctection/utils/fs.py +162 -43
  108. deepdoctection/utils/identifier.py +29 -16
  109. deepdoctection/utils/logger.py +49 -32
  110. deepdoctection/utils/metacfg.py +83 -21
  111. deepdoctection/utils/pdf_utils.py +119 -62
  112. deepdoctection/utils/settings.py +24 -10
  113. deepdoctection/utils/tqdm.py +10 -5
  114. deepdoctection/utils/transform.py +182 -46
  115. deepdoctection/utils/utils.py +61 -28
  116. deepdoctection/utils/viz.py +150 -104
  117. deepdoctection-0.43.dist-info/METADATA +376 -0
  118. deepdoctection-0.43.dist-info/RECORD +149 -0
  119. {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/WHEEL +1 -1
  120. deepdoctection/analyzer/_config.py +0 -146
  121. deepdoctection-0.42.0.dist-info/METADATA +0 -431
  122. deepdoctection-0.42.0.dist-info/RECORD +0 -148
  123. {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/licenses/LICENSE +0 -0
  124. {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/top_level.txt +0 -0
@@ -16,8 +16,8 @@
16
16
  # limitations under the License.
17
17
 
18
18
  """
19
- Module for refining methods of table segmentation. The refining methods lead ultimately to a table structure which
20
- enables html table representations
19
+ Refining methods for table segmentation. The refining methods lead ultimately to a table structure which enables
20
+ HTML table representations.
21
21
  """
22
22
  from __future__ import annotations
23
23
 
@@ -44,14 +44,16 @@ __all__ = ["TableSegmentationRefinementService", "generate_html_string"]
44
44
 
45
45
  def tiles_to_cells(dp: Image, table: ImageAnnotation) -> list[tuple[tuple[int, int], str]]:
46
46
  """
47
- Creation of a table parquet: A table is divided into a tile parquet with the (number of rows) x
48
- (the number of columns) tiles.
47
+ Creates a table parquet by dividing a table into a tile parquet with the number of rows x number of columns tiles.
49
48
  Each tile is assigned a list of cell ids that are occupied by the cell. No cells but one or more cells can be
50
49
  assigned per tile.
51
50
 
52
- :param dp: Image
53
- :param table: Table image annotation
54
- :return: Image
51
+ Args:
52
+ dp: `Image`
53
+ table: `ImageAnnotation`
54
+
55
+ Returns:
56
+ A list of tuples with tile positions and cell annotation ids.
55
57
  """
56
58
 
57
59
  cell_ann_ids = table.get_relationship(Relationships.CHILD)
@@ -77,13 +79,16 @@ def connected_component_tiles(
77
79
  tile_to_cell_list: list[tuple[tuple[int, int], str]]
78
80
  ) -> tuple[list[set[tuple[int, int]]], DefaultDict[tuple[int, int], list[str]]]:
79
81
  """
80
- The assignment of bricks to their cell occupancy induces a graph, with bricks as corners and cell edges. Cells that
81
- lie on top of several bricks connect the underlying bricks. The graph generated according to this procedure is
82
- usually multiple connected. The related components and the tile/cell ids assignment are determined.
82
+ Assigns bricks to their cell occupancy, inducing a graph with bricks as nodes and cell edges. Cells that lie on
83
+ top of several bricks connect the underlying bricks. The graph generated is usually multiple connected. Determines
84
+ the related components and the tile/cell ids assignment.
85
+
86
+ Args:
87
+ tile_to_cell_list: List of tuples with tile position and cell ids.
83
88
 
84
- :param tile_to_cell_list: list of tuples with tile position and cell ids
85
- :return: list of set with tiles that belong to the same connected component and a dict with tiles as keys and
86
- assigned list of cell ids as values.
89
+ Returns:
90
+ A tuple containing a list of sets with tiles that belong to the same connected component and a dict with tiles
91
+ as keys and assigned list of cell ids as values.
87
92
  """
88
93
  cell_to_tile_list = [(cell_position[1], cell_position[0]) for cell_position in tile_to_cell_list]
89
94
  cells = set(tup[0] for tup in cell_to_tile_list)
@@ -164,13 +169,14 @@ def _merge_components(reduced_connected_tiles: list[set[tuple[int, int]]]) -> li
164
169
 
165
170
  def generate_rectangle_tiling(connected_components_tiles: list[set[tuple[int, int]]]) -> list[set[tuple[int, int]]]:
166
171
  """
167
- The determined connected components imply that all cells have to be combined which are above a connected component.
168
- In addition, however, it must also be taken into account that cells must be rectangular. This means that related
169
- components have to be combined whose combined cells above do not create a rectangular tiling. All tiles are combined
170
- in such a way that all cells above them combine to form a rectangular scheme.
172
+ Combines connected components so that all cells above them form a rectangular scheme. Ensures that all tiles are
173
+ combined in such a way that all cells above them combine to form a rectangular tiling.
174
+
175
+ Args:
176
+ connected_components_tiles: List of sets with tiles that belong to the same connected component.
171
177
 
172
- :param connected_components_tiles: list of set with tiles that belong to the same connected component
173
- :return: list of sets with tiles, the cells on top of which together form a rectangular scheme
178
+ Returns:
179
+ List of sets with tiles, the cells on top of which together form a rectangular scheme.
174
180
  """
175
181
  rectangle_tiling: list[set[tuple[int, int]]] = []
176
182
  inputs = connected_components_tiles
@@ -187,11 +193,14 @@ def rectangle_cells(
187
193
  rectangle_tiling: list[set[tuple[int, int]]], tile_to_cell_dict: DefaultDict[tuple[int, int], list[str]]
188
194
  ) -> list[set[str]]:
189
195
  """
190
- All cells are determined that are located above combined connected components and form a rectangular scheme.
196
+ Determines all cells that are located above combined connected components and form a rectangular scheme.
191
197
 
192
- :param rectangle_tiling: list of sets with tiles, the cells on top of which together form a rectangular scheme
193
- :param tile_to_cell_dict: Dict with tiles as keys and assigned list of cell ids as values.
194
- :return: list of set of cell ids that form a rectangular scheme
198
+ Args:
199
+ rectangle_tiling: List of sets with tiles, the cells on top of which together form a rectangular scheme.
200
+ tile_to_cell_dict: Dict with tiles as keys and assigned list of cell ids as values.
201
+
202
+ Returns:
203
+ List of sets of cell ids that form a rectangular scheme.
195
204
  """
196
205
  rectangle_tiling_cells: list[set[str]] = []
197
206
  for rect_tiling_component in rectangle_tiling:
@@ -214,7 +223,14 @@ def _html_cell(
214
223
  cell_position: Union[tuple[int, int, int, int], tuple[()]], position_filled_list: list[tuple[int, int]]
215
224
  ) -> list[str]:
216
225
  """
217
- Html table cell string generation
226
+ Generates an HTML table cell string.
227
+
228
+ Args:
229
+ cell_position: Cell position tuple or empty tuple.
230
+ position_filled_list: List of filled positions.
231
+
232
+ Returns:
233
+ List of HTML strings representing the cell.
218
234
  """
219
235
  html = ["<td"]
220
236
  if not cell_position:
@@ -246,7 +262,17 @@ def _html_row(
246
262
  row_ann_id_list: list[str],
247
263
  ) -> list[str]:
248
264
  """
249
- Html table row string generation
265
+ Generates an HTML table row string.
266
+
267
+ Args:
268
+ row_list: List of cell position tuples for the row.
269
+ position_filled_list: List of filled positions.
270
+ this_row: The current row number.
271
+ number_of_cols: The total number of columns.
272
+ row_ann_id_list: List of annotation ids for the row.
273
+
274
+ Returns:
275
+ List of HTML strings representing the row.
250
276
  """
251
277
  html = ["<tr>"]
252
278
  for idx in range(1, number_of_cols + 1):
@@ -282,7 +308,16 @@ def _html_table(
282
308
  number_of_cols: int,
283
309
  ) -> list[str]:
284
310
  """
285
- Html table string generation
311
+ Generates an HTML table string.
312
+
313
+ Args:
314
+ table_list: List of tuples with row number and list of cell position tuples.
315
+ cells_ann_list: List of tuples with row number and list of annotation ids.
316
+ number_of_rows: The total number of rows.
317
+ number_of_cols: The total number of columns.
318
+
319
+ Returns:
320
+ List of HTML strings representing the table.
286
321
  """
287
322
  html = ["<table>"]
288
323
  position_filled: list[tuple[int, int]] = []
@@ -297,14 +332,21 @@ def _html_table(
297
332
 
298
333
  def generate_html_string(table: ImageAnnotation, cell_names: Sequence[ObjectTypes]) -> list[str]:
299
334
  """
300
- Takes the table segmentation by using table cells row number, column numbers etc. and generates a html
301
- representation.
302
-
303
- :param table: An annotation that has a not None image and fully segmented cell annotation.
304
- :param cell_names: List of cell names that are used for the table segmentation. Note: It must be ensured that
305
- that all cells have a row number, column number, row span and column span and that the dissection
306
- by rows and columns is completely covered by cells.
307
- :return: HTML representation of the table
335
+ Generates an HTML representation of a table using table segmentation by row number, column number, etc.
336
+
337
+ Note:
338
+ It must be ensured that all cells have a row number, column number, row span, and column span, and that the
339
+ dissection by rows and columns is completely covered by cells.
340
+
341
+ Args:
342
+ table: An annotation that has a not None image and fully segmented cell annotation.
343
+ cell_names: List of cell names that are used for the table segmentation.
344
+
345
+ Returns:
346
+ HTML representation of the table.
347
+
348
+ Raises:
349
+ `ImageError`: If `table.image` is None.
308
350
  """
309
351
  if table.image is None:
310
352
  raise ImageError("table.image cannot be None")
@@ -355,12 +397,11 @@ class TableSegmentationRefinementService(PipelineComponent):
355
397
  | C3 C3 |
356
398
  +----------+
357
399
 
358
- The first two cells have the same column assignment via the segmentation and must therefore be merged.
359
- Note that the number of rows and columns does not change in the refinement process. What changes is just the number
360
- of cells.
400
+ The first two cells have the same column assignment via the segmentation and must therefore be merged. Note that
401
+ the number of rows and columns does not change in the refinement process. What changes is just the number of cells.
361
402
 
362
- Furthermore, when merging, it must be ensured that the combined cells still have a rectangular shape.
363
- This is also guaranteed in the refining process.
403
+ Furthermore, when merging, it must be ensured that the combined cells still have a rectangular shape. This is also
404
+ guaranteed in the refining process.
364
405
 
365
406
  +----------+
366
407
  | C1 | C2 |
@@ -368,32 +409,39 @@ class TableSegmentationRefinementService(PipelineComponent):
368
409
  | C3 | C3 |
369
410
  +----------+
370
411
 
371
- The table consists of one row and two columns. The upper cells belong together with the lower cell.
372
- However, this means that all cells must be merged with one another so that the table only consists of one cell
373
- after the refinement process.
374
-
375
- **Example**
376
-
377
- layout = ImageLayoutService(layout_detector, to_image=True, crop_image=True)
378
- cell = SubImageLayoutService(cell_detector, "TABLE")
379
- row_col = SubImageLayoutService(row_col_detector, "TABLE")
380
- table_segmentation = TableSegmentationService("ioa",0.9,0.8,True,0.0001,0.0001)
381
- table_segmentation_refinement = TableSegmentationRefinementService()
382
-
383
- table_recognition_pipe = DoctectionPipe([layout,
384
- cell,
385
- row_col,
386
- table_segmentation,
387
- table_segmentation_refinement])
388
- df = pipe.analyze(path="path/to/document.pdf")
389
-
390
- for dp in df:
391
- ...
392
-
412
+ The table consists of one row and two columns. The upper cells belong together with the lower cell. However, this
413
+ means that all cells must be merged with one another so that the table only consists of one cell after the
414
+ refinement process.
415
+
416
+ Example:
417
+ ```python
418
+ layout = ImageLayoutService(layout_detector, to_image=True, crop_image=True)
419
+ cell = SubImageLayoutService(cell_detector, "TABLE")
420
+ row_col = SubImageLayoutService(row_col_detector, "TABLE")
421
+ table_segmentation = TableSegmentationService("ioa",0.9,0.8,True,0.0001,0.0001)
422
+ table_segmentation_refinement = TableSegmentationRefinementService()
423
+
424
+ table_recognition_pipe = DoctectionPipe([layout,
425
+ cell,
426
+ row_col,
427
+ table_segmentation,
428
+ table_segmentation_refinement])
429
+ df = pipe.analyze(path="path/to/document.pdf")
430
+
431
+ for dp in df:
432
+ ...
433
+ ```
393
434
  """
394
435
 
395
- def __init__(self, table_name: Sequence[ObjectTypes], cell_names: Sequence[ObjectTypes]) -> None:
396
- self.table_name = table_name
436
+ def __init__(self, table_names: Sequence[ObjectTypes], cell_names: Sequence[ObjectTypes]) -> None:
437
+ """
438
+ Initializes the `TableSegmentationRefinementService`.
439
+
440
+ Args:
441
+ table_names: Sequence of table object types.
442
+ cell_names: Sequence of cell object types.
443
+ """
444
+ self.table_name = table_names
397
445
  self.cell_names = cell_names
398
446
  super().__init__("table_segment_refine")
399
447
 
@@ -16,7 +16,7 @@
16
16
  # limitations under the License.
17
17
 
18
18
  """
19
- Module for PipeRegistry
19
+ Pipeline component registry
20
20
  """
21
21
 
22
22
  import catalogue # type: ignore