deepdoctection 0.42.1__py3-none-any.whl → 0.43__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +2 -1
- deepdoctection/analyzer/__init__.py +2 -1
- deepdoctection/analyzer/config.py +904 -0
- deepdoctection/analyzer/dd.py +36 -62
- deepdoctection/analyzer/factory.py +311 -141
- deepdoctection/configs/conf_dd_one.yaml +100 -44
- deepdoctection/configs/profiles.jsonl +32 -0
- deepdoctection/dataflow/__init__.py +9 -6
- deepdoctection/dataflow/base.py +33 -15
- deepdoctection/dataflow/common.py +96 -75
- deepdoctection/dataflow/custom.py +36 -29
- deepdoctection/dataflow/custom_serialize.py +135 -91
- deepdoctection/dataflow/parallel_map.py +33 -31
- deepdoctection/dataflow/serialize.py +15 -10
- deepdoctection/dataflow/stats.py +41 -28
- deepdoctection/datapoint/__init__.py +4 -6
- deepdoctection/datapoint/annotation.py +104 -66
- deepdoctection/datapoint/box.py +190 -130
- deepdoctection/datapoint/convert.py +66 -39
- deepdoctection/datapoint/image.py +151 -95
- deepdoctection/datapoint/view.py +383 -236
- deepdoctection/datasets/__init__.py +2 -6
- deepdoctection/datasets/adapter.py +11 -11
- deepdoctection/datasets/base.py +118 -81
- deepdoctection/datasets/dataflow_builder.py +18 -12
- deepdoctection/datasets/info.py +76 -57
- deepdoctection/datasets/instances/__init__.py +6 -2
- deepdoctection/datasets/instances/doclaynet.py +17 -14
- deepdoctection/datasets/instances/fintabnet.py +16 -22
- deepdoctection/datasets/instances/funsd.py +11 -6
- deepdoctection/datasets/instances/iiitar13k.py +9 -9
- deepdoctection/datasets/instances/layouttest.py +9 -9
- deepdoctection/datasets/instances/publaynet.py +9 -9
- deepdoctection/datasets/instances/pubtables1m.py +13 -13
- deepdoctection/datasets/instances/pubtabnet.py +13 -15
- deepdoctection/datasets/instances/rvlcdip.py +8 -8
- deepdoctection/datasets/instances/xfund.py +11 -9
- deepdoctection/datasets/registry.py +18 -11
- deepdoctection/datasets/save.py +12 -11
- deepdoctection/eval/__init__.py +3 -2
- deepdoctection/eval/accmetric.py +72 -52
- deepdoctection/eval/base.py +29 -10
- deepdoctection/eval/cocometric.py +14 -12
- deepdoctection/eval/eval.py +56 -41
- deepdoctection/eval/registry.py +6 -3
- deepdoctection/eval/tedsmetric.py +24 -9
- deepdoctection/eval/tp_eval_callback.py +13 -12
- deepdoctection/extern/__init__.py +1 -1
- deepdoctection/extern/base.py +176 -97
- deepdoctection/extern/d2detect.py +127 -92
- deepdoctection/extern/deskew.py +19 -10
- deepdoctection/extern/doctrocr.py +157 -106
- deepdoctection/extern/fastlang.py +25 -17
- deepdoctection/extern/hfdetr.py +137 -60
- deepdoctection/extern/hflayoutlm.py +329 -248
- deepdoctection/extern/hflm.py +67 -33
- deepdoctection/extern/model.py +108 -762
- deepdoctection/extern/pdftext.py +37 -12
- deepdoctection/extern/pt/nms.py +15 -1
- deepdoctection/extern/pt/ptutils.py +13 -9
- deepdoctection/extern/tessocr.py +87 -54
- deepdoctection/extern/texocr.py +29 -14
- deepdoctection/extern/tp/tfutils.py +36 -8
- deepdoctection/extern/tp/tpcompat.py +54 -16
- deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
- deepdoctection/extern/tpdetect.py +4 -2
- deepdoctection/mapper/__init__.py +1 -1
- deepdoctection/mapper/cats.py +117 -76
- deepdoctection/mapper/cocostruct.py +35 -17
- deepdoctection/mapper/d2struct.py +56 -29
- deepdoctection/mapper/hfstruct.py +32 -19
- deepdoctection/mapper/laylmstruct.py +221 -185
- deepdoctection/mapper/maputils.py +71 -35
- deepdoctection/mapper/match.py +76 -62
- deepdoctection/mapper/misc.py +68 -44
- deepdoctection/mapper/pascalstruct.py +13 -12
- deepdoctection/mapper/prodigystruct.py +33 -19
- deepdoctection/mapper/pubstruct.py +42 -32
- deepdoctection/mapper/tpstruct.py +39 -19
- deepdoctection/mapper/xfundstruct.py +20 -13
- deepdoctection/pipe/__init__.py +1 -2
- deepdoctection/pipe/anngen.py +104 -62
- deepdoctection/pipe/base.py +226 -107
- deepdoctection/pipe/common.py +206 -123
- deepdoctection/pipe/concurrency.py +74 -47
- deepdoctection/pipe/doctectionpipe.py +108 -47
- deepdoctection/pipe/language.py +41 -24
- deepdoctection/pipe/layout.py +45 -18
- deepdoctection/pipe/lm.py +146 -78
- deepdoctection/pipe/order.py +196 -113
- deepdoctection/pipe/refine.py +111 -63
- deepdoctection/pipe/registry.py +1 -1
- deepdoctection/pipe/segment.py +213 -142
- deepdoctection/pipe/sub_layout.py +76 -46
- deepdoctection/pipe/text.py +52 -33
- deepdoctection/pipe/transform.py +8 -6
- deepdoctection/train/d2_frcnn_train.py +87 -69
- deepdoctection/train/hf_detr_train.py +72 -40
- deepdoctection/train/hf_layoutlm_train.py +85 -46
- deepdoctection/train/tp_frcnn_train.py +56 -28
- deepdoctection/utils/concurrency.py +59 -16
- deepdoctection/utils/context.py +40 -19
- deepdoctection/utils/develop.py +25 -17
- deepdoctection/utils/env_info.py +85 -36
- deepdoctection/utils/error.py +16 -10
- deepdoctection/utils/file_utils.py +246 -62
- deepdoctection/utils/fs.py +162 -43
- deepdoctection/utils/identifier.py +29 -16
- deepdoctection/utils/logger.py +49 -32
- deepdoctection/utils/metacfg.py +83 -21
- deepdoctection/utils/pdf_utils.py +119 -62
- deepdoctection/utils/settings.py +24 -10
- deepdoctection/utils/tqdm.py +10 -5
- deepdoctection/utils/transform.py +182 -46
- deepdoctection/utils/utils.py +61 -28
- deepdoctection/utils/viz.py +150 -104
- deepdoctection-0.43.dist-info/METADATA +376 -0
- deepdoctection-0.43.dist-info/RECORD +149 -0
- deepdoctection/analyzer/_config.py +0 -146
- deepdoctection-0.42.1.dist-info/METADATA +0 -431
- deepdoctection-0.42.1.dist-info/RECORD +0 -148
- {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/WHEEL +0 -0
- {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/licenses/LICENSE +0 -0
- {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/top_level.txt +0 -0
deepdoctection/pipe/refine.py
CHANGED
|
@@ -16,8 +16,8 @@
|
|
|
16
16
|
# limitations under the License.
|
|
17
17
|
|
|
18
18
|
"""
|
|
19
|
-
|
|
20
|
-
|
|
19
|
+
Refining methods for table segmentation. The refining methods lead ultimately to a table structure which enables
|
|
20
|
+
HTML table representations.
|
|
21
21
|
"""
|
|
22
22
|
from __future__ import annotations
|
|
23
23
|
|
|
@@ -44,14 +44,16 @@ __all__ = ["TableSegmentationRefinementService", "generate_html_string"]
|
|
|
44
44
|
|
|
45
45
|
def tiles_to_cells(dp: Image, table: ImageAnnotation) -> list[tuple[tuple[int, int], str]]:
|
|
46
46
|
"""
|
|
47
|
-
|
|
48
|
-
(the number of columns) tiles.
|
|
47
|
+
Creates a table parquet by dividing a table into a tile parquet with the number of rows x number of columns tiles.
|
|
49
48
|
Each tile is assigned a list of cell ids that are occupied by the cell. No cells but one or more cells can be
|
|
50
49
|
assigned per tile.
|
|
51
50
|
|
|
52
|
-
:
|
|
53
|
-
|
|
54
|
-
|
|
51
|
+
Args:
|
|
52
|
+
dp: `Image`
|
|
53
|
+
table: `ImageAnnotation`
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
A list of tuples with tile positions and cell annotation ids.
|
|
55
57
|
"""
|
|
56
58
|
|
|
57
59
|
cell_ann_ids = table.get_relationship(Relationships.CHILD)
|
|
@@ -77,13 +79,16 @@ def connected_component_tiles(
|
|
|
77
79
|
tile_to_cell_list: list[tuple[tuple[int, int], str]]
|
|
78
80
|
) -> tuple[list[set[tuple[int, int]]], DefaultDict[tuple[int, int], list[str]]]:
|
|
79
81
|
"""
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
82
|
+
Assigns bricks to their cell occupancy, inducing a graph with bricks as nodes and cell edges. Cells that lie on
|
|
83
|
+
top of several bricks connect the underlying bricks. The graph generated is usually multiple connected. Determines
|
|
84
|
+
the related components and the tile/cell ids assignment.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
tile_to_cell_list: List of tuples with tile position and cell ids.
|
|
83
88
|
|
|
84
|
-
:
|
|
85
|
-
|
|
86
|
-
|
|
89
|
+
Returns:
|
|
90
|
+
A tuple containing a list of sets with tiles that belong to the same connected component and a dict with tiles
|
|
91
|
+
as keys and assigned list of cell ids as values.
|
|
87
92
|
"""
|
|
88
93
|
cell_to_tile_list = [(cell_position[1], cell_position[0]) for cell_position in tile_to_cell_list]
|
|
89
94
|
cells = set(tup[0] for tup in cell_to_tile_list)
|
|
@@ -164,13 +169,14 @@ def _merge_components(reduced_connected_tiles: list[set[tuple[int, int]]]) -> li
|
|
|
164
169
|
|
|
165
170
|
def generate_rectangle_tiling(connected_components_tiles: list[set[tuple[int, int]]]) -> list[set[tuple[int, int]]]:
|
|
166
171
|
"""
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
172
|
+
Combines connected components so that all cells above them form a rectangular scheme. Ensures that all tiles are
|
|
173
|
+
combined in such a way that all cells above them combine to form a rectangular tiling.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
connected_components_tiles: List of sets with tiles that belong to the same connected component.
|
|
171
177
|
|
|
172
|
-
:
|
|
173
|
-
|
|
178
|
+
Returns:
|
|
179
|
+
List of sets with tiles, the cells on top of which together form a rectangular scheme.
|
|
174
180
|
"""
|
|
175
181
|
rectangle_tiling: list[set[tuple[int, int]]] = []
|
|
176
182
|
inputs = connected_components_tiles
|
|
@@ -187,11 +193,14 @@ def rectangle_cells(
|
|
|
187
193
|
rectangle_tiling: list[set[tuple[int, int]]], tile_to_cell_dict: DefaultDict[tuple[int, int], list[str]]
|
|
188
194
|
) -> list[set[str]]:
|
|
189
195
|
"""
|
|
190
|
-
|
|
196
|
+
Determines all cells that are located above combined connected components and form a rectangular scheme.
|
|
191
197
|
|
|
192
|
-
:
|
|
193
|
-
|
|
194
|
-
|
|
198
|
+
Args:
|
|
199
|
+
rectangle_tiling: List of sets with tiles, the cells on top of which together form a rectangular scheme.
|
|
200
|
+
tile_to_cell_dict: Dict with tiles as keys and assigned list of cell ids as values.
|
|
201
|
+
|
|
202
|
+
Returns:
|
|
203
|
+
List of sets of cell ids that form a rectangular scheme.
|
|
195
204
|
"""
|
|
196
205
|
rectangle_tiling_cells: list[set[str]] = []
|
|
197
206
|
for rect_tiling_component in rectangle_tiling:
|
|
@@ -214,7 +223,14 @@ def _html_cell(
|
|
|
214
223
|
cell_position: Union[tuple[int, int, int, int], tuple[()]], position_filled_list: list[tuple[int, int]]
|
|
215
224
|
) -> list[str]:
|
|
216
225
|
"""
|
|
217
|
-
|
|
226
|
+
Generates an HTML table cell string.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
cell_position: Cell position tuple or empty tuple.
|
|
230
|
+
position_filled_list: List of filled positions.
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
List of HTML strings representing the cell.
|
|
218
234
|
"""
|
|
219
235
|
html = ["<td"]
|
|
220
236
|
if not cell_position:
|
|
@@ -246,7 +262,17 @@ def _html_row(
|
|
|
246
262
|
row_ann_id_list: list[str],
|
|
247
263
|
) -> list[str]:
|
|
248
264
|
"""
|
|
249
|
-
|
|
265
|
+
Generates an HTML table row string.
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
row_list: List of cell position tuples for the row.
|
|
269
|
+
position_filled_list: List of filled positions.
|
|
270
|
+
this_row: The current row number.
|
|
271
|
+
number_of_cols: The total number of columns.
|
|
272
|
+
row_ann_id_list: List of annotation ids for the row.
|
|
273
|
+
|
|
274
|
+
Returns:
|
|
275
|
+
List of HTML strings representing the row.
|
|
250
276
|
"""
|
|
251
277
|
html = ["<tr>"]
|
|
252
278
|
for idx in range(1, number_of_cols + 1):
|
|
@@ -282,7 +308,16 @@ def _html_table(
|
|
|
282
308
|
number_of_cols: int,
|
|
283
309
|
) -> list[str]:
|
|
284
310
|
"""
|
|
285
|
-
|
|
311
|
+
Generates an HTML table string.
|
|
312
|
+
|
|
313
|
+
Args:
|
|
314
|
+
table_list: List of tuples with row number and list of cell position tuples.
|
|
315
|
+
cells_ann_list: List of tuples with row number and list of annotation ids.
|
|
316
|
+
number_of_rows: The total number of rows.
|
|
317
|
+
number_of_cols: The total number of columns.
|
|
318
|
+
|
|
319
|
+
Returns:
|
|
320
|
+
List of HTML strings representing the table.
|
|
286
321
|
"""
|
|
287
322
|
html = ["<table>"]
|
|
288
323
|
position_filled: list[tuple[int, int]] = []
|
|
@@ -297,14 +332,21 @@ def _html_table(
|
|
|
297
332
|
|
|
298
333
|
def generate_html_string(table: ImageAnnotation, cell_names: Sequence[ObjectTypes]) -> list[str]:
|
|
299
334
|
"""
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
335
|
+
Generates an HTML representation of a table using table segmentation by row number, column number, etc.
|
|
336
|
+
|
|
337
|
+
Note:
|
|
338
|
+
It must be ensured that all cells have a row number, column number, row span, and column span, and that the
|
|
339
|
+
dissection by rows and columns is completely covered by cells.
|
|
340
|
+
|
|
341
|
+
Args:
|
|
342
|
+
table: An annotation that has a not None image and fully segmented cell annotation.
|
|
343
|
+
cell_names: List of cell names that are used for the table segmentation.
|
|
344
|
+
|
|
345
|
+
Returns:
|
|
346
|
+
HTML representation of the table.
|
|
347
|
+
|
|
348
|
+
Raises:
|
|
349
|
+
`ImageError`: If `table.image` is None.
|
|
308
350
|
"""
|
|
309
351
|
if table.image is None:
|
|
310
352
|
raise ImageError("table.image cannot be None")
|
|
@@ -355,12 +397,11 @@ class TableSegmentationRefinementService(PipelineComponent):
|
|
|
355
397
|
| C3 C3 |
|
|
356
398
|
+----------+
|
|
357
399
|
|
|
358
|
-
The first two cells have the same column assignment via the segmentation and must therefore be merged.
|
|
359
|
-
|
|
360
|
-
of cells.
|
|
400
|
+
The first two cells have the same column assignment via the segmentation and must therefore be merged. Note that
|
|
401
|
+
the number of rows and columns does not change in the refinement process. What changes is just the number of cells.
|
|
361
402
|
|
|
362
|
-
Furthermore, when merging, it must be ensured that the combined cells still have a rectangular shape.
|
|
363
|
-
|
|
403
|
+
Furthermore, when merging, it must be ensured that the combined cells still have a rectangular shape. This is also
|
|
404
|
+
guaranteed in the refining process.
|
|
364
405
|
|
|
365
406
|
+----------+
|
|
366
407
|
| C1 | C2 |
|
|
@@ -368,32 +409,39 @@ class TableSegmentationRefinementService(PipelineComponent):
|
|
|
368
409
|
| C3 | C3 |
|
|
369
410
|
+----------+
|
|
370
411
|
|
|
371
|
-
The table consists of one row and two columns. The upper cells belong together with the lower cell.
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
412
|
+
The table consists of one row and two columns. The upper cells belong together with the lower cell. However, this
|
|
413
|
+
means that all cells must be merged with one another so that the table only consists of one cell after the
|
|
414
|
+
refinement process.
|
|
415
|
+
|
|
416
|
+
Example:
|
|
417
|
+
```python
|
|
418
|
+
layout = ImageLayoutService(layout_detector, to_image=True, crop_image=True)
|
|
419
|
+
cell = SubImageLayoutService(cell_detector, "TABLE")
|
|
420
|
+
row_col = SubImageLayoutService(row_col_detector, "TABLE")
|
|
421
|
+
table_segmentation = TableSegmentationService("ioa",0.9,0.8,True,0.0001,0.0001)
|
|
422
|
+
table_segmentation_refinement = TableSegmentationRefinementService()
|
|
423
|
+
|
|
424
|
+
table_recognition_pipe = DoctectionPipe([layout,
|
|
425
|
+
cell,
|
|
426
|
+
row_col,
|
|
427
|
+
table_segmentation,
|
|
428
|
+
table_segmentation_refinement])
|
|
429
|
+
df = pipe.analyze(path="path/to/document.pdf")
|
|
430
|
+
|
|
431
|
+
for dp in df:
|
|
432
|
+
...
|
|
433
|
+
```
|
|
393
434
|
"""
|
|
394
435
|
|
|
395
|
-
def __init__(self,
|
|
396
|
-
|
|
436
|
+
def __init__(self, table_names: Sequence[ObjectTypes], cell_names: Sequence[ObjectTypes]) -> None:
|
|
437
|
+
"""
|
|
438
|
+
Initializes the `TableSegmentationRefinementService`.
|
|
439
|
+
|
|
440
|
+
Args:
|
|
441
|
+
table_names: Sequence of table object types.
|
|
442
|
+
cell_names: Sequence of cell object types.
|
|
443
|
+
"""
|
|
444
|
+
self.table_name = table_names
|
|
397
445
|
self.cell_names = cell_names
|
|
398
446
|
super().__init__("table_segment_refine")
|
|
399
447
|
|