deepdoctection 0.37.3__py3-none-any.whl → 0.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +5 -1
- deepdoctection/analyzer/_config.py +2 -1
- deepdoctection/analyzer/dd.py +6 -5
- deepdoctection/analyzer/factory.py +16 -6
- deepdoctection/configs/conf_dd_one.yaml +126 -85
- deepdoctection/datapoint/box.py +2 -4
- deepdoctection/datapoint/convert.py +14 -8
- deepdoctection/datapoint/image.py +12 -5
- deepdoctection/datapoint/view.py +151 -53
- deepdoctection/extern/hfdetr.py +4 -3
- deepdoctection/extern/model.py +6 -97
- deepdoctection/mapper/cats.py +21 -10
- deepdoctection/mapper/match.py +0 -22
- deepdoctection/mapper/misc.py +12 -2
- deepdoctection/mapper/pubstruct.py +1 -1
- deepdoctection/pipe/doctectionpipe.py +20 -3
- deepdoctection/pipe/lm.py +20 -5
- deepdoctection/pipe/refine.py +6 -13
- deepdoctection/pipe/segment.py +225 -46
- deepdoctection/pipe/sub_layout.py +40 -22
- deepdoctection/train/hf_layoutlm_train.py +3 -1
- deepdoctection/utils/pdf_utils.py +17 -9
- {deepdoctection-0.37.3.dist-info → deepdoctection-0.39.dist-info}/METADATA +15 -5
- {deepdoctection-0.37.3.dist-info → deepdoctection-0.39.dist-info}/RECORD +27 -27
- {deepdoctection-0.37.3.dist-info → deepdoctection-0.39.dist-info}/WHEEL +1 -1
- {deepdoctection-0.37.3.dist-info → deepdoctection-0.39.dist-info}/LICENSE +0 -0
- {deepdoctection-0.37.3.dist-info → deepdoctection-0.39.dist-info}/top_level.txt +0 -0
deepdoctection/datapoint/view.py
CHANGED
|
@@ -28,7 +28,7 @@ import numpy as np
|
|
|
28
28
|
from typing_extensions import LiteralString
|
|
29
29
|
|
|
30
30
|
from ..utils.error import AnnotationError, ImageError
|
|
31
|
-
from ..utils.logger import LoggingRecord, logger
|
|
31
|
+
from ..utils.logger import LoggingRecord, log_once, logger
|
|
32
32
|
from ..utils.settings import (
|
|
33
33
|
CellType,
|
|
34
34
|
LayoutType,
|
|
@@ -228,23 +228,33 @@ class Layout(ImageAnnotationBaseView):
|
|
|
228
228
|
|
|
229
229
|
"""
|
|
230
230
|
words = self.get_ordered_words()
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
231
|
+
if words:
|
|
232
|
+
characters, ann_ids, token_classes, token_tags, token_classes_ids, token_tag_ids = zip(
|
|
233
|
+
*[
|
|
234
|
+
(
|
|
235
|
+
word.characters,
|
|
236
|
+
word.annotation_id,
|
|
237
|
+
word.token_class,
|
|
238
|
+
word.token_tag,
|
|
239
|
+
word.get_sub_category(WordType.TOKEN_CLASS).category_id
|
|
240
|
+
if WordType.TOKEN_CLASS in word.sub_categories
|
|
241
|
+
else None,
|
|
242
|
+
word.get_sub_category(WordType.TOKEN_TAG).category_id
|
|
243
|
+
if WordType.TOKEN_TAG in word.sub_categories
|
|
244
|
+
else None,
|
|
245
|
+
)
|
|
246
|
+
for word in words
|
|
247
|
+
]
|
|
248
|
+
)
|
|
249
|
+
else:
|
|
250
|
+
characters, ann_ids, token_classes, token_tags, token_classes_ids, token_tag_ids = (
|
|
251
|
+
[], # type: ignore
|
|
252
|
+
[], # type: ignore
|
|
253
|
+
[], # type: ignore
|
|
254
|
+
[], # type: ignore
|
|
255
|
+
[], # type: ignore
|
|
256
|
+
[], # type: ignore
|
|
257
|
+
)
|
|
248
258
|
return {
|
|
249
259
|
"text": " ".join(characters),
|
|
250
260
|
"words": characters,
|
|
@@ -282,25 +292,103 @@ class Table(Layout):
|
|
|
282
292
|
"""
|
|
283
293
|
|
|
284
294
|
@property
|
|
285
|
-
def cells(self) -> list[
|
|
295
|
+
def cells(self) -> list[Cell]:
|
|
286
296
|
"""
|
|
287
297
|
A list of a table cells.
|
|
288
298
|
"""
|
|
289
299
|
all_relation_ids = self.get_relationship(Relationships.CHILD)
|
|
290
|
-
cell_anns = self.base_page.get_annotation(
|
|
300
|
+
cell_anns: list[Cell] = self.base_page.get_annotation( # type: ignore
|
|
291
301
|
annotation_ids=all_relation_ids,
|
|
292
302
|
category_names=[
|
|
293
303
|
LayoutType.CELL,
|
|
294
304
|
CellType.HEADER,
|
|
295
305
|
CellType.BODY,
|
|
296
|
-
CellType.PROJECTED_ROW_HEADER,
|
|
297
306
|
CellType.SPANNING,
|
|
298
|
-
CellType.ROW_HEADER,
|
|
299
|
-
CellType.COLUMN_HEADER,
|
|
300
307
|
],
|
|
301
308
|
)
|
|
302
309
|
return cell_anns
|
|
303
310
|
|
|
311
|
+
@property
|
|
312
|
+
def column_header_cells(self) -> list[Cell]:
|
|
313
|
+
"""
|
|
314
|
+
Retrieve a list of cells that are column headers in the table.
|
|
315
|
+
|
|
316
|
+
This property filters and sorts the cells in the table to return only those that are column headers.
|
|
317
|
+
The cells are sorted by their column number.
|
|
318
|
+
|
|
319
|
+
:return: A list of `Cell` objects that are column headers.
|
|
320
|
+
"""
|
|
321
|
+
all_relation_ids = self.get_relationship(Relationships.CHILD)
|
|
322
|
+
all_cells: list[Cell] = self.base_page.get_annotation( # type: ignore
|
|
323
|
+
category_names=[LayoutType.CELL, CellType.SPANNING], annotation_ids=all_relation_ids
|
|
324
|
+
)
|
|
325
|
+
headers = list(filter(lambda cell: CellType.COLUMN_HEADER in cell.sub_categories, all_cells))
|
|
326
|
+
headers.sort(key=lambda x: x.column_number) # type: ignore
|
|
327
|
+
return headers
|
|
328
|
+
|
|
329
|
+
@property
|
|
330
|
+
def row_header_cells(self) -> list[Cell]:
|
|
331
|
+
"""
|
|
332
|
+
Retrieve a list of cells that are row headers in the table.
|
|
333
|
+
|
|
334
|
+
This property filters and sorts the cells in the table to return only those that are row headers.
|
|
335
|
+
The cells are sorted by their column number.
|
|
336
|
+
|
|
337
|
+
:return: A list of `Cell` objects that are row headers.
|
|
338
|
+
"""
|
|
339
|
+
all_relation_ids = self.get_relationship(Relationships.CHILD)
|
|
340
|
+
all_cells: list[Cell] = self.base_page.get_annotation( # type: ignore
|
|
341
|
+
category_names=[LayoutType.CELL, CellType.SPANNING], annotation_ids=all_relation_ids
|
|
342
|
+
)
|
|
343
|
+
row_header_cells = list(filter(lambda cell: CellType.ROW_HEADER in cell.sub_categories, all_cells))
|
|
344
|
+
row_header_cells.sort(key=lambda x: x.column_number) # type: ignore
|
|
345
|
+
return row_header_cells
|
|
346
|
+
|
|
347
|
+
def kv_header_rows(self, row_number: int) -> Mapping[str, str]:
|
|
348
|
+
"""
|
|
349
|
+
For a given row number, returns a dictionary mapping column headers to cell values in that row.
|
|
350
|
+
|
|
351
|
+
This method retrieves all cells in the specified row and matches them with their corresponding column headers.
|
|
352
|
+
It then creates a key-value pair where the key is a tuple containing the column number and header text,
|
|
353
|
+
and the value is the cell text.
|
|
354
|
+
|
|
355
|
+
:param row_number: The row number for which to retrieve the key-value pairs.
|
|
356
|
+
:return: A dictionary where keys are tuples of (column number, header text) and values are cell texts.
|
|
357
|
+
|
|
358
|
+
Example:
|
|
359
|
+
If the table has the following structure:
|
|
360
|
+
| Header1 | Header2 |
|
|
361
|
+
|---------|---------|
|
|
362
|
+
| Value1 | Value2 |
|
|
363
|
+
| Value3 | Value4 |
|
|
364
|
+
|
|
365
|
+
Calling kv_header_rows(1) would return:
|
|
366
|
+
{
|
|
367
|
+
(1, 'Header1'): 'Value1',
|
|
368
|
+
(2, 'Header2'): 'Value2'
|
|
369
|
+
}
|
|
370
|
+
"""
|
|
371
|
+
all_relation_ids = self.get_relationship(Relationships.CHILD)
|
|
372
|
+
all_cells = self.base_page.get_annotation(
|
|
373
|
+
category_names=[LayoutType.CELL, CellType.SPANNING], annotation_ids=all_relation_ids
|
|
374
|
+
)
|
|
375
|
+
row_cells = list(
|
|
376
|
+
filter(lambda c: row_number in (c.row_number, c.row_number + c.row_span), all_cells) # type: ignore
|
|
377
|
+
)
|
|
378
|
+
row_cells.sort(key=lambda c: c.column_number) # type: ignore
|
|
379
|
+
column_header_cells = self.column_header_cells
|
|
380
|
+
|
|
381
|
+
kv_dict: Mapping[str, str] = {}
|
|
382
|
+
for cell in row_cells:
|
|
383
|
+
for header in column_header_cells:
|
|
384
|
+
if (
|
|
385
|
+
cell.column_number == header.column_number # type: ignore
|
|
386
|
+
and cell.annotation_id != header.annotation_id # type: ignore
|
|
387
|
+
):
|
|
388
|
+
kv_dict[(header.column_number, header.text)] = cell.text # type: ignore
|
|
389
|
+
break
|
|
390
|
+
return kv_dict
|
|
391
|
+
|
|
304
392
|
@property
|
|
305
393
|
def rows(self) -> list[ImageAnnotationBaseView]:
|
|
306
394
|
"""
|
|
@@ -335,7 +423,7 @@ class Table(Layout):
|
|
|
335
423
|
try:
|
|
336
424
|
html_index = html_list.index(cell.annotation_id)
|
|
337
425
|
html_list.pop(html_index)
|
|
338
|
-
html_list.insert(html_index, cell.text)
|
|
426
|
+
html_list.insert(html_index, cell.text)
|
|
339
427
|
except ValueError:
|
|
340
428
|
logger.warning(LoggingRecord("html construction not possible", {"annotation_id": cell.annotation_id}))
|
|
341
429
|
|
|
@@ -357,6 +445,12 @@ class Table(Layout):
|
|
|
357
445
|
cells = self.cells
|
|
358
446
|
table_list = [["" for _ in range(self.number_of_columns)] for _ in range(self.number_of_rows)] # type: ignore
|
|
359
447
|
for cell in cells:
|
|
448
|
+
if cell.category_name == CellType.SPANNING:
|
|
449
|
+
log_once(
|
|
450
|
+
"Table has spanning cells. This implies, that the .csv output will not be correct."
|
|
451
|
+
"To prevent spanning cell table creation set PT.ITEM.FILTER=['table','spanning'] ",
|
|
452
|
+
"error",
|
|
453
|
+
)
|
|
360
454
|
table_list[cell.row_number - 1][cell.column_number - 1] = ( # type: ignore
|
|
361
455
|
table_list[cell.row_number - 1][cell.column_number - 1] + cell.text + " " # type: ignore
|
|
362
456
|
)
|
|
@@ -386,13 +480,13 @@ class Table(Layout):
|
|
|
386
480
|
token_class_ids: list[str] = []
|
|
387
481
|
token_tag_ids: list[str] = []
|
|
388
482
|
for cell in cells:
|
|
389
|
-
text.extend(cell.text_["text"])
|
|
390
|
-
words.extend(cell.text_["words"])
|
|
391
|
-
ann_ids.extend(cell.text_["ann_ids"])
|
|
392
|
-
token_classes.extend(cell.text_["token_classes"])
|
|
393
|
-
token_tags.extend(cell.text_["token_tags"])
|
|
394
|
-
token_class_ids.extend(cell.text_["token_class_ids"])
|
|
395
|
-
token_tag_ids.extend(cell.text_["token_tag_ids"])
|
|
483
|
+
text.extend(cell.text_["text"])
|
|
484
|
+
words.extend(cell.text_["words"])
|
|
485
|
+
ann_ids.extend(cell.text_["ann_ids"])
|
|
486
|
+
token_classes.extend(cell.text_["token_classes"])
|
|
487
|
+
token_tags.extend(cell.text_["token_tags"])
|
|
488
|
+
token_class_ids.extend(cell.text_["token_class_ids"])
|
|
489
|
+
token_tag_ids.extend(cell.text_["token_tag_ids"])
|
|
396
490
|
return {
|
|
397
491
|
"text": " ".join(text),
|
|
398
492
|
"words": words,
|
|
@@ -414,7 +508,7 @@ class Table(Layout):
|
|
|
414
508
|
if not cells:
|
|
415
509
|
return super().words
|
|
416
510
|
for cell in cells:
|
|
417
|
-
all_words.extend(cell.words)
|
|
511
|
+
all_words.extend(cell.words)
|
|
418
512
|
return all_words
|
|
419
513
|
|
|
420
514
|
def get_ordered_words(self) -> list[ImageAnnotationBaseView]:
|
|
@@ -424,7 +518,7 @@ class Table(Layout):
|
|
|
424
518
|
all_words = []
|
|
425
519
|
cells.sort(key=lambda x: (x.ROW_NUMBER, x.COLUMN_NUMBER))
|
|
426
520
|
for cell in cells:
|
|
427
|
-
all_words.extend(cell.get_ordered_words())
|
|
521
|
+
all_words.extend(cell.get_ordered_words())
|
|
428
522
|
return all_words
|
|
429
523
|
except (TypeError, AnnotationError):
|
|
430
524
|
return super().get_ordered_words()
|
|
@@ -436,10 +530,10 @@ IMAGE_ANNOTATION_TO_LAYOUTS: dict[ObjectTypes, Type[Union[Layout, Table, Word]]]
|
|
|
436
530
|
LayoutType.TABLE_ROTATED: Table,
|
|
437
531
|
LayoutType.WORD: Word,
|
|
438
532
|
LayoutType.CELL: Cell,
|
|
439
|
-
CellType.PROJECTED_ROW_HEADER: Cell,
|
|
440
533
|
CellType.SPANNING: Cell,
|
|
441
534
|
CellType.ROW_HEADER: Cell,
|
|
442
535
|
CellType.COLUMN_HEADER: Cell,
|
|
536
|
+
CellType.PROJECTED_ROW_HEADER: Cell,
|
|
443
537
|
}
|
|
444
538
|
|
|
445
539
|
|
|
@@ -465,10 +559,7 @@ IMAGE_DEFAULTS: ImageDefaults = {
|
|
|
465
559
|
LayoutType.LIST,
|
|
466
560
|
LayoutType.CELL,
|
|
467
561
|
LayoutType.FIGURE,
|
|
468
|
-
CellType.COLUMN_HEADER,
|
|
469
|
-
CellType.PROJECTED_ROW_HEADER,
|
|
470
562
|
CellType.SPANNING,
|
|
471
|
-
CellType.ROW_HEADER,
|
|
472
563
|
),
|
|
473
564
|
}
|
|
474
565
|
|
|
@@ -851,6 +942,16 @@ class Page(Image):
|
|
|
851
942
|
"""
|
|
852
943
|
return self._make_text(False)
|
|
853
944
|
|
|
945
|
+
def _ann_viz_bbox(self, ann: ImageAnnotationBaseView) -> list[float]:
|
|
946
|
+
"""
|
|
947
|
+
Get the bounding box as list and in absolute coordinates of the base page.
|
|
948
|
+
"""
|
|
949
|
+
bounding_box = ann.get_bounding_box(self.image_id)
|
|
950
|
+
|
|
951
|
+
if not bounding_box.absolute_coords:
|
|
952
|
+
bounding_box = bounding_box.transform(self.width, self.height, absolute_coords=True)
|
|
953
|
+
return bounding_box.to_list(mode="xyxy")
|
|
954
|
+
|
|
854
955
|
@no_type_check
|
|
855
956
|
def viz(
|
|
856
957
|
self,
|
|
@@ -886,6 +987,7 @@ class Page(Image):
|
|
|
886
987
|
:param show_tables: Will display all tables boxes as well as cells, rows and columns
|
|
887
988
|
:param show_layouts: Will display all other layout components.
|
|
888
989
|
:param show_figures: Will display all figures
|
|
990
|
+
:param show_residual_layouts: Will display all residual layouts
|
|
889
991
|
:param show_cells: Will display cells within tables. (Only available if `show_tables=True`)
|
|
890
992
|
:param show_table_structure: Will display rows and columns
|
|
891
993
|
:param show_words: Will display bounding boxes around words labeled with token class and bio tag (experimental)
|
|
@@ -910,50 +1012,46 @@ class Page(Image):
|
|
|
910
1012
|
if debug_kwargs:
|
|
911
1013
|
anns = self.get_annotation(category_names=list(debug_kwargs.keys()))
|
|
912
1014
|
for ann in anns:
|
|
913
|
-
box_stack.append(ann
|
|
1015
|
+
box_stack.append(self._ann_viz_bbox(ann))
|
|
914
1016
|
category_names_list.append(str(getattr(ann, debug_kwargs[ann.category_name])))
|
|
915
1017
|
|
|
916
1018
|
if show_layouts and not debug_kwargs:
|
|
917
1019
|
for item in self.layouts:
|
|
918
|
-
box_stack.append(item
|
|
1020
|
+
box_stack.append(self._ann_viz_bbox(item))
|
|
919
1021
|
category_names_list.append(item.category_name.value)
|
|
920
1022
|
|
|
921
1023
|
if show_figures and not debug_kwargs:
|
|
922
1024
|
for item in self.figures:
|
|
923
|
-
box_stack.append(item
|
|
1025
|
+
box_stack.append(self._ann_viz_bbox(item))
|
|
924
1026
|
category_names_list.append(item.category_name.value)
|
|
925
1027
|
|
|
926
1028
|
if show_tables and not debug_kwargs:
|
|
927
1029
|
for table in self.tables:
|
|
928
|
-
box_stack.append(table
|
|
1030
|
+
box_stack.append(self._ann_viz_bbox(table))
|
|
929
1031
|
category_names_list.append(LayoutType.TABLE.value)
|
|
930
1032
|
if show_cells:
|
|
931
1033
|
for cell in table.cells:
|
|
932
1034
|
if cell.category_name in {
|
|
933
1035
|
LayoutType.CELL,
|
|
934
|
-
CellType.PROJECTED_ROW_HEADER,
|
|
935
1036
|
CellType.SPANNING,
|
|
936
|
-
CellType.ROW_HEADER,
|
|
937
|
-
CellType.COLUMN_HEADER,
|
|
938
1037
|
}:
|
|
939
1038
|
cells_found = True
|
|
940
|
-
box_stack.append(cell
|
|
1039
|
+
box_stack.append(self._ann_viz_bbox(cell))
|
|
941
1040
|
category_names_list.append(None)
|
|
942
1041
|
if show_table_structure:
|
|
943
1042
|
rows = table.rows
|
|
944
1043
|
cols = table.columns
|
|
945
1044
|
for row in rows:
|
|
946
|
-
box_stack.append(row
|
|
1045
|
+
box_stack.append(self._ann_viz_bbox(row))
|
|
947
1046
|
category_names_list.append(None)
|
|
948
1047
|
for col in cols:
|
|
949
|
-
box_stack.append(col
|
|
1048
|
+
box_stack.append(self._ann_viz_bbox(col))
|
|
950
1049
|
category_names_list.append(None)
|
|
951
1050
|
|
|
952
1051
|
if show_cells and not cells_found and not debug_kwargs:
|
|
953
|
-
for ann in self.
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
category_names_list.append(None)
|
|
1052
|
+
for ann in self.get_annotation(category_names=[LayoutType.CELL, CellType.SPANNING]):
|
|
1053
|
+
box_stack.append(self._ann_viz_bbox(ann))
|
|
1054
|
+
category_names_list.append(None)
|
|
957
1055
|
|
|
958
1056
|
if show_words and not debug_kwargs:
|
|
959
1057
|
all_words = []
|
|
@@ -965,7 +1063,7 @@ class Page(Image):
|
|
|
965
1063
|
all_words = self.get_annotation(category_names=LayoutType.WORD)
|
|
966
1064
|
if not ignore_default_token_class:
|
|
967
1065
|
for word in all_words:
|
|
968
|
-
box_stack.append(word
|
|
1066
|
+
box_stack.append(self._ann_viz_bbox(word))
|
|
969
1067
|
if show_token_class:
|
|
970
1068
|
category_names_list.append(word.token_class.value if word.token_class is not None else None)
|
|
971
1069
|
else:
|
|
@@ -973,7 +1071,7 @@ class Page(Image):
|
|
|
973
1071
|
else:
|
|
974
1072
|
for word in all_words:
|
|
975
1073
|
if word.token_class is not None and word.token_class != TokenClasses.OTHER:
|
|
976
|
-
box_stack.append(word
|
|
1074
|
+
box_stack.append(self._ann_viz_bbox(word))
|
|
977
1075
|
if show_token_class:
|
|
978
1076
|
category_names_list.append(word.token_class.value if word.token_class is not None else None)
|
|
979
1077
|
else:
|
deepdoctection/extern/hfdetr.py
CHANGED
|
@@ -41,6 +41,7 @@ with try_import() as tr_import_guard:
|
|
|
41
41
|
from transformers import ( # pylint: disable=W0611
|
|
42
42
|
AutoFeatureExtractor,
|
|
43
43
|
DetrFeatureExtractor,
|
|
44
|
+
DetrImageProcessor,
|
|
44
45
|
PretrainedConfig,
|
|
45
46
|
TableTransformerForObjectDetection,
|
|
46
47
|
)
|
|
@@ -55,7 +56,7 @@ def _detr_post_processing(
|
|
|
55
56
|
def detr_predict_image(
|
|
56
57
|
np_img: PixelValues,
|
|
57
58
|
predictor: TableTransformerForObjectDetection,
|
|
58
|
-
feature_extractor:
|
|
59
|
+
feature_extractor: DetrImageProcessor,
|
|
59
60
|
device: torch.device,
|
|
60
61
|
threshold: float,
|
|
61
62
|
nms_threshold: float,
|
|
@@ -224,13 +225,13 @@ class HFDetrDerivedDetector(HFDetrDerivedDetectorMixin):
|
|
|
224
225
|
)
|
|
225
226
|
|
|
226
227
|
@staticmethod
|
|
227
|
-
def get_pre_processor(path_feature_extractor_config: PathLikeOrStr) ->
|
|
228
|
+
def get_pre_processor(path_feature_extractor_config: PathLikeOrStr) -> DetrImageProcessor:
|
|
228
229
|
"""
|
|
229
230
|
Builds the feature extractor
|
|
230
231
|
|
|
231
232
|
:return: DetrFeatureExtractor
|
|
232
233
|
"""
|
|
233
|
-
return
|
|
234
|
+
return DetrImageProcessor.from_pretrained(
|
|
234
235
|
pretrained_model_name_or_path=os.fspath(path_feature_extractor_config)
|
|
235
236
|
)
|
|
236
237
|
|
deepdoctection/extern/model.py
CHANGED
|
@@ -24,7 +24,7 @@ from dataclasses import asdict, dataclass, field
|
|
|
24
24
|
from typing import Any, Mapping, Optional, Union
|
|
25
25
|
|
|
26
26
|
import jsonlines
|
|
27
|
-
from huggingface_hub import
|
|
27
|
+
from huggingface_hub import hf_hub_download
|
|
28
28
|
from tabulate import tabulate
|
|
29
29
|
from termcolor import colored
|
|
30
30
|
|
|
@@ -136,51 +136,6 @@ class ModelCatalog:
|
|
|
136
136
|
dl_library="TF",
|
|
137
137
|
model_wrapper="TPFrcnnDetector",
|
|
138
138
|
),
|
|
139
|
-
"item/model-1620000.data-00000-of-00001": ModelProfile(
|
|
140
|
-
name="item/model-1620000.data-00000-of-00001",
|
|
141
|
-
description="Tensorpack row/column detection model trained on Pubtabnet",
|
|
142
|
-
config="dd/tp/conf_frcnn_rows.yaml",
|
|
143
|
-
size=[823546048, 25787],
|
|
144
|
-
tp_model=True,
|
|
145
|
-
hf_repo_id="deepdoctection/tp_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_rc",
|
|
146
|
-
hf_model_name="model-1620000",
|
|
147
|
-
hf_config_file=["conf_frcnn_rows.yaml"],
|
|
148
|
-
categories={1: LayoutType.ROW, 2: LayoutType.COLUMN},
|
|
149
|
-
dl_library="TF",
|
|
150
|
-
model_wrapper="TPFrcnnDetector",
|
|
151
|
-
),
|
|
152
|
-
"layout/model-800000.data-00000-of-00001": ModelProfile(
|
|
153
|
-
name="layout/model-800000.data-00000-of-00001",
|
|
154
|
-
description="Tensorpack layout detection model trained on Publaynet",
|
|
155
|
-
config="dd/tp/conf_frcnn_layout.yaml",
|
|
156
|
-
size=[823656748, 25796],
|
|
157
|
-
tp_model=True,
|
|
158
|
-
hf_repo_id="deepdoctection/tp_casc_rcnn_X_32xd4_50_FPN_GN_2FC_publaynet",
|
|
159
|
-
hf_model_name="model-800000",
|
|
160
|
-
hf_config_file=["conf_frcnn_layout.yaml"],
|
|
161
|
-
dl_library="TF",
|
|
162
|
-
categories={
|
|
163
|
-
1: LayoutType.TEXT,
|
|
164
|
-
2: LayoutType.TITLE,
|
|
165
|
-
3: LayoutType.LIST,
|
|
166
|
-
4: LayoutType.TABLE,
|
|
167
|
-
5: LayoutType.FIGURE,
|
|
168
|
-
},
|
|
169
|
-
model_wrapper="TPFrcnnDetector",
|
|
170
|
-
),
|
|
171
|
-
"cell/model-1800000.data-00000-of-00001": ModelProfile(
|
|
172
|
-
name="cell/model-1800000.data-00000-of-00001",
|
|
173
|
-
description="Tensorpack cell detection model trained on Pubtabnet",
|
|
174
|
-
config="dd/tp/conf_frcnn_cell.yaml",
|
|
175
|
-
size=[823509160, 25905],
|
|
176
|
-
tp_model=True,
|
|
177
|
-
hf_repo_id="deepdoctection/tp_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_c",
|
|
178
|
-
hf_model_name="model-1800000",
|
|
179
|
-
hf_config_file=["conf_frcnn_cell.yaml"],
|
|
180
|
-
categories={1: LayoutType.CELL},
|
|
181
|
-
dl_library="TF",
|
|
182
|
-
model_wrapper="TPFrcnnDetector",
|
|
183
|
-
),
|
|
184
139
|
"layout/d2_model_0829999_layout_inf_only.pt": ModelProfile(
|
|
185
140
|
name="layout/d2_model_0829999_layout_inf_only.pt",
|
|
186
141
|
description="Detectron2 layout detection model trained on Publaynet",
|
|
@@ -200,25 +155,6 @@ class ModelCatalog:
|
|
|
200
155
|
dl_library="PT",
|
|
201
156
|
model_wrapper="D2FrcnnDetector",
|
|
202
157
|
),
|
|
203
|
-
"layout/d2_model_0829999_layout.pth": ModelProfile(
|
|
204
|
-
name="layout/d2_model_0829999_layout.pth",
|
|
205
|
-
description="Detectron2 layout detection model trained on Publaynet. Checkpoint for resuming training",
|
|
206
|
-
config="dd/d2/layout/CASCADE_RCNN_R_50_FPN_GN.yaml",
|
|
207
|
-
size=[548377327],
|
|
208
|
-
tp_model=False,
|
|
209
|
-
hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_publaynet_inference_only",
|
|
210
|
-
hf_model_name="d2_model_0829999_layout.pth",
|
|
211
|
-
hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
|
|
212
|
-
categories={
|
|
213
|
-
1: LayoutType.TEXT,
|
|
214
|
-
2: LayoutType.TITLE,
|
|
215
|
-
3: LayoutType.LIST,
|
|
216
|
-
4: LayoutType.TABLE,
|
|
217
|
-
5: LayoutType.FIGURE,
|
|
218
|
-
},
|
|
219
|
-
dl_library="PT",
|
|
220
|
-
model_wrapper="D2FrcnnDetector",
|
|
221
|
-
),
|
|
222
158
|
"layout/d2_model_0829999_layout_inf_only.ts": ModelProfile(
|
|
223
159
|
name="layout/d2_model_0829999_layout_inf_only.ts",
|
|
224
160
|
description="Detectron2 layout detection model trained on Publaynet. Torchscript export",
|
|
@@ -264,32 +200,6 @@ class ModelCatalog:
|
|
|
264
200
|
dl_library="PT",
|
|
265
201
|
model_wrapper="D2FrcnnTracingDetector",
|
|
266
202
|
),
|
|
267
|
-
"cell/d2_model_1849999_cell.pth": ModelProfile(
|
|
268
|
-
name="cell/d2_model_1849999_cell.pth",
|
|
269
|
-
description="Detectron2 cell detection inference only model trained on Pubtabnet",
|
|
270
|
-
config="dd/d2/cell/CASCADE_RCNN_R_50_FPN_GN.yaml",
|
|
271
|
-
size=[548279023],
|
|
272
|
-
tp_model=False,
|
|
273
|
-
hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_c_inference_only",
|
|
274
|
-
hf_model_name="cell/d2_model_1849999_cell.pth",
|
|
275
|
-
hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
|
|
276
|
-
categories={1: LayoutType.CELL},
|
|
277
|
-
dl_library="PT",
|
|
278
|
-
model_wrapper="D2FrcnnDetector",
|
|
279
|
-
),
|
|
280
|
-
"item/d2_model_1639999_item.pth": ModelProfile(
|
|
281
|
-
name="item/d2_model_1639999_item.pth",
|
|
282
|
-
description="Detectron2 item detection model trained on Pubtabnet",
|
|
283
|
-
config="dd/d2/item/CASCADE_RCNN_R_50_FPN_GN.yaml",
|
|
284
|
-
size=[548303599],
|
|
285
|
-
tp_model=False,
|
|
286
|
-
hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_rc_inference_only",
|
|
287
|
-
hf_model_name="d2_model_1639999_item.pth",
|
|
288
|
-
hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
|
|
289
|
-
categories={1: LayoutType.ROW, 2: LayoutType.COLUMN},
|
|
290
|
-
dl_library="PT",
|
|
291
|
-
model_wrapper="D2FrcnnDetector",
|
|
292
|
-
),
|
|
293
203
|
"item/d2_model_1639999_item_inf_only.pt": ModelProfile(
|
|
294
204
|
name="item/d2_model_1639999_item_inf_only.pt",
|
|
295
205
|
description="Detectron2 item detection model inference only trained on Pubtabnet",
|
|
@@ -1232,20 +1142,19 @@ class ModelDownloadManager:
|
|
|
1232
1142
|
def _load_from_hf_hub(
|
|
1233
1143
|
repo_id: str, file_name: str, cache_directory: PathLikeOrStr, force_download: bool = False
|
|
1234
1144
|
) -> int:
|
|
1235
|
-
url = hf_hub_url(repo_id=repo_id, filename=file_name)
|
|
1236
1145
|
token = os.environ.get("HF_CREDENTIALS", None)
|
|
1237
|
-
f_path =
|
|
1238
|
-
|
|
1239
|
-
|
|
1146
|
+
f_path = hf_hub_download(
|
|
1147
|
+
repo_id,
|
|
1148
|
+
file_name,
|
|
1149
|
+
local_dir=cache_directory, # type: ignore
|
|
1240
1150
|
force_filename=file_name,
|
|
1241
1151
|
force_download=force_download,
|
|
1242
1152
|
token=token,
|
|
1243
|
-
legacy_cache_layout=True,
|
|
1244
1153
|
)
|
|
1245
1154
|
if f_path:
|
|
1246
1155
|
stat_info = os.stat(f_path)
|
|
1247
1156
|
size = stat_info.st_size
|
|
1248
1157
|
|
|
1249
|
-
assert size > 0, f"Downloaded an empty file from {
|
|
1158
|
+
assert size > 0, f"Downloaded an empty file from {f_path}!"
|
|
1250
1159
|
return size
|
|
1251
1160
|
raise TypeError("Returned value from cached_download cannot be Null")
|
deepdoctection/mapper/cats.py
CHANGED
|
@@ -73,18 +73,21 @@ def re_assign_cat_ids(
|
|
|
73
73
|
Annotations that are not in the dictionary provided will be removed.
|
|
74
74
|
|
|
75
75
|
:param dp: Image
|
|
76
|
-
:param categories_dict_name_as_key: e.g. `{LayoutType.word:
|
|
76
|
+
:param categories_dict_name_as_key: e.g. `{LayoutType.word: 1}`
|
|
77
77
|
:param cat_to_sub_cat_mapping: e.g. `{<LayoutType.word>:
|
|
78
78
|
{<WordType.token_class>:
|
|
79
|
-
{<FundsFirstPage.
|
|
80
|
-
<FundsFirstPage.
|
|
81
|
-
<FundsFirstPage.
|
|
82
|
-
<FundsFirstPage.
|
|
83
|
-
<TokenClasses.
|
|
84
|
-
<WordType.
|
|
85
|
-
{<BioTag.
|
|
86
|
-
<BioTag.
|
|
87
|
-
<BioTag.
|
|
79
|
+
{<FundsFirstPage.REPORT_DATE>: 1,
|
|
80
|
+
<FundsFirstPage.REPORT_TYPE>: 2,
|
|
81
|
+
<FundsFirstPage.UMBRELLA>: 3,
|
|
82
|
+
<FundsFirstPage.FUND_NAME>: 4,
|
|
83
|
+
<TokenClasses.OTHER>: 5},
|
|
84
|
+
<WordType.TAG>:
|
|
85
|
+
{<BioTag.INSIDE>: 1,
|
|
86
|
+
<BioTag.OUTSIDE>: 2,
|
|
87
|
+
<BioTag.BEGIN>: 3}}}`
|
|
88
|
+
To re-assign the category ids of an image summary, use the key 'default_type' for the default category, e.g.
|
|
89
|
+
`{DefaultType.DEFAULT_TYPE: {<PageType.DOCUMENT_TYPE>: {<DocumentType.INVOICE>:1,
|
|
90
|
+
<DocumentType.BANK_STATEMENT>:2}}}`
|
|
88
91
|
:return: Image
|
|
89
92
|
"""
|
|
90
93
|
|
|
@@ -104,6 +107,14 @@ def re_assign_cat_ids(
|
|
|
104
107
|
sub_category = ann.get_sub_category(key)
|
|
105
108
|
sub_category.category_id = sub_cat_values_dict.get(sub_category.category_name, DEFAULT_CATEGORY_ID)
|
|
106
109
|
|
|
110
|
+
if cat_to_sub_cat_mapping:
|
|
111
|
+
if "default_type" in cat_to_sub_cat_mapping:
|
|
112
|
+
sub_cat_keys_to_sub_cat_values = cat_to_sub_cat_mapping[get_type("default_type")]
|
|
113
|
+
for key in sub_cat_keys_to_sub_cat_values:
|
|
114
|
+
sub_cat_values_dict = sub_cat_keys_to_sub_cat_values[key]
|
|
115
|
+
sub_category = dp.summary.get_sub_category(key)
|
|
116
|
+
sub_category.category_id = sub_cat_values_dict.get(sub_category.category_name, DEFAULT_CATEGORY_ID)
|
|
117
|
+
|
|
107
118
|
dp.remove(annotation_ids=ann_ids_to_remove)
|
|
108
119
|
|
|
109
120
|
return dp
|
deepdoctection/mapper/match.py
CHANGED
|
@@ -101,17 +101,6 @@ def match_anns_by_intersection(
|
|
|
101
101
|
]
|
|
102
102
|
)
|
|
103
103
|
|
|
104
|
-
# second try, if ann has empty image
|
|
105
|
-
n_dim = child_ann_boxes.ndim
|
|
106
|
-
if n_dim != 2:
|
|
107
|
-
child_ann_boxes = np.array(
|
|
108
|
-
[
|
|
109
|
-
ann.bounding_box.transform(dp.width, dp.height, absolute_coords=True).to_list(mode="xyxy")
|
|
110
|
-
for ann in child_anns
|
|
111
|
-
if ann.bounding_box is not None
|
|
112
|
-
]
|
|
113
|
-
)
|
|
114
|
-
|
|
115
104
|
parent_anns = dp.get_annotation(annotation_ids=parent_ann_ids, category_names=parent_ann_category_names)
|
|
116
105
|
parent_ann_boxes = np.array(
|
|
117
106
|
[
|
|
@@ -120,17 +109,6 @@ def match_anns_by_intersection(
|
|
|
120
109
|
]
|
|
121
110
|
)
|
|
122
111
|
|
|
123
|
-
# same for parent
|
|
124
|
-
n_dim = parent_ann_boxes.ndim
|
|
125
|
-
if n_dim != 2:
|
|
126
|
-
parent_ann_boxes = np.array(
|
|
127
|
-
[
|
|
128
|
-
ann.bounding_box.transform(dp.width, dp.height, absolute_coords=True).to_list(mode="xyxy")
|
|
129
|
-
for ann in parent_anns
|
|
130
|
-
if ann.bounding_box is not None
|
|
131
|
-
]
|
|
132
|
-
)
|
|
133
|
-
|
|
134
112
|
if matching_rule in ["iou"] and parent_anns and child_anns:
|
|
135
113
|
iou_matrix = iou(child_ann_boxes, parent_ann_boxes)
|
|
136
114
|
output = iou_matrix > threshold
|
deepdoctection/mapper/misc.py
CHANGED
|
@@ -38,12 +38,20 @@ with try_import() as import_guard:
|
|
|
38
38
|
from lxml import etree # pylint: disable=W0611
|
|
39
39
|
|
|
40
40
|
|
|
41
|
-
def to_image(
|
|
41
|
+
def to_image(
|
|
42
|
+
dp: Union[str, Mapping[str, Union[str, bytes]]],
|
|
43
|
+
dpi: Optional[int] = None,
|
|
44
|
+
width: Optional[int] = None,
|
|
45
|
+
height: Optional[int] = None,
|
|
46
|
+
) -> Optional[Image]:
|
|
42
47
|
"""
|
|
43
48
|
Mapping an input from `dataflow.SerializerFiles` or similar to an Image
|
|
44
49
|
|
|
45
50
|
:param dp: Image
|
|
46
51
|
:param dpi: dot per inch definition for pdf resolution when converting to numpy array
|
|
52
|
+
:param width: target width of the image. This option does only work when using Poppler as PDF renderer
|
|
53
|
+
:param height: target width of the image. This option does only work when using Poppler as PDF renderer
|
|
54
|
+
:param height: target height of the image
|
|
47
55
|
:return: Image
|
|
48
56
|
"""
|
|
49
57
|
|
|
@@ -77,7 +85,9 @@ def to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int]
|
|
|
77
85
|
dp_image.pdf_bytes = dp.get("pdf_bytes")
|
|
78
86
|
if dp_image.pdf_bytes is not None:
|
|
79
87
|
if isinstance(dp_image.pdf_bytes, bytes):
|
|
80
|
-
dp_image.image = convert_pdf_bytes_to_np_array_v2(
|
|
88
|
+
dp_image.image = convert_pdf_bytes_to_np_array_v2(
|
|
89
|
+
dp_image.pdf_bytes, dpi=dpi, width=width, height=height
|
|
90
|
+
)
|
|
81
91
|
elif image_bytes is not None:
|
|
82
92
|
dp_image.image = convert_bytes_to_np_array(image_bytes)
|
|
83
93
|
else:
|
|
@@ -393,7 +393,7 @@ def pub_to_image_uncur( # pylint: disable=R0914
|
|
|
393
393
|
np_image = load_image_from_file(dp["filename"])
|
|
394
394
|
if is_file_extension(dp["filename"], ".pdf"):
|
|
395
395
|
pdf_bytes = load_bytes_from_pdf_file(dp["filename"])
|
|
396
|
-
np_image = convert_pdf_bytes_to_np_array_v2(pdf_bytes)
|
|
396
|
+
np_image = convert_pdf_bytes_to_np_array_v2(pdf_bytes, dpi=200)
|
|
397
397
|
dp = _convert_boxes(dp, np_image.shape[0])
|
|
398
398
|
|
|
399
399
|
if load_image and np_image is not None:
|