deepdoctection 0.30__py3-none-any.whl → 0.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +38 -29
- deepdoctection/analyzer/dd.py +36 -29
- deepdoctection/configs/conf_dd_one.yaml +34 -31
- deepdoctection/dataflow/base.py +0 -19
- deepdoctection/dataflow/custom.py +4 -3
- deepdoctection/dataflow/custom_serialize.py +14 -5
- deepdoctection/dataflow/parallel_map.py +12 -11
- deepdoctection/dataflow/serialize.py +5 -4
- deepdoctection/datapoint/annotation.py +35 -13
- deepdoctection/datapoint/box.py +3 -5
- deepdoctection/datapoint/convert.py +3 -1
- deepdoctection/datapoint/image.py +79 -36
- deepdoctection/datapoint/view.py +152 -49
- deepdoctection/datasets/__init__.py +1 -4
- deepdoctection/datasets/adapter.py +6 -3
- deepdoctection/datasets/base.py +86 -11
- deepdoctection/datasets/dataflow_builder.py +1 -1
- deepdoctection/datasets/info.py +4 -4
- deepdoctection/datasets/instances/doclaynet.py +3 -2
- deepdoctection/datasets/instances/fintabnet.py +2 -1
- deepdoctection/datasets/instances/funsd.py +2 -1
- deepdoctection/datasets/instances/iiitar13k.py +5 -2
- deepdoctection/datasets/instances/layouttest.py +4 -8
- deepdoctection/datasets/instances/publaynet.py +2 -2
- deepdoctection/datasets/instances/pubtables1m.py +6 -3
- deepdoctection/datasets/instances/pubtabnet.py +2 -1
- deepdoctection/datasets/instances/rvlcdip.py +2 -1
- deepdoctection/datasets/instances/xfund.py +2 -1
- deepdoctection/eval/__init__.py +1 -4
- deepdoctection/eval/accmetric.py +1 -1
- deepdoctection/eval/base.py +5 -4
- deepdoctection/eval/cocometric.py +2 -1
- deepdoctection/eval/eval.py +19 -15
- deepdoctection/eval/tedsmetric.py +14 -11
- deepdoctection/eval/tp_eval_callback.py +14 -7
- deepdoctection/extern/__init__.py +2 -7
- deepdoctection/extern/base.py +39 -13
- deepdoctection/extern/d2detect.py +182 -90
- deepdoctection/extern/deskew.py +36 -9
- deepdoctection/extern/doctrocr.py +265 -83
- deepdoctection/extern/fastlang.py +49 -9
- deepdoctection/extern/hfdetr.py +106 -55
- deepdoctection/extern/hflayoutlm.py +441 -122
- deepdoctection/extern/hflm.py +225 -0
- deepdoctection/extern/model.py +56 -47
- deepdoctection/extern/pdftext.py +10 -5
- deepdoctection/extern/pt/__init__.py +1 -3
- deepdoctection/extern/pt/nms.py +6 -2
- deepdoctection/extern/pt/ptutils.py +27 -18
- deepdoctection/extern/tessocr.py +134 -22
- deepdoctection/extern/texocr.py +6 -2
- deepdoctection/extern/tp/tfutils.py +43 -9
- deepdoctection/extern/tp/tpcompat.py +14 -11
- deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
- deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/config/config.py +9 -6
- deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +17 -7
- deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
- deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +9 -4
- deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
- deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +16 -11
- deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +17 -10
- deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +14 -8
- deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
- deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
- deepdoctection/extern/tp/tpfrcnn/preproc.py +8 -9
- deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
- deepdoctection/extern/tpdetect.py +54 -30
- deepdoctection/mapper/__init__.py +3 -8
- deepdoctection/mapper/d2struct.py +9 -7
- deepdoctection/mapper/hfstruct.py +7 -2
- deepdoctection/mapper/laylmstruct.py +164 -21
- deepdoctection/mapper/maputils.py +16 -3
- deepdoctection/mapper/misc.py +6 -3
- deepdoctection/mapper/prodigystruct.py +1 -1
- deepdoctection/mapper/pubstruct.py +10 -10
- deepdoctection/mapper/tpstruct.py +3 -3
- deepdoctection/pipe/__init__.py +1 -1
- deepdoctection/pipe/anngen.py +35 -8
- deepdoctection/pipe/base.py +53 -19
- deepdoctection/pipe/common.py +23 -13
- deepdoctection/pipe/concurrency.py +2 -1
- deepdoctection/pipe/doctectionpipe.py +2 -2
- deepdoctection/pipe/language.py +3 -2
- deepdoctection/pipe/layout.py +6 -3
- deepdoctection/pipe/lm.py +34 -66
- deepdoctection/pipe/order.py +142 -35
- deepdoctection/pipe/refine.py +26 -24
- deepdoctection/pipe/segment.py +21 -16
- deepdoctection/pipe/{cell.py → sub_layout.py} +30 -9
- deepdoctection/pipe/text.py +14 -8
- deepdoctection/pipe/transform.py +16 -9
- deepdoctection/train/__init__.py +6 -12
- deepdoctection/train/d2_frcnn_train.py +36 -28
- deepdoctection/train/hf_detr_train.py +26 -17
- deepdoctection/train/hf_layoutlm_train.py +133 -111
- deepdoctection/train/tp_frcnn_train.py +21 -19
- deepdoctection/utils/__init__.py +3 -0
- deepdoctection/utils/concurrency.py +1 -1
- deepdoctection/utils/context.py +2 -2
- deepdoctection/utils/env_info.py +41 -84
- deepdoctection/utils/error.py +84 -0
- deepdoctection/utils/file_utils.py +4 -15
- deepdoctection/utils/fs.py +7 -7
- deepdoctection/utils/logger.py +1 -0
- deepdoctection/utils/mocks.py +93 -0
- deepdoctection/utils/pdf_utils.py +5 -4
- deepdoctection/utils/settings.py +6 -1
- deepdoctection/utils/transform.py +1 -1
- deepdoctection/utils/utils.py +0 -6
- deepdoctection/utils/viz.py +48 -5
- {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/METADATA +57 -73
- deepdoctection-0.32.dist-info/RECORD +146 -0
- {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/WHEEL +1 -1
- deepdoctection-0.30.dist-info/RECORD +0 -143
- {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/LICENSE +0 -0
- {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/top_level.txt +0 -0
deepdoctection/datapoint/view.py
CHANGED
|
@@ -19,6 +19,7 @@
|
|
|
19
19
|
Subclasses for ImageAnnotation and Image objects with various properties. These classes
|
|
20
20
|
simplify consumption
|
|
21
21
|
"""
|
|
22
|
+
from __future__ import annotations
|
|
22
23
|
|
|
23
24
|
from copy import copy
|
|
24
25
|
from typing import Any, Dict, List, Mapping, Optional, Sequence, Set, Tuple, Type, Union, no_type_check
|
|
@@ -26,6 +27,7 @@ from typing import Any, Dict, List, Mapping, Optional, Sequence, Set, Tuple, Typ
|
|
|
26
27
|
import numpy as np
|
|
27
28
|
|
|
28
29
|
from ..utils.detection_types import ImageType, JsonDict, Pathlike
|
|
30
|
+
from ..utils.error import AnnotationError, ImageError
|
|
29
31
|
from ..utils.logger import LoggingRecord, logger
|
|
30
32
|
from ..utils.settings import (
|
|
31
33
|
CellType,
|
|
@@ -63,7 +65,7 @@ class ImageAnnotationBaseView(ImageAnnotation):
|
|
|
63
65
|
base_page: `Page` class instantiated by the lowest hierarchy `Image`
|
|
64
66
|
"""
|
|
65
67
|
|
|
66
|
-
base_page:
|
|
68
|
+
base_page: Page
|
|
67
69
|
|
|
68
70
|
@property
|
|
69
71
|
def bbox(self) -> List[float]:
|
|
@@ -96,7 +98,7 @@ class ImageAnnotationBaseView(ImageAnnotation):
|
|
|
96
98
|
interactive_imshow(np_image)
|
|
97
99
|
return None
|
|
98
100
|
return np_image
|
|
99
|
-
raise
|
|
101
|
+
raise AnnotationError(f"base_page.image is None for {self.annotation_id}")
|
|
100
102
|
|
|
101
103
|
def __getattr__(self, item: str) -> Optional[Union[str, int, List[str]]]:
|
|
102
104
|
"""
|
|
@@ -115,7 +117,7 @@ class ImageAnnotationBaseView(ImageAnnotation):
|
|
|
115
117
|
:return: value according to the logic described above
|
|
116
118
|
"""
|
|
117
119
|
if item not in self.get_attribute_names():
|
|
118
|
-
raise
|
|
120
|
+
raise AnnotationError(f"Attribute {item} is not supported for {type(self)}")
|
|
119
121
|
if item in self.sub_categories:
|
|
120
122
|
sub_cat = self.get_sub_category(get_type(item))
|
|
121
123
|
if item != sub_cat.category_name:
|
|
@@ -147,7 +149,7 @@ class ImageAnnotationBaseView(ImageAnnotation):
|
|
|
147
149
|
return attribute_names
|
|
148
150
|
|
|
149
151
|
@classmethod
|
|
150
|
-
def from_dict(cls, **kwargs: JsonDict) ->
|
|
152
|
+
def from_dict(cls, **kwargs: JsonDict) -> ImageAnnotationBaseView:
|
|
151
153
|
"""
|
|
152
154
|
Identical to its base class method for having correct return types. If the base class changes, please
|
|
153
155
|
change this method as well.
|
|
@@ -204,15 +206,38 @@ class Layout(ImageAnnotationBaseView):
|
|
|
204
206
|
return words_with_reading_order
|
|
205
207
|
|
|
206
208
|
@property
|
|
207
|
-
def text_(self) ->
|
|
209
|
+
def text_(self) -> JsonDict:
|
|
208
210
|
"""Returns a dict `{"text": text string,
|
|
209
211
|
"text_list": list of single words,
|
|
210
212
|
"annotation_ids": word annotation ids`"""
|
|
211
213
|
words = self.get_ordered_words()
|
|
214
|
+
characters, ann_ids, token_classes, token_tags, token_classes_ids, token_tag_ids = zip(
|
|
215
|
+
*[
|
|
216
|
+
(
|
|
217
|
+
word.characters,
|
|
218
|
+
word.annotation_id,
|
|
219
|
+
word.token_class,
|
|
220
|
+
word.token_tag,
|
|
221
|
+
(
|
|
222
|
+
word.get_sub_category(WordType.token_class).category_id
|
|
223
|
+
if WordType.token_class in word.sub_categories
|
|
224
|
+
else None
|
|
225
|
+
),
|
|
226
|
+
(word.get_sub_category(WordType.token_tag).category_id)
|
|
227
|
+
if WordType.token_tag in word.sub_categories
|
|
228
|
+
else None,
|
|
229
|
+
)
|
|
230
|
+
for word in words
|
|
231
|
+
]
|
|
232
|
+
)
|
|
212
233
|
return {
|
|
213
|
-
"text": " ".join(
|
|
214
|
-
"
|
|
215
|
-
"
|
|
234
|
+
"text": " ".join(characters),
|
|
235
|
+
"words": characters,
|
|
236
|
+
"ann_ids": ann_ids,
|
|
237
|
+
"token_classes": token_classes,
|
|
238
|
+
"token_tags": token_tags,
|
|
239
|
+
"token_class_ids": token_classes_ids,
|
|
240
|
+
"token_tag_ids": token_tag_ids,
|
|
216
241
|
}
|
|
217
242
|
|
|
218
243
|
def get_attribute_names(self) -> Set[str]:
|
|
@@ -326,23 +351,37 @@ class Table(Layout):
|
|
|
326
351
|
def text(self) -> str:
|
|
327
352
|
try:
|
|
328
353
|
return str(self)
|
|
329
|
-
except TypeError:
|
|
354
|
+
except (TypeError, AnnotationError):
|
|
330
355
|
return super().text
|
|
331
356
|
|
|
332
357
|
@property
|
|
333
|
-
def text_(self) ->
|
|
358
|
+
def text_(self) -> JsonDict:
|
|
334
359
|
cells = self.cells
|
|
335
360
|
if not cells:
|
|
336
361
|
return super().text_
|
|
337
|
-
|
|
338
|
-
|
|
362
|
+
text: List[str] = []
|
|
363
|
+
words: List[str] = []
|
|
364
|
+
ann_ids: List[str] = []
|
|
365
|
+
token_classes: List[str] = []
|
|
366
|
+
token_tags: List[str] = []
|
|
367
|
+
token_class_ids: List[str] = []
|
|
368
|
+
token_tag_ids: List[str] = []
|
|
339
369
|
for cell in cells:
|
|
340
|
-
|
|
341
|
-
|
|
370
|
+
text.extend(cell.text_["text"]) # type: ignore
|
|
371
|
+
words.extend(cell.text_["words"]) # type: ignore
|
|
372
|
+
ann_ids.extend(cell.text_["ann_ids"]) # type: ignore
|
|
373
|
+
token_classes.extend(cell.text_["token_classes"]) # type: ignore
|
|
374
|
+
token_tags.extend(cell.text_["token_tags"]) # type: ignore
|
|
375
|
+
token_class_ids.extend(cell.text_["token_class_ids"]) # type: ignore
|
|
376
|
+
token_tag_ids.extend(cell.text_["token_tag_ids"]) # type: ignore
|
|
342
377
|
return {
|
|
343
|
-
"text": " ".join(
|
|
344
|
-
"
|
|
345
|
-
"
|
|
378
|
+
"text": " ".join(text),
|
|
379
|
+
"words": words,
|
|
380
|
+
"ann_ids": ann_ids,
|
|
381
|
+
"token_classes": token_classes,
|
|
382
|
+
"token_tags": token_tags,
|
|
383
|
+
"token_class_ids": token_class_ids,
|
|
384
|
+
"token_tag_ids": token_tag_ids,
|
|
346
385
|
}
|
|
347
386
|
|
|
348
387
|
@property
|
|
@@ -368,7 +407,7 @@ class Table(Layout):
|
|
|
368
407
|
for cell in cells:
|
|
369
408
|
all_words.extend(cell.get_ordered_words()) # type: ignore
|
|
370
409
|
return all_words
|
|
371
|
-
except TypeError:
|
|
410
|
+
except (TypeError, AnnotationError):
|
|
372
411
|
return super().get_ordered_words()
|
|
373
412
|
|
|
374
413
|
|
|
@@ -451,41 +490,73 @@ class Page(Image):
|
|
|
451
490
|
"document_id",
|
|
452
491
|
"page_number",
|
|
453
492
|
}
|
|
493
|
+
include_residual_text_container: bool = True
|
|
454
494
|
|
|
455
|
-
|
|
456
|
-
def get_annotation(
|
|
495
|
+
def get_annotation( # type: ignore
|
|
457
496
|
self,
|
|
458
497
|
category_names: Optional[Union[str, ObjectTypes, Sequence[Union[str, ObjectTypes]]]] = None,
|
|
459
498
|
annotation_ids: Optional[Union[str, Sequence[str]]] = None,
|
|
460
|
-
|
|
499
|
+
service_id: Optional[Union[str, Sequence[str]]] = None,
|
|
500
|
+
model_id: Optional[Union[str, Sequence[str]]] = None,
|
|
501
|
+
session_ids: Optional[Union[str, Sequence[str]]] = None,
|
|
502
|
+
ignore_inactive: bool = True,
|
|
461
503
|
) -> List[ImageAnnotationBaseView]:
|
|
462
504
|
"""
|
|
505
|
+
Selection of annotations from the annotation container. Filter conditions can be defined by specifying
|
|
506
|
+
the annotation_id or the category name. (Since only image annotations are currently allowed in the container,
|
|
507
|
+
annotation_type is a redundant filter condition.) Only annotations that have active = 'True' are
|
|
508
|
+
returned. If more than one condition is provided, only annotations will be returned that satisfy all conditions.
|
|
509
|
+
If no condition is provided, it will return all active annotations.
|
|
510
|
+
|
|
463
511
|
Identical to its base class method for having correct return types. If the base class changes, please
|
|
464
512
|
change this method as well.
|
|
513
|
+
|
|
514
|
+
:param category_names: A single name or list of names
|
|
515
|
+
:param annotation_ids: A single id or list of ids
|
|
516
|
+
:param service_id: A single service name or list of service names
|
|
517
|
+
:param model_id: A single model name or list of model names
|
|
518
|
+
:param session_ids: A single session id or list of session ids
|
|
519
|
+
:param ignore_inactive: If set to `True` only active annotations are returned.
|
|
520
|
+
|
|
521
|
+
:return: A (possibly empty) list of Annotations
|
|
465
522
|
"""
|
|
466
|
-
cat_names = [category_names] if isinstance(category_names, (ObjectTypes, str)) else category_names
|
|
467
|
-
if cat_names is not None:
|
|
468
|
-
cat_names = [get_type(cat_name) for cat_name in cat_names]
|
|
469
|
-
ann_ids = [annotation_ids] if isinstance(annotation_ids, str) else annotation_ids
|
|
470
|
-
ann_types = [annotation_types] if isinstance(annotation_types, str) else annotation_types
|
|
471
523
|
|
|
472
|
-
|
|
524
|
+
if category_names is not None:
|
|
525
|
+
category_names = (
|
|
526
|
+
[get_type(cat_name) for cat_name in category_names]
|
|
527
|
+
if isinstance(category_names, list)
|
|
528
|
+
else [get_type(category_names)] # type:ignore
|
|
529
|
+
)
|
|
530
|
+
ann_ids = [annotation_ids] if isinstance(annotation_ids, str) else annotation_ids
|
|
531
|
+
service_id = [service_id] if isinstance(service_id, str) else service_id
|
|
532
|
+
model_id = [model_id] if isinstance(model_id, str) else model_id
|
|
533
|
+
session_id = [session_ids] if isinstance(session_ids, str) else session_ids
|
|
473
534
|
|
|
474
|
-
if
|
|
475
|
-
|
|
476
|
-
|
|
535
|
+
if ignore_inactive:
|
|
536
|
+
anns = filter(lambda x: x.active, self.annotations)
|
|
537
|
+
else:
|
|
538
|
+
anns = self.annotations # type:ignore
|
|
477
539
|
|
|
478
|
-
if
|
|
479
|
-
anns = filter(lambda x: x.category_name in
|
|
540
|
+
if category_names is not None:
|
|
541
|
+
anns = filter(lambda x: x.category_name in category_names, anns) # type:ignore
|
|
480
542
|
|
|
481
543
|
if ann_ids is not None:
|
|
482
|
-
anns = filter(lambda x: x.annotation_id in ann_ids, anns)
|
|
544
|
+
anns = filter(lambda x: x.annotation_id in ann_ids, anns) # type:ignore
|
|
483
545
|
|
|
484
|
-
|
|
546
|
+
if service_id is not None:
|
|
547
|
+
anns = filter(lambda x: x.generating_service in service_id, anns) # type:ignore
|
|
548
|
+
|
|
549
|
+
if model_id is not None:
|
|
550
|
+
anns = filter(lambda x: x.generating_model in model_id, anns) # type:ignore
|
|
551
|
+
|
|
552
|
+
if session_id is not None:
|
|
553
|
+
anns = filter(lambda x: x.session_id in session_id, anns) # type:ignore
|
|
554
|
+
|
|
555
|
+
return list(anns) # type:ignore
|
|
485
556
|
|
|
486
557
|
def __getattr__(self, item: str) -> Any:
|
|
487
558
|
if item not in self.get_attribute_names():
|
|
488
|
-
raise
|
|
559
|
+
raise ImageError(f"Attribute {item} is not supported for {type(self)}")
|
|
489
560
|
if self.summary is not None:
|
|
490
561
|
if item in self.summary.sub_categories:
|
|
491
562
|
sub_cat = self.summary.get_sub_category(get_type(item))
|
|
@@ -524,8 +595,8 @@ class Page(Image):
|
|
|
524
595
|
text_container: Optional[ObjectTypes] = None,
|
|
525
596
|
floating_text_block_categories: Optional[Sequence[ObjectTypes]] = None,
|
|
526
597
|
include_residual_text_container: bool = True,
|
|
527
|
-
base_page: Optional[
|
|
528
|
-
) ->
|
|
598
|
+
base_page: Optional[Page] = None,
|
|
599
|
+
) -> Page:
|
|
529
600
|
"""
|
|
530
601
|
Factory function for generating a `Page` instance from `image_orig` .
|
|
531
602
|
|
|
@@ -583,6 +654,7 @@ class Page(Image):
|
|
|
583
654
|
page.summary = SummaryAnnotation.from_dict(**summary_dict)
|
|
584
655
|
page.floating_text_block_categories = floating_text_block_categories # type: ignore
|
|
585
656
|
page.text_container = text_container # type: ignore
|
|
657
|
+
page.include_residual_text_container = include_residual_text_container
|
|
586
658
|
return page
|
|
587
659
|
|
|
588
660
|
def _order(self, block: str) -> List[ImageAnnotationBaseView]:
|
|
@@ -596,7 +668,7 @@ class Page(Image):
|
|
|
596
668
|
break_str = "\n" if line_break else " "
|
|
597
669
|
for block in block_with_order:
|
|
598
670
|
text += f"{block.text}{break_str}"
|
|
599
|
-
return text
|
|
671
|
+
return text[:-1]
|
|
600
672
|
|
|
601
673
|
@property
|
|
602
674
|
def text(self) -> str:
|
|
@@ -606,17 +678,35 @@ class Page(Image):
|
|
|
606
678
|
return self._make_text()
|
|
607
679
|
|
|
608
680
|
@property
|
|
609
|
-
def text_(self) ->
|
|
681
|
+
def text_(self) -> JsonDict:
|
|
610
682
|
"""Returns a dict `{"text": text string,
|
|
611
683
|
"text_list": list of single words,
|
|
612
684
|
"annotation_ids": word annotation ids`"""
|
|
613
685
|
block_with_order = self._order("layouts")
|
|
614
|
-
|
|
615
|
-
|
|
686
|
+
text: List[str] = []
|
|
687
|
+
words: List[str] = []
|
|
688
|
+
ann_ids: List[str] = []
|
|
689
|
+
token_classes: List[str] = []
|
|
690
|
+
token_tags: List[str] = []
|
|
691
|
+
token_class_ids: List[str] = []
|
|
692
|
+
token_tag_ids: List[str] = []
|
|
616
693
|
for block in block_with_order:
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
694
|
+
text.append(block.text_["text"]) # type: ignore
|
|
695
|
+
words.extend(block.text_["words"]) # type: ignore
|
|
696
|
+
ann_ids.extend(block.text_["ann_ids"]) # type: ignore
|
|
697
|
+
token_classes.extend(block.text_["token_classes"]) # type: ignore
|
|
698
|
+
token_tags.extend(block.text_["token_tags"]) # type: ignore
|
|
699
|
+
token_class_ids.extend(block.text_["token_class_ids"]) # type: ignore
|
|
700
|
+
token_tag_ids.extend(block.text_["token_tag_ids"]) # type: ignore
|
|
701
|
+
return {
|
|
702
|
+
"text": " ".join(text),
|
|
703
|
+
"words": words,
|
|
704
|
+
"ann_ids": ann_ids,
|
|
705
|
+
"token_classes": token_classes,
|
|
706
|
+
"token_tags": token_tags,
|
|
707
|
+
"token_class_ids": token_class_ids,
|
|
708
|
+
"token_tag_ids": token_tag_ids,
|
|
709
|
+
}
|
|
620
710
|
|
|
621
711
|
def get_layout_context(self, annotation_id: str, context_size: int = 3) -> List[ImageAnnotationBaseView]:
|
|
622
712
|
"""For a given `annotation_id` get a list of `ImageAnnotation` that are nearby in terms of reading order.
|
|
@@ -629,10 +719,10 @@ class Page(Image):
|
|
|
629
719
|
"""
|
|
630
720
|
ann = self.get_annotation(annotation_ids=annotation_id)[0]
|
|
631
721
|
if ann.category_name not in self.floating_text_block_categories:
|
|
632
|
-
raise
|
|
633
|
-
f"
|
|
634
|
-
f"
|
|
635
|
-
f"
|
|
722
|
+
raise ImageError(
|
|
723
|
+
f"Cannot get context. Make sure to parametrize this category to a floating text: "
|
|
724
|
+
f"annotation_id: {annotation_id},"
|
|
725
|
+
f"category_name: {ann.category_name}"
|
|
636
726
|
)
|
|
637
727
|
block_with_order = self._order("layouts")
|
|
638
728
|
position = block_with_order.index(ann)
|
|
@@ -727,6 +817,11 @@ class Page(Image):
|
|
|
727
817
|
box_stack = []
|
|
728
818
|
cells_found = False
|
|
729
819
|
|
|
820
|
+
if self.image is None and interactive:
|
|
821
|
+
logger.warning(
|
|
822
|
+
LoggingRecord("No image provided. Cannot display image in interactive mode", {"page_id": self.image_id})
|
|
823
|
+
)
|
|
824
|
+
|
|
730
825
|
if debug_kwargs:
|
|
731
826
|
anns = self.get_annotation(category_names=list(debug_kwargs.keys()))
|
|
732
827
|
for ann in anns:
|
|
@@ -874,7 +969,7 @@ class Page(Image):
|
|
|
874
969
|
text_container: Optional[ObjectTypes] = None,
|
|
875
970
|
floating_text_block_categories: Optional[List[ObjectTypes]] = None,
|
|
876
971
|
include_residual_text_container: bool = True,
|
|
877
|
-
) ->
|
|
972
|
+
) -> Page:
|
|
878
973
|
"""Reading JSON file and building a `Page` object with given config.
|
|
879
974
|
:param file_path: Path to file
|
|
880
975
|
:param text_container: A LayoutType to get the text from. It will steer the output of `Layout.words`.
|
|
@@ -897,3 +992,11 @@ class Page(Image):
|
|
|
897
992
|
for word in all_words
|
|
898
993
|
if word.token_tag not in (TokenClasses.other, None)
|
|
899
994
|
]
|
|
995
|
+
|
|
996
|
+
def __copy__(self) -> Page:
|
|
997
|
+
return self.__class__.from_image(
|
|
998
|
+
self.image_orig,
|
|
999
|
+
self.text_container,
|
|
1000
|
+
self.floating_text_block_categories,
|
|
1001
|
+
self.include_residual_text_container,
|
|
1002
|
+
)
|
|
@@ -26,13 +26,10 @@ Create an info card, a DataFlowBaseBuilder derived instance, possibly a category
|
|
|
26
26
|
DatasetBase derived instance to create a data set.
|
|
27
27
|
"""
|
|
28
28
|
|
|
29
|
-
from
|
|
29
|
+
from .adapter import *
|
|
30
30
|
from .base import *
|
|
31
31
|
from .dataflow_builder import DataFlowBaseBuilder
|
|
32
32
|
from .info import *
|
|
33
33
|
from .instances import *
|
|
34
34
|
from .registry import *
|
|
35
35
|
from .save import *
|
|
36
|
-
|
|
37
|
-
if pytorch_available():
|
|
38
|
-
from .adapter import *
|
|
@@ -22,19 +22,22 @@ Module for wrapping datasets into a pytorch dataset framework.
|
|
|
22
22
|
|
|
23
23
|
from typing import Any, Callable, Iterator, Mapping, Optional, Union
|
|
24
24
|
|
|
25
|
+
from lazy_imports import try_import
|
|
26
|
+
|
|
25
27
|
from ..dataflow import CacheData, CustomDataFromList, MapData, RepeatedData
|
|
26
28
|
from ..datapoint.image import Image
|
|
27
29
|
from ..datasets.base import DatasetBase
|
|
28
30
|
from ..mapper.maputils import LabelSummarizer
|
|
29
31
|
from ..utils.detection_types import DP, JsonDict
|
|
30
|
-
from ..utils.file_utils import pytorch_available
|
|
31
32
|
from ..utils.logger import LoggingRecord, log_once, logger
|
|
32
33
|
from ..utils.settings import DatasetType, LayoutType, ObjectTypes, PageType, WordType
|
|
33
34
|
from ..utils.tqdm import get_tqdm
|
|
34
35
|
from .registry import get_dataset
|
|
35
36
|
|
|
36
|
-
|
|
37
|
+
with try_import() as import_guard:
|
|
37
38
|
from torch.utils.data import IterableDataset
|
|
39
|
+
if not import_guard.is_successful():
|
|
40
|
+
from ..utils.mocks import IterableDataset # type: ignore
|
|
38
41
|
|
|
39
42
|
|
|
40
43
|
class DatasetAdapter(IterableDataset): # type: ignore
|
|
@@ -165,4 +168,4 @@ class DatasetAdapter(IterableDataset): # type: ignore
|
|
|
165
168
|
return len(self.df)
|
|
166
169
|
|
|
167
170
|
def __getitem__(self, item: Any) -> None:
|
|
168
|
-
raise NotImplementedError
|
|
171
|
+
raise NotImplementedError()
|
deepdoctection/datasets/base.py
CHANGED
|
@@ -18,20 +18,24 @@
|
|
|
18
18
|
"""
|
|
19
19
|
Module for the base class of datasets.
|
|
20
20
|
"""
|
|
21
|
+
from __future__ import annotations
|
|
21
22
|
|
|
23
|
+
import json
|
|
22
24
|
import os
|
|
23
25
|
import pprint
|
|
24
26
|
from abc import ABC, abstractmethod
|
|
25
27
|
from collections import defaultdict
|
|
26
|
-
from
|
|
28
|
+
from inspect import signature
|
|
29
|
+
from pathlib import Path
|
|
30
|
+
from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple, Type, Union
|
|
27
31
|
|
|
28
32
|
import numpy as np
|
|
29
33
|
|
|
30
34
|
from ..dataflow import CacheData, ConcatData, CustomDataFromList, DataFlow
|
|
31
|
-
from ..datapoint import Image
|
|
35
|
+
from ..datapoint.image import Image
|
|
32
36
|
from ..utils.detection_types import Pathlike
|
|
33
37
|
from ..utils.logger import LoggingRecord, logger
|
|
34
|
-
from ..utils.settings import ObjectTypes, TypeOrStr, get_type
|
|
38
|
+
from ..utils.settings import DatasetType, ObjectTypes, TypeOrStr, get_type
|
|
35
39
|
from .dataflow_builder import DataFlowBaseBuilder
|
|
36
40
|
from .info import DatasetCategories, DatasetInfo, get_merged_categories
|
|
37
41
|
|
|
@@ -51,9 +55,11 @@ class DatasetBase(ABC):
|
|
|
51
55
|
self._dataflow_builder.splits = self._dataset_info.splits
|
|
52
56
|
|
|
53
57
|
if not self.dataset_available() and self.is_built_in():
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
58
|
+
logger.warning(
|
|
59
|
+
LoggingRecord(
|
|
60
|
+
f"Dataset {self._dataset_info.name} not locally found. Please download at {self._dataset_info.url}"
|
|
61
|
+
f" and place under {self._dataflow_builder.get_workdir()}"
|
|
62
|
+
)
|
|
57
63
|
)
|
|
58
64
|
|
|
59
65
|
@property
|
|
@@ -76,7 +82,7 @@ class DatasetBase(ABC):
|
|
|
76
82
|
Construct the DatasetCategory object.
|
|
77
83
|
"""
|
|
78
84
|
|
|
79
|
-
raise NotImplementedError
|
|
85
|
+
raise NotImplementedError()
|
|
80
86
|
|
|
81
87
|
@classmethod
|
|
82
88
|
@abstractmethod
|
|
@@ -85,7 +91,7 @@ class DatasetBase(ABC):
|
|
|
85
91
|
Construct the DatasetInfo object.
|
|
86
92
|
"""
|
|
87
93
|
|
|
88
|
-
raise NotImplementedError
|
|
94
|
+
raise NotImplementedError()
|
|
89
95
|
|
|
90
96
|
@abstractmethod
|
|
91
97
|
def _builder(self) -> DataFlowBaseBuilder:
|
|
@@ -93,7 +99,7 @@ class DatasetBase(ABC):
|
|
|
93
99
|
Construct the DataFlowBaseBuilder object. It needs to be implemented in the derived class.
|
|
94
100
|
"""
|
|
95
101
|
|
|
96
|
-
raise NotImplementedError
|
|
102
|
+
raise NotImplementedError()
|
|
97
103
|
|
|
98
104
|
def dataset_available(self) -> bool:
|
|
99
105
|
"""
|
|
@@ -114,7 +120,7 @@ class DatasetBase(ABC):
|
|
|
114
120
|
|
|
115
121
|
class _BuiltInDataset(DatasetBase, ABC):
|
|
116
122
|
"""
|
|
117
|
-
Dataclass for built-in dataset. Do not use this
|
|
123
|
+
Dataclass for built-in dataset. Do not use this
|
|
118
124
|
"""
|
|
119
125
|
|
|
120
126
|
_name: Optional[str] = None
|
|
@@ -419,7 +425,7 @@ class CustomDataset(DatasetBase):
|
|
|
419
425
|
"""
|
|
420
426
|
|
|
421
427
|
self.name = name
|
|
422
|
-
self.type = get_type(dataset_type)
|
|
428
|
+
self.type: DatasetType = get_type(dataset_type) # type: ignore
|
|
423
429
|
self.location = location
|
|
424
430
|
self.init_categories = init_categories
|
|
425
431
|
if init_sub_categories is None:
|
|
@@ -427,6 +433,11 @@ class CustomDataset(DatasetBase):
|
|
|
427
433
|
else:
|
|
428
434
|
self.init_sub_categories = init_sub_categories
|
|
429
435
|
self.annotation_files = annotation_files
|
|
436
|
+
if signature(dataflow_builder.__init__).parameters.keys() != {"self", "location", "annotation_files"}:
|
|
437
|
+
raise TypeError(
|
|
438
|
+
"Dataflow builder must have the signature `def __init__(self, location: Pathlike, "
|
|
439
|
+
"annotation_files: Optional[Mapping[str, Union[str, Sequence[str]]]] = None):`"
|
|
440
|
+
)
|
|
430
441
|
self.dataflow_builder = dataflow_builder(self.location, self.annotation_files)
|
|
431
442
|
super().__init__()
|
|
432
443
|
|
|
@@ -438,3 +449,67 @@ class CustomDataset(DatasetBase):
|
|
|
438
449
|
|
|
439
450
|
def _builder(self) -> DataFlowBaseBuilder:
|
|
440
451
|
return self.dataflow_builder
|
|
452
|
+
|
|
453
|
+
@staticmethod
|
|
454
|
+
def from_dataset_card(file_path: str, dataflow_builder: Type[DataFlowBaseBuilder]) -> CustomDataset:
|
|
455
|
+
"""
|
|
456
|
+
This static method creates a CustomDataset instance from a dataset card.
|
|
457
|
+
|
|
458
|
+
A dataset card is a JSON file that contains metadata about the dataset such as its name, type, location,
|
|
459
|
+
initial categories, initial sub categories, and annotation files. The dataflow_builder parameter is a class
|
|
460
|
+
that inherits from DataFlowBaseBuilder and is used to build the dataflow for the dataset.
|
|
461
|
+
|
|
462
|
+
:param file_path: The path to the dataset card (JSON file).
|
|
463
|
+
:param dataflow_builder: The class used to build the dataflow for the dataset.
|
|
464
|
+
:return: A CustomDataset instance created from the dataset card.
|
|
465
|
+
"""
|
|
466
|
+
|
|
467
|
+
with open(file_path, "r", encoding="UTF-8") as file:
|
|
468
|
+
meta_data = json.load(file)
|
|
469
|
+
meta_data["dataset_type"] = get_type(meta_data["dataset_type"])
|
|
470
|
+
meta_data["location"] = Path(meta_data["location"])
|
|
471
|
+
meta_data["init_categories"] = [get_type(cat) for cat in meta_data["init_categories"]]
|
|
472
|
+
meta_data["init_sub_categories"] = (
|
|
473
|
+
{
|
|
474
|
+
get_type(cat): {
|
|
475
|
+
get_type(sub_cat_key): [get_type(sub_cat_value) for sub_cat_value in sub_cat_values]
|
|
476
|
+
for sub_cat_key, sub_cat_values in sub_cats.items()
|
|
477
|
+
}
|
|
478
|
+
for cat, sub_cats in meta_data["init_sub_categories"].items()
|
|
479
|
+
}
|
|
480
|
+
if meta_data["init_sub_categories"] is not None
|
|
481
|
+
else None
|
|
482
|
+
)
|
|
483
|
+
return CustomDataset(**meta_data, dataflow_builder=dataflow_builder)
|
|
484
|
+
|
|
485
|
+
def as_dict(self) -> Mapping[str, Any]:
|
|
486
|
+
"""
|
|
487
|
+
Return the meta-data of the dataset as a dictionary.
|
|
488
|
+
|
|
489
|
+
:return: A dictionary containing the meta-data of the dataset.
|
|
490
|
+
"""
|
|
491
|
+
return {
|
|
492
|
+
"name": self.name,
|
|
493
|
+
"dataset_type": self.type,
|
|
494
|
+
"location": str(self.location),
|
|
495
|
+
"annotation_files": self.annotation_files,
|
|
496
|
+
"init_categories": [cat.value for cat in self.init_categories],
|
|
497
|
+
"init_sub_categories": {
|
|
498
|
+
cat.value: {
|
|
499
|
+
sub_cat_key.value: [sub_cat_value.value for sub_cat_value in sub_cat_values]
|
|
500
|
+
for sub_cat_key, sub_cat_values in sub_cats.items()
|
|
501
|
+
}
|
|
502
|
+
for cat, sub_cats in self.init_sub_categories.items()
|
|
503
|
+
}
|
|
504
|
+
if self.init_sub_categories is not None
|
|
505
|
+
else None,
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
def save_dataset_card(self, file_path: str) -> None:
|
|
509
|
+
"""
|
|
510
|
+
Save the dataset card to a JSON file.
|
|
511
|
+
|
|
512
|
+
:param file_path: file_path
|
|
513
|
+
"""
|
|
514
|
+
with open(file_path, "w", encoding="UTF-8") as file:
|
|
515
|
+
json.dump(self.as_dict(), file, indent=4)
|
|
@@ -110,7 +110,7 @@ class DataFlowBaseBuilder(ABC):
|
|
|
110
110
|
:param kwargs: A custom set of arguments/values
|
|
111
111
|
:return: dataflow
|
|
112
112
|
"""
|
|
113
|
-
raise NotImplementedError
|
|
113
|
+
raise NotImplementedError()
|
|
114
114
|
|
|
115
115
|
def get_annotation_file(self, split: str) -> str:
|
|
116
116
|
"""Get single annotation file."""
|
deepdoctection/datasets/info.py
CHANGED
|
@@ -24,7 +24,7 @@ from dataclasses import dataclass, field
|
|
|
24
24
|
from itertools import chain
|
|
25
25
|
from typing import Any, Dict, List, Literal, Mapping, Optional, Sequence, Set, Union, no_type_check, overload
|
|
26
26
|
|
|
27
|
-
from ..utils.settings import
|
|
27
|
+
from ..utils.settings import DatasetType, ObjectTypes, TypeOrStr, get_type
|
|
28
28
|
from ..utils.utils import call_only_once
|
|
29
29
|
|
|
30
30
|
__all__ = ["DatasetInfo", "DatasetCategories", "get_merged_categories"]
|
|
@@ -89,7 +89,7 @@ class DatasetInfo:
|
|
|
89
89
|
license: str = field(default="")
|
|
90
90
|
url: Union[str, Sequence[str]] = field(default="")
|
|
91
91
|
splits: Mapping[str, str] = field(default_factory=dict)
|
|
92
|
-
type:
|
|
92
|
+
type: DatasetType = field(default=DatasetType.default)
|
|
93
93
|
|
|
94
94
|
def get_split(self, key: str) -> str:
|
|
95
95
|
"""
|
|
@@ -306,7 +306,7 @@ class DatasetCategories:
|
|
|
306
306
|
|
|
307
307
|
_cat_to_sub_cat = {get_type(key): get_type(value) for key, value in cat_to_sub_cat.items()}
|
|
308
308
|
if not self._allow_update:
|
|
309
|
-
raise
|
|
309
|
+
raise RuntimeWarning("Replacing categories with sub categories is not allowed")
|
|
310
310
|
self._categories_update = self.init_categories
|
|
311
311
|
categories = self.get_categories(name_as_key=True)
|
|
312
312
|
cats_or_sub_cats = [
|
|
@@ -332,7 +332,7 @@ class DatasetCategories:
|
|
|
332
332
|
"""
|
|
333
333
|
|
|
334
334
|
if not self._allow_update:
|
|
335
|
-
raise
|
|
335
|
+
raise RuntimeWarning("Filtering categories is not allowed")
|
|
336
336
|
if isinstance(categories, (ObjectTypes, str)):
|
|
337
337
|
categories = [get_type(categories)]
|
|
338
338
|
else:
|
|
@@ -25,6 +25,7 @@ Module for DocLayNet dataset. Place the dataset as follows
|
|
|
25
25
|
├── PNG
|
|
26
26
|
│ ├── 0a0d43e301facee9e99cc33b9b16e732dd207135f4027e75f6aea2bf117535a2.png
|
|
27
27
|
"""
|
|
28
|
+
from __future__ import annotations
|
|
28
29
|
|
|
29
30
|
import os
|
|
30
31
|
from typing import Mapping, Sequence, Union
|
|
@@ -109,7 +110,7 @@ class DocLayNet(DatasetBase):
|
|
|
109
110
|
def _categories(self) -> DatasetCategories:
|
|
110
111
|
return DatasetCategories(init_categories=_INIT_CATEGORIES, init_sub_categories=_SUB_CATEGORIES)
|
|
111
112
|
|
|
112
|
-
def _builder(self) ->
|
|
113
|
+
def _builder(self) -> DocLayNetBuilder:
|
|
113
114
|
return DocLayNetBuilder(location=_LOCATION, annotation_files=_ANNOTATION_FILES)
|
|
114
115
|
|
|
115
116
|
|
|
@@ -209,7 +210,7 @@ class DocLayNetSeq(DatasetBase):
|
|
|
209
210
|
def _categories(self) -> DatasetCategories:
|
|
210
211
|
return DatasetCategories(init_categories=_INIT_CATEGORIES_SEQ)
|
|
211
212
|
|
|
212
|
-
def _builder(self) ->
|
|
213
|
+
def _builder(self) -> DocLayNetSeqBuilder:
|
|
213
214
|
return DocLayNetSeqBuilder(location=_LOCATION, annotation_files=_ANNOTATION_FILES)
|
|
214
215
|
|
|
215
216
|
|
|
@@ -30,6 +30,7 @@ Module for Fintabnet dataset. Place the dataset as follows
|
|
|
30
30
|
├── FinTabNet_1.0.0_table_train.jsonl
|
|
31
31
|
├── FinTabNet_1.0.0_table_val.jsonl
|
|
32
32
|
"""
|
|
33
|
+
from __future__ import annotations
|
|
33
34
|
|
|
34
35
|
from pathlib import Path
|
|
35
36
|
from typing import List, Mapping, Sequence, Union
|
|
@@ -133,7 +134,7 @@ class Fintabnet(_BuiltInDataset):
|
|
|
133
134
|
def _categories(self) -> DatasetCategories:
|
|
134
135
|
return DatasetCategories(init_categories=_INIT_CATEGORIES, init_sub_categories=_SUB_CATEGORIES)
|
|
135
136
|
|
|
136
|
-
def _builder(self) ->
|
|
137
|
+
def _builder(self) -> FintabnetBuilder:
|
|
137
138
|
return FintabnetBuilder(location=_LOCATION, annotation_files=_ANNOTATION_FILES)
|
|
138
139
|
|
|
139
140
|
|
|
@@ -32,6 +32,7 @@ Module for Funsd dataset. Install the dataset following the folder structure
|
|
|
32
32
|
│ ├── images
|
|
33
33
|
│ │ ├── ...
|
|
34
34
|
"""
|
|
35
|
+
from __future__ import annotations
|
|
35
36
|
|
|
36
37
|
import os
|
|
37
38
|
from typing import Dict, List, Mapping, Union
|
|
@@ -120,7 +121,7 @@ class Funsd(_BuiltInDataset):
|
|
|
120
121
|
def _categories(self) -> DatasetCategories:
|
|
121
122
|
return DatasetCategories(init_categories=_INIT_CATEGORIES, init_sub_categories=_SUB_CATEGORIES)
|
|
122
123
|
|
|
123
|
-
def _builder(self) ->
|
|
124
|
+
def _builder(self) -> FunsdBuilder:
|
|
124
125
|
return FunsdBuilder(location=_LOCATION, annotation_files=_ANNOTATION_FILES)
|
|
125
126
|
|
|
126
127
|
|