deepdoctection 0.43.6__py3-none-any.whl → 0.44.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +5 -1
- deepdoctection/datapoint/__init__.py +1 -1
- deepdoctection/datapoint/image.py +50 -1
- deepdoctection/datapoint/view.py +149 -54
- deepdoctection/datasets/base.py +196 -51
- deepdoctection/extern/fastlang.py +4 -2
- deepdoctection/mapper/laylmstruct.py +7 -7
- deepdoctection/pipe/base.py +29 -25
- deepdoctection/pipe/common.py +2 -2
- deepdoctection/pipe/concurrency.py +2 -2
- deepdoctection/pipe/language.py +2 -2
- deepdoctection/pipe/layout.py +2 -2
- deepdoctection/pipe/lm.py +13 -3
- deepdoctection/pipe/order.py +9 -5
- deepdoctection/pipe/refine.py +7 -7
- deepdoctection/pipe/segment.py +30 -30
- deepdoctection/pipe/sub_layout.py +2 -2
- deepdoctection/pipe/text.py +10 -5
- deepdoctection/pipe/transform.py +2 -4
- deepdoctection/utils/file_utils.py +34 -0
- deepdoctection/utils/types.py +0 -1
- {deepdoctection-0.43.6.dist-info → deepdoctection-0.44.1.dist-info}/METADATA +4 -4
- {deepdoctection-0.43.6.dist-info → deepdoctection-0.44.1.dist-info}/RECORD +26 -26
- {deepdoctection-0.43.6.dist-info → deepdoctection-0.44.1.dist-info}/WHEEL +0 -0
- {deepdoctection-0.43.6.dist-info → deepdoctection-0.44.1.dist-info}/licenses/LICENSE +0 -0
- {deepdoctection-0.43.6.dist-info → deepdoctection-0.44.1.dist-info}/top_level.txt +0 -0
deepdoctection/__init__.py
CHANGED
|
@@ -25,7 +25,7 @@ from .utils.logger import LoggingRecord, logger
|
|
|
25
25
|
|
|
26
26
|
# pylint: enable=wrong-import-position
|
|
27
27
|
|
|
28
|
-
__version__ = "0.
|
|
28
|
+
__version__ = "0.44.1"
|
|
29
29
|
|
|
30
30
|
_IMPORT_STRUCTURE = {
|
|
31
31
|
"analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory"],
|
|
@@ -92,6 +92,7 @@ _IMPORT_STRUCTURE = {
|
|
|
92
92
|
"convert_pdf_bytes_to_np_array_v2",
|
|
93
93
|
"as_dict",
|
|
94
94
|
"ImageAnnotationBaseView",
|
|
95
|
+
"MetaAnnotation",
|
|
95
96
|
"Image",
|
|
96
97
|
"Word",
|
|
97
98
|
"Layout",
|
|
@@ -105,6 +106,7 @@ _IMPORT_STRUCTURE = {
|
|
|
105
106
|
"DatasetAdapter",
|
|
106
107
|
"DatasetBase",
|
|
107
108
|
"MergeDataset",
|
|
109
|
+
"DatasetCard",
|
|
108
110
|
"CustomDataset",
|
|
109
111
|
"DataFlowBaseBuilder",
|
|
110
112
|
"DatasetInfo",
|
|
@@ -313,6 +315,8 @@ _IMPORT_STRUCTURE = {
|
|
|
313
315
|
"get_apted_requirement",
|
|
314
316
|
"distance_available",
|
|
315
317
|
"get_distance_requirement",
|
|
318
|
+
"numpy_v1_available",
|
|
319
|
+
"get_numpy_v1_requirement",
|
|
316
320
|
"transformers_available",
|
|
317
321
|
"get_transformers_requirement",
|
|
318
322
|
"detectron2_available",
|
|
@@ -25,7 +25,7 @@ from collections import defaultdict
|
|
|
25
25
|
from dataclasses import dataclass, field
|
|
26
26
|
from os import environ, fspath
|
|
27
27
|
from pathlib import Path
|
|
28
|
-
from typing import Any, Optional, Sequence, Union, no_type_check
|
|
28
|
+
from typing import Any, Optional, Sequence, TypedDict, Union, no_type_check
|
|
29
29
|
|
|
30
30
|
import numpy as np
|
|
31
31
|
from numpy import uint8
|
|
@@ -40,6 +40,55 @@ from .box import crop_box_from_image, global_to_local_coords, intersection_box
|
|
|
40
40
|
from .convert import as_dict, convert_b64_to_np_array, convert_np_array_to_b64, convert_pdf_bytes_to_np_array_v2
|
|
41
41
|
|
|
42
42
|
|
|
43
|
+
class MetaAnnotationDict(TypedDict):
|
|
44
|
+
"""MetaAnnotationDict"""
|
|
45
|
+
|
|
46
|
+
image_annotations: list[str]
|
|
47
|
+
sub_categories: dict[str, dict[str, list[str]]]
|
|
48
|
+
relationships: dict[str, list[str]]
|
|
49
|
+
summaries: list[str]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass(frozen=True)
|
|
53
|
+
class MetaAnnotation:
|
|
54
|
+
"""
|
|
55
|
+
An immutable dataclass that stores information about what `Image` are being
|
|
56
|
+
modified through a pipeline component.
|
|
57
|
+
|
|
58
|
+
Attributes:
|
|
59
|
+
image_annotations: Tuple of `ObjectTypes` representing image annotations.
|
|
60
|
+
sub_categories: Dictionary mapping `ObjectTypes` to dicts of `ObjectTypes` to sets of `ObjectTypes`
|
|
61
|
+
for sub-categories.
|
|
62
|
+
relationships: Dictionary mapping `ObjectTypes` to sets of `ObjectTypes` for relationships.
|
|
63
|
+
summaries: Tuple of `ObjectTypes` representing summaries.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
image_annotations: tuple[ObjectTypes, ...] = field(default=())
|
|
67
|
+
sub_categories: dict[ObjectTypes, dict[ObjectTypes, set[ObjectTypes]]] = field(default_factory=dict)
|
|
68
|
+
relationships: dict[ObjectTypes, set[ObjectTypes]] = field(default_factory=dict)
|
|
69
|
+
summaries: tuple[ObjectTypes, ...] = field(default=())
|
|
70
|
+
|
|
71
|
+
def as_dict(self) -> MetaAnnotationDict:
|
|
72
|
+
"""
|
|
73
|
+
Returns the MetaAnnotation as a dictionary, with all `ObjectTypes` converted to strings.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
A dictionary representation of the MetaAnnotation where all `ObjectTypes` are converted to strings.
|
|
77
|
+
"""
|
|
78
|
+
return {
|
|
79
|
+
"image_annotations": [obj.value for obj in self.image_annotations],
|
|
80
|
+
"sub_categories": {
|
|
81
|
+
outer_key.value: {
|
|
82
|
+
inner_key.value: [val.value for val in inner_values]
|
|
83
|
+
for inner_key, inner_values in outer_value.items()
|
|
84
|
+
}
|
|
85
|
+
for outer_key, outer_value in self.sub_categories.items()
|
|
86
|
+
},
|
|
87
|
+
"relationships": {key.value: [val.value for val in values] for key, values in self.relationships.items()},
|
|
88
|
+
"summaries": [obj.value for obj in self.summaries],
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
|
|
43
92
|
@dataclass
|
|
44
93
|
class Image:
|
|
45
94
|
"""
|
deepdoctection/datapoint/view.py
CHANGED
|
@@ -42,13 +42,60 @@ from ..utils.settings import (
|
|
|
42
42
|
get_type,
|
|
43
43
|
)
|
|
44
44
|
from ..utils.transform import ResizeTransform, box_to_point4, point4_to_box
|
|
45
|
-
from ..utils.types import HTML, AnnotationDict, Chunks, ImageDict, PathLikeOrStr, PixelValues,
|
|
45
|
+
from ..utils.types import HTML, AnnotationDict, Chunks, ImageDict, PathLikeOrStr, PixelValues, csv
|
|
46
46
|
from ..utils.viz import draw_boxes, interactive_imshow, viz_handler
|
|
47
47
|
from .annotation import CategoryAnnotation, ContainerAnnotation, ImageAnnotation, ann_from_dict
|
|
48
48
|
from .box import BoundingBox, crop_box_from_image
|
|
49
49
|
from .image import Image
|
|
50
50
|
|
|
51
51
|
|
|
52
|
+
@dataclass(frozen=True)
|
|
53
|
+
class Text_:
|
|
54
|
+
"""
|
|
55
|
+
Immutable dataclass for storing structured text extraction results.
|
|
56
|
+
|
|
57
|
+
Attributes:
|
|
58
|
+
text: The concatenated text string.
|
|
59
|
+
words: List of word strings.
|
|
60
|
+
ann_ids: List of annotation IDs for each word.
|
|
61
|
+
token_classes: List of token class names for each word.
|
|
62
|
+
token_class_ann_ids: List of annotation IDs for each token class.
|
|
63
|
+
token_tags: List of token tag names for each word.
|
|
64
|
+
token_tag_ann_ids: List of annotation IDs for each token tag.
|
|
65
|
+
token_class_ids: List of token class IDs.
|
|
66
|
+
token_tag_ids: List of token tag IDs.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
text: str = ""
|
|
70
|
+
words: list[str] = field(default_factory=list)
|
|
71
|
+
ann_ids: list[str] = field(default_factory=list)
|
|
72
|
+
token_classes: list[str] = field(default_factory=list)
|
|
73
|
+
token_class_ann_ids: list[str] = field(default_factory=list)
|
|
74
|
+
token_tags: list[str] = field(default_factory=list)
|
|
75
|
+
token_tag_ann_ids: list[str] = field(default_factory=list)
|
|
76
|
+
token_class_ids: list[str] = field(default_factory=list)
|
|
77
|
+
token_tag_ids: list[str] = field(default_factory=list)
|
|
78
|
+
|
|
79
|
+
def as_dict(self) -> dict[str, Union[list[str], str]]:
|
|
80
|
+
"""
|
|
81
|
+
Returns the Text_ as a dictionary.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
A dictionary representation of the Text_ dataclass.
|
|
85
|
+
"""
|
|
86
|
+
return {
|
|
87
|
+
"text": self.text,
|
|
88
|
+
"words": self.words,
|
|
89
|
+
"ann_ids": self.ann_ids,
|
|
90
|
+
"token_classes": self.token_classes,
|
|
91
|
+
"token_class_ann_ids": self.token_class_ann_ids,
|
|
92
|
+
"token_tags": self.token_tags,
|
|
93
|
+
"token_tag_ann_ids": self.token_tag_ann_ids,
|
|
94
|
+
"token_class_ids": self.token_class_ids,
|
|
95
|
+
"token_tag_ids": self.token_tag_ids,
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
|
|
52
99
|
class ImageAnnotationBaseView(ImageAnnotation):
|
|
53
100
|
"""
|
|
54
101
|
Consumption class for having easier access to categories added to an `ImageAnnotation`.
|
|
@@ -263,13 +310,28 @@ class Layout(ImageAnnotationBaseView):
|
|
|
263
310
|
"""
|
|
264
311
|
words = self.get_ordered_words()
|
|
265
312
|
if words:
|
|
266
|
-
|
|
313
|
+
(
|
|
314
|
+
characters,
|
|
315
|
+
ann_ids,
|
|
316
|
+
token_classes,
|
|
317
|
+
token_class_ann_ids,
|
|
318
|
+
token_tags,
|
|
319
|
+
token_tag_ann_ids,
|
|
320
|
+
token_classes_ids,
|
|
321
|
+
token_tag_ids,
|
|
322
|
+
) = map(list, zip(
|
|
267
323
|
*[
|
|
268
324
|
(
|
|
269
325
|
word.characters,
|
|
270
326
|
word.annotation_id,
|
|
271
327
|
word.token_class,
|
|
328
|
+
word.get_sub_category(WordType.TOKEN_CLASS).annotation_id
|
|
329
|
+
if WordType.TOKEN_CLASS in word.sub_categories
|
|
330
|
+
else None,
|
|
272
331
|
word.token_tag,
|
|
332
|
+
word.get_sub_category(WordType.TOKEN_TAG).annotation_id
|
|
333
|
+
if WordType.TOKEN_TAG in word.sub_categories
|
|
334
|
+
else None,
|
|
273
335
|
word.get_sub_category(WordType.TOKEN_CLASS).category_id
|
|
274
336
|
if WordType.TOKEN_CLASS in word.sub_categories
|
|
275
337
|
else None,
|
|
@@ -279,25 +341,40 @@ class Layout(ImageAnnotationBaseView):
|
|
|
279
341
|
)
|
|
280
342
|
for word in words
|
|
281
343
|
]
|
|
282
|
-
)
|
|
344
|
+
))
|
|
283
345
|
else:
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
346
|
+
(
|
|
347
|
+
characters,
|
|
348
|
+
ann_ids,
|
|
349
|
+
token_classes,
|
|
350
|
+
token_class_ann_ids,
|
|
351
|
+
token_tags,
|
|
352
|
+
token_tag_ann_ids,
|
|
353
|
+
token_classes_ids,
|
|
354
|
+
token_tag_ids,
|
|
355
|
+
) = (
|
|
356
|
+
[],
|
|
357
|
+
[],
|
|
358
|
+
[],
|
|
359
|
+
[],
|
|
360
|
+
[],
|
|
361
|
+
[],
|
|
362
|
+
[],
|
|
363
|
+
[],
|
|
291
364
|
)
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
"
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
365
|
+
|
|
366
|
+
return Text_(
|
|
367
|
+
text=" ".join(characters), # type: ignore
|
|
368
|
+
words=characters, # type: ignore
|
|
369
|
+
ann_ids=ann_ids, # type: ignore
|
|
370
|
+
token_classes=token_classes, # type: ignore
|
|
371
|
+
token_class_ann_ids=token_class_ann_ids, # type: ignore
|
|
372
|
+
token_tags=token_tags, # type: ignore
|
|
373
|
+
token_tag_ann_ids=token_tag_ann_ids, # type: ignore
|
|
374
|
+
token_class_ids=token_classes_ids, # type: ignore
|
|
375
|
+
token_tag_ids=token_tag_ids, # type: ignore
|
|
376
|
+
)
|
|
377
|
+
|
|
301
378
|
|
|
302
379
|
def get_attribute_names(self) -> set[str]:
|
|
303
380
|
attr_names = (
|
|
@@ -590,14 +667,16 @@ class Table(Layout):
|
|
|
590
667
|
|
|
591
668
|
@property
|
|
592
669
|
def csv_(self) -> list[list[list[Text_]]]:
|
|
670
|
+
"""
|
|
671
|
+
Returns:
|
|
672
|
+
A csv-style representation of a table as list of lists of cell.text_.
|
|
673
|
+
"""
|
|
593
674
|
cells = self.cells
|
|
594
675
|
table_list = [[[] for _ in range(self.number_of_columns)] for _ in range(self.number_of_rows)] # type: ignore
|
|
595
676
|
for cell in cells:
|
|
596
677
|
table_list[cell.row_number - 1][cell.column_number - 1].append(cell.text_) # type: ignore
|
|
597
678
|
return table_list
|
|
598
679
|
|
|
599
|
-
|
|
600
|
-
|
|
601
680
|
def __str__(self) -> str:
|
|
602
681
|
out = " ".join([" ".join(row + ["\n"]) for row in self.csv])
|
|
603
682
|
return out
|
|
@@ -624,26 +703,34 @@ class Table(Layout):
|
|
|
624
703
|
words: list[str] = []
|
|
625
704
|
ann_ids: list[str] = []
|
|
626
705
|
token_classes: list[str] = []
|
|
706
|
+
token_class_ann_ids: list[str] = []
|
|
627
707
|
token_tags: list[str] = []
|
|
708
|
+
token_tag_ann_ids: list[str] = []
|
|
628
709
|
token_class_ids: list[str] = []
|
|
629
710
|
token_tag_ids: list[str] = []
|
|
630
711
|
for cell in cells:
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
"
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
712
|
+
text_ = cell.text_
|
|
713
|
+
text.append(text_.text)
|
|
714
|
+
words.extend(text_.words)
|
|
715
|
+
ann_ids.extend(text_.ann_ids)
|
|
716
|
+
token_classes.extend(text_.token_classes)
|
|
717
|
+
token_class_ann_ids.extend(text_.token_class_ann_ids)
|
|
718
|
+
token_tags.extend(text_.token_tags)
|
|
719
|
+
token_tag_ann_ids.extend(text_.token_tag_ann_ids)
|
|
720
|
+
token_class_ids.extend(text_.token_class_ids)
|
|
721
|
+
token_tag_ids.extend(text_.token_tag_ids)
|
|
722
|
+
return Text_(
|
|
723
|
+
text=" ".join(text),
|
|
724
|
+
words=words,
|
|
725
|
+
ann_ids=ann_ids,
|
|
726
|
+
token_classes=token_classes,
|
|
727
|
+
token_class_ann_ids=token_class_ann_ids,
|
|
728
|
+
token_tags=token_tags,
|
|
729
|
+
token_tag_ann_ids=token_tag_ann_ids,
|
|
730
|
+
token_class_ids=token_class_ids,
|
|
731
|
+
token_tag_ids=token_tag_ids,
|
|
732
|
+
)
|
|
733
|
+
|
|
647
734
|
|
|
648
735
|
@property
|
|
649
736
|
def words(self) -> list[ImageAnnotationBaseView]:
|
|
@@ -1051,7 +1138,7 @@ class Page(Image):
|
|
|
1051
1138
|
|
|
1052
1139
|
```python
|
|
1053
1140
|
{"text": text string,
|
|
1054
|
-
"
|
|
1141
|
+
"words": list of single words,
|
|
1055
1142
|
"annotation_ids": word annotation ids}
|
|
1056
1143
|
```
|
|
1057
1144
|
"""
|
|
@@ -1060,26 +1147,34 @@ class Page(Image):
|
|
|
1060
1147
|
words: list[str] = []
|
|
1061
1148
|
ann_ids: list[str] = []
|
|
1062
1149
|
token_classes: list[str] = []
|
|
1150
|
+
token_class_ann_ids: list[str] = []
|
|
1063
1151
|
token_tags: list[str] = []
|
|
1152
|
+
token_tag_ann_ids: list[str] = []
|
|
1064
1153
|
token_class_ids: list[str] = []
|
|
1065
1154
|
token_tag_ids: list[str] = []
|
|
1066
1155
|
for block in block_with_order:
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
"
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1156
|
+
text_ = block.text_
|
|
1157
|
+
text.append(text_.text) # type: ignore
|
|
1158
|
+
words.extend(text_.words) # type: ignore
|
|
1159
|
+
ann_ids.extend(text_.ann_ids) # type: ignore
|
|
1160
|
+
token_classes.extend(text_.token_classes) # type: ignore
|
|
1161
|
+
token_class_ann_ids.extend(text_.token_class_ann_ids) # type: ignore
|
|
1162
|
+
token_tags.extend(text_.token_tags) # type: ignore
|
|
1163
|
+
token_tag_ann_ids.extend(text_.token_tag_ann_ids) # type: ignore
|
|
1164
|
+
token_class_ids.extend(text_.token_class_ids) # type: ignore
|
|
1165
|
+
token_tag_ids.extend(text_.token_tag_ids) # type: ignore
|
|
1166
|
+
return Text_(
|
|
1167
|
+
text=" ".join(text),
|
|
1168
|
+
words=words,
|
|
1169
|
+
ann_ids=ann_ids,
|
|
1170
|
+
token_classes=token_classes,
|
|
1171
|
+
token_class_ann_ids=token_class_ann_ids,
|
|
1172
|
+
token_tags=token_tags,
|
|
1173
|
+
token_tag_ann_ids=token_tag_ann_ids,
|
|
1174
|
+
token_class_ids=token_class_ids,
|
|
1175
|
+
token_tag_ids=token_tag_ann_ids,
|
|
1176
|
+
)
|
|
1177
|
+
|
|
1083
1178
|
|
|
1084
1179
|
def get_layout_context(self, annotation_id: str, context_size: int = 3) -> list[ImageAnnotationBaseView]:
|
|
1085
1180
|
"""
|
deepdoctection/datasets/base.py
CHANGED
|
@@ -25,14 +25,15 @@ import os
|
|
|
25
25
|
import pprint
|
|
26
26
|
from abc import ABC, abstractmethod
|
|
27
27
|
from collections import defaultdict
|
|
28
|
+
from dataclasses import dataclass, field
|
|
28
29
|
from inspect import signature
|
|
29
30
|
from pathlib import Path
|
|
30
|
-
from typing import Any, Mapping, Optional, Sequence, Type, Union
|
|
31
|
+
from typing import Any, Mapping, Optional, Sequence, Type, TypedDict, Union
|
|
31
32
|
|
|
32
33
|
import numpy as np
|
|
33
34
|
|
|
34
35
|
from ..dataflow import CacheData, ConcatData, CustomDataFromList, DataFlow
|
|
35
|
-
from ..datapoint.image import Image
|
|
36
|
+
from ..datapoint.image import Image, MetaAnnotation
|
|
36
37
|
from ..utils.logger import LoggingRecord, logger
|
|
37
38
|
from ..utils.settings import DatasetType, ObjectTypes, TypeOrStr, get_type
|
|
38
39
|
from ..utils.types import PathLikeOrStr
|
|
@@ -405,6 +406,194 @@ class MergeDataset(DatasetBase):
|
|
|
405
406
|
self._dataflow_builder.categories = self._categories()
|
|
406
407
|
|
|
407
408
|
|
|
409
|
+
class DatasetCardDict(TypedDict):
|
|
410
|
+
"""DatasetCardDict"""
|
|
411
|
+
|
|
412
|
+
name: str
|
|
413
|
+
dataset_type: Union[str, Any]
|
|
414
|
+
location: str
|
|
415
|
+
init_categories: Sequence[Any]
|
|
416
|
+
init_sub_categories: dict[Any, dict[Any, list[Any]]]
|
|
417
|
+
annotation_files: Optional[dict[Any, Union[Any, Sequence[Any]]]]
|
|
418
|
+
description: str
|
|
419
|
+
service_id_to_meta_annotation: dict[str, Any]
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
# Usage:
|
|
423
|
+
# def as_dict(self, ...) -> DatasetCardDict:
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
@dataclass
|
|
427
|
+
class DatasetCard:
|
|
428
|
+
"""
|
|
429
|
+
An immutable dataclass representing the metadata of a dataset, including categories, sub-categories,
|
|
430
|
+
storage location, annotation files, and description. It facilitates management and consistency checks
|
|
431
|
+
for annotations generated by pipeline components.
|
|
432
|
+
|
|
433
|
+
Attributes:
|
|
434
|
+
name: Name of the dataset.
|
|
435
|
+
dataset_type: Type of the dataset as `ObjectTypes`.
|
|
436
|
+
location: Storage location of the dataset as `Path`.
|
|
437
|
+
init_categories: List of all initial categories (`ObjectTypes`) present in the dataset.
|
|
438
|
+
init_sub_categories: Mapping from main categories to sub-categories and their possible values.
|
|
439
|
+
annotation_files: Optional mapping from split names to annotation files.
|
|
440
|
+
description: Description of the dataset.
|
|
441
|
+
service_id_to_meta_annotation: Mapping from service IDs to `MetaAnnotation` objects, storing
|
|
442
|
+
annotation structure for different pipeline components.
|
|
443
|
+
"""
|
|
444
|
+
|
|
445
|
+
name: str
|
|
446
|
+
dataset_type: ObjectTypes
|
|
447
|
+
location: Path
|
|
448
|
+
init_categories: list[ObjectTypes] = field(default_factory=list)
|
|
449
|
+
init_sub_categories: dict[ObjectTypes, dict[ObjectTypes, list[ObjectTypes]]] = field(default_factory=dict)
|
|
450
|
+
annotation_files: Optional[Mapping[str, Union[str, Sequence[str]]]] = None
|
|
451
|
+
description: str = field(default="")
|
|
452
|
+
service_id_to_meta_annotation: dict[str, MetaAnnotation] = field(default_factory=dict)
|
|
453
|
+
|
|
454
|
+
def save_dataset_card(self, file_path: Union[str, Path]) -> None:
|
|
455
|
+
"""Save the DatasetCard instance as a JSON file."""
|
|
456
|
+
with open(file_path, "w", encoding="utf-8") as f:
|
|
457
|
+
json.dump(self.as_dict(), f, indent=4)
|
|
458
|
+
|
|
459
|
+
@staticmethod
|
|
460
|
+
def load_dataset_card(file_path: PathLikeOrStr) -> DatasetCard:
|
|
461
|
+
"""Load a DatasetCard instance from a JSON file."""
|
|
462
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
463
|
+
data = json.load(f)
|
|
464
|
+
service_id_to_meta_annotation = {}
|
|
465
|
+
if "service_id_to_meta_annotation" in data:
|
|
466
|
+
for service_id, meta_ann_dict in data.pop("service_id_to_meta_annotation").items():
|
|
467
|
+
meta_ann_dict["image_annotations"] = tuple(
|
|
468
|
+
get_type(cat) for cat in meta_ann_dict["image_annotations"]
|
|
469
|
+
)
|
|
470
|
+
meta_ann_dict["sub_categories"] = {
|
|
471
|
+
get_type(cat): {
|
|
472
|
+
get_type(sub_cat): set({get_type(value) for value in values})
|
|
473
|
+
for sub_cat, values in sub_cats.items()
|
|
474
|
+
}
|
|
475
|
+
for cat, sub_cats in meta_ann_dict["sub_categories"].items()
|
|
476
|
+
}
|
|
477
|
+
meta_ann_dict["relationships"] = {
|
|
478
|
+
get_type(key): set({get_type(value) for value in values})
|
|
479
|
+
for key, values in meta_ann_dict["relationships"].items()
|
|
480
|
+
}
|
|
481
|
+
meta_ann_dict["summaries"] = tuple(get_type(val) for val in meta_ann_dict["summaries"])
|
|
482
|
+
service_id_to_meta_annotation[service_id] = MetaAnnotation(**meta_ann_dict)
|
|
483
|
+
data["service_id_to_meta_annotation"] = service_id_to_meta_annotation
|
|
484
|
+
return DatasetCard(**data)
|
|
485
|
+
|
|
486
|
+
def as_dict(self, keep_object_types: bool = False) -> DatasetCardDict:
|
|
487
|
+
"""Convert the DatasetCard to a dictionary."""
|
|
488
|
+
if keep_object_types:
|
|
489
|
+
return {
|
|
490
|
+
"name": self.name,
|
|
491
|
+
"dataset_type": self.dataset_type,
|
|
492
|
+
"location": self.location.as_posix(),
|
|
493
|
+
"init_categories": self.init_categories,
|
|
494
|
+
"init_sub_categories": self.init_sub_categories,
|
|
495
|
+
"annotation_files": self.annotation_files, # type: ignore
|
|
496
|
+
"description": self.description,
|
|
497
|
+
"service_id_to_meta_annotation": {
|
|
498
|
+
key: val.as_dict() for key, val in self.service_id_to_meta_annotation.items()
|
|
499
|
+
},
|
|
500
|
+
}
|
|
501
|
+
return {
|
|
502
|
+
"name": self.name,
|
|
503
|
+
"dataset_type": self.dataset_type.value,
|
|
504
|
+
"location": self.location.as_posix(),
|
|
505
|
+
"init_categories": [cat.value for cat in self.init_categories],
|
|
506
|
+
"init_sub_categories": {
|
|
507
|
+
cat.value: {
|
|
508
|
+
sub_cat.value: list({value.value for value in values}) for sub_cat, values in sub_cats.items()
|
|
509
|
+
}
|
|
510
|
+
for cat, sub_cats in self.init_sub_categories.items()
|
|
511
|
+
},
|
|
512
|
+
"annotation_files": self.annotation_files, # type: ignore
|
|
513
|
+
"description": self.description,
|
|
514
|
+
"service_id_to_meta_annotation": {
|
|
515
|
+
key: val.as_dict() for key, val in self.service_id_to_meta_annotation.items()
|
|
516
|
+
},
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
def update_from_pipeline(
|
|
520
|
+
self, meta_annotations: MetaAnnotation, service_id_to_meta_annotation: Mapping[str, MetaAnnotation]
|
|
521
|
+
) -> None:
|
|
522
|
+
"""
|
|
523
|
+
Update the initial categories, sub-categories, and service ID to `MetaAnnotation` mapping
|
|
524
|
+
based on the results from a pipeline.
|
|
525
|
+
|
|
526
|
+
```python
|
|
527
|
+
analyzer = dd.get_dd_analyzer(config_overwrite=["USE_OCR=True","USE_TABLE_SEGMENTATION=True"])
|
|
528
|
+
meta_annotations = analyzer.get_meta_annotation()
|
|
529
|
+
service_id_to_meta_annotation = analyzer.get_service_id_to_meta_annotation()
|
|
530
|
+
card.update_from_pipeline(meta_annotations, service_id_to_meta_annotation)
|
|
531
|
+
```
|
|
532
|
+
|
|
533
|
+
Args:
|
|
534
|
+
meta_annotations: A `MetaAnnotation` object containing new or updated categories and sub-categories.
|
|
535
|
+
service_id_to_meta_annotation: A mapping from service IDs to `MetaAnnotation` objects generated by the
|
|
536
|
+
pipeline.
|
|
537
|
+
|
|
538
|
+
Adds any missing categories, sub-categories, and values to the respective attributes of the instance.
|
|
539
|
+
"""
|
|
540
|
+
for category in meta_annotations.image_annotations:
|
|
541
|
+
if category not in self.init_categories:
|
|
542
|
+
self.init_categories.append(category)
|
|
543
|
+
for cat, sub_cats in meta_annotations.sub_categories.items():
|
|
544
|
+
if cat not in self.init_sub_categories:
|
|
545
|
+
self.init_sub_categories[cat] = {}
|
|
546
|
+
for sub_cat, values in sub_cats.items():
|
|
547
|
+
if sub_cat not in self.init_sub_categories[cat]:
|
|
548
|
+
self.init_sub_categories[cat][sub_cat] = []
|
|
549
|
+
for value in values:
|
|
550
|
+
if value not in self.init_sub_categories[cat][sub_cat]:
|
|
551
|
+
self.init_sub_categories[cat][sub_cat].append(value)
|
|
552
|
+
|
|
553
|
+
for service_id, meta_annotation in service_id_to_meta_annotation.items():
|
|
554
|
+
if service_id not in self.service_id_to_meta_annotation:
|
|
555
|
+
self.service_id_to_meta_annotation[service_id] = meta_annotation
|
|
556
|
+
|
|
557
|
+
def __post_init__(self) -> None:
|
|
558
|
+
"""
|
|
559
|
+
Perform internal consistency checks ensuring `init_categories` and
|
|
560
|
+
`init_sub_categories` align with `service_id_to_meta_annotation`.
|
|
561
|
+
"""
|
|
562
|
+
self.dataset_type = get_type(self.dataset_type)
|
|
563
|
+
self.location = Path(self.location)
|
|
564
|
+
self.init_categories = [get_type(cat) for cat in self.init_categories]
|
|
565
|
+
self.init_sub_categories = {
|
|
566
|
+
get_type(outer_key): {
|
|
567
|
+
get_type(inner_key): [get_type(value) for value in inner_values]
|
|
568
|
+
for inner_key, inner_values in outer_value.items()
|
|
569
|
+
}
|
|
570
|
+
for outer_key, outer_value in self.init_sub_categories.items()
|
|
571
|
+
}
|
|
572
|
+
|
|
573
|
+
if self.service_id_to_meta_annotation is None:
|
|
574
|
+
return
|
|
575
|
+
|
|
576
|
+
# Check compatibility of image_annotations with init_categories
|
|
577
|
+
for service_id, meta_annotation in self.service_id_to_meta_annotation.items():
|
|
578
|
+
for annotation in meta_annotation.image_annotations:
|
|
579
|
+
if annotation not in self.init_categories:
|
|
580
|
+
raise ValueError(
|
|
581
|
+
f"Image annotation '{annotation}' in service ID '{service_id}' is not "
|
|
582
|
+
f"present in `init_categories`."
|
|
583
|
+
)
|
|
584
|
+
|
|
585
|
+
# Check compatibility of sub_categories
|
|
586
|
+
for cat, sub_cats in meta_annotation.sub_categories.items():
|
|
587
|
+
if not (
|
|
588
|
+
cat in self.init_sub_categories
|
|
589
|
+
and all(sub_cat in self.init_sub_categories[cat] for sub_cat in sub_cats)
|
|
590
|
+
):
|
|
591
|
+
raise ValueError(
|
|
592
|
+
f"Sub-categories for category '{cat}' in service ID '{service_id}' "
|
|
593
|
+
f"do not match with `init_sub_categories`."
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
|
|
408
597
|
class CustomDataset(DatasetBase):
|
|
409
598
|
"""
|
|
410
599
|
A simple dataset interface that implements the boilerplate code and reduces complexity by merely leaving
|
|
@@ -512,53 +701,9 @@ class CustomDataset(DatasetBase):
|
|
|
512
701
|
Returns:
|
|
513
702
|
A CustomDataset instance created from the dataset card.
|
|
514
703
|
"""
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
meta_data["init_categories"] = [get_type(cat) for cat in meta_data["init_categories"]]
|
|
521
|
-
meta_data["init_sub_categories"] = (
|
|
522
|
-
{
|
|
523
|
-
get_type(cat): {
|
|
524
|
-
get_type(sub_cat_key): [get_type(sub_cat_value) for sub_cat_value in sub_cat_values]
|
|
525
|
-
for sub_cat_key, sub_cat_values in sub_cats.items()
|
|
526
|
-
}
|
|
527
|
-
for cat, sub_cats in meta_data["init_sub_categories"].items()
|
|
528
|
-
}
|
|
529
|
-
if meta_data["init_sub_categories"] is not None
|
|
530
|
-
else None
|
|
704
|
+
dataset_card = DatasetCard.load_dataset_card(file_path)
|
|
705
|
+
dataset_card_as_dict = dataset_card.as_dict(True)
|
|
706
|
+
dataset_card_as_dict.pop("service_id_to_meta_annotation") # type: ignore # pylint: disable=E1123
|
|
707
|
+
return CustomDataset( # pylint: disable=E1123
|
|
708
|
+
**dataset_card_as_dict, dataflow_builder=dataflow_builder # type: ignore
|
|
531
709
|
)
|
|
532
|
-
return CustomDataset(**meta_data, dataflow_builder=dataflow_builder)
|
|
533
|
-
|
|
534
|
-
def as_dict(self) -> Mapping[str, Any]:
|
|
535
|
-
"""
|
|
536
|
-
Return:
|
|
537
|
-
The meta-data of the dataset as a dictionary.
|
|
538
|
-
"""
|
|
539
|
-
return {
|
|
540
|
-
"name": self.name,
|
|
541
|
-
"dataset_type": self.type,
|
|
542
|
-
"location": str(self.location),
|
|
543
|
-
"annotation_files": self.annotation_files,
|
|
544
|
-
"init_categories": [cat.value for cat in self.init_categories],
|
|
545
|
-
"init_sub_categories": {
|
|
546
|
-
cat.value: {
|
|
547
|
-
sub_cat_key.value: [sub_cat_value.value for sub_cat_value in sub_cat_values]
|
|
548
|
-
for sub_cat_key, sub_cat_values in sub_cats.items()
|
|
549
|
-
}
|
|
550
|
-
for cat, sub_cats in self.init_sub_categories.items()
|
|
551
|
-
}
|
|
552
|
-
if self.init_sub_categories is not None
|
|
553
|
-
else None,
|
|
554
|
-
}
|
|
555
|
-
|
|
556
|
-
def save_dataset_card(self, file_path: str) -> None:
|
|
557
|
-
"""
|
|
558
|
-
Save the dataset card to a `JSON` file.
|
|
559
|
-
|
|
560
|
-
Args:
|
|
561
|
-
file_path: file_path
|
|
562
|
-
"""
|
|
563
|
-
with open(file_path, "w", encoding="UTF-8") as file:
|
|
564
|
-
json.dump(self.as_dict(), file, indent=4)
|