deepdoctection 0.30__py3-none-any.whl → 0.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +4 -2
- deepdoctection/analyzer/dd.py +6 -5
- deepdoctection/dataflow/base.py +0 -19
- deepdoctection/dataflow/custom.py +4 -3
- deepdoctection/dataflow/custom_serialize.py +14 -5
- deepdoctection/dataflow/parallel_map.py +12 -11
- deepdoctection/dataflow/serialize.py +5 -4
- deepdoctection/datapoint/annotation.py +33 -12
- deepdoctection/datapoint/box.py +1 -4
- deepdoctection/datapoint/convert.py +3 -1
- deepdoctection/datapoint/image.py +66 -29
- deepdoctection/datapoint/view.py +57 -25
- deepdoctection/datasets/adapter.py +1 -1
- deepdoctection/datasets/base.py +83 -10
- deepdoctection/datasets/dataflow_builder.py +1 -1
- deepdoctection/datasets/info.py +2 -2
- deepdoctection/datasets/instances/layouttest.py +2 -7
- deepdoctection/eval/accmetric.py +1 -1
- deepdoctection/eval/base.py +5 -4
- deepdoctection/eval/eval.py +2 -2
- deepdoctection/eval/tp_eval_callback.py +5 -4
- deepdoctection/extern/base.py +39 -13
- deepdoctection/extern/d2detect.py +164 -64
- deepdoctection/extern/deskew.py +32 -7
- deepdoctection/extern/doctrocr.py +227 -39
- deepdoctection/extern/fastlang.py +45 -7
- deepdoctection/extern/hfdetr.py +90 -33
- deepdoctection/extern/hflayoutlm.py +109 -22
- deepdoctection/extern/pdftext.py +2 -1
- deepdoctection/extern/pt/ptutils.py +3 -2
- deepdoctection/extern/tessocr.py +134 -22
- deepdoctection/extern/texocr.py +2 -0
- deepdoctection/extern/tp/tpcompat.py +4 -4
- deepdoctection/extern/tp/tpfrcnn/preproc.py +2 -7
- deepdoctection/extern/tpdetect.py +50 -23
- deepdoctection/mapper/d2struct.py +1 -1
- deepdoctection/mapper/hfstruct.py +1 -1
- deepdoctection/mapper/laylmstruct.py +1 -1
- deepdoctection/mapper/maputils.py +13 -2
- deepdoctection/mapper/prodigystruct.py +1 -1
- deepdoctection/mapper/pubstruct.py +10 -10
- deepdoctection/mapper/tpstruct.py +1 -1
- deepdoctection/pipe/anngen.py +35 -8
- deepdoctection/pipe/base.py +53 -19
- deepdoctection/pipe/cell.py +29 -8
- deepdoctection/pipe/common.py +12 -4
- deepdoctection/pipe/doctectionpipe.py +2 -2
- deepdoctection/pipe/language.py +3 -2
- deepdoctection/pipe/layout.py +3 -2
- deepdoctection/pipe/lm.py +2 -2
- deepdoctection/pipe/refine.py +18 -10
- deepdoctection/pipe/segment.py +21 -16
- deepdoctection/pipe/text.py +14 -8
- deepdoctection/pipe/transform.py +16 -9
- deepdoctection/train/d2_frcnn_train.py +15 -12
- deepdoctection/train/hf_detr_train.py +8 -6
- deepdoctection/train/hf_layoutlm_train.py +16 -11
- deepdoctection/utils/__init__.py +3 -0
- deepdoctection/utils/concurrency.py +1 -1
- deepdoctection/utils/context.py +2 -2
- deepdoctection/utils/env_info.py +55 -22
- deepdoctection/utils/error.py +84 -0
- deepdoctection/utils/file_utils.py +4 -15
- deepdoctection/utils/fs.py +7 -7
- deepdoctection/utils/pdf_utils.py +5 -4
- deepdoctection/utils/settings.py +5 -1
- deepdoctection/utils/transform.py +1 -1
- deepdoctection/utils/utils.py +0 -6
- deepdoctection/utils/viz.py +44 -2
- {deepdoctection-0.30.dist-info → deepdoctection-0.31.dist-info}/METADATA +33 -58
- {deepdoctection-0.30.dist-info → deepdoctection-0.31.dist-info}/RECORD +74 -73
- {deepdoctection-0.30.dist-info → deepdoctection-0.31.dist-info}/WHEEL +1 -1
- {deepdoctection-0.30.dist-info → deepdoctection-0.31.dist-info}/LICENSE +0 -0
- {deepdoctection-0.30.dist-info → deepdoctection-0.31.dist-info}/top_level.txt +0 -0
|
@@ -28,6 +28,7 @@ import numpy as np
|
|
|
28
28
|
from numpy import uint8
|
|
29
29
|
|
|
30
30
|
from ..utils.detection_types import ImageType, JsonDict, Pathlike
|
|
31
|
+
from ..utils.error import AnnotationError, BoundingBoxError, ImageError, UUIDError
|
|
31
32
|
from ..utils.identifier import get_uuid, is_uuid_like
|
|
32
33
|
from ..utils.settings import ObjectTypes, get_type
|
|
33
34
|
from .annotation import Annotation, BoundingBox, ImageAnnotation, SummaryAnnotation
|
|
@@ -108,7 +109,7 @@ class Image:
|
|
|
108
109
|
"""
|
|
109
110
|
if self._image_id is not None:
|
|
110
111
|
return self._image_id
|
|
111
|
-
raise
|
|
112
|
+
raise ImageError("image_id not set")
|
|
112
113
|
|
|
113
114
|
@image_id.setter
|
|
114
115
|
def image_id(self, input_id: str) -> None:
|
|
@@ -116,13 +117,13 @@ class Image:
|
|
|
116
117
|
image_id setter
|
|
117
118
|
"""
|
|
118
119
|
if self._image_id is not None:
|
|
119
|
-
raise
|
|
120
|
+
raise ImageError("image_id already defined and cannot be reset")
|
|
120
121
|
if is_uuid_like(input_id):
|
|
121
122
|
self._image_id = input_id
|
|
122
123
|
elif isinstance(input_id, property):
|
|
123
124
|
pass
|
|
124
125
|
else:
|
|
125
|
-
raise
|
|
126
|
+
raise UUIDError("image_id must be uuid3 string")
|
|
126
127
|
|
|
127
128
|
@property
|
|
128
129
|
def image(self) -> Optional[ImageType]:
|
|
@@ -153,7 +154,7 @@ class Image:
|
|
|
153
154
|
self._self_embedding()
|
|
154
155
|
else:
|
|
155
156
|
if not isinstance(image, np.ndarray):
|
|
156
|
-
raise
|
|
157
|
+
raise ImageError(f"Cannot load image is of type: {type(image)}")
|
|
157
158
|
self._image = image.astype(uint8)
|
|
158
159
|
self.set_width_height(self._image.shape[1], self._image.shape[0])
|
|
159
160
|
self._self_embedding()
|
|
@@ -248,7 +249,7 @@ class Image:
|
|
|
248
249
|
width
|
|
249
250
|
"""
|
|
250
251
|
if self._bbox is None:
|
|
251
|
-
raise
|
|
252
|
+
raise ImageError("Width not available. Call set_width_height first")
|
|
252
253
|
return self._bbox.width
|
|
253
254
|
|
|
254
255
|
@property
|
|
@@ -257,7 +258,7 @@ class Image:
|
|
|
257
258
|
height
|
|
258
259
|
"""
|
|
259
260
|
if self._bbox is None:
|
|
260
|
-
raise
|
|
261
|
+
raise ImageError("Height not available. Call set_width_height first")
|
|
261
262
|
return self._bbox.height
|
|
262
263
|
|
|
263
264
|
def set_width_height(self, width: float, height: float) -> None:
|
|
@@ -281,7 +282,7 @@ class Image:
|
|
|
281
282
|
:param bounding_box: bounding box of this image in terms of the embedding image.
|
|
282
283
|
"""
|
|
283
284
|
if not isinstance(bounding_box, BoundingBox):
|
|
284
|
-
raise
|
|
285
|
+
raise BoundingBoxError(f"Bounding box must be of type BoundingBox, is of type {type(bounding_box)}")
|
|
285
286
|
self.embeddings[image_id] = bounding_box
|
|
286
287
|
|
|
287
288
|
def get_embedding(self, image_id: str) -> BoundingBox:
|
|
@@ -307,14 +308,14 @@ class Image:
|
|
|
307
308
|
:param annotation: image annotation to store
|
|
308
309
|
"""
|
|
309
310
|
if not isinstance(annotation, ImageAnnotation):
|
|
310
|
-
raise
|
|
311
|
+
raise AnnotationError(
|
|
311
312
|
f"Annotation must be of type ImageAnnotation: "
|
|
312
313
|
f"{annotation.annotation_id} but is of type {str(type(annotation))}"
|
|
313
314
|
)
|
|
314
315
|
if annotation._annotation_id is None: # pylint: disable=W0212
|
|
315
316
|
annotation.annotation_id = self.define_annotation_id(annotation)
|
|
316
317
|
if annotation.annotation_id in self._annotation_ids:
|
|
317
|
-
raise
|
|
318
|
+
raise ImageError(f"Cannot dump annotation with already taken " f"id {annotation.annotation_id}")
|
|
318
319
|
self._annotation_ids.append(annotation.annotation_id)
|
|
319
320
|
self.annotations.append(annotation)
|
|
320
321
|
|
|
@@ -322,7 +323,10 @@ class Image:
|
|
|
322
323
|
self,
|
|
323
324
|
category_names: Optional[Union[str, ObjectTypes, Sequence[Union[str, ObjectTypes]]]] = None,
|
|
324
325
|
annotation_ids: Optional[Union[str, Sequence[str]]] = None,
|
|
325
|
-
|
|
326
|
+
service_id: Optional[Union[str, Sequence[str]]] = None,
|
|
327
|
+
model_id: Optional[Union[str, Sequence[str]]] = None,
|
|
328
|
+
session_ids: Optional[Union[str, Sequence[str]]] = None,
|
|
329
|
+
ignore_inactive: bool = True,
|
|
326
330
|
) -> List[ImageAnnotation]:
|
|
327
331
|
"""
|
|
328
332
|
Selection of annotations from the annotation container. Filter conditions can be defined by specifying
|
|
@@ -333,47 +337,80 @@ class Image:
|
|
|
333
337
|
|
|
334
338
|
:param category_names: A single name or list of names
|
|
335
339
|
:param annotation_ids: A single id or list of ids
|
|
336
|
-
:param
|
|
340
|
+
:param service_id: A single service name or list of service names
|
|
341
|
+
:param model_id: A single model name or list of model names
|
|
342
|
+
:param session_ids: A single session id or list of session ids
|
|
343
|
+
:param ignore_inactive: If set to `True` only active annotations are returned.
|
|
344
|
+
|
|
337
345
|
:return: A (possibly empty) list of Annotations
|
|
338
346
|
"""
|
|
339
347
|
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
348
|
+
if category_names is not None:
|
|
349
|
+
category_names = (
|
|
350
|
+
[get_type(cat_name) for cat_name in category_names]
|
|
351
|
+
if isinstance(category_names, (list, set))
|
|
352
|
+
else [get_type(category_names)] # type:ignore
|
|
353
|
+
)
|
|
345
354
|
|
|
346
|
-
|
|
355
|
+
ann_ids = [annotation_ids] if isinstance(annotation_ids, str) else annotation_ids
|
|
356
|
+
service_id = [service_id] if isinstance(service_id, str) else service_id
|
|
357
|
+
model_id = [model_id] if isinstance(model_id, str) else model_id
|
|
358
|
+
session_id = [session_ids] if isinstance(session_ids, str) else session_ids
|
|
347
359
|
|
|
348
|
-
if
|
|
349
|
-
|
|
350
|
-
|
|
360
|
+
if ignore_inactive:
|
|
361
|
+
anns = filter(lambda x: x.active, self.annotations)
|
|
362
|
+
else:
|
|
363
|
+
anns = self.annotations # type:ignore
|
|
351
364
|
|
|
352
|
-
if
|
|
353
|
-
anns = filter(lambda x: x.category_name in
|
|
365
|
+
if category_names is not None:
|
|
366
|
+
anns = filter(lambda x: x.category_name in category_names, anns) # type:ignore
|
|
354
367
|
|
|
355
368
|
if ann_ids is not None:
|
|
356
369
|
anns = filter(lambda x: x.annotation_id in ann_ids, anns) # type:ignore
|
|
357
370
|
|
|
371
|
+
if service_id is not None:
|
|
372
|
+
anns = filter(lambda x: x.service_id in service_id, anns) # type:ignore
|
|
373
|
+
|
|
374
|
+
if model_id is not None:
|
|
375
|
+
anns = filter(lambda x: x.model_id in model_id, anns) # type:ignore
|
|
376
|
+
|
|
377
|
+
if session_id is not None:
|
|
378
|
+
anns = filter(lambda x: x.session_id in session_id, anns) # type:ignore
|
|
379
|
+
|
|
358
380
|
return list(anns)
|
|
359
381
|
|
|
360
382
|
def get_annotation_iter(
|
|
361
383
|
self,
|
|
362
384
|
category_names: Optional[Union[str, ObjectTypes, Sequence[Union[str, ObjectTypes]]]] = None,
|
|
363
385
|
annotation_ids: Optional[Union[str, Sequence[str]]] = None,
|
|
364
|
-
|
|
386
|
+
service_id: Optional[Union[str, Sequence[str]]] = None,
|
|
387
|
+
model_id: Optional[Union[str, Sequence[str]]] = None,
|
|
388
|
+
session_ids: Optional[Union[str, Sequence[str]]] = None,
|
|
389
|
+
ignore_inactive: bool = True,
|
|
365
390
|
) -> Iterable[ImageAnnotation]:
|
|
366
391
|
"""
|
|
367
392
|
Get annotation as an iterator. Same as `get_annotation` but returns an iterator instead of a list.
|
|
368
393
|
|
|
369
394
|
:param category_names: A single name or list of names
|
|
370
395
|
:param annotation_ids: A single id or list of ids
|
|
371
|
-
:param
|
|
396
|
+
:param service_id: A single service name or list of service names
|
|
397
|
+
:param model_id: A single model name or list of model names
|
|
398
|
+
:param session_ids: A single session id or list of session ids
|
|
399
|
+
:param ignore_inactive: If set to `True` only active annotations are returned.
|
|
372
400
|
|
|
373
401
|
:return: A (possibly empty) list of annotations
|
|
374
402
|
"""
|
|
375
403
|
|
|
376
|
-
return iter(
|
|
404
|
+
return iter(
|
|
405
|
+
self.get_annotation(
|
|
406
|
+
category_names=category_names,
|
|
407
|
+
annotation_ids=annotation_ids,
|
|
408
|
+
service_id=service_id,
|
|
409
|
+
model_id=model_id,
|
|
410
|
+
session_ids=session_ids,
|
|
411
|
+
ignore_inactive=ignore_inactive,
|
|
412
|
+
)
|
|
413
|
+
)
|
|
377
414
|
|
|
378
415
|
def as_dict(self) -> Dict[str, Any]:
|
|
379
416
|
"""
|
|
@@ -439,7 +476,7 @@ class Image:
|
|
|
439
476
|
new_image = Image(file_name=self.file_name, location=self.location, external_id=annotation_id)
|
|
440
477
|
|
|
441
478
|
if self._bbox is None or ann.bounding_box is None:
|
|
442
|
-
raise
|
|
479
|
+
raise ImageError(f"Bounding box for image and ImageAnnotation ({annotation_id}) must be set")
|
|
443
480
|
|
|
444
481
|
new_bounding_box = intersection_box(self._bbox, ann.bounding_box, self.width, self.height)
|
|
445
482
|
if new_bounding_box.absolute_coords:
|
|
@@ -454,7 +491,7 @@ class Image:
|
|
|
454
491
|
if crop_image and self.image is not None:
|
|
455
492
|
new_image.image = crop_box_from_image(self.image, ann.bounding_box, self.width, self.height)
|
|
456
493
|
elif crop_image and self.image is None:
|
|
457
|
-
raise
|
|
494
|
+
raise ImageError("crop_image = True requires self.image to be not None")
|
|
458
495
|
|
|
459
496
|
ann.image = new_image
|
|
460
497
|
|
|
@@ -472,7 +509,7 @@ class Image:
|
|
|
472
509
|
|
|
473
510
|
ann = self.get_annotation(annotation_ids=annotation_id)[0]
|
|
474
511
|
if ann.image is None:
|
|
475
|
-
raise
|
|
512
|
+
raise ImageError("When adding sub images to ImageAnnotation then ImageAnnotation.image must not be None")
|
|
476
513
|
assert ann.bounding_box is not None
|
|
477
514
|
box = ann.bounding_box.to_list("xyxy")
|
|
478
515
|
proposals = self.get_annotation(category_names)
|
|
@@ -485,7 +522,7 @@ class Image:
|
|
|
485
522
|
sub_images = self.get_annotation(annotation_ids=selected_ids.tolist())
|
|
486
523
|
for sub_image in sub_images:
|
|
487
524
|
if sub_image.image is None:
|
|
488
|
-
raise
|
|
525
|
+
raise ImageError(
|
|
489
526
|
"When setting an embedding to ImageAnnotation then ImageAnnotation.image must not be None"
|
|
490
527
|
)
|
|
491
528
|
sub_image.image.set_embedding(
|
deepdoctection/datapoint/view.py
CHANGED
|
@@ -26,6 +26,7 @@ from typing import Any, Dict, List, Mapping, Optional, Sequence, Set, Tuple, Typ
|
|
|
26
26
|
import numpy as np
|
|
27
27
|
|
|
28
28
|
from ..utils.detection_types import ImageType, JsonDict, Pathlike
|
|
29
|
+
from ..utils.error import AnnotationError, ImageError
|
|
29
30
|
from ..utils.logger import LoggingRecord, logger
|
|
30
31
|
from ..utils.settings import (
|
|
31
32
|
CellType,
|
|
@@ -96,7 +97,7 @@ class ImageAnnotationBaseView(ImageAnnotation):
|
|
|
96
97
|
interactive_imshow(np_image)
|
|
97
98
|
return None
|
|
98
99
|
return np_image
|
|
99
|
-
raise
|
|
100
|
+
raise AnnotationError(f"base_page.image is None for {self.annotation_id}")
|
|
100
101
|
|
|
101
102
|
def __getattr__(self, item: str) -> Optional[Union[str, int, List[str]]]:
|
|
102
103
|
"""
|
|
@@ -115,7 +116,7 @@ class ImageAnnotationBaseView(ImageAnnotation):
|
|
|
115
116
|
:return: value according to the logic described above
|
|
116
117
|
"""
|
|
117
118
|
if item not in self.get_attribute_names():
|
|
118
|
-
raise
|
|
119
|
+
raise AnnotationError(f"Attribute {item} is not supported for {type(self)}")
|
|
119
120
|
if item in self.sub_categories:
|
|
120
121
|
sub_cat = self.get_sub_category(get_type(item))
|
|
121
122
|
if item != sub_cat.category_name:
|
|
@@ -326,7 +327,7 @@ class Table(Layout):
|
|
|
326
327
|
def text(self) -> str:
|
|
327
328
|
try:
|
|
328
329
|
return str(self)
|
|
329
|
-
except TypeError:
|
|
330
|
+
except (TypeError, AnnotationError):
|
|
330
331
|
return super().text
|
|
331
332
|
|
|
332
333
|
@property
|
|
@@ -368,7 +369,7 @@ class Table(Layout):
|
|
|
368
369
|
for cell in cells:
|
|
369
370
|
all_words.extend(cell.get_ordered_words()) # type: ignore
|
|
370
371
|
return all_words
|
|
371
|
-
except TypeError:
|
|
372
|
+
except (TypeError, AnnotationError):
|
|
372
373
|
return super().get_ordered_words()
|
|
373
374
|
|
|
374
375
|
|
|
@@ -452,40 +453,71 @@ class Page(Image):
|
|
|
452
453
|
"page_number",
|
|
453
454
|
}
|
|
454
455
|
|
|
455
|
-
|
|
456
|
-
def get_annotation(
|
|
456
|
+
def get_annotation( # type: ignore
|
|
457
457
|
self,
|
|
458
458
|
category_names: Optional[Union[str, ObjectTypes, Sequence[Union[str, ObjectTypes]]]] = None,
|
|
459
459
|
annotation_ids: Optional[Union[str, Sequence[str]]] = None,
|
|
460
|
-
|
|
460
|
+
service_id: Optional[Union[str, Sequence[str]]] = None,
|
|
461
|
+
model_id: Optional[Union[str, Sequence[str]]] = None,
|
|
462
|
+
session_ids: Optional[Union[str, Sequence[str]]] = None,
|
|
463
|
+
ignore_inactive: bool = True,
|
|
461
464
|
) -> List[ImageAnnotationBaseView]:
|
|
462
465
|
"""
|
|
466
|
+
Selection of annotations from the annotation container. Filter conditions can be defined by specifying
|
|
467
|
+
the annotation_id or the category name. (Since only image annotations are currently allowed in the container,
|
|
468
|
+
annotation_type is a redundant filter condition.) Only annotations that have active = 'True' are
|
|
469
|
+
returned. If more than one condition is provided, only annotations will be returned that satisfy all conditions.
|
|
470
|
+
If no condition is provided, it will return all active annotations.
|
|
471
|
+
|
|
463
472
|
Identical to its base class method for having correct return types. If the base class changes, please
|
|
464
473
|
change this method as well.
|
|
474
|
+
|
|
475
|
+
:param category_names: A single name or list of names
|
|
476
|
+
:param annotation_ids: A single id or list of ids
|
|
477
|
+
:param service_id: A single service name or list of service names
|
|
478
|
+
:param model_id: A single model name or list of model names
|
|
479
|
+
:param session_ids: A single session id or list of session ids
|
|
480
|
+
:param ignore_inactive: If set to `True` only active annotations are returned.
|
|
481
|
+
|
|
482
|
+
:return: A (possibly empty) list of Annotations
|
|
465
483
|
"""
|
|
466
|
-
cat_names = [category_names] if isinstance(category_names, (ObjectTypes, str)) else category_names
|
|
467
|
-
if cat_names is not None:
|
|
468
|
-
cat_names = [get_type(cat_name) for cat_name in cat_names]
|
|
469
|
-
ann_ids = [annotation_ids] if isinstance(annotation_ids, str) else annotation_ids
|
|
470
|
-
ann_types = [annotation_types] if isinstance(annotation_types, str) else annotation_types
|
|
471
484
|
|
|
472
|
-
|
|
485
|
+
if category_names is not None:
|
|
486
|
+
category_names = (
|
|
487
|
+
[get_type(cat_name) for cat_name in category_names]
|
|
488
|
+
if isinstance(category_names, list)
|
|
489
|
+
else [get_type(category_names)] # type:ignore
|
|
490
|
+
)
|
|
491
|
+
ann_ids = [annotation_ids] if isinstance(annotation_ids, str) else annotation_ids
|
|
492
|
+
service_id = [service_id] if isinstance(service_id, str) else service_id
|
|
493
|
+
model_id = [model_id] if isinstance(model_id, str) else model_id
|
|
494
|
+
session_id = [session_ids] if isinstance(session_ids, str) else session_ids
|
|
473
495
|
|
|
474
|
-
if
|
|
475
|
-
|
|
476
|
-
|
|
496
|
+
if ignore_inactive:
|
|
497
|
+
anns = filter(lambda x: x.active, self.annotations)
|
|
498
|
+
else:
|
|
499
|
+
anns = self.annotations # type:ignore
|
|
477
500
|
|
|
478
|
-
if
|
|
479
|
-
anns = filter(lambda x: x.category_name in
|
|
501
|
+
if category_names is not None:
|
|
502
|
+
anns = filter(lambda x: x.category_name in category_names, anns) # type:ignore
|
|
480
503
|
|
|
481
504
|
if ann_ids is not None:
|
|
482
|
-
anns = filter(lambda x: x.annotation_id in ann_ids, anns)
|
|
505
|
+
anns = filter(lambda x: x.annotation_id in ann_ids, anns) # type:ignore
|
|
506
|
+
|
|
507
|
+
if service_id is not None:
|
|
508
|
+
anns = filter(lambda x: x.generating_service in service_id, anns) # type:ignore
|
|
509
|
+
|
|
510
|
+
if model_id is not None:
|
|
511
|
+
anns = filter(lambda x: x.generating_model in model_id, anns) # type:ignore
|
|
512
|
+
|
|
513
|
+
if session_id is not None:
|
|
514
|
+
anns = filter(lambda x: x.session_id in session_id, anns) # type:ignore
|
|
483
515
|
|
|
484
|
-
return list(anns)
|
|
516
|
+
return list(anns) # type:ignore
|
|
485
517
|
|
|
486
518
|
def __getattr__(self, item: str) -> Any:
|
|
487
519
|
if item not in self.get_attribute_names():
|
|
488
|
-
raise
|
|
520
|
+
raise ImageError(f"Attribute {item} is not supported for {type(self)}")
|
|
489
521
|
if self.summary is not None:
|
|
490
522
|
if item in self.summary.sub_categories:
|
|
491
523
|
sub_cat = self.summary.get_sub_category(get_type(item))
|
|
@@ -629,10 +661,10 @@ class Page(Image):
|
|
|
629
661
|
"""
|
|
630
662
|
ann = self.get_annotation(annotation_ids=annotation_id)[0]
|
|
631
663
|
if ann.category_name not in self.floating_text_block_categories:
|
|
632
|
-
raise
|
|
633
|
-
f"
|
|
634
|
-
f"
|
|
635
|
-
f"
|
|
664
|
+
raise ImageError(
|
|
665
|
+
f"Cannot get context. Make sure to parametrize this category to a floating text: "
|
|
666
|
+
f"annotation_id: {annotation_id},"
|
|
667
|
+
f"category_name: {ann.category_name}"
|
|
636
668
|
)
|
|
637
669
|
block_with_order = self._order("layouts")
|
|
638
670
|
position = block_with_order.index(ann)
|
deepdoctection/datasets/base.py
CHANGED
|
@@ -18,17 +18,19 @@
|
|
|
18
18
|
"""
|
|
19
19
|
Module for the base class of datasets.
|
|
20
20
|
"""
|
|
21
|
-
|
|
21
|
+
import json
|
|
22
22
|
import os
|
|
23
23
|
import pprint
|
|
24
24
|
from abc import ABC, abstractmethod
|
|
25
25
|
from collections import defaultdict
|
|
26
|
-
from
|
|
26
|
+
from inspect import signature
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple, Type, Union
|
|
27
29
|
|
|
28
30
|
import numpy as np
|
|
29
31
|
|
|
30
32
|
from ..dataflow import CacheData, ConcatData, CustomDataFromList, DataFlow
|
|
31
|
-
from ..datapoint import Image
|
|
33
|
+
from ..datapoint.image import Image
|
|
32
34
|
from ..utils.detection_types import Pathlike
|
|
33
35
|
from ..utils.logger import LoggingRecord, logger
|
|
34
36
|
from ..utils.settings import ObjectTypes, TypeOrStr, get_type
|
|
@@ -51,9 +53,11 @@ class DatasetBase(ABC):
|
|
|
51
53
|
self._dataflow_builder.splits = self._dataset_info.splits
|
|
52
54
|
|
|
53
55
|
if not self.dataset_available() and self.is_built_in():
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
56
|
+
logger.warning(
|
|
57
|
+
LoggingRecord(
|
|
58
|
+
f"Dataset {self._dataset_info.name} not locally found. Please download at {self._dataset_info.url}"
|
|
59
|
+
f" and place under {self._dataflow_builder.get_workdir()}"
|
|
60
|
+
)
|
|
57
61
|
)
|
|
58
62
|
|
|
59
63
|
@property
|
|
@@ -76,7 +80,7 @@ class DatasetBase(ABC):
|
|
|
76
80
|
Construct the DatasetCategory object.
|
|
77
81
|
"""
|
|
78
82
|
|
|
79
|
-
raise NotImplementedError
|
|
83
|
+
raise NotImplementedError()
|
|
80
84
|
|
|
81
85
|
@classmethod
|
|
82
86
|
@abstractmethod
|
|
@@ -85,7 +89,7 @@ class DatasetBase(ABC):
|
|
|
85
89
|
Construct the DatasetInfo object.
|
|
86
90
|
"""
|
|
87
91
|
|
|
88
|
-
raise NotImplementedError
|
|
92
|
+
raise NotImplementedError()
|
|
89
93
|
|
|
90
94
|
@abstractmethod
|
|
91
95
|
def _builder(self) -> DataFlowBaseBuilder:
|
|
@@ -93,7 +97,7 @@ class DatasetBase(ABC):
|
|
|
93
97
|
Construct the DataFlowBaseBuilder object. It needs to be implemented in the derived class.
|
|
94
98
|
"""
|
|
95
99
|
|
|
96
|
-
raise NotImplementedError
|
|
100
|
+
raise NotImplementedError()
|
|
97
101
|
|
|
98
102
|
def dataset_available(self) -> bool:
|
|
99
103
|
"""
|
|
@@ -114,7 +118,7 @@ class DatasetBase(ABC):
|
|
|
114
118
|
|
|
115
119
|
class _BuiltInDataset(DatasetBase, ABC):
|
|
116
120
|
"""
|
|
117
|
-
Dataclass for built-in dataset. Do not use this
|
|
121
|
+
Dataclass for built-in dataset. Do not use this
|
|
118
122
|
"""
|
|
119
123
|
|
|
120
124
|
_name: Optional[str] = None
|
|
@@ -427,6 +431,11 @@ class CustomDataset(DatasetBase):
|
|
|
427
431
|
else:
|
|
428
432
|
self.init_sub_categories = init_sub_categories
|
|
429
433
|
self.annotation_files = annotation_files
|
|
434
|
+
if signature(dataflow_builder.__init__).parameters.keys() != {"self", "location", "annotation_files"}:
|
|
435
|
+
raise TypeError(
|
|
436
|
+
"Dataflow builder must have the signature `def __init__(self, location: Pathlike, "
|
|
437
|
+
"annotation_files: Optional[Mapping[str, Union[str, Sequence[str]]]] = None):`"
|
|
438
|
+
)
|
|
430
439
|
self.dataflow_builder = dataflow_builder(self.location, self.annotation_files)
|
|
431
440
|
super().__init__()
|
|
432
441
|
|
|
@@ -438,3 +447,67 @@ class CustomDataset(DatasetBase):
|
|
|
438
447
|
|
|
439
448
|
def _builder(self) -> DataFlowBaseBuilder:
|
|
440
449
|
return self.dataflow_builder
|
|
450
|
+
|
|
451
|
+
@staticmethod
|
|
452
|
+
def from_dataset_card(file_path: str, dataflow_builder: Type[DataFlowBaseBuilder]) -> "CustomDataset":
|
|
453
|
+
"""
|
|
454
|
+
This static method creates a CustomDataset instance from a dataset card.
|
|
455
|
+
|
|
456
|
+
A dataset card is a JSON file that contains metadata about the dataset such as its name, type, location,
|
|
457
|
+
initial categories, initial sub categories, and annotation files. The dataflow_builder parameter is a class
|
|
458
|
+
that inherits from DataFlowBaseBuilder and is used to build the dataflow for the dataset.
|
|
459
|
+
|
|
460
|
+
:param file_path: The path to the dataset card (JSON file).
|
|
461
|
+
:param dataflow_builder: The class used to build the dataflow for the dataset.
|
|
462
|
+
:return: A CustomDataset instance created from the dataset card.
|
|
463
|
+
"""
|
|
464
|
+
|
|
465
|
+
with open(file_path, "r", encoding="UTF-8") as file:
|
|
466
|
+
meta_data = json.load(file)
|
|
467
|
+
meta_data["dataset_type"] = get_type(meta_data["dataset_type"])
|
|
468
|
+
meta_data["location"] = Path(meta_data["location"])
|
|
469
|
+
meta_data["init_categories"] = [get_type(cat) for cat in meta_data["init_categories"]]
|
|
470
|
+
meta_data["init_sub_categories"] = (
|
|
471
|
+
{
|
|
472
|
+
get_type(cat): {
|
|
473
|
+
get_type(sub_cat_key): [get_type(sub_cat_value) for sub_cat_value in sub_cat_values]
|
|
474
|
+
for sub_cat_key, sub_cat_values in sub_cats.items()
|
|
475
|
+
}
|
|
476
|
+
for cat, sub_cats in meta_data["init_sub_categories"].items()
|
|
477
|
+
}
|
|
478
|
+
if meta_data["init_sub_categories"] is not None
|
|
479
|
+
else None
|
|
480
|
+
)
|
|
481
|
+
return CustomDataset(**meta_data, dataflow_builder=dataflow_builder)
|
|
482
|
+
|
|
483
|
+
def as_dict(self) -> Mapping[str, Any]:
|
|
484
|
+
"""
|
|
485
|
+
Return the meta-data of the dataset as a dictionary.
|
|
486
|
+
|
|
487
|
+
:return: A dictionary containing the meta-data of the dataset.
|
|
488
|
+
"""
|
|
489
|
+
return {
|
|
490
|
+
"name": self.name,
|
|
491
|
+
"dataset_type": self.type,
|
|
492
|
+
"location": str(self.location),
|
|
493
|
+
"annotation_files": self.annotation_files,
|
|
494
|
+
"init_categories": [cat.value for cat in self.init_categories],
|
|
495
|
+
"init_sub_categories": {
|
|
496
|
+
cat.value: {
|
|
497
|
+
sub_cat_key.value: [sub_cat_value.value for sub_cat_value in sub_cat_values]
|
|
498
|
+
for sub_cat_key, sub_cat_values in sub_cats.items()
|
|
499
|
+
}
|
|
500
|
+
for cat, sub_cats in self.init_sub_categories.items()
|
|
501
|
+
}
|
|
502
|
+
if self.init_sub_categories is not None
|
|
503
|
+
else None,
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
def save_dataset_card(self, file_path: str) -> None:
|
|
507
|
+
"""
|
|
508
|
+
Save the dataset card to a JSON file.
|
|
509
|
+
|
|
510
|
+
:param file_path: file_path
|
|
511
|
+
"""
|
|
512
|
+
with open(file_path, "w", encoding="UTF-8") as file:
|
|
513
|
+
json.dump(self.as_dict(), file, indent=4)
|
|
@@ -110,7 +110,7 @@ class DataFlowBaseBuilder(ABC):
|
|
|
110
110
|
:param kwargs: A custom set of arguments/values
|
|
111
111
|
:return: dataflow
|
|
112
112
|
"""
|
|
113
|
-
raise NotImplementedError
|
|
113
|
+
raise NotImplementedError()
|
|
114
114
|
|
|
115
115
|
def get_annotation_file(self, split: str) -> str:
|
|
116
116
|
"""Get single annotation file."""
|
deepdoctection/datasets/info.py
CHANGED
|
@@ -306,7 +306,7 @@ class DatasetCategories:
|
|
|
306
306
|
|
|
307
307
|
_cat_to_sub_cat = {get_type(key): get_type(value) for key, value in cat_to_sub_cat.items()}
|
|
308
308
|
if not self._allow_update:
|
|
309
|
-
raise
|
|
309
|
+
raise RuntimeWarning("Replacing categories with sub categories is not allowed")
|
|
310
310
|
self._categories_update = self.init_categories
|
|
311
311
|
categories = self.get_categories(name_as_key=True)
|
|
312
312
|
cats_or_sub_cats = [
|
|
@@ -332,7 +332,7 @@ class DatasetCategories:
|
|
|
332
332
|
"""
|
|
333
333
|
|
|
334
334
|
if not self._allow_update:
|
|
335
|
-
raise
|
|
335
|
+
raise RuntimeWarning("Filtering categories is not allowed")
|
|
336
336
|
if isinstance(categories, (ObjectTypes, str)):
|
|
337
337
|
categories = [get_type(categories)]
|
|
338
338
|
else:
|
|
@@ -49,12 +49,7 @@ _LICENSE = (
|
|
|
49
49
|
" – Permissive – Version 1.0 License. Dr. Janis Meyer does not own the copyright of the images. \n"
|
|
50
50
|
" Use of the images must abide by the PMC Open Access Subset Terms of Use."
|
|
51
51
|
)
|
|
52
|
-
|
|
53
|
-
"https://www.googleapis.com/drive/v3/files/1ZD4Ef4gd2FIfp7vR8jbnrZeXD3gSWNqE?alt"
|
|
54
|
-
"=media&key=AIzaSyDuoPG6naK-kRJikScR7cP_1sQBF1r3fWU",
|
|
55
|
-
"https://www.googleapis.com/drive/v3/files/18HD62LFLa1iAmqffo4SyjuEQ32MzyNQ0?alt"
|
|
56
|
-
"=media&key=AIzaSyDuoPG6naK-kRJikScR7cP_1sQBF1r3fWU",
|
|
57
|
-
]
|
|
52
|
+
|
|
58
53
|
_SPLITS: Mapping[str, str] = {"test": "test", "predict": "predict"}
|
|
59
54
|
_TYPE = DatasetType.object_detection
|
|
60
55
|
_LOCATION = "testlayout"
|
|
@@ -77,7 +72,7 @@ class LayoutTest(_BuiltInDataset):
|
|
|
77
72
|
|
|
78
73
|
@classmethod
|
|
79
74
|
def _info(cls) -> DatasetInfo:
|
|
80
|
-
return DatasetInfo(name=_NAME, description=_DESCRIPTION, license=_LICENSE,
|
|
75
|
+
return DatasetInfo(name=_NAME, description=_DESCRIPTION, license=_LICENSE, splits=_SPLITS, type=_TYPE)
|
|
81
76
|
|
|
82
77
|
def _categories(self) -> DatasetCategories:
|
|
83
78
|
return DatasetCategories(init_categories=_INIT_CATEGORIES)
|
deepdoctection/eval/accmetric.py
CHANGED
|
@@ -87,7 +87,7 @@ def accuracy(label_gt: Sequence[int], label_predictions: Sequence[int], masks: O
|
|
|
87
87
|
np_label_gt, np_label_pr = np.asarray(label_gt), np.asarray(label_predictions)
|
|
88
88
|
if len(np_label_gt) != len(np_label_pr):
|
|
89
89
|
raise ValueError(
|
|
90
|
-
f"length
|
|
90
|
+
f"length label_gt: {len(np_label_gt)}, length label_predictions: ({len(np_label_pr)}) but must be equal"
|
|
91
91
|
)
|
|
92
92
|
if masks is not None:
|
|
93
93
|
np_label_gt, np_label_pr = _mask_some_gt_and_pr_labels(np_label_gt, np_label_pr, masks)
|
deepdoctection/eval/base.py
CHANGED
|
@@ -25,6 +25,7 @@ from typing import Any, Callable, List, Optional, Tuple
|
|
|
25
25
|
from ..dataflow import DataFlow
|
|
26
26
|
from ..datasets.info import DatasetCategories
|
|
27
27
|
from ..utils.detection_types import JsonDict
|
|
28
|
+
from ..utils.error import DependencyError
|
|
28
29
|
from ..utils.file_utils import Requirement
|
|
29
30
|
|
|
30
31
|
|
|
@@ -52,7 +53,7 @@ class MetricBase(ABC):
|
|
|
52
53
|
requirements = cls.get_requirements()
|
|
53
54
|
name = cls.__name__ if hasattr(cls, "__name__") else cls.__class__.__name__
|
|
54
55
|
if not all(requirement[1] for requirement in requirements):
|
|
55
|
-
raise
|
|
56
|
+
raise DependencyError(
|
|
56
57
|
"\n".join(
|
|
57
58
|
[f"{name} has the following dependencies:"]
|
|
58
59
|
+ [requirement[2] for requirement in requirements if not requirement[1]]
|
|
@@ -66,7 +67,7 @@ class MetricBase(ABC):
|
|
|
66
67
|
"""
|
|
67
68
|
Get a list of requirements for running the detector
|
|
68
69
|
"""
|
|
69
|
-
raise NotImplementedError
|
|
70
|
+
raise NotImplementedError()
|
|
70
71
|
|
|
71
72
|
@classmethod
|
|
72
73
|
@abstractmethod
|
|
@@ -80,7 +81,7 @@ class MetricBase(ABC):
|
|
|
80
81
|
:param dataflow_predictions: Dataflow with predictions.
|
|
81
82
|
:param categories: DatasetCategories with respect to the underlying dataset.
|
|
82
83
|
"""
|
|
83
|
-
raise NotImplementedError
|
|
84
|
+
raise NotImplementedError()
|
|
84
85
|
|
|
85
86
|
@classmethod
|
|
86
87
|
@abstractmethod
|
|
@@ -95,7 +96,7 @@ class MetricBase(ABC):
|
|
|
95
96
|
:param dataflow_predictions: Dataflow with predictions.
|
|
96
97
|
:param categories: DatasetCategories with respect to the underlying dataset.
|
|
97
98
|
"""
|
|
98
|
-
raise NotImplementedError
|
|
99
|
+
raise NotImplementedError()
|
|
99
100
|
|
|
100
101
|
@classmethod
|
|
101
102
|
def result_list_to_dict(cls, results: List[JsonDict]) -> JsonDict:
|
deepdoctection/eval/eval.py
CHANGED
|
@@ -171,7 +171,7 @@ class Evaluator:
|
|
|
171
171
|
"metric has no attribute sub_cats and cannot be used for token classification datasets"
|
|
172
172
|
)
|
|
173
173
|
else:
|
|
174
|
-
raise NotImplementedError
|
|
174
|
+
raise NotImplementedError()
|
|
175
175
|
|
|
176
176
|
else:
|
|
177
177
|
self.wandb_table_agent = None
|
|
@@ -271,7 +271,7 @@ class Evaluator:
|
|
|
271
271
|
sub_cats_to_remove = meta_anns["sub_categories"]
|
|
272
272
|
df_pr = MapData(df_pr, remove_cats(sub_categories=sub_cats_to_remove))
|
|
273
273
|
else:
|
|
274
|
-
raise NotImplementedError
|
|
274
|
+
raise NotImplementedError()
|
|
275
275
|
|
|
276
276
|
return df_pr
|
|
277
277
|
|