deepdoctection 0.30__py3-none-any.whl → 0.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (74) hide show
  1. deepdoctection/__init__.py +4 -2
  2. deepdoctection/analyzer/dd.py +6 -5
  3. deepdoctection/dataflow/base.py +0 -19
  4. deepdoctection/dataflow/custom.py +4 -3
  5. deepdoctection/dataflow/custom_serialize.py +14 -5
  6. deepdoctection/dataflow/parallel_map.py +12 -11
  7. deepdoctection/dataflow/serialize.py +5 -4
  8. deepdoctection/datapoint/annotation.py +33 -12
  9. deepdoctection/datapoint/box.py +1 -4
  10. deepdoctection/datapoint/convert.py +3 -1
  11. deepdoctection/datapoint/image.py +66 -29
  12. deepdoctection/datapoint/view.py +57 -25
  13. deepdoctection/datasets/adapter.py +1 -1
  14. deepdoctection/datasets/base.py +83 -10
  15. deepdoctection/datasets/dataflow_builder.py +1 -1
  16. deepdoctection/datasets/info.py +2 -2
  17. deepdoctection/datasets/instances/layouttest.py +2 -7
  18. deepdoctection/eval/accmetric.py +1 -1
  19. deepdoctection/eval/base.py +5 -4
  20. deepdoctection/eval/eval.py +2 -2
  21. deepdoctection/eval/tp_eval_callback.py +5 -4
  22. deepdoctection/extern/base.py +39 -13
  23. deepdoctection/extern/d2detect.py +164 -64
  24. deepdoctection/extern/deskew.py +32 -7
  25. deepdoctection/extern/doctrocr.py +227 -39
  26. deepdoctection/extern/fastlang.py +45 -7
  27. deepdoctection/extern/hfdetr.py +90 -33
  28. deepdoctection/extern/hflayoutlm.py +109 -22
  29. deepdoctection/extern/pdftext.py +2 -1
  30. deepdoctection/extern/pt/ptutils.py +3 -2
  31. deepdoctection/extern/tessocr.py +134 -22
  32. deepdoctection/extern/texocr.py +2 -0
  33. deepdoctection/extern/tp/tpcompat.py +4 -4
  34. deepdoctection/extern/tp/tpfrcnn/preproc.py +2 -7
  35. deepdoctection/extern/tpdetect.py +50 -23
  36. deepdoctection/mapper/d2struct.py +1 -1
  37. deepdoctection/mapper/hfstruct.py +1 -1
  38. deepdoctection/mapper/laylmstruct.py +1 -1
  39. deepdoctection/mapper/maputils.py +13 -2
  40. deepdoctection/mapper/prodigystruct.py +1 -1
  41. deepdoctection/mapper/pubstruct.py +10 -10
  42. deepdoctection/mapper/tpstruct.py +1 -1
  43. deepdoctection/pipe/anngen.py +35 -8
  44. deepdoctection/pipe/base.py +53 -19
  45. deepdoctection/pipe/cell.py +29 -8
  46. deepdoctection/pipe/common.py +12 -4
  47. deepdoctection/pipe/doctectionpipe.py +2 -2
  48. deepdoctection/pipe/language.py +3 -2
  49. deepdoctection/pipe/layout.py +3 -2
  50. deepdoctection/pipe/lm.py +2 -2
  51. deepdoctection/pipe/refine.py +18 -10
  52. deepdoctection/pipe/segment.py +21 -16
  53. deepdoctection/pipe/text.py +14 -8
  54. deepdoctection/pipe/transform.py +16 -9
  55. deepdoctection/train/d2_frcnn_train.py +15 -12
  56. deepdoctection/train/hf_detr_train.py +8 -6
  57. deepdoctection/train/hf_layoutlm_train.py +16 -11
  58. deepdoctection/utils/__init__.py +3 -0
  59. deepdoctection/utils/concurrency.py +1 -1
  60. deepdoctection/utils/context.py +2 -2
  61. deepdoctection/utils/env_info.py +55 -22
  62. deepdoctection/utils/error.py +84 -0
  63. deepdoctection/utils/file_utils.py +4 -15
  64. deepdoctection/utils/fs.py +7 -7
  65. deepdoctection/utils/pdf_utils.py +5 -4
  66. deepdoctection/utils/settings.py +5 -1
  67. deepdoctection/utils/transform.py +1 -1
  68. deepdoctection/utils/utils.py +0 -6
  69. deepdoctection/utils/viz.py +44 -2
  70. {deepdoctection-0.30.dist-info → deepdoctection-0.31.dist-info}/METADATA +33 -58
  71. {deepdoctection-0.30.dist-info → deepdoctection-0.31.dist-info}/RECORD +74 -73
  72. {deepdoctection-0.30.dist-info → deepdoctection-0.31.dist-info}/WHEEL +1 -1
  73. {deepdoctection-0.30.dist-info → deepdoctection-0.31.dist-info}/LICENSE +0 -0
  74. {deepdoctection-0.30.dist-info → deepdoctection-0.31.dist-info}/top_level.txt +0 -0
@@ -28,6 +28,7 @@ import numpy as np
28
28
  from numpy import uint8
29
29
 
30
30
  from ..utils.detection_types import ImageType, JsonDict, Pathlike
31
+ from ..utils.error import AnnotationError, BoundingBoxError, ImageError, UUIDError
31
32
  from ..utils.identifier import get_uuid, is_uuid_like
32
33
  from ..utils.settings import ObjectTypes, get_type
33
34
  from .annotation import Annotation, BoundingBox, ImageAnnotation, SummaryAnnotation
@@ -108,7 +109,7 @@ class Image:
108
109
  """
109
110
  if self._image_id is not None:
110
111
  return self._image_id
111
- raise ValueError("image_id not set")
112
+ raise ImageError("image_id not set")
112
113
 
113
114
  @image_id.setter
114
115
  def image_id(self, input_id: str) -> None:
@@ -116,13 +117,13 @@ class Image:
116
117
  image_id setter
117
118
  """
118
119
  if self._image_id is not None:
119
- raise ValueError("image_id already defined and cannot be reset")
120
+ raise ImageError("image_id already defined and cannot be reset")
120
121
  if is_uuid_like(input_id):
121
122
  self._image_id = input_id
122
123
  elif isinstance(input_id, property):
123
124
  pass
124
125
  else:
125
- raise ValueError("image_id must be uuid3 string")
126
+ raise UUIDError("image_id must be uuid3 string")
126
127
 
127
128
  @property
128
129
  def image(self) -> Optional[ImageType]:
@@ -153,7 +154,7 @@ class Image:
153
154
  self._self_embedding()
154
155
  else:
155
156
  if not isinstance(image, np.ndarray):
156
- raise TypeError(f"Cannot load image is of type: {type(image)}")
157
+ raise ImageError(f"Cannot load image is of type: {type(image)}")
157
158
  self._image = image.astype(uint8)
158
159
  self.set_width_height(self._image.shape[1], self._image.shape[0])
159
160
  self._self_embedding()
@@ -248,7 +249,7 @@ class Image:
248
249
  width
249
250
  """
250
251
  if self._bbox is None:
251
- raise ValueError("Width not available. Call set_width_height first")
252
+ raise ImageError("Width not available. Call set_width_height first")
252
253
  return self._bbox.width
253
254
 
254
255
  @property
@@ -257,7 +258,7 @@ class Image:
257
258
  height
258
259
  """
259
260
  if self._bbox is None:
260
- raise ValueError("Height not available. Call set_width_height first")
261
+ raise ImageError("Height not available. Call set_width_height first")
261
262
  return self._bbox.height
262
263
 
263
264
  def set_width_height(self, width: float, height: float) -> None:
@@ -281,7 +282,7 @@ class Image:
281
282
  :param bounding_box: bounding box of this image in terms of the embedding image.
282
283
  """
283
284
  if not isinstance(bounding_box, BoundingBox):
284
- raise TypeError(f"Bounding box must be of type BoundingBox, is of type {type(bounding_box)}")
285
+ raise BoundingBoxError(f"Bounding box must be of type BoundingBox, is of type {type(bounding_box)}")
285
286
  self.embeddings[image_id] = bounding_box
286
287
 
287
288
  def get_embedding(self, image_id: str) -> BoundingBox:
@@ -307,14 +308,14 @@ class Image:
307
308
  :param annotation: image annotation to store
308
309
  """
309
310
  if not isinstance(annotation, ImageAnnotation):
310
- raise TypeError(
311
+ raise AnnotationError(
311
312
  f"Annotation must be of type ImageAnnotation: "
312
313
  f"{annotation.annotation_id} but is of type {str(type(annotation))}"
313
314
  )
314
315
  if annotation._annotation_id is None: # pylint: disable=W0212
315
316
  annotation.annotation_id = self.define_annotation_id(annotation)
316
317
  if annotation.annotation_id in self._annotation_ids:
317
- raise ValueError(f"Cannot dump annotation with already taken " f"id {annotation.annotation_id}")
318
+ raise ImageError(f"Cannot dump annotation with already taken " f"id {annotation.annotation_id}")
318
319
  self._annotation_ids.append(annotation.annotation_id)
319
320
  self.annotations.append(annotation)
320
321
 
@@ -322,7 +323,10 @@ class Image:
322
323
  self,
323
324
  category_names: Optional[Union[str, ObjectTypes, Sequence[Union[str, ObjectTypes]]]] = None,
324
325
  annotation_ids: Optional[Union[str, Sequence[str]]] = None,
325
- annotation_types: Optional[Union[str, Sequence[str]]] = None,
326
+ service_id: Optional[Union[str, Sequence[str]]] = None,
327
+ model_id: Optional[Union[str, Sequence[str]]] = None,
328
+ session_ids: Optional[Union[str, Sequence[str]]] = None,
329
+ ignore_inactive: bool = True,
326
330
  ) -> List[ImageAnnotation]:
327
331
  """
328
332
  Selection of annotations from the annotation container. Filter conditions can be defined by specifying
@@ -333,47 +337,80 @@ class Image:
333
337
 
334
338
  :param category_names: A single name or list of names
335
339
  :param annotation_ids: A single id or list of ids
336
- :param annotation_types: A type name or list of type names.
340
+ :param service_id: A single service name or list of service names
341
+ :param model_id: A single model name or list of model names
342
+ :param session_ids: A single session id or list of session ids
343
+ :param ignore_inactive: If set to `True` only active annotations are returned.
344
+
337
345
  :return: A (possibly empty) list of Annotations
338
346
  """
339
347
 
340
- cat_names = [category_names] if isinstance(category_names, (ObjectTypes, str)) else category_names
341
- if cat_names is not None:
342
- cat_names = [get_type(cat_name) for cat_name in cat_names]
343
- ann_ids = [annotation_ids] if isinstance(annotation_ids, str) else annotation_ids
344
- ann_types = [annotation_types] if isinstance(annotation_types, str) else annotation_types
348
+ if category_names is not None:
349
+ category_names = (
350
+ [get_type(cat_name) for cat_name in category_names]
351
+ if isinstance(category_names, (list, set))
352
+ else [get_type(category_names)] # type:ignore
353
+ )
345
354
 
346
- anns = filter(lambda x: x.active, self.annotations)
355
+ ann_ids = [annotation_ids] if isinstance(annotation_ids, str) else annotation_ids
356
+ service_id = [service_id] if isinstance(service_id, str) else service_id
357
+ model_id = [model_id] if isinstance(model_id, str) else model_id
358
+ session_id = [session_ids] if isinstance(session_ids, str) else session_ids
347
359
 
348
- if ann_types is not None:
349
- for type_name in ann_types:
350
- anns = filter(lambda x: isinstance(x, eval(type_name)), anns) # pylint: disable=W0123, W0640
360
+ if ignore_inactive:
361
+ anns = filter(lambda x: x.active, self.annotations)
362
+ else:
363
+ anns = self.annotations # type:ignore
351
364
 
352
- if cat_names is not None:
353
- anns = filter(lambda x: x.category_name in cat_names, anns) # type:ignore
365
+ if category_names is not None:
366
+ anns = filter(lambda x: x.category_name in category_names, anns) # type:ignore
354
367
 
355
368
  if ann_ids is not None:
356
369
  anns = filter(lambda x: x.annotation_id in ann_ids, anns) # type:ignore
357
370
 
371
+ if service_id is not None:
372
+ anns = filter(lambda x: x.service_id in service_id, anns) # type:ignore
373
+
374
+ if model_id is not None:
375
+ anns = filter(lambda x: x.model_id in model_id, anns) # type:ignore
376
+
377
+ if session_id is not None:
378
+ anns = filter(lambda x: x.session_id in session_id, anns) # type:ignore
379
+
358
380
  return list(anns)
359
381
 
360
382
  def get_annotation_iter(
361
383
  self,
362
384
  category_names: Optional[Union[str, ObjectTypes, Sequence[Union[str, ObjectTypes]]]] = None,
363
385
  annotation_ids: Optional[Union[str, Sequence[str]]] = None,
364
- annotation_types: Optional[Union[str, Sequence[str]]] = None,
386
+ service_id: Optional[Union[str, Sequence[str]]] = None,
387
+ model_id: Optional[Union[str, Sequence[str]]] = None,
388
+ session_ids: Optional[Union[str, Sequence[str]]] = None,
389
+ ignore_inactive: bool = True,
365
390
  ) -> Iterable[ImageAnnotation]:
366
391
  """
367
392
  Get annotation as an iterator. Same as `get_annotation` but returns an iterator instead of a list.
368
393
 
369
394
  :param category_names: A single name or list of names
370
395
  :param annotation_ids: A single id or list of ids
371
- :param annotation_types: A type name or list of type names.
396
+ :param service_id: A single service name or list of service names
397
+ :param model_id: A single model name or list of model names
398
+ :param session_ids: A single session id or list of session ids
399
+ :param ignore_inactive: If set to `True` only active annotations are returned.
372
400
 
373
401
  :return: A (possibly empty) list of annotations
374
402
  """
375
403
 
376
- return iter(self.get_annotation(category_names, annotation_ids, annotation_types))
404
+ return iter(
405
+ self.get_annotation(
406
+ category_names=category_names,
407
+ annotation_ids=annotation_ids,
408
+ service_id=service_id,
409
+ model_id=model_id,
410
+ session_ids=session_ids,
411
+ ignore_inactive=ignore_inactive,
412
+ )
413
+ )
377
414
 
378
415
  def as_dict(self) -> Dict[str, Any]:
379
416
  """
@@ -439,7 +476,7 @@ class Image:
439
476
  new_image = Image(file_name=self.file_name, location=self.location, external_id=annotation_id)
440
477
 
441
478
  if self._bbox is None or ann.bounding_box is None:
442
- raise ValueError(f"Bounding box for image and ImageAnnotation ({annotation_id}) must be set")
479
+ raise ImageError(f"Bounding box for image and ImageAnnotation ({annotation_id}) must be set")
443
480
 
444
481
  new_bounding_box = intersection_box(self._bbox, ann.bounding_box, self.width, self.height)
445
482
  if new_bounding_box.absolute_coords:
@@ -454,7 +491,7 @@ class Image:
454
491
  if crop_image and self.image is not None:
455
492
  new_image.image = crop_box_from_image(self.image, ann.bounding_box, self.width, self.height)
456
493
  elif crop_image and self.image is None:
457
- raise ValueError("crop_image = True requires self.image to be not None")
494
+ raise ImageError("crop_image = True requires self.image to be not None")
458
495
 
459
496
  ann.image = new_image
460
497
 
@@ -472,7 +509,7 @@ class Image:
472
509
 
473
510
  ann = self.get_annotation(annotation_ids=annotation_id)[0]
474
511
  if ann.image is None:
475
- raise ValueError("When adding sub images to ImageAnnotation then ImageAnnotation.image must not be None")
512
+ raise ImageError("When adding sub images to ImageAnnotation then ImageAnnotation.image must not be None")
476
513
  assert ann.bounding_box is not None
477
514
  box = ann.bounding_box.to_list("xyxy")
478
515
  proposals = self.get_annotation(category_names)
@@ -485,7 +522,7 @@ class Image:
485
522
  sub_images = self.get_annotation(annotation_ids=selected_ids.tolist())
486
523
  for sub_image in sub_images:
487
524
  if sub_image.image is None:
488
- raise ValueError(
525
+ raise ImageError(
489
526
  "When setting an embedding to ImageAnnotation then ImageAnnotation.image must not be None"
490
527
  )
491
528
  sub_image.image.set_embedding(
@@ -26,6 +26,7 @@ from typing import Any, Dict, List, Mapping, Optional, Sequence, Set, Tuple, Typ
26
26
  import numpy as np
27
27
 
28
28
  from ..utils.detection_types import ImageType, JsonDict, Pathlike
29
+ from ..utils.error import AnnotationError, ImageError
29
30
  from ..utils.logger import LoggingRecord, logger
30
31
  from ..utils.settings import (
31
32
  CellType,
@@ -96,7 +97,7 @@ class ImageAnnotationBaseView(ImageAnnotation):
96
97
  interactive_imshow(np_image)
97
98
  return None
98
99
  return np_image
99
- raise ValueError(f"base_page.image is None for {self.annotation_id}")
100
+ raise AnnotationError(f"base_page.image is None for {self.annotation_id}")
100
101
 
101
102
  def __getattr__(self, item: str) -> Optional[Union[str, int, List[str]]]:
102
103
  """
@@ -115,7 +116,7 @@ class ImageAnnotationBaseView(ImageAnnotation):
115
116
  :return: value according to the logic described above
116
117
  """
117
118
  if item not in self.get_attribute_names():
118
- raise AttributeError(f"Attribute {item} is not supported for {type(self)}")
119
+ raise AnnotationError(f"Attribute {item} is not supported for {type(self)}")
119
120
  if item in self.sub_categories:
120
121
  sub_cat = self.get_sub_category(get_type(item))
121
122
  if item != sub_cat.category_name:
@@ -326,7 +327,7 @@ class Table(Layout):
326
327
  def text(self) -> str:
327
328
  try:
328
329
  return str(self)
329
- except TypeError:
330
+ except (TypeError, AnnotationError):
330
331
  return super().text
331
332
 
332
333
  @property
@@ -368,7 +369,7 @@ class Table(Layout):
368
369
  for cell in cells:
369
370
  all_words.extend(cell.get_ordered_words()) # type: ignore
370
371
  return all_words
371
- except TypeError:
372
+ except (TypeError, AnnotationError):
372
373
  return super().get_ordered_words()
373
374
 
374
375
 
@@ -452,40 +453,71 @@ class Page(Image):
452
453
  "page_number",
453
454
  }
454
455
 
455
- @no_type_check
456
- def get_annotation(
456
+ def get_annotation( # type: ignore
457
457
  self,
458
458
  category_names: Optional[Union[str, ObjectTypes, Sequence[Union[str, ObjectTypes]]]] = None,
459
459
  annotation_ids: Optional[Union[str, Sequence[str]]] = None,
460
- annotation_types: Optional[Union[str, Sequence[str]]] = None,
460
+ service_id: Optional[Union[str, Sequence[str]]] = None,
461
+ model_id: Optional[Union[str, Sequence[str]]] = None,
462
+ session_ids: Optional[Union[str, Sequence[str]]] = None,
463
+ ignore_inactive: bool = True,
461
464
  ) -> List[ImageAnnotationBaseView]:
462
465
  """
466
+ Selection of annotations from the annotation container. Filter conditions can be defined by specifying
467
+ the annotation_id or the category name. (Since only image annotations are currently allowed in the container,
468
+ annotation_type is a redundant filter condition.) Only annotations that have active = 'True' are
469
+ returned. If more than one condition is provided, only annotations will be returned that satisfy all conditions.
470
+ If no condition is provided, it will return all active annotations.
471
+
463
472
  Identical to its base class method for having correct return types. If the base class changes, please
464
473
  change this method as well.
474
+
475
+ :param category_names: A single name or list of names
476
+ :param annotation_ids: A single id or list of ids
477
+ :param service_id: A single service name or list of service names
478
+ :param model_id: A single model name or list of model names
479
+ :param session_ids: A single session id or list of session ids
480
+ :param ignore_inactive: If set to `True` only active annotations are returned.
481
+
482
+ :return: A (possibly empty) list of Annotations
465
483
  """
466
- cat_names = [category_names] if isinstance(category_names, (ObjectTypes, str)) else category_names
467
- if cat_names is not None:
468
- cat_names = [get_type(cat_name) for cat_name in cat_names]
469
- ann_ids = [annotation_ids] if isinstance(annotation_ids, str) else annotation_ids
470
- ann_types = [annotation_types] if isinstance(annotation_types, str) else annotation_types
471
484
 
472
- anns = filter(lambda x: x.active, self.annotations)
485
+ if category_names is not None:
486
+ category_names = (
487
+ [get_type(cat_name) for cat_name in category_names]
488
+ if isinstance(category_names, list)
489
+ else [get_type(category_names)] # type:ignore
490
+ )
491
+ ann_ids = [annotation_ids] if isinstance(annotation_ids, str) else annotation_ids
492
+ service_id = [service_id] if isinstance(service_id, str) else service_id
493
+ model_id = [model_id] if isinstance(model_id, str) else model_id
494
+ session_id = [session_ids] if isinstance(session_ids, str) else session_ids
473
495
 
474
- if ann_types is not None:
475
- for type_name in ann_types:
476
- anns = filter(lambda x: isinstance(x, eval(type_name)), anns) # pylint: disable=W0123, W0640
496
+ if ignore_inactive:
497
+ anns = filter(lambda x: x.active, self.annotations)
498
+ else:
499
+ anns = self.annotations # type:ignore
477
500
 
478
- if cat_names is not None:
479
- anns = filter(lambda x: x.category_name in cat_names, anns)
501
+ if category_names is not None:
502
+ anns = filter(lambda x: x.category_name in category_names, anns) # type:ignore
480
503
 
481
504
  if ann_ids is not None:
482
- anns = filter(lambda x: x.annotation_id in ann_ids, anns)
505
+ anns = filter(lambda x: x.annotation_id in ann_ids, anns) # type:ignore
506
+
507
+ if service_id is not None:
508
+ anns = filter(lambda x: x.generating_service in service_id, anns) # type:ignore
509
+
510
+ if model_id is not None:
511
+ anns = filter(lambda x: x.generating_model in model_id, anns) # type:ignore
512
+
513
+ if session_id is not None:
514
+ anns = filter(lambda x: x.session_id in session_id, anns) # type:ignore
483
515
 
484
- return list(anns)
516
+ return list(anns) # type:ignore
485
517
 
486
518
  def __getattr__(self, item: str) -> Any:
487
519
  if item not in self.get_attribute_names():
488
- raise AttributeError(f"Attribute {item} is not supported for {type(self)}")
520
+ raise ImageError(f"Attribute {item} is not supported for {type(self)}")
489
521
  if self.summary is not None:
490
522
  if item in self.summary.sub_categories:
491
523
  sub_cat = self.summary.get_sub_category(get_type(item))
@@ -629,10 +661,10 @@ class Page(Image):
629
661
  """
630
662
  ann = self.get_annotation(annotation_ids=annotation_id)[0]
631
663
  if ann.category_name not in self.floating_text_block_categories:
632
- raise ValueError(
633
- f"Annotation {annotation_id} with category_name {ann.category_name} is not a floating text "
634
- f"block category. Cannot get context. Make sure to make this category a floating text "
635
- f"block"
664
+ raise ImageError(
665
+ f"Cannot get context. Make sure to parametrize this category to a floating text: "
666
+ f"annotation_id: {annotation_id},"
667
+ f"category_name: {ann.category_name}"
636
668
  )
637
669
  block_with_order = self._order("layouts")
638
670
  position = block_with_order.index(ann)
@@ -165,4 +165,4 @@ class DatasetAdapter(IterableDataset): # type: ignore
165
165
  return len(self.df)
166
166
 
167
167
  def __getitem__(self, item: Any) -> None:
168
- raise NotImplementedError
168
+ raise NotImplementedError()
@@ -18,17 +18,19 @@
18
18
  """
19
19
  Module for the base class of datasets.
20
20
  """
21
-
21
+ import json
22
22
  import os
23
23
  import pprint
24
24
  from abc import ABC, abstractmethod
25
25
  from collections import defaultdict
26
- from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Type, Union
26
+ from inspect import signature
27
+ from pathlib import Path
28
+ from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple, Type, Union
27
29
 
28
30
  import numpy as np
29
31
 
30
32
  from ..dataflow import CacheData, ConcatData, CustomDataFromList, DataFlow
31
- from ..datapoint import Image
33
+ from ..datapoint.image import Image
32
34
  from ..utils.detection_types import Pathlike
33
35
  from ..utils.logger import LoggingRecord, logger
34
36
  from ..utils.settings import ObjectTypes, TypeOrStr, get_type
@@ -51,9 +53,11 @@ class DatasetBase(ABC):
51
53
  self._dataflow_builder.splits = self._dataset_info.splits
52
54
 
53
55
  if not self.dataset_available() and self.is_built_in():
54
- print(
55
- f"Dataset {self._dataset_info.name} not locally found. Please download at {self._dataset_info.url}"
56
- f" and place under {self._dataflow_builder.get_workdir()}"
56
+ logger.warning(
57
+ LoggingRecord(
58
+ f"Dataset {self._dataset_info.name} not locally found. Please download at {self._dataset_info.url}"
59
+ f" and place under {self._dataflow_builder.get_workdir()}"
60
+ )
57
61
  )
58
62
 
59
63
  @property
@@ -76,7 +80,7 @@ class DatasetBase(ABC):
76
80
  Construct the DatasetCategory object.
77
81
  """
78
82
 
79
- raise NotImplementedError
83
+ raise NotImplementedError()
80
84
 
81
85
  @classmethod
82
86
  @abstractmethod
@@ -85,7 +89,7 @@ class DatasetBase(ABC):
85
89
  Construct the DatasetInfo object.
86
90
  """
87
91
 
88
- raise NotImplementedError
92
+ raise NotImplementedError()
89
93
 
90
94
  @abstractmethod
91
95
  def _builder(self) -> DataFlowBaseBuilder:
@@ -93,7 +97,7 @@ class DatasetBase(ABC):
93
97
  Construct the DataFlowBaseBuilder object. It needs to be implemented in the derived class.
94
98
  """
95
99
 
96
- raise NotImplementedError
100
+ raise NotImplementedError()
97
101
 
98
102
  def dataset_available(self) -> bool:
99
103
  """
@@ -114,7 +118,7 @@ class DatasetBase(ABC):
114
118
 
115
119
  class _BuiltInDataset(DatasetBase, ABC):
116
120
  """
117
- Dataclass for built-in dataset. Do not use this it
121
+ Dataclass for built-in dataset. Do not use this
118
122
  """
119
123
 
120
124
  _name: Optional[str] = None
@@ -427,6 +431,11 @@ class CustomDataset(DatasetBase):
427
431
  else:
428
432
  self.init_sub_categories = init_sub_categories
429
433
  self.annotation_files = annotation_files
434
+ if signature(dataflow_builder.__init__).parameters.keys() != {"self", "location", "annotation_files"}:
435
+ raise TypeError(
436
+ "Dataflow builder must have the signature `def __init__(self, location: Pathlike, "
437
+ "annotation_files: Optional[Mapping[str, Union[str, Sequence[str]]]] = None):`"
438
+ )
430
439
  self.dataflow_builder = dataflow_builder(self.location, self.annotation_files)
431
440
  super().__init__()
432
441
 
@@ -438,3 +447,67 @@ class CustomDataset(DatasetBase):
438
447
 
439
448
  def _builder(self) -> DataFlowBaseBuilder:
440
449
  return self.dataflow_builder
450
+
451
+ @staticmethod
452
+ def from_dataset_card(file_path: str, dataflow_builder: Type[DataFlowBaseBuilder]) -> "CustomDataset":
453
+ """
454
+ This static method creates a CustomDataset instance from a dataset card.
455
+
456
+ A dataset card is a JSON file that contains metadata about the dataset such as its name, type, location,
457
+ initial categories, initial sub categories, and annotation files. The dataflow_builder parameter is a class
458
+ that inherits from DataFlowBaseBuilder and is used to build the dataflow for the dataset.
459
+
460
+ :param file_path: The path to the dataset card (JSON file).
461
+ :param dataflow_builder: The class used to build the dataflow for the dataset.
462
+ :return: A CustomDataset instance created from the dataset card.
463
+ """
464
+
465
+ with open(file_path, "r", encoding="UTF-8") as file:
466
+ meta_data = json.load(file)
467
+ meta_data["dataset_type"] = get_type(meta_data["dataset_type"])
468
+ meta_data["location"] = Path(meta_data["location"])
469
+ meta_data["init_categories"] = [get_type(cat) for cat in meta_data["init_categories"]]
470
+ meta_data["init_sub_categories"] = (
471
+ {
472
+ get_type(cat): {
473
+ get_type(sub_cat_key): [get_type(sub_cat_value) for sub_cat_value in sub_cat_values]
474
+ for sub_cat_key, sub_cat_values in sub_cats.items()
475
+ }
476
+ for cat, sub_cats in meta_data["init_sub_categories"].items()
477
+ }
478
+ if meta_data["init_sub_categories"] is not None
479
+ else None
480
+ )
481
+ return CustomDataset(**meta_data, dataflow_builder=dataflow_builder)
482
+
483
+ def as_dict(self) -> Mapping[str, Any]:
484
+ """
485
+ Return the meta-data of the dataset as a dictionary.
486
+
487
+ :return: A dictionary containing the meta-data of the dataset.
488
+ """
489
+ return {
490
+ "name": self.name,
491
+ "dataset_type": self.type,
492
+ "location": str(self.location),
493
+ "annotation_files": self.annotation_files,
494
+ "init_categories": [cat.value for cat in self.init_categories],
495
+ "init_sub_categories": {
496
+ cat.value: {
497
+ sub_cat_key.value: [sub_cat_value.value for sub_cat_value in sub_cat_values]
498
+ for sub_cat_key, sub_cat_values in sub_cats.items()
499
+ }
500
+ for cat, sub_cats in self.init_sub_categories.items()
501
+ }
502
+ if self.init_sub_categories is not None
503
+ else None,
504
+ }
505
+
506
+ def save_dataset_card(self, file_path: str) -> None:
507
+ """
508
+ Save the dataset card to a JSON file.
509
+
510
+ :param file_path: file_path
511
+ """
512
+ with open(file_path, "w", encoding="UTF-8") as file:
513
+ json.dump(self.as_dict(), file, indent=4)
@@ -110,7 +110,7 @@ class DataFlowBaseBuilder(ABC):
110
110
  :param kwargs: A custom set of arguments/values
111
111
  :return: dataflow
112
112
  """
113
- raise NotImplementedError
113
+ raise NotImplementedError()
114
114
 
115
115
  def get_annotation_file(self, split: str) -> str:
116
116
  """Get single annotation file."""
@@ -306,7 +306,7 @@ class DatasetCategories:
306
306
 
307
307
  _cat_to_sub_cat = {get_type(key): get_type(value) for key, value in cat_to_sub_cat.items()}
308
308
  if not self._allow_update:
309
- raise PermissionError("Replacing categories with sub categories is not allowed")
309
+ raise RuntimeWarning("Replacing categories with sub categories is not allowed")
310
310
  self._categories_update = self.init_categories
311
311
  categories = self.get_categories(name_as_key=True)
312
312
  cats_or_sub_cats = [
@@ -332,7 +332,7 @@ class DatasetCategories:
332
332
  """
333
333
 
334
334
  if not self._allow_update:
335
- raise PermissionError("Filtering categories is not allowed")
335
+ raise RuntimeWarning("Filtering categories is not allowed")
336
336
  if isinstance(categories, (ObjectTypes, str)):
337
337
  categories = [get_type(categories)]
338
338
  else:
@@ -49,12 +49,7 @@ _LICENSE = (
49
49
  " – Permissive – Version 1.0 License. Dr. Janis Meyer does not own the copyright of the images. \n"
50
50
  " Use of the images must abide by the PMC Open Access Subset Terms of Use."
51
51
  )
52
- _URL = [
53
- "https://www.googleapis.com/drive/v3/files/1ZD4Ef4gd2FIfp7vR8jbnrZeXD3gSWNqE?alt"
54
- "=media&key=AIzaSyDuoPG6naK-kRJikScR7cP_1sQBF1r3fWU",
55
- "https://www.googleapis.com/drive/v3/files/18HD62LFLa1iAmqffo4SyjuEQ32MzyNQ0?alt"
56
- "=media&key=AIzaSyDuoPG6naK-kRJikScR7cP_1sQBF1r3fWU",
57
- ]
52
+
58
53
  _SPLITS: Mapping[str, str] = {"test": "test", "predict": "predict"}
59
54
  _TYPE = DatasetType.object_detection
60
55
  _LOCATION = "testlayout"
@@ -77,7 +72,7 @@ class LayoutTest(_BuiltInDataset):
77
72
 
78
73
  @classmethod
79
74
  def _info(cls) -> DatasetInfo:
80
- return DatasetInfo(name=_NAME, description=_DESCRIPTION, license=_LICENSE, url=_URL, splits=_SPLITS, type=_TYPE)
75
+ return DatasetInfo(name=_NAME, description=_DESCRIPTION, license=_LICENSE, splits=_SPLITS, type=_TYPE)
81
76
 
82
77
  def _categories(self) -> DatasetCategories:
83
78
  return DatasetCategories(init_categories=_INIT_CATEGORIES)
@@ -87,7 +87,7 @@ def accuracy(label_gt: Sequence[int], label_predictions: Sequence[int], masks: O
87
87
  np_label_gt, np_label_pr = np.asarray(label_gt), np.asarray(label_predictions)
88
88
  if len(np_label_gt) != len(np_label_pr):
89
89
  raise ValueError(
90
- f"length of label_gt ({len(np_label_gt)}) and label_predictions" f" ({len(np_label_pr)}) must be equal"
90
+ f"length label_gt: {len(np_label_gt)}, length label_predictions: ({len(np_label_pr)}) but must be equal"
91
91
  )
92
92
  if masks is not None:
93
93
  np_label_gt, np_label_pr = _mask_some_gt_and_pr_labels(np_label_gt, np_label_pr, masks)
@@ -25,6 +25,7 @@ from typing import Any, Callable, List, Optional, Tuple
25
25
  from ..dataflow import DataFlow
26
26
  from ..datasets.info import DatasetCategories
27
27
  from ..utils.detection_types import JsonDict
28
+ from ..utils.error import DependencyError
28
29
  from ..utils.file_utils import Requirement
29
30
 
30
31
 
@@ -52,7 +53,7 @@ class MetricBase(ABC):
52
53
  requirements = cls.get_requirements()
53
54
  name = cls.__name__ if hasattr(cls, "__name__") else cls.__class__.__name__
54
55
  if not all(requirement[1] for requirement in requirements):
55
- raise ImportError(
56
+ raise DependencyError(
56
57
  "\n".join(
57
58
  [f"{name} has the following dependencies:"]
58
59
  + [requirement[2] for requirement in requirements if not requirement[1]]
@@ -66,7 +67,7 @@ class MetricBase(ABC):
66
67
  """
67
68
  Get a list of requirements for running the detector
68
69
  """
69
- raise NotImplementedError
70
+ raise NotImplementedError()
70
71
 
71
72
  @classmethod
72
73
  @abstractmethod
@@ -80,7 +81,7 @@ class MetricBase(ABC):
80
81
  :param dataflow_predictions: Dataflow with predictions.
81
82
  :param categories: DatasetCategories with respect to the underlying dataset.
82
83
  """
83
- raise NotImplementedError
84
+ raise NotImplementedError()
84
85
 
85
86
  @classmethod
86
87
  @abstractmethod
@@ -95,7 +96,7 @@ class MetricBase(ABC):
95
96
  :param dataflow_predictions: Dataflow with predictions.
96
97
  :param categories: DatasetCategories with respect to the underlying dataset.
97
98
  """
98
- raise NotImplementedError
99
+ raise NotImplementedError()
99
100
 
100
101
  @classmethod
101
102
  def result_list_to_dict(cls, results: List[JsonDict]) -> JsonDict:
@@ -171,7 +171,7 @@ class Evaluator:
171
171
  "metric has no attribute sub_cats and cannot be used for token classification datasets"
172
172
  )
173
173
  else:
174
- raise NotImplementedError
174
+ raise NotImplementedError()
175
175
 
176
176
  else:
177
177
  self.wandb_table_agent = None
@@ -271,7 +271,7 @@ class Evaluator:
271
271
  sub_cats_to_remove = meta_anns["sub_categories"]
272
272
  df_pr = MapData(df_pr, remove_cats(sub_categories=sub_cats_to_remove))
273
273
  else:
274
- raise NotImplementedError
274
+ raise NotImplementedError()
275
275
 
276
276
  return df_pr
277
277