deepdoctection 0.30__py3-none-any.whl → 0.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (120) hide show
  1. deepdoctection/__init__.py +38 -29
  2. deepdoctection/analyzer/dd.py +36 -29
  3. deepdoctection/configs/conf_dd_one.yaml +34 -31
  4. deepdoctection/dataflow/base.py +0 -19
  5. deepdoctection/dataflow/custom.py +4 -3
  6. deepdoctection/dataflow/custom_serialize.py +14 -5
  7. deepdoctection/dataflow/parallel_map.py +12 -11
  8. deepdoctection/dataflow/serialize.py +5 -4
  9. deepdoctection/datapoint/annotation.py +35 -13
  10. deepdoctection/datapoint/box.py +3 -5
  11. deepdoctection/datapoint/convert.py +3 -1
  12. deepdoctection/datapoint/image.py +79 -36
  13. deepdoctection/datapoint/view.py +152 -49
  14. deepdoctection/datasets/__init__.py +1 -4
  15. deepdoctection/datasets/adapter.py +6 -3
  16. deepdoctection/datasets/base.py +86 -11
  17. deepdoctection/datasets/dataflow_builder.py +1 -1
  18. deepdoctection/datasets/info.py +4 -4
  19. deepdoctection/datasets/instances/doclaynet.py +3 -2
  20. deepdoctection/datasets/instances/fintabnet.py +2 -1
  21. deepdoctection/datasets/instances/funsd.py +2 -1
  22. deepdoctection/datasets/instances/iiitar13k.py +5 -2
  23. deepdoctection/datasets/instances/layouttest.py +4 -8
  24. deepdoctection/datasets/instances/publaynet.py +2 -2
  25. deepdoctection/datasets/instances/pubtables1m.py +6 -3
  26. deepdoctection/datasets/instances/pubtabnet.py +2 -1
  27. deepdoctection/datasets/instances/rvlcdip.py +2 -1
  28. deepdoctection/datasets/instances/xfund.py +2 -1
  29. deepdoctection/eval/__init__.py +1 -4
  30. deepdoctection/eval/accmetric.py +1 -1
  31. deepdoctection/eval/base.py +5 -4
  32. deepdoctection/eval/cocometric.py +2 -1
  33. deepdoctection/eval/eval.py +19 -15
  34. deepdoctection/eval/tedsmetric.py +14 -11
  35. deepdoctection/eval/tp_eval_callback.py +14 -7
  36. deepdoctection/extern/__init__.py +2 -7
  37. deepdoctection/extern/base.py +39 -13
  38. deepdoctection/extern/d2detect.py +182 -90
  39. deepdoctection/extern/deskew.py +36 -9
  40. deepdoctection/extern/doctrocr.py +265 -83
  41. deepdoctection/extern/fastlang.py +49 -9
  42. deepdoctection/extern/hfdetr.py +106 -55
  43. deepdoctection/extern/hflayoutlm.py +441 -122
  44. deepdoctection/extern/hflm.py +225 -0
  45. deepdoctection/extern/model.py +56 -47
  46. deepdoctection/extern/pdftext.py +10 -5
  47. deepdoctection/extern/pt/__init__.py +1 -3
  48. deepdoctection/extern/pt/nms.py +6 -2
  49. deepdoctection/extern/pt/ptutils.py +27 -18
  50. deepdoctection/extern/tessocr.py +134 -22
  51. deepdoctection/extern/texocr.py +6 -2
  52. deepdoctection/extern/tp/tfutils.py +43 -9
  53. deepdoctection/extern/tp/tpcompat.py +14 -11
  54. deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
  55. deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
  56. deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
  57. deepdoctection/extern/tp/tpfrcnn/config/config.py +9 -6
  58. deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
  59. deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +17 -7
  60. deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
  61. deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +9 -4
  62. deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
  63. deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +16 -11
  64. deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +17 -10
  65. deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +14 -8
  66. deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
  67. deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
  68. deepdoctection/extern/tp/tpfrcnn/preproc.py +8 -9
  69. deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
  70. deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
  71. deepdoctection/extern/tpdetect.py +54 -30
  72. deepdoctection/mapper/__init__.py +3 -8
  73. deepdoctection/mapper/d2struct.py +9 -7
  74. deepdoctection/mapper/hfstruct.py +7 -2
  75. deepdoctection/mapper/laylmstruct.py +164 -21
  76. deepdoctection/mapper/maputils.py +16 -3
  77. deepdoctection/mapper/misc.py +6 -3
  78. deepdoctection/mapper/prodigystruct.py +1 -1
  79. deepdoctection/mapper/pubstruct.py +10 -10
  80. deepdoctection/mapper/tpstruct.py +3 -3
  81. deepdoctection/pipe/__init__.py +1 -1
  82. deepdoctection/pipe/anngen.py +35 -8
  83. deepdoctection/pipe/base.py +53 -19
  84. deepdoctection/pipe/common.py +23 -13
  85. deepdoctection/pipe/concurrency.py +2 -1
  86. deepdoctection/pipe/doctectionpipe.py +2 -2
  87. deepdoctection/pipe/language.py +3 -2
  88. deepdoctection/pipe/layout.py +6 -3
  89. deepdoctection/pipe/lm.py +34 -66
  90. deepdoctection/pipe/order.py +142 -35
  91. deepdoctection/pipe/refine.py +26 -24
  92. deepdoctection/pipe/segment.py +21 -16
  93. deepdoctection/pipe/{cell.py → sub_layout.py} +30 -9
  94. deepdoctection/pipe/text.py +14 -8
  95. deepdoctection/pipe/transform.py +16 -9
  96. deepdoctection/train/__init__.py +6 -12
  97. deepdoctection/train/d2_frcnn_train.py +36 -28
  98. deepdoctection/train/hf_detr_train.py +26 -17
  99. deepdoctection/train/hf_layoutlm_train.py +133 -111
  100. deepdoctection/train/tp_frcnn_train.py +21 -19
  101. deepdoctection/utils/__init__.py +3 -0
  102. deepdoctection/utils/concurrency.py +1 -1
  103. deepdoctection/utils/context.py +2 -2
  104. deepdoctection/utils/env_info.py +41 -84
  105. deepdoctection/utils/error.py +84 -0
  106. deepdoctection/utils/file_utils.py +4 -15
  107. deepdoctection/utils/fs.py +7 -7
  108. deepdoctection/utils/logger.py +1 -0
  109. deepdoctection/utils/mocks.py +93 -0
  110. deepdoctection/utils/pdf_utils.py +5 -4
  111. deepdoctection/utils/settings.py +6 -1
  112. deepdoctection/utils/transform.py +1 -1
  113. deepdoctection/utils/utils.py +0 -6
  114. deepdoctection/utils/viz.py +48 -5
  115. {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/METADATA +57 -73
  116. deepdoctection-0.32.dist-info/RECORD +146 -0
  117. {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/WHEEL +1 -1
  118. deepdoctection-0.30.dist-info/RECORD +0 -143
  119. {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/LICENSE +0 -0
  120. {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/top_level.txt +0 -0
@@ -19,6 +19,7 @@
19
19
  Subclasses for ImageAnnotation and Image objects with various properties. These classes
20
20
  simplify consumption
21
21
  """
22
+ from __future__ import annotations
22
23
 
23
24
  from copy import copy
24
25
  from typing import Any, Dict, List, Mapping, Optional, Sequence, Set, Tuple, Type, Union, no_type_check
@@ -26,6 +27,7 @@ from typing import Any, Dict, List, Mapping, Optional, Sequence, Set, Tuple, Typ
26
27
  import numpy as np
27
28
 
28
29
  from ..utils.detection_types import ImageType, JsonDict, Pathlike
30
+ from ..utils.error import AnnotationError, ImageError
29
31
  from ..utils.logger import LoggingRecord, logger
30
32
  from ..utils.settings import (
31
33
  CellType,
@@ -63,7 +65,7 @@ class ImageAnnotationBaseView(ImageAnnotation):
63
65
  base_page: `Page` class instantiated by the lowest hierarchy `Image`
64
66
  """
65
67
 
66
- base_page: "Page"
68
+ base_page: Page
67
69
 
68
70
  @property
69
71
  def bbox(self) -> List[float]:
@@ -96,7 +98,7 @@ class ImageAnnotationBaseView(ImageAnnotation):
96
98
  interactive_imshow(np_image)
97
99
  return None
98
100
  return np_image
99
- raise ValueError(f"base_page.image is None for {self.annotation_id}")
101
+ raise AnnotationError(f"base_page.image is None for {self.annotation_id}")
100
102
 
101
103
  def __getattr__(self, item: str) -> Optional[Union[str, int, List[str]]]:
102
104
  """
@@ -115,7 +117,7 @@ class ImageAnnotationBaseView(ImageAnnotation):
115
117
  :return: value according to the logic described above
116
118
  """
117
119
  if item not in self.get_attribute_names():
118
- raise AttributeError(f"Attribute {item} is not supported for {type(self)}")
120
+ raise AnnotationError(f"Attribute {item} is not supported for {type(self)}")
119
121
  if item in self.sub_categories:
120
122
  sub_cat = self.get_sub_category(get_type(item))
121
123
  if item != sub_cat.category_name:
@@ -147,7 +149,7 @@ class ImageAnnotationBaseView(ImageAnnotation):
147
149
  return attribute_names
148
150
 
149
151
  @classmethod
150
- def from_dict(cls, **kwargs: JsonDict) -> "ImageAnnotationBaseView":
152
+ def from_dict(cls, **kwargs: JsonDict) -> ImageAnnotationBaseView:
151
153
  """
152
154
  Identical to its base class method for having correct return types. If the base class changes, please
153
155
  change this method as well.
@@ -204,15 +206,38 @@ class Layout(ImageAnnotationBaseView):
204
206
  return words_with_reading_order
205
207
 
206
208
  @property
207
- def text_(self) -> Dict[str, Union[str, List[str]]]:
209
+ def text_(self) -> JsonDict:
208
210
  """Returns a dict `{"text": text string,
209
211
  "text_list": list of single words,
210
212
  "annotation_ids": word annotation ids`"""
211
213
  words = self.get_ordered_words()
214
+ characters, ann_ids, token_classes, token_tags, token_classes_ids, token_tag_ids = zip(
215
+ *[
216
+ (
217
+ word.characters,
218
+ word.annotation_id,
219
+ word.token_class,
220
+ word.token_tag,
221
+ (
222
+ word.get_sub_category(WordType.token_class).category_id
223
+ if WordType.token_class in word.sub_categories
224
+ else None
225
+ ),
226
+ (word.get_sub_category(WordType.token_tag).category_id)
227
+ if WordType.token_tag in word.sub_categories
228
+ else None,
229
+ )
230
+ for word in words
231
+ ]
232
+ )
212
233
  return {
213
- "text": " ".join([word.characters for word in words]), # type: ignore
214
- "text_list": [word.characters for word in words], # type: ignore
215
- "annotation_ids": [word.annotation_id for word in words],
234
+ "text": " ".join(characters),
235
+ "words": characters,
236
+ "ann_ids": ann_ids,
237
+ "token_classes": token_classes,
238
+ "token_tags": token_tags,
239
+ "token_class_ids": token_classes_ids,
240
+ "token_tag_ids": token_tag_ids,
216
241
  }
217
242
 
218
243
  def get_attribute_names(self) -> Set[str]:
@@ -326,23 +351,37 @@ class Table(Layout):
326
351
  def text(self) -> str:
327
352
  try:
328
353
  return str(self)
329
- except TypeError:
354
+ except (TypeError, AnnotationError):
330
355
  return super().text
331
356
 
332
357
  @property
333
- def text_(self) -> Dict[str, Union[str, List[str]]]:
358
+ def text_(self) -> JsonDict:
334
359
  cells = self.cells
335
360
  if not cells:
336
361
  return super().text_
337
- text_list: List[str] = []
338
- annotation_id_list: List[str] = []
362
+ text: List[str] = []
363
+ words: List[str] = []
364
+ ann_ids: List[str] = []
365
+ token_classes: List[str] = []
366
+ token_tags: List[str] = []
367
+ token_class_ids: List[str] = []
368
+ token_tag_ids: List[str] = []
339
369
  for cell in cells:
340
- text_list.extend(cell.text_["text_list"]) # type: ignore
341
- annotation_id_list.extend(cell.text_["annotation_ids"]) # type: ignore
370
+ text.extend(cell.text_["text"]) # type: ignore
371
+ words.extend(cell.text_["words"]) # type: ignore
372
+ ann_ids.extend(cell.text_["ann_ids"]) # type: ignore
373
+ token_classes.extend(cell.text_["token_classes"]) # type: ignore
374
+ token_tags.extend(cell.text_["token_tags"]) # type: ignore
375
+ token_class_ids.extend(cell.text_["token_class_ids"]) # type: ignore
376
+ token_tag_ids.extend(cell.text_["token_tag_ids"]) # type: ignore
342
377
  return {
343
- "text": " ".join([cell.text for cell in cells]), # type: ignore
344
- "text_list": text_list,
345
- "annotation_ids": annotation_id_list,
378
+ "text": " ".join(text),
379
+ "words": words,
380
+ "ann_ids": ann_ids,
381
+ "token_classes": token_classes,
382
+ "token_tags": token_tags,
383
+ "token_class_ids": token_class_ids,
384
+ "token_tag_ids": token_tag_ids,
346
385
  }
347
386
 
348
387
  @property
@@ -368,7 +407,7 @@ class Table(Layout):
368
407
  for cell in cells:
369
408
  all_words.extend(cell.get_ordered_words()) # type: ignore
370
409
  return all_words
371
- except TypeError:
410
+ except (TypeError, AnnotationError):
372
411
  return super().get_ordered_words()
373
412
 
374
413
 
@@ -451,41 +490,73 @@ class Page(Image):
451
490
  "document_id",
452
491
  "page_number",
453
492
  }
493
+ include_residual_text_container: bool = True
454
494
 
455
- @no_type_check
456
- def get_annotation(
495
+ def get_annotation( # type: ignore
457
496
  self,
458
497
  category_names: Optional[Union[str, ObjectTypes, Sequence[Union[str, ObjectTypes]]]] = None,
459
498
  annotation_ids: Optional[Union[str, Sequence[str]]] = None,
460
- annotation_types: Optional[Union[str, Sequence[str]]] = None,
499
+ service_id: Optional[Union[str, Sequence[str]]] = None,
500
+ model_id: Optional[Union[str, Sequence[str]]] = None,
501
+ session_ids: Optional[Union[str, Sequence[str]]] = None,
502
+ ignore_inactive: bool = True,
461
503
  ) -> List[ImageAnnotationBaseView]:
462
504
  """
505
+ Selection of annotations from the annotation container. Filter conditions can be defined by specifying
506
+ the annotation_id or the category name. (Since only image annotations are currently allowed in the container,
507
+ annotation_type is a redundant filter condition.) Only annotations that have active = 'True' are
508
+ returned. If more than one condition is provided, only annotations will be returned that satisfy all conditions.
509
+ If no condition is provided, it will return all active annotations.
510
+
463
511
  Identical to its base class method for having correct return types. If the base class changes, please
464
512
  change this method as well.
513
+
514
+ :param category_names: A single name or list of names
515
+ :param annotation_ids: A single id or list of ids
516
+ :param service_id: A single service name or list of service names
517
+ :param model_id: A single model name or list of model names
518
+ :param session_ids: A single session id or list of session ids
519
+ :param ignore_inactive: If set to `True` only active annotations are returned.
520
+
521
+ :return: A (possibly empty) list of Annotations
465
522
  """
466
- cat_names = [category_names] if isinstance(category_names, (ObjectTypes, str)) else category_names
467
- if cat_names is not None:
468
- cat_names = [get_type(cat_name) for cat_name in cat_names]
469
- ann_ids = [annotation_ids] if isinstance(annotation_ids, str) else annotation_ids
470
- ann_types = [annotation_types] if isinstance(annotation_types, str) else annotation_types
471
523
 
472
- anns = filter(lambda x: x.active, self.annotations)
524
+ if category_names is not None:
525
+ category_names = (
526
+ [get_type(cat_name) for cat_name in category_names]
527
+ if isinstance(category_names, list)
528
+ else [get_type(category_names)] # type:ignore
529
+ )
530
+ ann_ids = [annotation_ids] if isinstance(annotation_ids, str) else annotation_ids
531
+ service_id = [service_id] if isinstance(service_id, str) else service_id
532
+ model_id = [model_id] if isinstance(model_id, str) else model_id
533
+ session_id = [session_ids] if isinstance(session_ids, str) else session_ids
473
534
 
474
- if ann_types is not None:
475
- for type_name in ann_types:
476
- anns = filter(lambda x: isinstance(x, eval(type_name)), anns) # pylint: disable=W0123, W0640
535
+ if ignore_inactive:
536
+ anns = filter(lambda x: x.active, self.annotations)
537
+ else:
538
+ anns = self.annotations # type:ignore
477
539
 
478
- if cat_names is not None:
479
- anns = filter(lambda x: x.category_name in cat_names, anns)
540
+ if category_names is not None:
541
+ anns = filter(lambda x: x.category_name in category_names, anns) # type:ignore
480
542
 
481
543
  if ann_ids is not None:
482
- anns = filter(lambda x: x.annotation_id in ann_ids, anns)
544
+ anns = filter(lambda x: x.annotation_id in ann_ids, anns) # type:ignore
483
545
 
484
- return list(anns)
546
+ if service_id is not None:
547
+ anns = filter(lambda x: x.generating_service in service_id, anns) # type:ignore
548
+
549
+ if model_id is not None:
550
+ anns = filter(lambda x: x.generating_model in model_id, anns) # type:ignore
551
+
552
+ if session_id is not None:
553
+ anns = filter(lambda x: x.session_id in session_id, anns) # type:ignore
554
+
555
+ return list(anns) # type:ignore
485
556
 
486
557
  def __getattr__(self, item: str) -> Any:
487
558
  if item not in self.get_attribute_names():
488
- raise AttributeError(f"Attribute {item} is not supported for {type(self)}")
559
+ raise ImageError(f"Attribute {item} is not supported for {type(self)}")
489
560
  if self.summary is not None:
490
561
  if item in self.summary.sub_categories:
491
562
  sub_cat = self.summary.get_sub_category(get_type(item))
@@ -524,8 +595,8 @@ class Page(Image):
524
595
  text_container: Optional[ObjectTypes] = None,
525
596
  floating_text_block_categories: Optional[Sequence[ObjectTypes]] = None,
526
597
  include_residual_text_container: bool = True,
527
- base_page: Optional["Page"] = None,
528
- ) -> "Page":
598
+ base_page: Optional[Page] = None,
599
+ ) -> Page:
529
600
  """
530
601
  Factory function for generating a `Page` instance from `image_orig` .
531
602
 
@@ -583,6 +654,7 @@ class Page(Image):
583
654
  page.summary = SummaryAnnotation.from_dict(**summary_dict)
584
655
  page.floating_text_block_categories = floating_text_block_categories # type: ignore
585
656
  page.text_container = text_container # type: ignore
657
+ page.include_residual_text_container = include_residual_text_container
586
658
  return page
587
659
 
588
660
  def _order(self, block: str) -> List[ImageAnnotationBaseView]:
@@ -596,7 +668,7 @@ class Page(Image):
596
668
  break_str = "\n" if line_break else " "
597
669
  for block in block_with_order:
598
670
  text += f"{block.text}{break_str}"
599
- return text
671
+ return text[:-1]
600
672
 
601
673
  @property
602
674
  def text(self) -> str:
@@ -606,17 +678,35 @@ class Page(Image):
606
678
  return self._make_text()
607
679
 
608
680
  @property
609
- def text_(self) -> Dict[str, Union[str, List[str]]]:
681
+ def text_(self) -> JsonDict:
610
682
  """Returns a dict `{"text": text string,
611
683
  "text_list": list of single words,
612
684
  "annotation_ids": word annotation ids`"""
613
685
  block_with_order = self._order("layouts")
614
- text_list: List[str] = []
615
- annotation_id_list: List[str] = []
686
+ text: List[str] = []
687
+ words: List[str] = []
688
+ ann_ids: List[str] = []
689
+ token_classes: List[str] = []
690
+ token_tags: List[str] = []
691
+ token_class_ids: List[str] = []
692
+ token_tag_ids: List[str] = []
616
693
  for block in block_with_order:
617
- text_list.extend(block.text_["text_list"]) # type: ignore
618
- annotation_id_list.extend(block.text_["annotation_ids"]) # type: ignore
619
- return {"text": self.text, "text_list": text_list, "annotation_ids": annotation_id_list}
694
+ text.append(block.text_["text"]) # type: ignore
695
+ words.extend(block.text_["words"]) # type: ignore
696
+ ann_ids.extend(block.text_["ann_ids"]) # type: ignore
697
+ token_classes.extend(block.text_["token_classes"]) # type: ignore
698
+ token_tags.extend(block.text_["token_tags"]) # type: ignore
699
+ token_class_ids.extend(block.text_["token_class_ids"]) # type: ignore
700
+ token_tag_ids.extend(block.text_["token_tag_ids"]) # type: ignore
701
+ return {
702
+ "text": " ".join(text),
703
+ "words": words,
704
+ "ann_ids": ann_ids,
705
+ "token_classes": token_classes,
706
+ "token_tags": token_tags,
707
+ "token_class_ids": token_class_ids,
708
+ "token_tag_ids": token_tag_ids,
709
+ }
620
710
 
621
711
  def get_layout_context(self, annotation_id: str, context_size: int = 3) -> List[ImageAnnotationBaseView]:
622
712
  """For a given `annotation_id` get a list of `ImageAnnotation` that are nearby in terms of reading order.
@@ -629,10 +719,10 @@ class Page(Image):
629
719
  """
630
720
  ann = self.get_annotation(annotation_ids=annotation_id)[0]
631
721
  if ann.category_name not in self.floating_text_block_categories:
632
- raise ValueError(
633
- f"Annotation {annotation_id} with category_name {ann.category_name} is not a floating text "
634
- f"block category. Cannot get context. Make sure to make this category a floating text "
635
- f"block"
722
+ raise ImageError(
723
+ f"Cannot get context. Make sure to parametrize this category to a floating text: "
724
+ f"annotation_id: {annotation_id},"
725
+ f"category_name: {ann.category_name}"
636
726
  )
637
727
  block_with_order = self._order("layouts")
638
728
  position = block_with_order.index(ann)
@@ -727,6 +817,11 @@ class Page(Image):
727
817
  box_stack = []
728
818
  cells_found = False
729
819
 
820
+ if self.image is None and interactive:
821
+ logger.warning(
822
+ LoggingRecord("No image provided. Cannot display image in interactive mode", {"page_id": self.image_id})
823
+ )
824
+
730
825
  if debug_kwargs:
731
826
  anns = self.get_annotation(category_names=list(debug_kwargs.keys()))
732
827
  for ann in anns:
@@ -874,7 +969,7 @@ class Page(Image):
874
969
  text_container: Optional[ObjectTypes] = None,
875
970
  floating_text_block_categories: Optional[List[ObjectTypes]] = None,
876
971
  include_residual_text_container: bool = True,
877
- ) -> "Page":
972
+ ) -> Page:
878
973
  """Reading JSON file and building a `Page` object with given config.
879
974
  :param file_path: Path to file
880
975
  :param text_container: A LayoutType to get the text from. It will steer the output of `Layout.words`.
@@ -897,3 +992,11 @@ class Page(Image):
897
992
  for word in all_words
898
993
  if word.token_tag not in (TokenClasses.other, None)
899
994
  ]
995
+
996
+ def __copy__(self) -> Page:
997
+ return self.__class__.from_image(
998
+ self.image_orig,
999
+ self.text_container,
1000
+ self.floating_text_block_categories,
1001
+ self.include_residual_text_container,
1002
+ )
@@ -26,13 +26,10 @@ Create an info card, a DataFlowBaseBuilder derived instance, possibly a category
26
26
  DatasetBase derived instance to create a data set.
27
27
  """
28
28
 
29
- from ..utils.file_utils import pytorch_available
29
+ from .adapter import *
30
30
  from .base import *
31
31
  from .dataflow_builder import DataFlowBaseBuilder
32
32
  from .info import *
33
33
  from .instances import *
34
34
  from .registry import *
35
35
  from .save import *
36
-
37
- if pytorch_available():
38
- from .adapter import *
@@ -22,19 +22,22 @@ Module for wrapping datasets into a pytorch dataset framework.
22
22
 
23
23
  from typing import Any, Callable, Iterator, Mapping, Optional, Union
24
24
 
25
+ from lazy_imports import try_import
26
+
25
27
  from ..dataflow import CacheData, CustomDataFromList, MapData, RepeatedData
26
28
  from ..datapoint.image import Image
27
29
  from ..datasets.base import DatasetBase
28
30
  from ..mapper.maputils import LabelSummarizer
29
31
  from ..utils.detection_types import DP, JsonDict
30
- from ..utils.file_utils import pytorch_available
31
32
  from ..utils.logger import LoggingRecord, log_once, logger
32
33
  from ..utils.settings import DatasetType, LayoutType, ObjectTypes, PageType, WordType
33
34
  from ..utils.tqdm import get_tqdm
34
35
  from .registry import get_dataset
35
36
 
36
- if pytorch_available():
37
+ with try_import() as import_guard:
37
38
  from torch.utils.data import IterableDataset
39
+ if not import_guard.is_successful():
40
+ from ..utils.mocks import IterableDataset # type: ignore
38
41
 
39
42
 
40
43
  class DatasetAdapter(IterableDataset): # type: ignore
@@ -165,4 +168,4 @@ class DatasetAdapter(IterableDataset): # type: ignore
165
168
  return len(self.df)
166
169
 
167
170
  def __getitem__(self, item: Any) -> None:
168
- raise NotImplementedError
171
+ raise NotImplementedError()
@@ -18,20 +18,24 @@
18
18
  """
19
19
  Module for the base class of datasets.
20
20
  """
21
+ from __future__ import annotations
21
22
 
23
+ import json
22
24
  import os
23
25
  import pprint
24
26
  from abc import ABC, abstractmethod
25
27
  from collections import defaultdict
26
- from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Type, Union
28
+ from inspect import signature
29
+ from pathlib import Path
30
+ from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple, Type, Union
27
31
 
28
32
  import numpy as np
29
33
 
30
34
  from ..dataflow import CacheData, ConcatData, CustomDataFromList, DataFlow
31
- from ..datapoint import Image
35
+ from ..datapoint.image import Image
32
36
  from ..utils.detection_types import Pathlike
33
37
  from ..utils.logger import LoggingRecord, logger
34
- from ..utils.settings import ObjectTypes, TypeOrStr, get_type
38
+ from ..utils.settings import DatasetType, ObjectTypes, TypeOrStr, get_type
35
39
  from .dataflow_builder import DataFlowBaseBuilder
36
40
  from .info import DatasetCategories, DatasetInfo, get_merged_categories
37
41
 
@@ -51,9 +55,11 @@ class DatasetBase(ABC):
51
55
  self._dataflow_builder.splits = self._dataset_info.splits
52
56
 
53
57
  if not self.dataset_available() and self.is_built_in():
54
- print(
55
- f"Dataset {self._dataset_info.name} not locally found. Please download at {self._dataset_info.url}"
56
- f" and place under {self._dataflow_builder.get_workdir()}"
58
+ logger.warning(
59
+ LoggingRecord(
60
+ f"Dataset {self._dataset_info.name} not locally found. Please download at {self._dataset_info.url}"
61
+ f" and place under {self._dataflow_builder.get_workdir()}"
62
+ )
57
63
  )
58
64
 
59
65
  @property
@@ -76,7 +82,7 @@ class DatasetBase(ABC):
76
82
  Construct the DatasetCategory object.
77
83
  """
78
84
 
79
- raise NotImplementedError
85
+ raise NotImplementedError()
80
86
 
81
87
  @classmethod
82
88
  @abstractmethod
@@ -85,7 +91,7 @@ class DatasetBase(ABC):
85
91
  Construct the DatasetInfo object.
86
92
  """
87
93
 
88
- raise NotImplementedError
94
+ raise NotImplementedError()
89
95
 
90
96
  @abstractmethod
91
97
  def _builder(self) -> DataFlowBaseBuilder:
@@ -93,7 +99,7 @@ class DatasetBase(ABC):
93
99
  Construct the DataFlowBaseBuilder object. It needs to be implemented in the derived class.
94
100
  """
95
101
 
96
- raise NotImplementedError
102
+ raise NotImplementedError()
97
103
 
98
104
  def dataset_available(self) -> bool:
99
105
  """
@@ -114,7 +120,7 @@ class DatasetBase(ABC):
114
120
 
115
121
  class _BuiltInDataset(DatasetBase, ABC):
116
122
  """
117
- Dataclass for built-in dataset. Do not use this it
123
+ Dataclass for built-in dataset. Do not use this
118
124
  """
119
125
 
120
126
  _name: Optional[str] = None
@@ -419,7 +425,7 @@ class CustomDataset(DatasetBase):
419
425
  """
420
426
 
421
427
  self.name = name
422
- self.type = get_type(dataset_type)
428
+ self.type: DatasetType = get_type(dataset_type) # type: ignore
423
429
  self.location = location
424
430
  self.init_categories = init_categories
425
431
  if init_sub_categories is None:
@@ -427,6 +433,11 @@ class CustomDataset(DatasetBase):
427
433
  else:
428
434
  self.init_sub_categories = init_sub_categories
429
435
  self.annotation_files = annotation_files
436
+ if signature(dataflow_builder.__init__).parameters.keys() != {"self", "location", "annotation_files"}:
437
+ raise TypeError(
438
+ "Dataflow builder must have the signature `def __init__(self, location: Pathlike, "
439
+ "annotation_files: Optional[Mapping[str, Union[str, Sequence[str]]]] = None):`"
440
+ )
430
441
  self.dataflow_builder = dataflow_builder(self.location, self.annotation_files)
431
442
  super().__init__()
432
443
 
@@ -438,3 +449,67 @@ class CustomDataset(DatasetBase):
438
449
 
439
450
  def _builder(self) -> DataFlowBaseBuilder:
440
451
  return self.dataflow_builder
452
+
453
+ @staticmethod
454
+ def from_dataset_card(file_path: str, dataflow_builder: Type[DataFlowBaseBuilder]) -> CustomDataset:
455
+ """
456
+ This static method creates a CustomDataset instance from a dataset card.
457
+
458
+ A dataset card is a JSON file that contains metadata about the dataset such as its name, type, location,
459
+ initial categories, initial sub categories, and annotation files. The dataflow_builder parameter is a class
460
+ that inherits from DataFlowBaseBuilder and is used to build the dataflow for the dataset.
461
+
462
+ :param file_path: The path to the dataset card (JSON file).
463
+ :param dataflow_builder: The class used to build the dataflow for the dataset.
464
+ :return: A CustomDataset instance created from the dataset card.
465
+ """
466
+
467
+ with open(file_path, "r", encoding="UTF-8") as file:
468
+ meta_data = json.load(file)
469
+ meta_data["dataset_type"] = get_type(meta_data["dataset_type"])
470
+ meta_data["location"] = Path(meta_data["location"])
471
+ meta_data["init_categories"] = [get_type(cat) for cat in meta_data["init_categories"]]
472
+ meta_data["init_sub_categories"] = (
473
+ {
474
+ get_type(cat): {
475
+ get_type(sub_cat_key): [get_type(sub_cat_value) for sub_cat_value in sub_cat_values]
476
+ for sub_cat_key, sub_cat_values in sub_cats.items()
477
+ }
478
+ for cat, sub_cats in meta_data["init_sub_categories"].items()
479
+ }
480
+ if meta_data["init_sub_categories"] is not None
481
+ else None
482
+ )
483
+ return CustomDataset(**meta_data, dataflow_builder=dataflow_builder)
484
+
485
+ def as_dict(self) -> Mapping[str, Any]:
486
+ """
487
+ Return the meta-data of the dataset as a dictionary.
488
+
489
+ :return: A dictionary containing the meta-data of the dataset.
490
+ """
491
+ return {
492
+ "name": self.name,
493
+ "dataset_type": self.type,
494
+ "location": str(self.location),
495
+ "annotation_files": self.annotation_files,
496
+ "init_categories": [cat.value for cat in self.init_categories],
497
+ "init_sub_categories": {
498
+ cat.value: {
499
+ sub_cat_key.value: [sub_cat_value.value for sub_cat_value in sub_cat_values]
500
+ for sub_cat_key, sub_cat_values in sub_cats.items()
501
+ }
502
+ for cat, sub_cats in self.init_sub_categories.items()
503
+ }
504
+ if self.init_sub_categories is not None
505
+ else None,
506
+ }
507
+
508
+ def save_dataset_card(self, file_path: str) -> None:
509
+ """
510
+ Save the dataset card to a JSON file.
511
+
512
+ :param file_path: file_path
513
+ """
514
+ with open(file_path, "w", encoding="UTF-8") as file:
515
+ json.dump(self.as_dict(), file, indent=4)
@@ -110,7 +110,7 @@ class DataFlowBaseBuilder(ABC):
110
110
  :param kwargs: A custom set of arguments/values
111
111
  :return: dataflow
112
112
  """
113
- raise NotImplementedError
113
+ raise NotImplementedError()
114
114
 
115
115
  def get_annotation_file(self, split: str) -> str:
116
116
  """Get single annotation file."""
@@ -24,7 +24,7 @@ from dataclasses import dataclass, field
24
24
  from itertools import chain
25
25
  from typing import Any, Dict, List, Literal, Mapping, Optional, Sequence, Set, Union, no_type_check, overload
26
26
 
27
- from ..utils.settings import DefaultType, ObjectTypes, TypeOrStr, get_type
27
+ from ..utils.settings import DatasetType, ObjectTypes, TypeOrStr, get_type
28
28
  from ..utils.utils import call_only_once
29
29
 
30
30
  __all__ = ["DatasetInfo", "DatasetCategories", "get_merged_categories"]
@@ -89,7 +89,7 @@ class DatasetInfo:
89
89
  license: str = field(default="")
90
90
  url: Union[str, Sequence[str]] = field(default="")
91
91
  splits: Mapping[str, str] = field(default_factory=dict)
92
- type: ObjectTypes = field(default=DefaultType.default_type)
92
+ type: DatasetType = field(default=DatasetType.default)
93
93
 
94
94
  def get_split(self, key: str) -> str:
95
95
  """
@@ -306,7 +306,7 @@ class DatasetCategories:
306
306
 
307
307
  _cat_to_sub_cat = {get_type(key): get_type(value) for key, value in cat_to_sub_cat.items()}
308
308
  if not self._allow_update:
309
- raise PermissionError("Replacing categories with sub categories is not allowed")
309
+ raise RuntimeWarning("Replacing categories with sub categories is not allowed")
310
310
  self._categories_update = self.init_categories
311
311
  categories = self.get_categories(name_as_key=True)
312
312
  cats_or_sub_cats = [
@@ -332,7 +332,7 @@ class DatasetCategories:
332
332
  """
333
333
 
334
334
  if not self._allow_update:
335
- raise PermissionError("Filtering categories is not allowed")
335
+ raise RuntimeWarning("Filtering categories is not allowed")
336
336
  if isinstance(categories, (ObjectTypes, str)):
337
337
  categories = [get_type(categories)]
338
338
  else:
@@ -25,6 +25,7 @@ Module for DocLayNet dataset. Place the dataset as follows
25
25
  ├── PNG
26
26
  │ ├── 0a0d43e301facee9e99cc33b9b16e732dd207135f4027e75f6aea2bf117535a2.png
27
27
  """
28
+ from __future__ import annotations
28
29
 
29
30
  import os
30
31
  from typing import Mapping, Sequence, Union
@@ -109,7 +110,7 @@ class DocLayNet(DatasetBase):
109
110
  def _categories(self) -> DatasetCategories:
110
111
  return DatasetCategories(init_categories=_INIT_CATEGORIES, init_sub_categories=_SUB_CATEGORIES)
111
112
 
112
- def _builder(self) -> "DocLayNetBuilder":
113
+ def _builder(self) -> DocLayNetBuilder:
113
114
  return DocLayNetBuilder(location=_LOCATION, annotation_files=_ANNOTATION_FILES)
114
115
 
115
116
 
@@ -209,7 +210,7 @@ class DocLayNetSeq(DatasetBase):
209
210
  def _categories(self) -> DatasetCategories:
210
211
  return DatasetCategories(init_categories=_INIT_CATEGORIES_SEQ)
211
212
 
212
- def _builder(self) -> "DocLayNetSeqBuilder":
213
+ def _builder(self) -> DocLayNetSeqBuilder:
213
214
  return DocLayNetSeqBuilder(location=_LOCATION, annotation_files=_ANNOTATION_FILES)
214
215
 
215
216
 
@@ -30,6 +30,7 @@ Module for Fintabnet dataset. Place the dataset as follows
30
30
  ├── FinTabNet_1.0.0_table_train.jsonl
31
31
  ├── FinTabNet_1.0.0_table_val.jsonl
32
32
  """
33
+ from __future__ import annotations
33
34
 
34
35
  from pathlib import Path
35
36
  from typing import List, Mapping, Sequence, Union
@@ -133,7 +134,7 @@ class Fintabnet(_BuiltInDataset):
133
134
  def _categories(self) -> DatasetCategories:
134
135
  return DatasetCategories(init_categories=_INIT_CATEGORIES, init_sub_categories=_SUB_CATEGORIES)
135
136
 
136
- def _builder(self) -> "FintabnetBuilder":
137
+ def _builder(self) -> FintabnetBuilder:
137
138
  return FintabnetBuilder(location=_LOCATION, annotation_files=_ANNOTATION_FILES)
138
139
 
139
140
 
@@ -32,6 +32,7 @@ Module for Funsd dataset. Install the dataset following the folder structure
32
32
  │ ├── images
33
33
  │ │ ├── ...
34
34
  """
35
+ from __future__ import annotations
35
36
 
36
37
  import os
37
38
  from typing import Dict, List, Mapping, Union
@@ -120,7 +121,7 @@ class Funsd(_BuiltInDataset):
120
121
  def _categories(self) -> DatasetCategories:
121
122
  return DatasetCategories(init_categories=_INIT_CATEGORIES, init_sub_categories=_SUB_CATEGORIES)
122
123
 
123
- def _builder(self) -> "FunsdBuilder":
124
+ def _builder(self) -> FunsdBuilder:
124
125
  return FunsdBuilder(location=_LOCATION, annotation_files=_ANNOTATION_FILES)
125
126
 
126
127