deepdoctection 0.31__py3-none-any.whl → 0.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (91) hide show
  1. deepdoctection/__init__.py +35 -28
  2. deepdoctection/analyzer/dd.py +30 -24
  3. deepdoctection/configs/conf_dd_one.yaml +34 -31
  4. deepdoctection/datapoint/annotation.py +2 -1
  5. deepdoctection/datapoint/box.py +2 -1
  6. deepdoctection/datapoint/image.py +13 -7
  7. deepdoctection/datapoint/view.py +95 -24
  8. deepdoctection/datasets/__init__.py +1 -4
  9. deepdoctection/datasets/adapter.py +5 -2
  10. deepdoctection/datasets/base.py +5 -3
  11. deepdoctection/datasets/info.py +2 -2
  12. deepdoctection/datasets/instances/doclaynet.py +3 -2
  13. deepdoctection/datasets/instances/fintabnet.py +2 -1
  14. deepdoctection/datasets/instances/funsd.py +2 -1
  15. deepdoctection/datasets/instances/iiitar13k.py +5 -2
  16. deepdoctection/datasets/instances/layouttest.py +2 -1
  17. deepdoctection/datasets/instances/publaynet.py +2 -2
  18. deepdoctection/datasets/instances/pubtables1m.py +6 -3
  19. deepdoctection/datasets/instances/pubtabnet.py +2 -1
  20. deepdoctection/datasets/instances/rvlcdip.py +2 -1
  21. deepdoctection/datasets/instances/xfund.py +2 -1
  22. deepdoctection/eval/__init__.py +1 -4
  23. deepdoctection/eval/cocometric.py +2 -1
  24. deepdoctection/eval/eval.py +17 -13
  25. deepdoctection/eval/tedsmetric.py +14 -11
  26. deepdoctection/eval/tp_eval_callback.py +9 -3
  27. deepdoctection/extern/__init__.py +2 -7
  28. deepdoctection/extern/d2detect.py +24 -32
  29. deepdoctection/extern/deskew.py +4 -2
  30. deepdoctection/extern/doctrocr.py +75 -81
  31. deepdoctection/extern/fastlang.py +4 -2
  32. deepdoctection/extern/hfdetr.py +22 -28
  33. deepdoctection/extern/hflayoutlm.py +335 -103
  34. deepdoctection/extern/hflm.py +225 -0
  35. deepdoctection/extern/model.py +56 -47
  36. deepdoctection/extern/pdftext.py +8 -4
  37. deepdoctection/extern/pt/__init__.py +1 -3
  38. deepdoctection/extern/pt/nms.py +6 -2
  39. deepdoctection/extern/pt/ptutils.py +27 -19
  40. deepdoctection/extern/texocr.py +4 -2
  41. deepdoctection/extern/tp/tfutils.py +43 -9
  42. deepdoctection/extern/tp/tpcompat.py +10 -7
  43. deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
  44. deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
  45. deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
  46. deepdoctection/extern/tp/tpfrcnn/config/config.py +9 -6
  47. deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
  48. deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +17 -7
  49. deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
  50. deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +9 -4
  51. deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
  52. deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +16 -11
  53. deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +17 -10
  54. deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +14 -8
  55. deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
  56. deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
  57. deepdoctection/extern/tp/tpfrcnn/preproc.py +7 -3
  58. deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
  59. deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
  60. deepdoctection/extern/tpdetect.py +5 -8
  61. deepdoctection/mapper/__init__.py +3 -8
  62. deepdoctection/mapper/d2struct.py +8 -6
  63. deepdoctection/mapper/hfstruct.py +6 -1
  64. deepdoctection/mapper/laylmstruct.py +163 -20
  65. deepdoctection/mapper/maputils.py +3 -1
  66. deepdoctection/mapper/misc.py +6 -3
  67. deepdoctection/mapper/tpstruct.py +2 -2
  68. deepdoctection/pipe/__init__.py +1 -1
  69. deepdoctection/pipe/common.py +11 -9
  70. deepdoctection/pipe/concurrency.py +2 -1
  71. deepdoctection/pipe/layout.py +3 -1
  72. deepdoctection/pipe/lm.py +32 -64
  73. deepdoctection/pipe/order.py +142 -35
  74. deepdoctection/pipe/refine.py +8 -14
  75. deepdoctection/pipe/{cell.py → sub_layout.py} +1 -1
  76. deepdoctection/train/__init__.py +6 -12
  77. deepdoctection/train/d2_frcnn_train.py +21 -16
  78. deepdoctection/train/hf_detr_train.py +18 -11
  79. deepdoctection/train/hf_layoutlm_train.py +118 -101
  80. deepdoctection/train/tp_frcnn_train.py +21 -19
  81. deepdoctection/utils/env_info.py +41 -117
  82. deepdoctection/utils/logger.py +1 -0
  83. deepdoctection/utils/mocks.py +93 -0
  84. deepdoctection/utils/settings.py +1 -0
  85. deepdoctection/utils/viz.py +4 -3
  86. {deepdoctection-0.31.dist-info → deepdoctection-0.32.dist-info}/METADATA +27 -18
  87. deepdoctection-0.32.dist-info/RECORD +146 -0
  88. deepdoctection-0.31.dist-info/RECORD +0 -144
  89. {deepdoctection-0.31.dist-info → deepdoctection-0.32.dist-info}/LICENSE +0 -0
  90. {deepdoctection-0.31.dist-info → deepdoctection-0.32.dist-info}/WHEEL +0 -0
  91. {deepdoctection-0.31.dist-info → deepdoctection-0.32.dist-info}/top_level.txt +0 -0
@@ -19,15 +19,13 @@ import os
19
19
  import sys
20
20
  from typing import TYPE_CHECKING
21
21
 
22
- from packaging import version
23
-
24
- from .utils.env_info import auto_select_lib_and_device
22
+ from .utils.env_info import collect_env_info
25
23
  from .utils.file_utils import _LazyModule, get_tf_version, pytorch_available, tf_available
26
- from .utils.logger import logger
24
+ from .utils.logger import LoggingRecord, logger
27
25
 
28
26
  # pylint: enable=wrong-import-position
29
27
 
30
- __version__ = 0.31
28
+ __version__ = 0.32
31
29
 
32
30
  _IMPORT_STRUCTURE = {
33
31
  "analyzer": [
@@ -182,6 +180,7 @@ _IMPORT_STRUCTURE = {
182
180
  "DocTrRotationTransformer",
183
181
  "FasttextLangDetector",
184
182
  "HFDetrDerivedDetector",
183
+ "get_tokenizer_from_architecture",
185
184
  "HFLayoutLmTokenClassifierBase",
186
185
  "HFLayoutLmTokenClassifier",
187
186
  "HFLayoutLmv2TokenClassifier",
@@ -189,6 +188,9 @@ _IMPORT_STRUCTURE = {
189
188
  "HFLayoutLmSequenceClassifier",
190
189
  "HFLayoutLmv2SequenceClassifier",
191
190
  "HFLayoutLmv3SequenceClassifier",
191
+ "HFLiltTokenClassifier",
192
+ "HFLiltSequenceClassifier",
193
+ "HFLmSequenceClassifier",
192
194
  "ModelProfile",
193
195
  "ModelCatalog",
194
196
  "print_model_infos",
@@ -268,11 +270,11 @@ _IMPORT_STRUCTURE = {
268
270
  "DoctectionPipe",
269
271
  "LanguageDetectionService",
270
272
  "ImageLayoutService",
271
- "get_tokenizer_from_architecture",
272
273
  "LMTokenClassifierService",
273
274
  "LMSequenceClassifierService",
274
275
  "OrderGenerator",
275
276
  "TextLineGenerator",
277
+ "TextLineService",
276
278
  "TextOrderService",
277
279
  "TableSegmentationRefinementService",
278
280
  "generate_html_string",
@@ -297,14 +299,13 @@ _IMPORT_STRUCTURE = {
297
299
  "save_tmp_file",
298
300
  "timed_operation",
299
301
  "collect_env_info",
300
- "get_device",
301
- "auto_select_lib_and_device",
302
302
  "auto_select_viz_library",
303
303
  "get_tensorflow_requirement",
304
304
  "tf_addons_available",
305
305
  "get_tf_addons_requirements",
306
306
  "tensorpack_available",
307
307
  "get_tensorpack_requirement",
308
+ "pytorch_available",
308
309
  "get_pytorch_requirement",
309
310
  "lxml_available",
310
311
  "get_lxml_requirement",
@@ -418,25 +419,31 @@ _IMPORT_STRUCTURE = {
418
419
  ],
419
420
  }
420
421
 
422
+ # Setting some environment variables so that standard functions can be invoked with available hardware
423
+ env_info = collect_env_info()
424
+ logger.debug(LoggingRecord(msg=env_info))
421
425
 
422
- # disable TF warnings for versions > 2.4.1
423
- if tf_available():
424
- if version.parse(get_tf_version()) > version.parse("2.4.1"):
425
- os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
426
- try:
427
- import tensorflow.python.util.deprecation as deprecation # type: ignore # pylint: disable=E0401,R0402
428
-
429
- deprecation._PRINT_DEPRECATION_WARNINGS = False # pylint: disable=W0212
430
- except Exception: # pylint: disable=W0703
431
- try:
432
- from tensorflow.python.util import deprecation # type: ignore # pylint: disable=E0401
433
-
434
- deprecation._PRINT_DEPRECATION_WARNINGS = False # pylint: disable=W0212
435
- except Exception: # pylint: disable=W0703
436
- pass
426
+ if os.environ.get("PYTORCH_AVAILABLE") and os.environ.get("DD_USE_TORCH") is None:
427
+ os.environ["DD_USE_TORCH"] = "1"
428
+ os.environ["USE_TORCH"] = "1"
429
+ if os.environ.get("TENSORFLOW_AVAILABLE") and os.environ.get("DD_USE_TF") is None:
430
+ os.environ["DD_USE_TF"] = "1"
431
+ os.environ["USE_TF"] = "1"
432
+ if os.environ.get("DD_USE_TORCH") and os.environ.get("DD_USE_TF"):
433
+ logger.warning(
434
+ "Both DD_USE_TORCH and DD_USE_TF are set. Defaulting to PyTorch. If you want a different "
435
+ "behaviour, set DD_USE_TORCH to None before importing deepdoctection."
436
+ )
437
+ os.environ.pop("DD_USE_TF")
438
+ os.environ.pop("USE_TF")
437
439
 
438
- # Setting some environment variables so that standard functions can be invoked with available hardware
439
- auto_select_lib_and_device()
440
+ if not os.environ.get("PYTORCH_AVAILABLE") and not os.environ.get("TENSORFLOW_AVAILABLE"):
441
+ logger.warning(
442
+ LoggingRecord(
443
+ msg="Neither Tensorflow or Pytorch are available. You will not be able to use any Deep Learning "
444
+ "model from the library."
445
+ )
446
+ )
440
447
 
441
448
 
442
449
  # Direct imports for type-checking
@@ -444,10 +451,10 @@ if TYPE_CHECKING:
444
451
  from .analyzer import *
445
452
  from .dataflow import *
446
453
  from .datapoint import *
447
- from .datasets import *
454
+ from .datasets import * # type: ignore
448
455
  from .eval import *
449
- from .extern import *
450
- from .mapper import *
456
+ from .extern import * # type: ignore
457
+ from .mapper import * # type: ignore
451
458
  from .pipe import *
452
459
  from .train import *
453
460
  from .utils import *
@@ -23,51 +23,43 @@ Module for **deep**doctection analyzer.
23
23
  -user factory with a reduced config setting
24
24
  """
25
25
 
26
- import ast
27
26
  import os
28
27
  from os import environ
29
28
  from shutil import copyfile
30
29
  from typing import List, Optional, Union
31
30
 
31
+ from lazy_imports import try_import
32
+
32
33
  from ..extern.base import ObjectDetector
34
+ from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
33
35
  from ..extern.doctrocr import DoctrTextlineDetector, DoctrTextRecognizer
36
+ from ..extern.hfdetr import HFDetrDerivedDetector
34
37
  from ..extern.model import ModelCatalog, ModelDownloadManager
35
38
  from ..extern.pdftext import PdfPlumberTextDetector
39
+ from ..extern.pt.ptutils import get_torch_device
36
40
  from ..extern.tessocr import TesseractOcrDetector
37
41
  from ..extern.texocr import TextractOcrDetector
42
+ from ..extern.tp.tfutils import disable_tp_layer_logging, get_tf_device
43
+ from ..extern.tpdetect import TPFrcnnDetector
38
44
  from ..pipe.base import PipelineComponent
39
- from ..pipe.cell import DetectResultGenerator, SubImageLayoutService
40
45
  from ..pipe.common import AnnotationNmsService, MatchingService, PageParsingService
41
46
  from ..pipe.doctectionpipe import DoctectionPipe
42
47
  from ..pipe.layout import ImageLayoutService
43
48
  from ..pipe.order import TextOrderService
44
49
  from ..pipe.refine import TableSegmentationRefinementService
45
50
  from ..pipe.segment import PubtablesSegmentationService, TableSegmentationService
51
+ from ..pipe.sub_layout import DetectResultGenerator, SubImageLayoutService
46
52
  from ..pipe.text import TextExtractionService
47
53
  from ..utils.detection_types import Pathlike
48
- from ..utils.env_info import get_device
49
- from ..utils.file_utils import (
50
- boto3_available,
51
- detectron2_available,
52
- pytorch_available,
53
- tensorpack_available,
54
- tf_available,
55
- )
54
+ from ..utils.error import DependencyError
55
+ from ..utils.file_utils import detectron2_available, tensorpack_available
56
56
  from ..utils.fs import get_configs_dir_path, get_package_path, mkdir_p
57
57
  from ..utils.logger import LoggingRecord, logger
58
58
  from ..utils.metacfg import AttrDict, set_config_by_yaml
59
59
  from ..utils.settings import CellType, LayoutType
60
60
  from ..utils.transform import PadTransform
61
61
 
62
- if tf_available() and tensorpack_available():
63
- from ..extern.tp.tfutils import disable_tp_layer_logging
64
- from ..extern.tpdetect import TPFrcnnDetector
65
-
66
- if pytorch_available():
67
- from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
68
- from ..extern.hfdetr import HFDetrDerivedDetector
69
-
70
- if boto3_available():
62
+ with try_import() as image_guard:
71
63
  from botocore.config import Config # type: ignore
72
64
 
73
65
 
@@ -344,11 +336,20 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
344
336
  pipe_component_list.append(table_segmentation)
345
337
 
346
338
  if cfg.USE_TABLE_REFINEMENT:
347
- table_segmentation_refinement = TableSegmentationRefinementService()
339
+ table_segmentation_refinement = TableSegmentationRefinementService(
340
+ [LayoutType.table, LayoutType.table_rotated],
341
+ [
342
+ LayoutType.cell,
343
+ CellType.column_header,
344
+ CellType.projected_row_header,
345
+ CellType.spanning,
346
+ CellType.row_header,
347
+ ],
348
+ )
348
349
  pipe_component_list.append(table_segmentation_refinement)
349
350
 
350
351
  if cfg.USE_PDF_MINER:
351
- pdf_text = PdfPlumberTextDetector()
352
+ pdf_text = PdfPlumberTextDetector(x_tolerance=cfg.PDF_MINER.X_TOLERANCE, y_tolerance=cfg.PDF_MINER.Y_TOLERANCE)
352
353
  d_text = TextExtractionService(pdf_text)
353
354
  pipe_component_list.append(d_text)
354
355
 
@@ -401,7 +402,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
401
402
 
402
403
 
403
404
  def get_dd_analyzer(
404
- reset_config_file: bool = False,
405
+ reset_config_file: bool = True,
405
406
  config_overwrite: Optional[List[str]] = None,
406
407
  path_config_file: Optional[Pathlike] = None,
407
408
  ) -> DoctectionPipe:
@@ -430,8 +431,13 @@ def get_dd_analyzer(
430
431
  :return: A DoctectionPipe instance with given configs
431
432
  """
432
433
  config_overwrite = [] if config_overwrite is None else config_overwrite
433
- lib = "TF" if ast.literal_eval(os.environ.get("USE_TENSORFLOW", "False")) else "PT"
434
- device = get_device(False)
434
+ lib = "TF" if os.environ.get("DD_USE_TF") else "PT"
435
+ if lib == "TF":
436
+ device = get_tf_device()
437
+ elif lib == "PT":
438
+ device = get_torch_device()
439
+ else:
440
+ raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
435
441
  dd_one_config_path = maybe_copy_config_to_cache(
436
442
  get_package_path(), get_configs_dir_path(), _DD_ONE, reset_config_file
437
443
  )
@@ -1,38 +1,38 @@
1
1
  USE_LAYOUT: True
2
2
  USE_TABLE_SEGMENTATION: True
3
3
  TF:
4
- LAYOUT:
5
- WEIGHTS: layout/model-800000_inf_only.data-00000-of-00001
6
- FILTER:
7
- CELL:
8
- WEIGHTS: cell/model-1800000_inf_only.data-00000-of-00001
9
- FILTER:
10
- ITEM:
11
- WEIGHTS: item/model-1620000_inf_only.data-00000-of-00001
12
- FILTER:
4
+ LAYOUT:
5
+ WEIGHTS: layout/model-800000_inf_only.data-00000-of-00001
6
+ FILTER:
7
+ CELL:
8
+ WEIGHTS: cell/model-1800000_inf_only.data-00000-of-00001
9
+ FILTER:
10
+ ITEM:
11
+ WEIGHTS: item/model-1620000_inf_only.data-00000-of-00001
12
+ FILTER:
13
13
  PT:
14
- LAYOUT:
15
- WEIGHTS: layout/d2_model_0829999_layout_inf_only.pt
16
- WEIGHTS_TS: layout/d2_model_0829999_layout_inf_only.ts
17
- FILTER:
18
- PAD:
19
- TOP: 60
20
- RIGHT: 60
21
- BOTTOM: 60
22
- LEFT: 60
23
- ITEM:
24
- WEIGHTS: item/d2_model_1639999_item_inf_only.pt
25
- WEIGHTS_TS: item/d2_model_1639999_item_inf_only.ts
26
- FILTER:
27
- PAD:
28
- TOP: 60
29
- RIGHT: 60
30
- BOTTOM: 60
31
- LEFT: 60
32
- CELL:
33
- WEIGHTS: cell/d2_model_1849999_cell_inf_only.pt
34
- WEIGHTS_TS: cell/d2_model_1849999_cell_inf_only.ts
35
- FILTER:
14
+ LAYOUT:
15
+ WEIGHTS: layout/d2_model_0829999_layout_inf_only.pt
16
+ WEIGHTS_TS: layout/d2_model_0829999_layout_inf_only.ts
17
+ FILTER:
18
+ PAD:
19
+ TOP: 60
20
+ RIGHT: 60
21
+ BOTTOM: 60
22
+ LEFT: 60
23
+ ITEM:
24
+ WEIGHTS: item/d2_model_1639999_item_inf_only.pt
25
+ WEIGHTS_TS: item/d2_model_1639999_item_inf_only.ts
26
+ FILTER:
27
+ PAD:
28
+ TOP: 60
29
+ RIGHT: 60
30
+ BOTTOM: 60
31
+ LEFT: 60
32
+ CELL:
33
+ WEIGHTS: cell/d2_model_1849999_cell_inf_only.pt
34
+ WEIGHTS_TS: cell/d2_model_1849999_cell_inf_only.ts
35
+ FILTER:
36
36
  LAYOUT_NMS_PAIRS:
37
37
  COMBINATIONS:
38
38
  THRESHOLDS:
@@ -48,6 +48,9 @@ SEGMENTATION:
48
48
  STRETCH_RULE: equal
49
49
  USE_TABLE_REFINEMENT: True
50
50
  USE_PDF_MINER: False
51
+ PDF_MINER:
52
+ X_TOLERANCE: 3
53
+ Y_TOLERANCE: 3
51
54
  USE_OCR: True
52
55
  OCR:
53
56
  USE_TESSERACT: True
@@ -504,5 +504,6 @@ class ContainerAnnotation(CategoryAnnotation):
504
504
  @classmethod
505
505
  def from_dict(cls, **kwargs: JsonDict) -> "SummaryAnnotation":
506
506
  container_ann = ann_from_dict(cls, **kwargs)
507
- container_ann.value = kwargs.get("value")
507
+ value = kwargs.get("value", "")
508
+ container_ann.value = value if isinstance(value, str) else list(value)
508
509
  return container_ann
@@ -25,6 +25,7 @@ from typing import List, Optional, Sequence, no_type_check
25
25
 
26
26
  import numpy as np
27
27
  import numpy.typing as npt
28
+ from lazy_imports import try_import
28
29
  from numpy import float32
29
30
 
30
31
  from ..utils.detection_types import ImageType
@@ -32,7 +33,7 @@ from ..utils.error import BoundingBoxError
32
33
  from ..utils.file_utils import cocotools_available
33
34
  from ..utils.logger import LoggingRecord, logger
34
35
 
35
- if cocotools_available():
36
+ with try_import() as import_guard:
36
37
  import pycocotools.mask as coco_mask
37
38
 
38
39
 
@@ -18,6 +18,8 @@
18
18
  """
19
19
  Dataclass Image
20
20
  """
21
+ from __future__ import annotations
22
+
21
23
  import json
22
24
  from dataclasses import dataclass, field
23
25
  from os import environ
@@ -202,7 +204,7 @@ class Image:
202
204
  self._bbox = None
203
205
  self.embeddings.pop(self.image_id)
204
206
 
205
- def get_image(self) -> "_Img": # type: ignore
207
+ def get_image(self) -> _Img: # type: ignore # pylint: disable=E0602
206
208
  """
207
209
  Get the image either in base64 string representation or as np.array.
208
210
 
@@ -531,16 +533,20 @@ class Image:
531
533
  )
532
534
  ann.image.dump(sub_image)
533
535
 
534
- def remove_image_from_lower_hierachy(self) -> None:
536
+ def remove_image_from_lower_hierachy(self, pixel_values_only: bool = False) -> None:
535
537
  """Will remove all images from image annotations."""
536
538
  for ann in self.annotations:
537
- absolute_bounding_box = ann.get_bounding_box(self.image_id)
538
- ann.bounding_box = absolute_bounding_box
539
- ann.image = None
539
+ if pixel_values_only:
540
+ if ann.image is not None:
541
+ ann.image.clear_image()
542
+ else:
543
+ absolute_bounding_box = ann.get_bounding_box(self.image_id)
544
+ ann.bounding_box = absolute_bounding_box
545
+ ann.image = None
540
546
 
541
547
  @classmethod
542
548
  @no_type_check
543
- def from_dict(cls, **kwargs) -> "Image":
549
+ def from_dict(cls, **kwargs) -> Image:
544
550
  """
545
551
  Create `Image` instance from dict.
546
552
 
@@ -571,7 +577,7 @@ class Image:
571
577
 
572
578
  @classmethod
573
579
  @no_type_check
574
- def from_file(cls, file_path: str) -> "Image":
580
+ def from_file(cls, file_path: str) -> Image:
575
581
  """
576
582
  Create `Image` instance from .json file.
577
583
 
@@ -19,6 +19,7 @@
19
19
  Subclasses for ImageAnnotation and Image objects with various properties. These classes
20
20
  simplify consumption
21
21
  """
22
+ from __future__ import annotations
22
23
 
23
24
  from copy import copy
24
25
  from typing import Any, Dict, List, Mapping, Optional, Sequence, Set, Tuple, Type, Union, no_type_check
@@ -64,7 +65,7 @@ class ImageAnnotationBaseView(ImageAnnotation):
64
65
  base_page: `Page` class instantiated by the lowest hierarchy `Image`
65
66
  """
66
67
 
67
- base_page: "Page"
68
+ base_page: Page
68
69
 
69
70
  @property
70
71
  def bbox(self) -> List[float]:
@@ -148,7 +149,7 @@ class ImageAnnotationBaseView(ImageAnnotation):
148
149
  return attribute_names
149
150
 
150
151
  @classmethod
151
- def from_dict(cls, **kwargs: JsonDict) -> "ImageAnnotationBaseView":
152
+ def from_dict(cls, **kwargs: JsonDict) -> ImageAnnotationBaseView:
152
153
  """
153
154
  Identical to its base class method for having correct return types. If the base class changes, please
154
155
  change this method as well.
@@ -205,15 +206,38 @@ class Layout(ImageAnnotationBaseView):
205
206
  return words_with_reading_order
206
207
 
207
208
  @property
208
- def text_(self) -> Dict[str, Union[str, List[str]]]:
209
+ def text_(self) -> JsonDict:
209
210
  """Returns a dict `{"text": text string,
210
211
  "text_list": list of single words,
211
212
  "annotation_ids": word annotation ids`"""
212
213
  words = self.get_ordered_words()
214
+ characters, ann_ids, token_classes, token_tags, token_classes_ids, token_tag_ids = zip(
215
+ *[
216
+ (
217
+ word.characters,
218
+ word.annotation_id,
219
+ word.token_class,
220
+ word.token_tag,
221
+ (
222
+ word.get_sub_category(WordType.token_class).category_id
223
+ if WordType.token_class in word.sub_categories
224
+ else None
225
+ ),
226
+ (word.get_sub_category(WordType.token_tag).category_id)
227
+ if WordType.token_tag in word.sub_categories
228
+ else None,
229
+ )
230
+ for word in words
231
+ ]
232
+ )
213
233
  return {
214
- "text": " ".join([word.characters for word in words]), # type: ignore
215
- "text_list": [word.characters for word in words], # type: ignore
216
- "annotation_ids": [word.annotation_id for word in words],
234
+ "text": " ".join(characters),
235
+ "words": characters,
236
+ "ann_ids": ann_ids,
237
+ "token_classes": token_classes,
238
+ "token_tags": token_tags,
239
+ "token_class_ids": token_classes_ids,
240
+ "token_tag_ids": token_tag_ids,
217
241
  }
218
242
 
219
243
  def get_attribute_names(self) -> Set[str]:
@@ -331,19 +355,33 @@ class Table(Layout):
331
355
  return super().text
332
356
 
333
357
  @property
334
- def text_(self) -> Dict[str, Union[str, List[str]]]:
358
+ def text_(self) -> JsonDict:
335
359
  cells = self.cells
336
360
  if not cells:
337
361
  return super().text_
338
- text_list: List[str] = []
339
- annotation_id_list: List[str] = []
362
+ text: List[str] = []
363
+ words: List[str] = []
364
+ ann_ids: List[str] = []
365
+ token_classes: List[str] = []
366
+ token_tags: List[str] = []
367
+ token_class_ids: List[str] = []
368
+ token_tag_ids: List[str] = []
340
369
  for cell in cells:
341
- text_list.extend(cell.text_["text_list"]) # type: ignore
342
- annotation_id_list.extend(cell.text_["annotation_ids"]) # type: ignore
370
+ text.extend(cell.text_["text"]) # type: ignore
371
+ words.extend(cell.text_["words"]) # type: ignore
372
+ ann_ids.extend(cell.text_["ann_ids"]) # type: ignore
373
+ token_classes.extend(cell.text_["token_classes"]) # type: ignore
374
+ token_tags.extend(cell.text_["token_tags"]) # type: ignore
375
+ token_class_ids.extend(cell.text_["token_class_ids"]) # type: ignore
376
+ token_tag_ids.extend(cell.text_["token_tag_ids"]) # type: ignore
343
377
  return {
344
- "text": " ".join([cell.text for cell in cells]), # type: ignore
345
- "text_list": text_list,
346
- "annotation_ids": annotation_id_list,
378
+ "text": " ".join(text),
379
+ "words": words,
380
+ "ann_ids": ann_ids,
381
+ "token_classes": token_classes,
382
+ "token_tags": token_tags,
383
+ "token_class_ids": token_class_ids,
384
+ "token_tag_ids": token_tag_ids,
347
385
  }
348
386
 
349
387
  @property
@@ -452,6 +490,7 @@ class Page(Image):
452
490
  "document_id",
453
491
  "page_number",
454
492
  }
493
+ include_residual_text_container: bool = True
455
494
 
456
495
  def get_annotation( # type: ignore
457
496
  self,
@@ -556,8 +595,8 @@ class Page(Image):
556
595
  text_container: Optional[ObjectTypes] = None,
557
596
  floating_text_block_categories: Optional[Sequence[ObjectTypes]] = None,
558
597
  include_residual_text_container: bool = True,
559
- base_page: Optional["Page"] = None,
560
- ) -> "Page":
598
+ base_page: Optional[Page] = None,
599
+ ) -> Page:
561
600
  """
562
601
  Factory function for generating a `Page` instance from `image_orig` .
563
602
 
@@ -615,6 +654,7 @@ class Page(Image):
615
654
  page.summary = SummaryAnnotation.from_dict(**summary_dict)
616
655
  page.floating_text_block_categories = floating_text_block_categories # type: ignore
617
656
  page.text_container = text_container # type: ignore
657
+ page.include_residual_text_container = include_residual_text_container
618
658
  return page
619
659
 
620
660
  def _order(self, block: str) -> List[ImageAnnotationBaseView]:
@@ -628,7 +668,7 @@ class Page(Image):
628
668
  break_str = "\n" if line_break else " "
629
669
  for block in block_with_order:
630
670
  text += f"{block.text}{break_str}"
631
- return text
671
+ return text[:-1]
632
672
 
633
673
  @property
634
674
  def text(self) -> str:
@@ -638,17 +678,35 @@ class Page(Image):
638
678
  return self._make_text()
639
679
 
640
680
  @property
641
- def text_(self) -> Dict[str, Union[str, List[str]]]:
681
+ def text_(self) -> JsonDict:
642
682
  """Returns a dict `{"text": text string,
643
683
  "text_list": list of single words,
644
684
  "annotation_ids": word annotation ids`"""
645
685
  block_with_order = self._order("layouts")
646
- text_list: List[str] = []
647
- annotation_id_list: List[str] = []
686
+ text: List[str] = []
687
+ words: List[str] = []
688
+ ann_ids: List[str] = []
689
+ token_classes: List[str] = []
690
+ token_tags: List[str] = []
691
+ token_class_ids: List[str] = []
692
+ token_tag_ids: List[str] = []
648
693
  for block in block_with_order:
649
- text_list.extend(block.text_["text_list"]) # type: ignore
650
- annotation_id_list.extend(block.text_["annotation_ids"]) # type: ignore
651
- return {"text": self.text, "text_list": text_list, "annotation_ids": annotation_id_list}
694
+ text.append(block.text_["text"]) # type: ignore
695
+ words.extend(block.text_["words"]) # type: ignore
696
+ ann_ids.extend(block.text_["ann_ids"]) # type: ignore
697
+ token_classes.extend(block.text_["token_classes"]) # type: ignore
698
+ token_tags.extend(block.text_["token_tags"]) # type: ignore
699
+ token_class_ids.extend(block.text_["token_class_ids"]) # type: ignore
700
+ token_tag_ids.extend(block.text_["token_tag_ids"]) # type: ignore
701
+ return {
702
+ "text": " ".join(text),
703
+ "words": words,
704
+ "ann_ids": ann_ids,
705
+ "token_classes": token_classes,
706
+ "token_tags": token_tags,
707
+ "token_class_ids": token_class_ids,
708
+ "token_tag_ids": token_tag_ids,
709
+ }
652
710
 
653
711
  def get_layout_context(self, annotation_id: str, context_size: int = 3) -> List[ImageAnnotationBaseView]:
654
712
  """For a given `annotation_id` get a list of `ImageAnnotation` that are nearby in terms of reading order.
@@ -759,6 +817,11 @@ class Page(Image):
759
817
  box_stack = []
760
818
  cells_found = False
761
819
 
820
+ if self.image is None and interactive:
821
+ logger.warning(
822
+ LoggingRecord("No image provided. Cannot display image in interactive mode", {"page_id": self.image_id})
823
+ )
824
+
762
825
  if debug_kwargs:
763
826
  anns = self.get_annotation(category_names=list(debug_kwargs.keys()))
764
827
  for ann in anns:
@@ -906,7 +969,7 @@ class Page(Image):
906
969
  text_container: Optional[ObjectTypes] = None,
907
970
  floating_text_block_categories: Optional[List[ObjectTypes]] = None,
908
971
  include_residual_text_container: bool = True,
909
- ) -> "Page":
972
+ ) -> Page:
910
973
  """Reading JSON file and building a `Page` object with given config.
911
974
  :param file_path: Path to file
912
975
  :param text_container: A LayoutType to get the text from. It will steer the output of `Layout.words`.
@@ -929,3 +992,11 @@ class Page(Image):
929
992
  for word in all_words
930
993
  if word.token_tag not in (TokenClasses.other, None)
931
994
  ]
995
+
996
+ def __copy__(self) -> Page:
997
+ return self.__class__.from_image(
998
+ self.image_orig,
999
+ self.text_container,
1000
+ self.floating_text_block_categories,
1001
+ self.include_residual_text_container,
1002
+ )
@@ -26,13 +26,10 @@ Create an info card, a DataFlowBaseBuilder derived instance, possibly a category
26
26
  DatasetBase derived instance to create a data set.
27
27
  """
28
28
 
29
- from ..utils.file_utils import pytorch_available
29
+ from .adapter import *
30
30
  from .base import *
31
31
  from .dataflow_builder import DataFlowBaseBuilder
32
32
  from .info import *
33
33
  from .instances import *
34
34
  from .registry import *
35
35
  from .save import *
36
-
37
- if pytorch_available():
38
- from .adapter import *
@@ -22,19 +22,22 @@ Module for wrapping datasets into a pytorch dataset framework.
22
22
 
23
23
  from typing import Any, Callable, Iterator, Mapping, Optional, Union
24
24
 
25
+ from lazy_imports import try_import
26
+
25
27
  from ..dataflow import CacheData, CustomDataFromList, MapData, RepeatedData
26
28
  from ..datapoint.image import Image
27
29
  from ..datasets.base import DatasetBase
28
30
  from ..mapper.maputils import LabelSummarizer
29
31
  from ..utils.detection_types import DP, JsonDict
30
- from ..utils.file_utils import pytorch_available
31
32
  from ..utils.logger import LoggingRecord, log_once, logger
32
33
  from ..utils.settings import DatasetType, LayoutType, ObjectTypes, PageType, WordType
33
34
  from ..utils.tqdm import get_tqdm
34
35
  from .registry import get_dataset
35
36
 
36
- if pytorch_available():
37
+ with try_import() as import_guard:
37
38
  from torch.utils.data import IterableDataset
39
+ if not import_guard.is_successful():
40
+ from ..utils.mocks import IterableDataset # type: ignore
38
41
 
39
42
 
40
43
  class DatasetAdapter(IterableDataset): # type: ignore