deepdoctection 0.29__py3-none-any.whl → 0.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (83) hide show
  1. deepdoctection/__init__.py +6 -2
  2. deepdoctection/analyzer/dd.py +13 -8
  3. deepdoctection/dataflow/base.py +0 -19
  4. deepdoctection/dataflow/custom.py +6 -5
  5. deepdoctection/dataflow/custom_serialize.py +20 -5
  6. deepdoctection/dataflow/parallel_map.py +22 -17
  7. deepdoctection/dataflow/serialize.py +5 -4
  8. deepdoctection/dataflow/stats.py +5 -5
  9. deepdoctection/datapoint/annotation.py +35 -14
  10. deepdoctection/datapoint/box.py +9 -6
  11. deepdoctection/datapoint/convert.py +3 -1
  12. deepdoctection/datapoint/image.py +66 -29
  13. deepdoctection/datapoint/view.py +62 -24
  14. deepdoctection/datasets/adapter.py +4 -5
  15. deepdoctection/datasets/base.py +87 -14
  16. deepdoctection/datasets/dataflow_builder.py +1 -1
  17. deepdoctection/datasets/info.py +2 -2
  18. deepdoctection/datasets/instances/fintabnet.py +3 -3
  19. deepdoctection/datasets/instances/layouttest.py +2 -7
  20. deepdoctection/datasets/instances/pubtabnet.py +3 -3
  21. deepdoctection/eval/accmetric.py +7 -5
  22. deepdoctection/eval/base.py +5 -4
  23. deepdoctection/eval/eval.py +9 -7
  24. deepdoctection/eval/tedsmetric.py +9 -3
  25. deepdoctection/eval/tp_eval_callback.py +8 -7
  26. deepdoctection/extern/base.py +39 -13
  27. deepdoctection/extern/d2detect.py +164 -64
  28. deepdoctection/extern/deskew.py +32 -7
  29. deepdoctection/extern/doctrocr.py +268 -29
  30. deepdoctection/extern/fastlang.py +45 -7
  31. deepdoctection/extern/hfdetr.py +90 -33
  32. deepdoctection/extern/hflayoutlm.py +109 -22
  33. deepdoctection/extern/model.py +30 -11
  34. deepdoctection/extern/pdftext.py +2 -1
  35. deepdoctection/extern/pt/ptutils.py +3 -2
  36. deepdoctection/extern/tessocr.py +134 -22
  37. deepdoctection/extern/texocr.py +4 -2
  38. deepdoctection/extern/tp/tpcompat.py +4 -4
  39. deepdoctection/extern/tp/tpfrcnn/preproc.py +2 -7
  40. deepdoctection/extern/tpdetect.py +50 -23
  41. deepdoctection/mapper/d2struct.py +1 -1
  42. deepdoctection/mapper/hfstruct.py +1 -1
  43. deepdoctection/mapper/laylmstruct.py +1 -1
  44. deepdoctection/mapper/maputils.py +19 -5
  45. deepdoctection/mapper/prodigystruct.py +15 -13
  46. deepdoctection/mapper/pubstruct.py +10 -10
  47. deepdoctection/mapper/tpstruct.py +1 -1
  48. deepdoctection/pipe/anngen.py +35 -8
  49. deepdoctection/pipe/base.py +53 -19
  50. deepdoctection/pipe/cell.py +29 -8
  51. deepdoctection/pipe/common.py +12 -4
  52. deepdoctection/pipe/doctectionpipe.py +4 -3
  53. deepdoctection/pipe/language.py +3 -2
  54. deepdoctection/pipe/layout.py +3 -2
  55. deepdoctection/pipe/lm.py +2 -2
  56. deepdoctection/pipe/order.py +67 -39
  57. deepdoctection/pipe/refine.py +18 -10
  58. deepdoctection/pipe/segment.py +34 -20
  59. deepdoctection/pipe/text.py +14 -8
  60. deepdoctection/pipe/transform.py +16 -8
  61. deepdoctection/train/d2_frcnn_train.py +17 -14
  62. deepdoctection/train/hf_detr_train.py +13 -9
  63. deepdoctection/train/hf_layoutlm_train.py +31 -19
  64. deepdoctection/utils/__init__.py +3 -0
  65. deepdoctection/utils/concurrency.py +1 -1
  66. deepdoctection/utils/context.py +5 -5
  67. deepdoctection/utils/develop.py +2 -2
  68. deepdoctection/utils/env_info.py +64 -27
  69. deepdoctection/utils/error.py +84 -0
  70. deepdoctection/utils/file_utils.py +28 -17
  71. deepdoctection/utils/fs.py +16 -14
  72. deepdoctection/utils/logger.py +43 -19
  73. deepdoctection/utils/pdf_utils.py +14 -7
  74. deepdoctection/utils/settings.py +5 -1
  75. deepdoctection/utils/transform.py +1 -1
  76. deepdoctection/utils/utils.py +0 -6
  77. deepdoctection/utils/viz.py +83 -14
  78. {deepdoctection-0.29.dist-info → deepdoctection-0.31.dist-info}/METADATA +39 -61
  79. deepdoctection-0.31.dist-info/RECORD +144 -0
  80. {deepdoctection-0.29.dist-info → deepdoctection-0.31.dist-info}/WHEEL +1 -1
  81. deepdoctection-0.29.dist-info/RECORD +0 -143
  82. {deepdoctection-0.29.dist-info → deepdoctection-0.31.dist-info}/LICENSE +0 -0
  83. {deepdoctection-0.29.dist-info → deepdoctection-0.31.dist-info}/top_level.txt +0 -0
@@ -27,7 +27,7 @@ from .utils.logger import logger
27
27
 
28
28
  # pylint: enable=wrong-import-position
29
29
 
30
- __version__ = 0.29
30
+ __version__ = 0.31
31
31
 
32
32
  _IMPORT_STRUCTURE = {
33
33
  "analyzer": [
@@ -179,6 +179,7 @@ _IMPORT_STRUCTURE = {
179
179
  "Jdeskewer",
180
180
  "DoctrTextlineDetector",
181
181
  "DoctrTextRecognizer",
182
+ "DocTrRotationTransformer",
182
183
  "FasttextLangDetector",
183
184
  "HFDetrDerivedDetector",
184
185
  "HFLayoutLmTokenClassifierBase",
@@ -194,6 +195,7 @@ _IMPORT_STRUCTURE = {
194
195
  "ModelDownloadManager",
195
196
  "PdfPlumberTextDetector",
196
197
  "TesseractOcrDetector",
198
+ "TesseractRotationTransformer",
197
199
  "TextractOcrDetector",
198
200
  "TPFrcnnDetector",
199
201
  ],
@@ -279,7 +281,7 @@ _IMPORT_STRUCTURE = {
279
281
  "PubtablesSegmentationService",
280
282
  "SegmentationResult",
281
283
  "TextExtractionService",
282
- "SimpleTransformPipelineComponent",
284
+ "SimpleTransformService",
283
285
  ],
284
286
  "train": [
285
287
  "D2Trainer",
@@ -343,6 +345,8 @@ _IMPORT_STRUCTURE = {
343
345
  "get_opencv_requirement",
344
346
  "pillow_available",
345
347
  "get_pillow_requirement",
348
+ "spacy_available",
349
+ "get_spacy_requirement",
346
350
  "load_image_from_file",
347
351
  "load_bytes_from_pdf_file",
348
352
  "get_load_image_func",
@@ -54,7 +54,7 @@ from ..utils.file_utils import (
54
54
  tf_available,
55
55
  )
56
56
  from ..utils.fs import get_configs_dir_path, get_package_path, mkdir_p
57
- from ..utils.logger import logger
57
+ from ..utils.logger import LoggingRecord, logger
58
58
  from ..utils.metacfg import AttrDict, set_config_by_yaml
59
59
  from ..utils.settings import CellType, LayoutType
60
60
  from ..utils.transform import PadTransform
@@ -113,11 +113,12 @@ def config_sanity_checks(cfg: AttrDict) -> None:
113
113
  """Some config sanity checks"""
114
114
  if cfg.USE_PDF_MINER and cfg.USE_OCR and cfg.OCR.USE_DOCTR:
115
115
  raise ValueError("Configuration USE_PDF_MINER= True and USE_OCR=True and USE_DOCTR=True is not allowed")
116
- if cfg.OCR.USE_TESSERACT + cfg.OCR.USE_DOCTR + cfg.OCR.USE_TEXTRACT != 1:
117
- raise ValueError(
118
- "Choose either OCR.USE_TESSERACT=True or OCR.USE_DOCTR=True or OCR.USE_TEXTRACT=True and set the other two "
119
- "to False. Only one OCR system can be activated."
120
- )
116
+ if cfg.USE_OCR:
117
+ if cfg.OCR.USE_TESSERACT + cfg.OCR.USE_DOCTR + cfg.OCR.USE_TEXTRACT != 1:
118
+ raise ValueError(
119
+ "Choose either OCR.USE_TESSERACT=True or OCR.USE_DOCTR=True or OCR.USE_TEXTRACT=True "
120
+ "and set the other two to False. Only one OCR system can be activated."
121
+ )
121
122
 
122
123
 
123
124
  def build_detector(
@@ -231,9 +232,13 @@ def build_ocr(cfg: AttrDict) -> Union[TesseractOcrDetector, DoctrTextRecognizer,
231
232
  weights = cfg.OCR.WEIGHTS.DOCTR_RECOGNITION.TF if cfg.LIB == "TF" else cfg.OCR.WEIGHTS.DOCTR_RECOGNITION.PT
232
233
  weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
233
234
  profile = ModelCatalog.get_profile(weights)
235
+ # get_full_path_configs will complete the path even if the model is not registered
236
+ config_path = ModelCatalog.get_full_path_configs(weights) if profile.config is not None else None
234
237
  if profile.architecture is None:
235
238
  raise ValueError("model profile.architecture must be specified")
236
- return DoctrTextRecognizer(profile.architecture, weights_path, cfg.DEVICE, lib=cfg.LIB)
239
+ return DoctrTextRecognizer(
240
+ profile.architecture, weights_path, cfg.DEVICE, lib=cfg.LIB, path_config_json=config_path
241
+ )
237
242
  if cfg.OCR.USE_TEXTRACT:
238
243
  credentials_kwargs = {
239
244
  "aws_access_key_id": environ.get("ACCESS_KEY"),
@@ -445,7 +450,7 @@ def get_dd_analyzer(
445
450
  cfg.update_args(config_overwrite)
446
451
 
447
452
  config_sanity_checks(cfg)
448
- logger.info("Config: \n %s", str(cfg), cfg.to_dict())
453
+ logger.info(LoggingRecord(f"Config: \n {str(cfg)}", cfg.to_dict())) # type: ignore
449
454
 
450
455
  # will silent all TP logging while building the tower
451
456
  if tensorpack_available():
@@ -17,25 +17,6 @@ from typing import Any, Iterator, no_type_check
17
17
  from ..utils.utils import get_rng
18
18
 
19
19
 
20
- class DataFlowTerminated(BaseException):
21
- """
22
- An exception indicating that the DataFlow is unable to produce any more
23
- data, i.e. something wrong happened so that calling `__iter__`
24
- cannot give a valid iterator anymore.
25
- In most DataFlow this will never be raised.
26
- """
27
-
28
-
29
- class DataFlowResetStateNotCalled(BaseException):
30
- """
31
- An exception indicating that `reset_state()` has not been called before starting
32
- iteration.
33
- """
34
-
35
- def __init__(self) -> None:
36
- super().__init__("Iterating a dataflow requires .reset_state() to be called first")
37
-
38
-
39
20
  class DataFlowReentrantGuard:
40
21
  """
41
22
  A tool to enforce non-reentrancy.
@@ -25,10 +25,11 @@ from typing import Any, Callable, Iterable, Iterator, List, Optional
25
25
 
26
26
  import numpy as np
27
27
 
28
- from ..utils.logger import logger
28
+ from ..utils.error import DataFlowResetStateNotCalledError
29
+ from ..utils.logger import LoggingRecord, logger
29
30
  from ..utils.tqdm import get_tqdm
30
31
  from ..utils.utils import get_rng
31
- from .base import DataFlow, DataFlowReentrantGuard, DataFlowResetStateNotCalled, ProxyDataFlow
32
+ from .base import DataFlow, DataFlowReentrantGuard, ProxyDataFlow
32
33
  from .serialize import DataFromIterable, DataFromList
33
34
 
34
35
  __all__ = ["CacheData", "CustomDataFromList", "CustomDataFromIterable"]
@@ -65,7 +66,7 @@ class CacheData(ProxyDataFlow):
65
66
 
66
67
  def __iter__(self) -> Iterator[Any]:
67
68
  if self._guard is None:
68
- raise DataFlowResetStateNotCalled()
69
+ raise DataFlowResetStateNotCalledError()
69
70
 
70
71
  with self._guard:
71
72
  if self.buffer:
@@ -139,10 +140,10 @@ class CustomDataFromList(DataFromList):
139
140
 
140
141
  def __iter__(self) -> Iterator[Any]:
141
142
  if self.rng is None:
142
- raise DataFlowResetStateNotCalled()
143
+ raise DataFlowResetStateNotCalledError()
143
144
  if self.rebalance_func is not None:
144
145
  lst_tmp = self.rebalance_func(self.lst)
145
- logger.info("subset size after re-balancing: %s", len(lst_tmp))
146
+ logger.info(LoggingRecord(f"CustomDataFromList: subset size after re-balancing: {len(lst_tmp)}"))
146
147
  else:
147
148
  lst_tmp = self.lst
148
149
 
@@ -23,16 +23,20 @@ import itertools
23
23
  import json
24
24
  import os
25
25
  from collections import defaultdict
26
+ from pathlib import Path
26
27
  from typing import DefaultDict, Dict, List, Optional, Sequence, Union
27
28
 
28
29
  from jsonlines import Reader, Writer
30
+ from tabulate import tabulate
31
+ from termcolor import colored
29
32
 
30
33
  from ..utils.context import timed_operation
31
34
  from ..utils.detection_types import JsonDict, Pathlike
35
+ from ..utils.error import FileExtensionError
32
36
  from ..utils.identifier import get_uuid_from_str
33
37
  from ..utils.pdf_utils import PDFStreamer
34
38
  from ..utils.tqdm import get_tqdm
35
- from ..utils.utils import FileExtensionError, is_file_extension
39
+ from ..utils.utils import is_file_extension
36
40
  from .base import DataFlow
37
41
  from .common import FlattenData, JoinData, MapData
38
42
  from .custom import CacheData, CustomDataFromIterable, CustomDataFromList
@@ -186,6 +190,11 @@ class SerializerFiles:
186
190
  df2: DataFlow
187
191
  df3: DataFlow
188
192
 
193
+ if isinstance(path, str):
194
+ path = Path(path)
195
+ if not path.exists():
196
+ raise NotADirectoryError(f"The path {path} to the directory or file does not exist")
197
+
189
198
  if shuffle:
190
199
  sort = False
191
200
  it1 = os.walk(path, topdown=False)
@@ -217,7 +226,7 @@ class SerializerFiles:
217
226
  """
218
227
  Not implemented
219
228
  """
220
- raise NotImplementedError
229
+ raise NotImplementedError()
221
230
 
222
231
 
223
232
  class CocoParser:
@@ -277,8 +286,14 @@ class CocoParser:
277
286
  """
278
287
  Print information about the annotation file.
279
288
  """
289
+ rows = []
280
290
  for key, value in self.dataset["info"].items():
281
- print(f"{key}: {value}")
291
+ row = [key, value]
292
+ rows.append(row)
293
+
294
+ header = ["key", "value"]
295
+ table = tabulate(rows, headers=header, tablefmt="fancy_grid", stralign="left", numalign="left")
296
+ print(colored(table, "cyan"))
282
297
 
283
298
  def get_ann_ids(
284
299
  self,
@@ -493,7 +508,7 @@ class SerializerCoco:
493
508
  """
494
509
  Not implemented
495
510
  """
496
- raise NotImplementedError
511
+ raise NotImplementedError()
497
512
 
498
513
 
499
514
  class SerializerPdfDoc:
@@ -541,7 +556,7 @@ class SerializerPdfDoc:
541
556
  """
542
557
  Not implemented
543
558
  """
544
- raise NotImplementedError
559
+ raise NotImplementedError()
545
560
 
546
561
  @staticmethod
547
562
  def split(path: Pathlike, path_target: Optional[Pathlike] = None, max_datapoint: Optional[int] = None) -> None:
@@ -28,8 +28,9 @@ from typing import Any, Callable, Iterator, List, no_type_check
28
28
  import zmq
29
29
 
30
30
  from ..utils.concurrency import StoppableThread, enable_death_signal, start_proc_mask_signal
31
- from ..utils.logger import logger
32
- from .base import DataFlow, DataFlowReentrantGuard, DataFlowTerminated, ProxyDataFlow
31
+ from ..utils.error import DataFlowTerminatedError
32
+ from ..utils.logger import LoggingRecord, logger
33
+ from .base import DataFlow, DataFlowReentrantGuard, ProxyDataFlow
33
34
  from .common import RepeatedData
34
35
  from .serialize import PickleSerializer
35
36
 
@@ -48,15 +49,15 @@ def _zmq_catch_error(name):
48
49
  try:
49
50
  yield
50
51
  except zmq.ContextTerminated as exc:
51
- logger.info("[%s] Context terminated.", name)
52
- raise DataFlowTerminated() from exc
52
+ logger.info(LoggingRecord(f"_zmq_catch_error: [{name}] Context terminated."))
53
+ raise DataFlowTerminatedError() from exc
53
54
  except zmq.ZMQError as exc:
54
55
  if exc.errno == errno.ENOTSOCK: # socket closed
55
- logger.info("[%s] Socket closed.", name)
56
- raise DataFlowTerminated() from exc
57
- raise ValueError from exc
56
+ logger.info(LoggingRecord(f"_zmq_catch_error: [{name}] Socket closed."))
57
+ raise DataFlowTerminatedError() from exc
58
+ raise ValueError() from exc
58
59
  except Exception as exc:
59
- raise ValueError from exc
60
+ raise ValueError() from exc
60
61
 
61
62
 
62
63
  @no_type_check
@@ -78,8 +79,8 @@ def _get_pipe_name(name):
78
79
  class _ParallelMapData(ProxyDataFlow, ABC):
79
80
  def __init__(self, df: DataFlow, buffer_size: int, strict: bool = False) -> None:
80
81
  super().__init__(df)
81
- if not buffer_size:
82
- raise ValueError("buffer_size must be a positive number")
82
+ if buffer_size <= 0:
83
+ raise ValueError(f"buffer_size must be a positive number, got {buffer_size}")
83
84
  self._buffer_size = buffer_size
84
85
  self._buffer_occupancy = 0 # actual #elements in buffer, only useful in strict mode
85
86
  self._strict = strict
@@ -95,12 +96,12 @@ class _ParallelMapData(ProxyDataFlow, ABC):
95
96
  @no_type_check
96
97
  @abstractmethod
97
98
  def _recv(self):
98
- raise NotImplementedError
99
+ raise NotImplementedError()
99
100
 
100
101
  @no_type_check
101
102
  @abstractmethod
102
103
  def _send(self, dp: Any):
103
- raise NotImplementedError
104
+ raise NotImplementedError()
104
105
 
105
106
  @no_type_check
106
107
  def _recv_filter_none(self):
@@ -312,7 +313,8 @@ class _MultiProcessZMQDataFlow(DataFlow, ABC):
312
313
  for x in self._procs:
313
314
  x.terminate()
314
315
  x.join(5)
315
- logger.info("%s successfully cleaned-up.", type(self).__name__)
316
+ logger.info(LoggingRecord(f"_MultiProcessZMQDataFlow [{type(self).__name__}] successfully cleaned-up."))
317
+
316
318
  except Exception: # pylint: disable=W0703
317
319
  pass
318
320
 
@@ -323,9 +325,12 @@ def _bind_guard(sock, name):
323
325
  sock.bind(name)
324
326
  except zmq.ZMQError:
325
327
  logger.error(
326
- "ZMQError in socket.bind('{name}'). Perhaps you're using pipes on a non-local file system. "
327
- "See documentation of MultiProcessRunnerZMQ for more information."
328
+ LoggingRecord(
329
+ f"ZMQError in socket.bind('{name}'). Perhaps you're using pipes on a non-local file system. "
330
+ "See documentation of MultiProcessRunnerZMQ for more information."
331
+ )
328
332
  )
333
+
329
334
  raise
330
335
 
331
336
 
@@ -394,8 +399,8 @@ class MultiProcessMapData(_ParallelMapData, _MultiProcessZMQDataFlow):
394
399
 
395
400
  _ParallelMapData.__init__(self, df, buffer_size, strict)
396
401
  _MultiProcessZMQDataFlow.__init__(self)
397
- if not num_proc:
398
- raise ValueError("num_proc must be a positive number")
402
+ if num_proc <= 0:
403
+ raise ValueError(f"num_proc must be a positive number, got {num_proc}")
399
404
  self.num_proc = num_proc
400
405
  self.map_func = map_func
401
406
  self._strict = strict
@@ -16,7 +16,8 @@ from typing import Any, Iterable, Iterator, List, Optional, Tuple, Union
16
16
 
17
17
  import numpy as np
18
18
 
19
- from .base import DataFlow, DataFlowResetStateNotCalled, RNGDataFlow
19
+ from ..utils.error import DataFlowResetStateNotCalledError
20
+ from .base import DataFlow, RNGDataFlow
20
21
 
21
22
 
22
23
  class DataFromList(RNGDataFlow):
@@ -44,7 +45,7 @@ class DataFromList(RNGDataFlow):
44
45
  for k in idxs:
45
46
  yield self.lst[k]
46
47
  else:
47
- raise DataFlowResetStateNotCalled()
48
+ raise DataFlowResetStateNotCalledError()
48
49
 
49
50
 
50
51
  class DataFromIterable(DataFlow):
@@ -63,7 +64,7 @@ class DataFromIterable(DataFlow):
63
64
 
64
65
  def __len__(self) -> int:
65
66
  if self._len is None:
66
- raise NotImplementedError
67
+ raise NotImplementedError()
67
68
  return self._len
68
69
 
69
70
  def __iter__(self) -> Iterator[Any]:
@@ -107,7 +108,7 @@ class FakeData(RNGDataFlow):
107
108
 
108
109
  def __iter__(self) -> Iterator[Any]:
109
110
  if self.rng is None:
110
- raise DataFlowResetStateNotCalled()
111
+ raise DataFlowResetStateNotCalledError()
111
112
  if self.random:
112
113
  for _ in range(self._size):
113
114
  val = []
@@ -23,7 +23,7 @@ from typing import Any, Optional, Tuple, Union
23
23
  import numpy as np
24
24
  import numpy.typing as npt
25
25
 
26
- from ..utils.logger import logger
26
+ from ..utils.logger import LoggingRecord, logger
27
27
  from ..utils.tqdm import get_tqdm
28
28
  from .base import DataFlow, ProxyDataFlow
29
29
 
@@ -95,7 +95,7 @@ class MeanFromDataFlow(ProxyDataFlow):
95
95
  self.df.reset_state()
96
96
  itr = iter(self.df)
97
97
 
98
- logger.info("Calculating mean")
98
+ logger.info(LoggingRecord("Calculating mean"))
99
99
 
100
100
  len_df: Optional[int]
101
101
  try:
@@ -139,7 +139,7 @@ class MeanFromDataFlow(ProxyDataFlow):
139
139
  if n == self.max_datapoints:
140
140
  break
141
141
 
142
- logger.info("Mean from %s datapoints along axis %s: %s", n, self.axis, self.mean)
142
+ logger.info(LoggingRecord(f"Mean from {n} datapoints along axis {self.axis}: {self.mean}"))
143
143
 
144
144
  return self.mean
145
145
 
@@ -216,7 +216,7 @@ class StdFromDataFlow(ProxyDataFlow):
216
216
  self.df.reset_state()
217
217
  itr = iter(self.df)
218
218
 
219
- logger.info("Calculating standard deviation")
219
+ logger.info(LoggingRecord("Calculating standard deviation"))
220
220
  try:
221
221
  len_df = len(self.df)
222
222
  except NotImplementedError:
@@ -266,6 +266,6 @@ class StdFromDataFlow(ProxyDataFlow):
266
266
  var = (ex2 - (ex * ex) / n) / (n - 1)
267
267
  self.std = np.sqrt(var)
268
268
 
269
- logger.info("Standard deviation from %s datapoints along axis %s: %s", n, self.axis, self.std)
269
+ logger.info(LoggingRecord(f"Standard deviation from {n} datapoints along axis {self.axis}: {self.std}"))
270
270
 
271
271
  return self.std
@@ -24,8 +24,9 @@ from dataclasses import dataclass, field
24
24
  from typing import Any, Dict, List, Optional, Union, no_type_check
25
25
 
26
26
  from ..utils.detection_types import JsonDict
27
+ from ..utils.error import AnnotationError, UUIDError
27
28
  from ..utils.identifier import get_uuid, is_uuid_like
28
- from ..utils.logger import logger
29
+ from ..utils.logger import LoggingRecord, logger
29
30
  from ..utils.settings import DefaultType, ObjectTypes, SummaryType, TypeOrStr, get_type
30
31
  from .box import BoundingBox
31
32
  from .convert import as_dict
@@ -36,7 +37,16 @@ def ann_from_dict(cls, **kwargs):
36
37
  """
37
38
  A factory function to create subclasses of annotations from a given dict
38
39
  """
39
- ann = cls(kwargs.get("external_id"), kwargs.get("category_name"), kwargs.get("category_id"), kwargs.get("score"))
40
+ _init_kwargs = {
41
+ "external_id": kwargs.get("external_id"),
42
+ "category_name": kwargs.get("category_name"),
43
+ "category_id": kwargs.get("category_id"),
44
+ "score": kwargs.get("score"),
45
+ "service_id": kwargs.get("service_id"),
46
+ "model_id": kwargs.get("model_id"),
47
+ "session_id": kwargs.get("session_id"),
48
+ }
49
+ ann = cls(**_init_kwargs)
40
50
  ann.active = kwargs.get("active")
41
51
  ann._annotation_id = kwargs.get("_annotation_id") # pylint: disable=W0212
42
52
  if isinstance(kwargs.get("sub_categories"), dict):
@@ -74,11 +84,17 @@ class Annotation(ABC):
74
84
  id will not depend on the defining attributes.
75
85
 
76
86
  `_annotation_id`: Unique id for annotations. Will always be given as string representation of a md5-hash.
87
+ `service_id`: Service that generated the annotation. This will be the name of a pipeline component
88
+ `model_id`: Model that generated the annotation. This will be the name of particular model
89
+ `session_id`: Session id for the annotation. This will be the id of the session in which the annotation was created.
77
90
  """
78
91
 
79
92
  active: bool = field(default=True, init=False, repr=True)
80
93
  external_id: Optional[Union[str, int]] = field(default=None, init=True, repr=False)
81
94
  _annotation_id: Optional[str] = field(default=None, init=False, repr=True)
95
+ service_id: Optional[str] = field(default=None)
96
+ model_id: Optional[str] = field(default=None)
97
+ session_id: Optional[str] = field(default=None)
82
98
 
83
99
  def __post_init__(self) -> None:
84
100
  """
@@ -101,7 +117,7 @@ class Annotation(ABC):
101
117
  """
102
118
  if self._annotation_id:
103
119
  return self._annotation_id
104
- raise ValueError("Dump annotation first or pass external_id to create an annotation id")
120
+ raise AnnotationError("Dump annotation first or pass external_id to create an annotation id")
105
121
 
106
122
  @annotation_id.setter
107
123
  def annotation_id(self, input_id: str) -> None:
@@ -109,13 +125,13 @@ class Annotation(ABC):
109
125
  annotation_id setter
110
126
  """
111
127
  if self._annotation_id is not None:
112
- raise AssertionError("Annotation_id already defined and cannot be reset")
128
+ raise AnnotationError("Annotation_id already defined and cannot be reset")
113
129
  if is_uuid_like(input_id):
114
130
  self._annotation_id = input_id
115
131
  elif isinstance(input_id, property):
116
132
  pass
117
133
  else:
118
- raise ValueError("Annotation_id must be uuid3 string")
134
+ raise AnnotationError("Annotation_id must be uuid3 string")
119
135
 
120
136
  @abstractmethod
121
137
  def get_defining_attributes(self) -> List[str]:
@@ -126,13 +142,13 @@ class Annotation(ABC):
126
142
 
127
143
  :return: A list of attributes.
128
144
  """
129
- raise NotImplementedError
145
+ raise NotImplementedError()
130
146
 
131
147
  def _assert_attributes_have_str(self, state_id: bool = False) -> None:
132
148
  defining_attributes = self.get_state_attributes() if state_id else self.get_defining_attributes()
133
149
  for attr in defining_attributes:
134
150
  if not hasattr(eval("self." + attr), "__str__"): # pylint: disable=W0123
135
- raise AttributeError(f"Attribute {attr} must have __str__ method")
151
+ raise AnnotationError(f"Attribute {attr} must have __str__ method")
136
152
 
137
153
  @staticmethod
138
154
  def set_annotation_id(annotation: "CategoryAnnotation", *container_id_context: Optional[str]) -> str:
@@ -179,7 +195,7 @@ class Annotation(ABC):
179
195
 
180
196
  :return: Annotation instance
181
197
  """
182
- raise NotImplementedError
198
+ raise NotImplementedError()
183
199
 
184
200
  @staticmethod
185
201
  @abstractmethod
@@ -189,7 +205,7 @@ class Annotation(ABC):
189
205
 
190
206
  :return: A list of attributes.
191
207
  """
192
- raise NotImplementedError
208
+ raise NotImplementedError()
193
209
 
194
210
  @property
195
211
  def state_id(self) -> str:
@@ -290,7 +306,12 @@ class CategoryAnnotation(Annotation):
290
306
  """
291
307
 
292
308
  if sub_category_name in self.sub_categories:
293
- raise KeyError(f"{sub_category_name} as sub category already defined for " f"{self.annotation_id}")
309
+ raise AnnotationError(
310
+ f"sub category {sub_category_name} already defined: "
311
+ f"annotation_id: {self.annotation_id}, "
312
+ f"category_name: {self.category_name}, "
313
+ f"category_id: {self.category_id}"
314
+ )
294
315
 
295
316
  if self._annotation_id is not None:
296
317
  if annotation._annotation_id is None: # pylint: disable=W0212
@@ -333,7 +354,7 @@ class CategoryAnnotation(Annotation):
333
354
  :param annotation_id: An annotation id
334
355
  """
335
356
  if not is_uuid_like(annotation_id):
336
- raise ValueError("Annotation_id must be uuid")
357
+ raise UUIDError("Annotation_id must be uuid")
337
358
 
338
359
  key_type = get_type(key)
339
360
  if key not in self.relationships:
@@ -369,7 +390,7 @@ class CategoryAnnotation(Annotation):
369
390
  try:
370
391
  self.relationships[key].remove(ann_id)
371
392
  except ValueError:
372
- logger.warning("Relationship %s cannot be removed because it does not exist", key)
393
+ logger.warning(LoggingRecord(f"Relationship {key} cannot be removed because it does not exist"))
373
394
  else:
374
395
  self.relationships[key].clear()
375
396
 
@@ -436,14 +457,14 @@ class ImageAnnotation(CategoryAnnotation):
436
457
  box = self.bounding_box
437
458
  if box:
438
459
  return box
439
- raise ValueError(f"bounding_box has not been initialized for {self.annotation_id}")
460
+ raise AnnotationError(f"bounding_box has not been initialized for {self.annotation_id}")
440
461
 
441
462
  def get_summary(self, key: ObjectTypes) -> CategoryAnnotation:
442
463
  """Get summary sub categories from `image`. Raises `ValueError` if `key` is not available"""
443
464
  if self.image:
444
465
  if self.image.summary:
445
466
  return self.image.summary.get_sub_category(key)
446
- raise ValueError(f"Summary does not exist for {self.annotation_id} and key: {key}")
467
+ raise AnnotationError(f"Summary does not exist for {self.annotation_id} and key: {key}")
447
468
 
448
469
 
449
470
  @dataclass
@@ -28,8 +28,9 @@ import numpy.typing as npt
28
28
  from numpy import float32
29
29
 
30
30
  from ..utils.detection_types import ImageType
31
+ from ..utils.error import BoundingBoxError
31
32
  from ..utils.file_utils import cocotools_available
32
- from ..utils.logger import logger
33
+ from ..utils.logger import LoggingRecord, logger
33
34
 
34
35
  if cocotools_available():
35
36
  import pycocotools.mask as coco_mask
@@ -140,10 +141,6 @@ def iou(boxes1: npt.NDArray[float32], boxes2: npt.NDArray[float32]) -> npt.NDArr
140
141
  return np_iou(boxes1, boxes2)
141
142
 
142
143
 
143
- class BoundingBoxError(BaseException):
144
- """Special exception only for `BoundingBox`"""
145
-
146
-
147
144
  @dataclass
148
145
  class BoundingBox:
149
146
  """
@@ -558,6 +555,12 @@ def intersection_boxes(boxes_1: Sequence[BoundingBox], boxes_2: Sequence[Boundin
558
555
  :param boxes_2: sequence of n BoundingBox
559
556
  :return: list of at most mxn BoundingBox
560
557
  """
558
+ if not boxes_1 and boxes_2:
559
+ return boxes_2
560
+ if not boxes_2 and boxes_1:
561
+ return boxes_1
562
+ if not boxes_1 and not boxes_2:
563
+ return []
561
564
  if boxes_1[0].absolute_coords != boxes_2[0].absolute_coords:
562
565
  raise ValueError("absolute_coords of boxes_1 and boxes_2 mus be equal")
563
566
  absolute_coords = boxes_1[0].absolute_coords
@@ -596,6 +599,6 @@ def intersection_boxes(boxes_1: Sequence[BoundingBox], boxes_2: Sequence[Boundin
596
599
  "height": np_boxes_output[idx][3],
597
600
  }
598
601
 
599
- logger.warning("intersection_boxes error %s", "", log_dict)
602
+ logger.warning(LoggingRecord("intersection_boxes", log_dict)) # type: ignore
600
603
 
601
604
  return boxes_output
@@ -32,6 +32,7 @@ from pypdf import PdfReader
32
32
 
33
33
  from ..utils.detection_types import ImageType
34
34
  from ..utils.develop import deprecated
35
+ from ..utils.error import DependencyError
35
36
  from ..utils.pdf_utils import pdf_to_np_array
36
37
  from ..utils.viz import viz_handler
37
38
 
@@ -121,7 +122,8 @@ def convert_pdf_bytes_to_np_array(pdf_bytes: bytes, dpi: Optional[int] = None) -
121
122
  """
122
123
  from pdf2image import convert_from_bytes # type: ignore # pylint: disable=C0415, E0401
123
124
 
124
- assert which("pdftoppm") is not None, "convert_pdf_bytes_to_np_array requires poppler to be installed"
125
+ if which("pdftoppm") is None:
126
+ raise DependencyError("convert_pdf_bytes_to_np_array requires poppler to be installed")
125
127
 
126
128
  with BytesIO(pdf_bytes) as pdf_file:
127
129
  pdf = PdfReader(pdf_file).pages[0]