deepdoctection 0.29__py3-none-any.whl → 0.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +6 -2
- deepdoctection/analyzer/dd.py +13 -8
- deepdoctection/dataflow/base.py +0 -19
- deepdoctection/dataflow/custom.py +6 -5
- deepdoctection/dataflow/custom_serialize.py +20 -5
- deepdoctection/dataflow/parallel_map.py +22 -17
- deepdoctection/dataflow/serialize.py +5 -4
- deepdoctection/dataflow/stats.py +5 -5
- deepdoctection/datapoint/annotation.py +35 -14
- deepdoctection/datapoint/box.py +9 -6
- deepdoctection/datapoint/convert.py +3 -1
- deepdoctection/datapoint/image.py +66 -29
- deepdoctection/datapoint/view.py +62 -24
- deepdoctection/datasets/adapter.py +4 -5
- deepdoctection/datasets/base.py +87 -14
- deepdoctection/datasets/dataflow_builder.py +1 -1
- deepdoctection/datasets/info.py +2 -2
- deepdoctection/datasets/instances/fintabnet.py +3 -3
- deepdoctection/datasets/instances/layouttest.py +2 -7
- deepdoctection/datasets/instances/pubtabnet.py +3 -3
- deepdoctection/eval/accmetric.py +7 -5
- deepdoctection/eval/base.py +5 -4
- deepdoctection/eval/eval.py +9 -7
- deepdoctection/eval/tedsmetric.py +9 -3
- deepdoctection/eval/tp_eval_callback.py +8 -7
- deepdoctection/extern/base.py +39 -13
- deepdoctection/extern/d2detect.py +164 -64
- deepdoctection/extern/deskew.py +32 -7
- deepdoctection/extern/doctrocr.py +268 -29
- deepdoctection/extern/fastlang.py +45 -7
- deepdoctection/extern/hfdetr.py +90 -33
- deepdoctection/extern/hflayoutlm.py +109 -22
- deepdoctection/extern/model.py +30 -11
- deepdoctection/extern/pdftext.py +2 -1
- deepdoctection/extern/pt/ptutils.py +3 -2
- deepdoctection/extern/tessocr.py +134 -22
- deepdoctection/extern/texocr.py +4 -2
- deepdoctection/extern/tp/tpcompat.py +4 -4
- deepdoctection/extern/tp/tpfrcnn/preproc.py +2 -7
- deepdoctection/extern/tpdetect.py +50 -23
- deepdoctection/mapper/d2struct.py +1 -1
- deepdoctection/mapper/hfstruct.py +1 -1
- deepdoctection/mapper/laylmstruct.py +1 -1
- deepdoctection/mapper/maputils.py +19 -5
- deepdoctection/mapper/prodigystruct.py +15 -13
- deepdoctection/mapper/pubstruct.py +10 -10
- deepdoctection/mapper/tpstruct.py +1 -1
- deepdoctection/pipe/anngen.py +35 -8
- deepdoctection/pipe/base.py +53 -19
- deepdoctection/pipe/cell.py +29 -8
- deepdoctection/pipe/common.py +12 -4
- deepdoctection/pipe/doctectionpipe.py +4 -3
- deepdoctection/pipe/language.py +3 -2
- deepdoctection/pipe/layout.py +3 -2
- deepdoctection/pipe/lm.py +2 -2
- deepdoctection/pipe/order.py +67 -39
- deepdoctection/pipe/refine.py +18 -10
- deepdoctection/pipe/segment.py +34 -20
- deepdoctection/pipe/text.py +14 -8
- deepdoctection/pipe/transform.py +16 -8
- deepdoctection/train/d2_frcnn_train.py +17 -14
- deepdoctection/train/hf_detr_train.py +13 -9
- deepdoctection/train/hf_layoutlm_train.py +31 -19
- deepdoctection/utils/__init__.py +3 -0
- deepdoctection/utils/concurrency.py +1 -1
- deepdoctection/utils/context.py +5 -5
- deepdoctection/utils/develop.py +2 -2
- deepdoctection/utils/env_info.py +64 -27
- deepdoctection/utils/error.py +84 -0
- deepdoctection/utils/file_utils.py +28 -17
- deepdoctection/utils/fs.py +16 -14
- deepdoctection/utils/logger.py +43 -19
- deepdoctection/utils/pdf_utils.py +14 -7
- deepdoctection/utils/settings.py +5 -1
- deepdoctection/utils/transform.py +1 -1
- deepdoctection/utils/utils.py +0 -6
- deepdoctection/utils/viz.py +83 -14
- {deepdoctection-0.29.dist-info → deepdoctection-0.31.dist-info}/METADATA +39 -61
- deepdoctection-0.31.dist-info/RECORD +144 -0
- {deepdoctection-0.29.dist-info → deepdoctection-0.31.dist-info}/WHEEL +1 -1
- deepdoctection-0.29.dist-info/RECORD +0 -143
- {deepdoctection-0.29.dist-info → deepdoctection-0.31.dist-info}/LICENSE +0 -0
- {deepdoctection-0.29.dist-info → deepdoctection-0.31.dist-info}/top_level.txt +0 -0
deepdoctection/__init__.py
CHANGED
|
@@ -27,7 +27,7 @@ from .utils.logger import logger
|
|
|
27
27
|
|
|
28
28
|
# pylint: enable=wrong-import-position
|
|
29
29
|
|
|
30
|
-
__version__ = 0.
|
|
30
|
+
__version__ = 0.31
|
|
31
31
|
|
|
32
32
|
_IMPORT_STRUCTURE = {
|
|
33
33
|
"analyzer": [
|
|
@@ -179,6 +179,7 @@ _IMPORT_STRUCTURE = {
|
|
|
179
179
|
"Jdeskewer",
|
|
180
180
|
"DoctrTextlineDetector",
|
|
181
181
|
"DoctrTextRecognizer",
|
|
182
|
+
"DocTrRotationTransformer",
|
|
182
183
|
"FasttextLangDetector",
|
|
183
184
|
"HFDetrDerivedDetector",
|
|
184
185
|
"HFLayoutLmTokenClassifierBase",
|
|
@@ -194,6 +195,7 @@ _IMPORT_STRUCTURE = {
|
|
|
194
195
|
"ModelDownloadManager",
|
|
195
196
|
"PdfPlumberTextDetector",
|
|
196
197
|
"TesseractOcrDetector",
|
|
198
|
+
"TesseractRotationTransformer",
|
|
197
199
|
"TextractOcrDetector",
|
|
198
200
|
"TPFrcnnDetector",
|
|
199
201
|
],
|
|
@@ -279,7 +281,7 @@ _IMPORT_STRUCTURE = {
|
|
|
279
281
|
"PubtablesSegmentationService",
|
|
280
282
|
"SegmentationResult",
|
|
281
283
|
"TextExtractionService",
|
|
282
|
-
"
|
|
284
|
+
"SimpleTransformService",
|
|
283
285
|
],
|
|
284
286
|
"train": [
|
|
285
287
|
"D2Trainer",
|
|
@@ -343,6 +345,8 @@ _IMPORT_STRUCTURE = {
|
|
|
343
345
|
"get_opencv_requirement",
|
|
344
346
|
"pillow_available",
|
|
345
347
|
"get_pillow_requirement",
|
|
348
|
+
"spacy_available",
|
|
349
|
+
"get_spacy_requirement",
|
|
346
350
|
"load_image_from_file",
|
|
347
351
|
"load_bytes_from_pdf_file",
|
|
348
352
|
"get_load_image_func",
|
deepdoctection/analyzer/dd.py
CHANGED
|
@@ -54,7 +54,7 @@ from ..utils.file_utils import (
|
|
|
54
54
|
tf_available,
|
|
55
55
|
)
|
|
56
56
|
from ..utils.fs import get_configs_dir_path, get_package_path, mkdir_p
|
|
57
|
-
from ..utils.logger import logger
|
|
57
|
+
from ..utils.logger import LoggingRecord, logger
|
|
58
58
|
from ..utils.metacfg import AttrDict, set_config_by_yaml
|
|
59
59
|
from ..utils.settings import CellType, LayoutType
|
|
60
60
|
from ..utils.transform import PadTransform
|
|
@@ -113,11 +113,12 @@ def config_sanity_checks(cfg: AttrDict) -> None:
|
|
|
113
113
|
"""Some config sanity checks"""
|
|
114
114
|
if cfg.USE_PDF_MINER and cfg.USE_OCR and cfg.OCR.USE_DOCTR:
|
|
115
115
|
raise ValueError("Configuration USE_PDF_MINER= True and USE_OCR=True and USE_DOCTR=True is not allowed")
|
|
116
|
-
if cfg.
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
116
|
+
if cfg.USE_OCR:
|
|
117
|
+
if cfg.OCR.USE_TESSERACT + cfg.OCR.USE_DOCTR + cfg.OCR.USE_TEXTRACT != 1:
|
|
118
|
+
raise ValueError(
|
|
119
|
+
"Choose either OCR.USE_TESSERACT=True or OCR.USE_DOCTR=True or OCR.USE_TEXTRACT=True "
|
|
120
|
+
"and set the other two to False. Only one OCR system can be activated."
|
|
121
|
+
)
|
|
121
122
|
|
|
122
123
|
|
|
123
124
|
def build_detector(
|
|
@@ -231,9 +232,13 @@ def build_ocr(cfg: AttrDict) -> Union[TesseractOcrDetector, DoctrTextRecognizer,
|
|
|
231
232
|
weights = cfg.OCR.WEIGHTS.DOCTR_RECOGNITION.TF if cfg.LIB == "TF" else cfg.OCR.WEIGHTS.DOCTR_RECOGNITION.PT
|
|
232
233
|
weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
|
|
233
234
|
profile = ModelCatalog.get_profile(weights)
|
|
235
|
+
# get_full_path_configs will complete the path even if the model is not registered
|
|
236
|
+
config_path = ModelCatalog.get_full_path_configs(weights) if profile.config is not None else None
|
|
234
237
|
if profile.architecture is None:
|
|
235
238
|
raise ValueError("model profile.architecture must be specified")
|
|
236
|
-
return DoctrTextRecognizer(
|
|
239
|
+
return DoctrTextRecognizer(
|
|
240
|
+
profile.architecture, weights_path, cfg.DEVICE, lib=cfg.LIB, path_config_json=config_path
|
|
241
|
+
)
|
|
237
242
|
if cfg.OCR.USE_TEXTRACT:
|
|
238
243
|
credentials_kwargs = {
|
|
239
244
|
"aws_access_key_id": environ.get("ACCESS_KEY"),
|
|
@@ -445,7 +450,7 @@ def get_dd_analyzer(
|
|
|
445
450
|
cfg.update_args(config_overwrite)
|
|
446
451
|
|
|
447
452
|
config_sanity_checks(cfg)
|
|
448
|
-
logger.info("Config: \n
|
|
453
|
+
logger.info(LoggingRecord(f"Config: \n {str(cfg)}", cfg.to_dict())) # type: ignore
|
|
449
454
|
|
|
450
455
|
# will silent all TP logging while building the tower
|
|
451
456
|
if tensorpack_available():
|
deepdoctection/dataflow/base.py
CHANGED
|
@@ -17,25 +17,6 @@ from typing import Any, Iterator, no_type_check
|
|
|
17
17
|
from ..utils.utils import get_rng
|
|
18
18
|
|
|
19
19
|
|
|
20
|
-
class DataFlowTerminated(BaseException):
|
|
21
|
-
"""
|
|
22
|
-
An exception indicating that the DataFlow is unable to produce any more
|
|
23
|
-
data, i.e. something wrong happened so that calling `__iter__`
|
|
24
|
-
cannot give a valid iterator anymore.
|
|
25
|
-
In most DataFlow this will never be raised.
|
|
26
|
-
"""
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
class DataFlowResetStateNotCalled(BaseException):
|
|
30
|
-
"""
|
|
31
|
-
An exception indicating that `reset_state()` has not been called before starting
|
|
32
|
-
iteration.
|
|
33
|
-
"""
|
|
34
|
-
|
|
35
|
-
def __init__(self) -> None:
|
|
36
|
-
super().__init__("Iterating a dataflow requires .reset_state() to be called first")
|
|
37
|
-
|
|
38
|
-
|
|
39
20
|
class DataFlowReentrantGuard:
|
|
40
21
|
"""
|
|
41
22
|
A tool to enforce non-reentrancy.
|
|
@@ -25,10 +25,11 @@ from typing import Any, Callable, Iterable, Iterator, List, Optional
|
|
|
25
25
|
|
|
26
26
|
import numpy as np
|
|
27
27
|
|
|
28
|
-
from ..utils.
|
|
28
|
+
from ..utils.error import DataFlowResetStateNotCalledError
|
|
29
|
+
from ..utils.logger import LoggingRecord, logger
|
|
29
30
|
from ..utils.tqdm import get_tqdm
|
|
30
31
|
from ..utils.utils import get_rng
|
|
31
|
-
from .base import DataFlow, DataFlowReentrantGuard,
|
|
32
|
+
from .base import DataFlow, DataFlowReentrantGuard, ProxyDataFlow
|
|
32
33
|
from .serialize import DataFromIterable, DataFromList
|
|
33
34
|
|
|
34
35
|
__all__ = ["CacheData", "CustomDataFromList", "CustomDataFromIterable"]
|
|
@@ -65,7 +66,7 @@ class CacheData(ProxyDataFlow):
|
|
|
65
66
|
|
|
66
67
|
def __iter__(self) -> Iterator[Any]:
|
|
67
68
|
if self._guard is None:
|
|
68
|
-
raise
|
|
69
|
+
raise DataFlowResetStateNotCalledError()
|
|
69
70
|
|
|
70
71
|
with self._guard:
|
|
71
72
|
if self.buffer:
|
|
@@ -139,10 +140,10 @@ class CustomDataFromList(DataFromList):
|
|
|
139
140
|
|
|
140
141
|
def __iter__(self) -> Iterator[Any]:
|
|
141
142
|
if self.rng is None:
|
|
142
|
-
raise
|
|
143
|
+
raise DataFlowResetStateNotCalledError()
|
|
143
144
|
if self.rebalance_func is not None:
|
|
144
145
|
lst_tmp = self.rebalance_func(self.lst)
|
|
145
|
-
logger.info("subset size after re-balancing:
|
|
146
|
+
logger.info(LoggingRecord(f"CustomDataFromList: subset size after re-balancing: {len(lst_tmp)}"))
|
|
146
147
|
else:
|
|
147
148
|
lst_tmp = self.lst
|
|
148
149
|
|
|
@@ -23,16 +23,20 @@ import itertools
|
|
|
23
23
|
import json
|
|
24
24
|
import os
|
|
25
25
|
from collections import defaultdict
|
|
26
|
+
from pathlib import Path
|
|
26
27
|
from typing import DefaultDict, Dict, List, Optional, Sequence, Union
|
|
27
28
|
|
|
28
29
|
from jsonlines import Reader, Writer
|
|
30
|
+
from tabulate import tabulate
|
|
31
|
+
from termcolor import colored
|
|
29
32
|
|
|
30
33
|
from ..utils.context import timed_operation
|
|
31
34
|
from ..utils.detection_types import JsonDict, Pathlike
|
|
35
|
+
from ..utils.error import FileExtensionError
|
|
32
36
|
from ..utils.identifier import get_uuid_from_str
|
|
33
37
|
from ..utils.pdf_utils import PDFStreamer
|
|
34
38
|
from ..utils.tqdm import get_tqdm
|
|
35
|
-
from ..utils.utils import
|
|
39
|
+
from ..utils.utils import is_file_extension
|
|
36
40
|
from .base import DataFlow
|
|
37
41
|
from .common import FlattenData, JoinData, MapData
|
|
38
42
|
from .custom import CacheData, CustomDataFromIterable, CustomDataFromList
|
|
@@ -186,6 +190,11 @@ class SerializerFiles:
|
|
|
186
190
|
df2: DataFlow
|
|
187
191
|
df3: DataFlow
|
|
188
192
|
|
|
193
|
+
if isinstance(path, str):
|
|
194
|
+
path = Path(path)
|
|
195
|
+
if not path.exists():
|
|
196
|
+
raise NotADirectoryError(f"The path {path} to the directory or file does not exist")
|
|
197
|
+
|
|
189
198
|
if shuffle:
|
|
190
199
|
sort = False
|
|
191
200
|
it1 = os.walk(path, topdown=False)
|
|
@@ -217,7 +226,7 @@ class SerializerFiles:
|
|
|
217
226
|
"""
|
|
218
227
|
Not implemented
|
|
219
228
|
"""
|
|
220
|
-
raise NotImplementedError
|
|
229
|
+
raise NotImplementedError()
|
|
221
230
|
|
|
222
231
|
|
|
223
232
|
class CocoParser:
|
|
@@ -277,8 +286,14 @@ class CocoParser:
|
|
|
277
286
|
"""
|
|
278
287
|
Print information about the annotation file.
|
|
279
288
|
"""
|
|
289
|
+
rows = []
|
|
280
290
|
for key, value in self.dataset["info"].items():
|
|
281
|
-
|
|
291
|
+
row = [key, value]
|
|
292
|
+
rows.append(row)
|
|
293
|
+
|
|
294
|
+
header = ["key", "value"]
|
|
295
|
+
table = tabulate(rows, headers=header, tablefmt="fancy_grid", stralign="left", numalign="left")
|
|
296
|
+
print(colored(table, "cyan"))
|
|
282
297
|
|
|
283
298
|
def get_ann_ids(
|
|
284
299
|
self,
|
|
@@ -493,7 +508,7 @@ class SerializerCoco:
|
|
|
493
508
|
"""
|
|
494
509
|
Not implemented
|
|
495
510
|
"""
|
|
496
|
-
raise NotImplementedError
|
|
511
|
+
raise NotImplementedError()
|
|
497
512
|
|
|
498
513
|
|
|
499
514
|
class SerializerPdfDoc:
|
|
@@ -541,7 +556,7 @@ class SerializerPdfDoc:
|
|
|
541
556
|
"""
|
|
542
557
|
Not implemented
|
|
543
558
|
"""
|
|
544
|
-
raise NotImplementedError
|
|
559
|
+
raise NotImplementedError()
|
|
545
560
|
|
|
546
561
|
@staticmethod
|
|
547
562
|
def split(path: Pathlike, path_target: Optional[Pathlike] = None, max_datapoint: Optional[int] = None) -> None:
|
|
@@ -28,8 +28,9 @@ from typing import Any, Callable, Iterator, List, no_type_check
|
|
|
28
28
|
import zmq
|
|
29
29
|
|
|
30
30
|
from ..utils.concurrency import StoppableThread, enable_death_signal, start_proc_mask_signal
|
|
31
|
-
from ..utils.
|
|
32
|
-
from .
|
|
31
|
+
from ..utils.error import DataFlowTerminatedError
|
|
32
|
+
from ..utils.logger import LoggingRecord, logger
|
|
33
|
+
from .base import DataFlow, DataFlowReentrantGuard, ProxyDataFlow
|
|
33
34
|
from .common import RepeatedData
|
|
34
35
|
from .serialize import PickleSerializer
|
|
35
36
|
|
|
@@ -48,15 +49,15 @@ def _zmq_catch_error(name):
|
|
|
48
49
|
try:
|
|
49
50
|
yield
|
|
50
51
|
except zmq.ContextTerminated as exc:
|
|
51
|
-
logger.info("[
|
|
52
|
-
raise
|
|
52
|
+
logger.info(LoggingRecord(f"_zmq_catch_error: [{name}] Context terminated."))
|
|
53
|
+
raise DataFlowTerminatedError() from exc
|
|
53
54
|
except zmq.ZMQError as exc:
|
|
54
55
|
if exc.errno == errno.ENOTSOCK: # socket closed
|
|
55
|
-
logger.info("[
|
|
56
|
-
raise
|
|
57
|
-
raise ValueError from exc
|
|
56
|
+
logger.info(LoggingRecord(f"_zmq_catch_error: [{name}] Socket closed."))
|
|
57
|
+
raise DataFlowTerminatedError() from exc
|
|
58
|
+
raise ValueError() from exc
|
|
58
59
|
except Exception as exc:
|
|
59
|
-
raise ValueError from exc
|
|
60
|
+
raise ValueError() from exc
|
|
60
61
|
|
|
61
62
|
|
|
62
63
|
@no_type_check
|
|
@@ -78,8 +79,8 @@ def _get_pipe_name(name):
|
|
|
78
79
|
class _ParallelMapData(ProxyDataFlow, ABC):
|
|
79
80
|
def __init__(self, df: DataFlow, buffer_size: int, strict: bool = False) -> None:
|
|
80
81
|
super().__init__(df)
|
|
81
|
-
if
|
|
82
|
-
raise ValueError("buffer_size must be a positive number")
|
|
82
|
+
if buffer_size <= 0:
|
|
83
|
+
raise ValueError(f"buffer_size must be a positive number, got {buffer_size}")
|
|
83
84
|
self._buffer_size = buffer_size
|
|
84
85
|
self._buffer_occupancy = 0 # actual #elements in buffer, only useful in strict mode
|
|
85
86
|
self._strict = strict
|
|
@@ -95,12 +96,12 @@ class _ParallelMapData(ProxyDataFlow, ABC):
|
|
|
95
96
|
@no_type_check
|
|
96
97
|
@abstractmethod
|
|
97
98
|
def _recv(self):
|
|
98
|
-
raise NotImplementedError
|
|
99
|
+
raise NotImplementedError()
|
|
99
100
|
|
|
100
101
|
@no_type_check
|
|
101
102
|
@abstractmethod
|
|
102
103
|
def _send(self, dp: Any):
|
|
103
|
-
raise NotImplementedError
|
|
104
|
+
raise NotImplementedError()
|
|
104
105
|
|
|
105
106
|
@no_type_check
|
|
106
107
|
def _recv_filter_none(self):
|
|
@@ -312,7 +313,8 @@ class _MultiProcessZMQDataFlow(DataFlow, ABC):
|
|
|
312
313
|
for x in self._procs:
|
|
313
314
|
x.terminate()
|
|
314
315
|
x.join(5)
|
|
315
|
-
logger.info("
|
|
316
|
+
logger.info(LoggingRecord(f"_MultiProcessZMQDataFlow [{type(self).__name__}] successfully cleaned-up."))
|
|
317
|
+
|
|
316
318
|
except Exception: # pylint: disable=W0703
|
|
317
319
|
pass
|
|
318
320
|
|
|
@@ -323,9 +325,12 @@ def _bind_guard(sock, name):
|
|
|
323
325
|
sock.bind(name)
|
|
324
326
|
except zmq.ZMQError:
|
|
325
327
|
logger.error(
|
|
326
|
-
|
|
327
|
-
|
|
328
|
+
LoggingRecord(
|
|
329
|
+
f"ZMQError in socket.bind('{name}'). Perhaps you're using pipes on a non-local file system. "
|
|
330
|
+
"See documentation of MultiProcessRunnerZMQ for more information."
|
|
331
|
+
)
|
|
328
332
|
)
|
|
333
|
+
|
|
329
334
|
raise
|
|
330
335
|
|
|
331
336
|
|
|
@@ -394,8 +399,8 @@ class MultiProcessMapData(_ParallelMapData, _MultiProcessZMQDataFlow):
|
|
|
394
399
|
|
|
395
400
|
_ParallelMapData.__init__(self, df, buffer_size, strict)
|
|
396
401
|
_MultiProcessZMQDataFlow.__init__(self)
|
|
397
|
-
if
|
|
398
|
-
raise ValueError("num_proc must be a positive number")
|
|
402
|
+
if num_proc <= 0:
|
|
403
|
+
raise ValueError(f"num_proc must be a positive number, got {num_proc}")
|
|
399
404
|
self.num_proc = num_proc
|
|
400
405
|
self.map_func = map_func
|
|
401
406
|
self._strict = strict
|
|
@@ -16,7 +16,8 @@ from typing import Any, Iterable, Iterator, List, Optional, Tuple, Union
|
|
|
16
16
|
|
|
17
17
|
import numpy as np
|
|
18
18
|
|
|
19
|
-
from .
|
|
19
|
+
from ..utils.error import DataFlowResetStateNotCalledError
|
|
20
|
+
from .base import DataFlow, RNGDataFlow
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
class DataFromList(RNGDataFlow):
|
|
@@ -44,7 +45,7 @@ class DataFromList(RNGDataFlow):
|
|
|
44
45
|
for k in idxs:
|
|
45
46
|
yield self.lst[k]
|
|
46
47
|
else:
|
|
47
|
-
raise
|
|
48
|
+
raise DataFlowResetStateNotCalledError()
|
|
48
49
|
|
|
49
50
|
|
|
50
51
|
class DataFromIterable(DataFlow):
|
|
@@ -63,7 +64,7 @@ class DataFromIterable(DataFlow):
|
|
|
63
64
|
|
|
64
65
|
def __len__(self) -> int:
|
|
65
66
|
if self._len is None:
|
|
66
|
-
raise NotImplementedError
|
|
67
|
+
raise NotImplementedError()
|
|
67
68
|
return self._len
|
|
68
69
|
|
|
69
70
|
def __iter__(self) -> Iterator[Any]:
|
|
@@ -107,7 +108,7 @@ class FakeData(RNGDataFlow):
|
|
|
107
108
|
|
|
108
109
|
def __iter__(self) -> Iterator[Any]:
|
|
109
110
|
if self.rng is None:
|
|
110
|
-
raise
|
|
111
|
+
raise DataFlowResetStateNotCalledError()
|
|
111
112
|
if self.random:
|
|
112
113
|
for _ in range(self._size):
|
|
113
114
|
val = []
|
deepdoctection/dataflow/stats.py
CHANGED
|
@@ -23,7 +23,7 @@ from typing import Any, Optional, Tuple, Union
|
|
|
23
23
|
import numpy as np
|
|
24
24
|
import numpy.typing as npt
|
|
25
25
|
|
|
26
|
-
from ..utils.logger import logger
|
|
26
|
+
from ..utils.logger import LoggingRecord, logger
|
|
27
27
|
from ..utils.tqdm import get_tqdm
|
|
28
28
|
from .base import DataFlow, ProxyDataFlow
|
|
29
29
|
|
|
@@ -95,7 +95,7 @@ class MeanFromDataFlow(ProxyDataFlow):
|
|
|
95
95
|
self.df.reset_state()
|
|
96
96
|
itr = iter(self.df)
|
|
97
97
|
|
|
98
|
-
logger.info("Calculating mean")
|
|
98
|
+
logger.info(LoggingRecord("Calculating mean"))
|
|
99
99
|
|
|
100
100
|
len_df: Optional[int]
|
|
101
101
|
try:
|
|
@@ -139,7 +139,7 @@ class MeanFromDataFlow(ProxyDataFlow):
|
|
|
139
139
|
if n == self.max_datapoints:
|
|
140
140
|
break
|
|
141
141
|
|
|
142
|
-
logger.info("Mean from
|
|
142
|
+
logger.info(LoggingRecord(f"Mean from {n} datapoints along axis {self.axis}: {self.mean}"))
|
|
143
143
|
|
|
144
144
|
return self.mean
|
|
145
145
|
|
|
@@ -216,7 +216,7 @@ class StdFromDataFlow(ProxyDataFlow):
|
|
|
216
216
|
self.df.reset_state()
|
|
217
217
|
itr = iter(self.df)
|
|
218
218
|
|
|
219
|
-
logger.info("Calculating standard deviation")
|
|
219
|
+
logger.info(LoggingRecord("Calculating standard deviation"))
|
|
220
220
|
try:
|
|
221
221
|
len_df = len(self.df)
|
|
222
222
|
except NotImplementedError:
|
|
@@ -266,6 +266,6 @@ class StdFromDataFlow(ProxyDataFlow):
|
|
|
266
266
|
var = (ex2 - (ex * ex) / n) / (n - 1)
|
|
267
267
|
self.std = np.sqrt(var)
|
|
268
268
|
|
|
269
|
-
logger.info("Standard deviation from
|
|
269
|
+
logger.info(LoggingRecord(f"Standard deviation from {n} datapoints along axis {self.axis}: {self.std}"))
|
|
270
270
|
|
|
271
271
|
return self.std
|
|
@@ -24,8 +24,9 @@ from dataclasses import dataclass, field
|
|
|
24
24
|
from typing import Any, Dict, List, Optional, Union, no_type_check
|
|
25
25
|
|
|
26
26
|
from ..utils.detection_types import JsonDict
|
|
27
|
+
from ..utils.error import AnnotationError, UUIDError
|
|
27
28
|
from ..utils.identifier import get_uuid, is_uuid_like
|
|
28
|
-
from ..utils.logger import logger
|
|
29
|
+
from ..utils.logger import LoggingRecord, logger
|
|
29
30
|
from ..utils.settings import DefaultType, ObjectTypes, SummaryType, TypeOrStr, get_type
|
|
30
31
|
from .box import BoundingBox
|
|
31
32
|
from .convert import as_dict
|
|
@@ -36,7 +37,16 @@ def ann_from_dict(cls, **kwargs):
|
|
|
36
37
|
"""
|
|
37
38
|
A factory function to create subclasses of annotations from a given dict
|
|
38
39
|
"""
|
|
39
|
-
|
|
40
|
+
_init_kwargs = {
|
|
41
|
+
"external_id": kwargs.get("external_id"),
|
|
42
|
+
"category_name": kwargs.get("category_name"),
|
|
43
|
+
"category_id": kwargs.get("category_id"),
|
|
44
|
+
"score": kwargs.get("score"),
|
|
45
|
+
"service_id": kwargs.get("service_id"),
|
|
46
|
+
"model_id": kwargs.get("model_id"),
|
|
47
|
+
"session_id": kwargs.get("session_id"),
|
|
48
|
+
}
|
|
49
|
+
ann = cls(**_init_kwargs)
|
|
40
50
|
ann.active = kwargs.get("active")
|
|
41
51
|
ann._annotation_id = kwargs.get("_annotation_id") # pylint: disable=W0212
|
|
42
52
|
if isinstance(kwargs.get("sub_categories"), dict):
|
|
@@ -74,11 +84,17 @@ class Annotation(ABC):
|
|
|
74
84
|
id will not depend on the defining attributes.
|
|
75
85
|
|
|
76
86
|
`_annotation_id`: Unique id for annotations. Will always be given as string representation of a md5-hash.
|
|
87
|
+
`service_id`: Service that generated the annotation. This will be the name of a pipeline component
|
|
88
|
+
`model_id`: Model that generated the annotation. This will be the name of particular model
|
|
89
|
+
`session_id`: Session id for the annotation. This will be the id of the session in which the annotation was created.
|
|
77
90
|
"""
|
|
78
91
|
|
|
79
92
|
active: bool = field(default=True, init=False, repr=True)
|
|
80
93
|
external_id: Optional[Union[str, int]] = field(default=None, init=True, repr=False)
|
|
81
94
|
_annotation_id: Optional[str] = field(default=None, init=False, repr=True)
|
|
95
|
+
service_id: Optional[str] = field(default=None)
|
|
96
|
+
model_id: Optional[str] = field(default=None)
|
|
97
|
+
session_id: Optional[str] = field(default=None)
|
|
82
98
|
|
|
83
99
|
def __post_init__(self) -> None:
|
|
84
100
|
"""
|
|
@@ -101,7 +117,7 @@ class Annotation(ABC):
|
|
|
101
117
|
"""
|
|
102
118
|
if self._annotation_id:
|
|
103
119
|
return self._annotation_id
|
|
104
|
-
raise
|
|
120
|
+
raise AnnotationError("Dump annotation first or pass external_id to create an annotation id")
|
|
105
121
|
|
|
106
122
|
@annotation_id.setter
|
|
107
123
|
def annotation_id(self, input_id: str) -> None:
|
|
@@ -109,13 +125,13 @@ class Annotation(ABC):
|
|
|
109
125
|
annotation_id setter
|
|
110
126
|
"""
|
|
111
127
|
if self._annotation_id is not None:
|
|
112
|
-
raise
|
|
128
|
+
raise AnnotationError("Annotation_id already defined and cannot be reset")
|
|
113
129
|
if is_uuid_like(input_id):
|
|
114
130
|
self._annotation_id = input_id
|
|
115
131
|
elif isinstance(input_id, property):
|
|
116
132
|
pass
|
|
117
133
|
else:
|
|
118
|
-
raise
|
|
134
|
+
raise AnnotationError("Annotation_id must be uuid3 string")
|
|
119
135
|
|
|
120
136
|
@abstractmethod
|
|
121
137
|
def get_defining_attributes(self) -> List[str]:
|
|
@@ -126,13 +142,13 @@ class Annotation(ABC):
|
|
|
126
142
|
|
|
127
143
|
:return: A list of attributes.
|
|
128
144
|
"""
|
|
129
|
-
raise NotImplementedError
|
|
145
|
+
raise NotImplementedError()
|
|
130
146
|
|
|
131
147
|
def _assert_attributes_have_str(self, state_id: bool = False) -> None:
|
|
132
148
|
defining_attributes = self.get_state_attributes() if state_id else self.get_defining_attributes()
|
|
133
149
|
for attr in defining_attributes:
|
|
134
150
|
if not hasattr(eval("self." + attr), "__str__"): # pylint: disable=W0123
|
|
135
|
-
raise
|
|
151
|
+
raise AnnotationError(f"Attribute {attr} must have __str__ method")
|
|
136
152
|
|
|
137
153
|
@staticmethod
|
|
138
154
|
def set_annotation_id(annotation: "CategoryAnnotation", *container_id_context: Optional[str]) -> str:
|
|
@@ -179,7 +195,7 @@ class Annotation(ABC):
|
|
|
179
195
|
|
|
180
196
|
:return: Annotation instance
|
|
181
197
|
"""
|
|
182
|
-
raise NotImplementedError
|
|
198
|
+
raise NotImplementedError()
|
|
183
199
|
|
|
184
200
|
@staticmethod
|
|
185
201
|
@abstractmethod
|
|
@@ -189,7 +205,7 @@ class Annotation(ABC):
|
|
|
189
205
|
|
|
190
206
|
:return: A list of attributes.
|
|
191
207
|
"""
|
|
192
|
-
raise NotImplementedError
|
|
208
|
+
raise NotImplementedError()
|
|
193
209
|
|
|
194
210
|
@property
|
|
195
211
|
def state_id(self) -> str:
|
|
@@ -290,7 +306,12 @@ class CategoryAnnotation(Annotation):
|
|
|
290
306
|
"""
|
|
291
307
|
|
|
292
308
|
if sub_category_name in self.sub_categories:
|
|
293
|
-
raise
|
|
309
|
+
raise AnnotationError(
|
|
310
|
+
f"sub category {sub_category_name} already defined: "
|
|
311
|
+
f"annotation_id: {self.annotation_id}, "
|
|
312
|
+
f"category_name: {self.category_name}, "
|
|
313
|
+
f"category_id: {self.category_id}"
|
|
314
|
+
)
|
|
294
315
|
|
|
295
316
|
if self._annotation_id is not None:
|
|
296
317
|
if annotation._annotation_id is None: # pylint: disable=W0212
|
|
@@ -333,7 +354,7 @@ class CategoryAnnotation(Annotation):
|
|
|
333
354
|
:param annotation_id: An annotation id
|
|
334
355
|
"""
|
|
335
356
|
if not is_uuid_like(annotation_id):
|
|
336
|
-
raise
|
|
357
|
+
raise UUIDError("Annotation_id must be uuid")
|
|
337
358
|
|
|
338
359
|
key_type = get_type(key)
|
|
339
360
|
if key not in self.relationships:
|
|
@@ -369,7 +390,7 @@ class CategoryAnnotation(Annotation):
|
|
|
369
390
|
try:
|
|
370
391
|
self.relationships[key].remove(ann_id)
|
|
371
392
|
except ValueError:
|
|
372
|
-
logger.warning("Relationship
|
|
393
|
+
logger.warning(LoggingRecord(f"Relationship {key} cannot be removed because it does not exist"))
|
|
373
394
|
else:
|
|
374
395
|
self.relationships[key].clear()
|
|
375
396
|
|
|
@@ -436,14 +457,14 @@ class ImageAnnotation(CategoryAnnotation):
|
|
|
436
457
|
box = self.bounding_box
|
|
437
458
|
if box:
|
|
438
459
|
return box
|
|
439
|
-
raise
|
|
460
|
+
raise AnnotationError(f"bounding_box has not been initialized for {self.annotation_id}")
|
|
440
461
|
|
|
441
462
|
def get_summary(self, key: ObjectTypes) -> CategoryAnnotation:
|
|
442
463
|
"""Get summary sub categories from `image`. Raises `ValueError` if `key` is not available"""
|
|
443
464
|
if self.image:
|
|
444
465
|
if self.image.summary:
|
|
445
466
|
return self.image.summary.get_sub_category(key)
|
|
446
|
-
raise
|
|
467
|
+
raise AnnotationError(f"Summary does not exist for {self.annotation_id} and key: {key}")
|
|
447
468
|
|
|
448
469
|
|
|
449
470
|
@dataclass
|
deepdoctection/datapoint/box.py
CHANGED
|
@@ -28,8 +28,9 @@ import numpy.typing as npt
|
|
|
28
28
|
from numpy import float32
|
|
29
29
|
|
|
30
30
|
from ..utils.detection_types import ImageType
|
|
31
|
+
from ..utils.error import BoundingBoxError
|
|
31
32
|
from ..utils.file_utils import cocotools_available
|
|
32
|
-
from ..utils.logger import logger
|
|
33
|
+
from ..utils.logger import LoggingRecord, logger
|
|
33
34
|
|
|
34
35
|
if cocotools_available():
|
|
35
36
|
import pycocotools.mask as coco_mask
|
|
@@ -140,10 +141,6 @@ def iou(boxes1: npt.NDArray[float32], boxes2: npt.NDArray[float32]) -> npt.NDArr
|
|
|
140
141
|
return np_iou(boxes1, boxes2)
|
|
141
142
|
|
|
142
143
|
|
|
143
|
-
class BoundingBoxError(BaseException):
|
|
144
|
-
"""Special exception only for `BoundingBox`"""
|
|
145
|
-
|
|
146
|
-
|
|
147
144
|
@dataclass
|
|
148
145
|
class BoundingBox:
|
|
149
146
|
"""
|
|
@@ -558,6 +555,12 @@ def intersection_boxes(boxes_1: Sequence[BoundingBox], boxes_2: Sequence[Boundin
|
|
|
558
555
|
:param boxes_2: sequence of n BoundingBox
|
|
559
556
|
:return: list of at most mxn BoundingBox
|
|
560
557
|
"""
|
|
558
|
+
if not boxes_1 and boxes_2:
|
|
559
|
+
return boxes_2
|
|
560
|
+
if not boxes_2 and boxes_1:
|
|
561
|
+
return boxes_1
|
|
562
|
+
if not boxes_1 and not boxes_2:
|
|
563
|
+
return []
|
|
561
564
|
if boxes_1[0].absolute_coords != boxes_2[0].absolute_coords:
|
|
562
565
|
raise ValueError("absolute_coords of boxes_1 and boxes_2 mus be equal")
|
|
563
566
|
absolute_coords = boxes_1[0].absolute_coords
|
|
@@ -596,6 +599,6 @@ def intersection_boxes(boxes_1: Sequence[BoundingBox], boxes_2: Sequence[Boundin
|
|
|
596
599
|
"height": np_boxes_output[idx][3],
|
|
597
600
|
}
|
|
598
601
|
|
|
599
|
-
logger.warning("intersection_boxes
|
|
602
|
+
logger.warning(LoggingRecord("intersection_boxes", log_dict)) # type: ignore
|
|
600
603
|
|
|
601
604
|
return boxes_output
|
|
@@ -32,6 +32,7 @@ from pypdf import PdfReader
|
|
|
32
32
|
|
|
33
33
|
from ..utils.detection_types import ImageType
|
|
34
34
|
from ..utils.develop import deprecated
|
|
35
|
+
from ..utils.error import DependencyError
|
|
35
36
|
from ..utils.pdf_utils import pdf_to_np_array
|
|
36
37
|
from ..utils.viz import viz_handler
|
|
37
38
|
|
|
@@ -121,7 +122,8 @@ def convert_pdf_bytes_to_np_array(pdf_bytes: bytes, dpi: Optional[int] = None) -
|
|
|
121
122
|
"""
|
|
122
123
|
from pdf2image import convert_from_bytes # type: ignore # pylint: disable=C0415, E0401
|
|
123
124
|
|
|
124
|
-
|
|
125
|
+
if which("pdftoppm") is None:
|
|
126
|
+
raise DependencyError("convert_pdf_bytes_to_np_array requires poppler to be installed")
|
|
125
127
|
|
|
126
128
|
with BytesIO(pdf_bytes) as pdf_file:
|
|
127
129
|
pdf = PdfReader(pdf_file).pages[0]
|