deepdoctection 0.36__tar.gz → 0.37.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- {deepdoctection-0.36 → deepdoctection-0.37.1}/PKG-INFO +1 -1
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/__init__.py +3 -1
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/analyzer/factory.py +3 -3
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/dataflow/custom_serialize.py +1 -1
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/datapoint/convert.py +11 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/datapoint/image.py +9 -5
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/datasets/save.py +1 -1
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/base.py +2 -3
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/hflayoutlm.py +1 -1
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/mapper/misc.py +5 -1
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/pipe/base.py +29 -9
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/pipe/doctectionpipe.py +77 -10
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/utils/fs.py +8 -7
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/utils/pdf_utils.py +45 -17
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/utils/viz.py +33 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection.egg-info/PKG-INFO +1 -1
- {deepdoctection-0.36 → deepdoctection-0.37.1}/setup.py +1 -1
- {deepdoctection-0.36 → deepdoctection-0.37.1}/LICENSE +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/README.md +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/analyzer/__init__.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/analyzer/_config.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/analyzer/dd.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/configs/__init__.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/configs/conf_dd_one.yaml +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/configs/conf_tesseract.yaml +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/dataflow/__init__.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/dataflow/base.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/dataflow/common.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/dataflow/custom.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/dataflow/parallel_map.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/dataflow/serialize.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/dataflow/stats.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/datapoint/__init__.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/datapoint/annotation.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/datapoint/box.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/datapoint/view.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/datasets/__init__.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/datasets/adapter.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/datasets/base.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/datasets/dataflow_builder.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/datasets/info.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/datasets/instances/__init__.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/datasets/instances/doclaynet.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/datasets/instances/fintabnet.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/datasets/instances/funsd.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/datasets/instances/iiitar13k.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/datasets/instances/layouttest.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/datasets/instances/publaynet.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/datasets/instances/pubtables1m.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/datasets/instances/pubtabnet.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/datasets/instances/rvlcdip.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/datasets/instances/xfund.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/datasets/instances/xsl/__init__.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/datasets/instances/xsl/pascal_voc.xsl +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/datasets/registry.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/eval/__init__.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/eval/accmetric.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/eval/base.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/eval/cocometric.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/eval/eval.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/eval/registry.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/eval/tedsmetric.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/eval/tp_eval_callback.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/__init__.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/d2detect.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/deskew.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/doctrocr.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/fastlang.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/hfdetr.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/hflm.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/model.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/pdftext.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/pt/__init__.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/pt/nms.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/pt/ptutils.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/tessocr.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/texocr.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/tp/__init__.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/tp/tfutils.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/tp/tpcompat.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/tp/tpfrcnn/__init__.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/tp/tpfrcnn/common.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/tp/tpfrcnn/config/config.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/tp/tpfrcnn/predict.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/tp/tpfrcnn/preproc.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/tp/tpfrcnn/utils/np_box_ops.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/tpdetect.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/mapper/__init__.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/mapper/cats.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/mapper/cocostruct.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/mapper/d2struct.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/mapper/hfstruct.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/mapper/laylmstruct.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/mapper/maputils.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/mapper/match.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/mapper/pascalstruct.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/mapper/prodigystruct.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/mapper/pubstruct.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/mapper/tpstruct.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/mapper/xfundstruct.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/pipe/__init__.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/pipe/anngen.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/pipe/common.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/pipe/concurrency.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/pipe/language.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/pipe/layout.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/pipe/lm.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/pipe/order.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/pipe/refine.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/pipe/registry.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/pipe/segment.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/pipe/sub_layout.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/pipe/text.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/pipe/transform.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/py.typed +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/train/__init__.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/train/d2_frcnn_train.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/train/hf_detr_train.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/train/hf_layoutlm_train.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/train/tp_frcnn_train.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/utils/__init__.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/utils/concurrency.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/utils/context.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/utils/develop.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/utils/env_info.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/utils/error.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/utils/file_utils.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/utils/identifier.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/utils/logger.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/utils/metacfg.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/utils/mocks.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/utils/settings.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/utils/tqdm.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/utils/transform.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/utils/types.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/utils/utils.py +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection.egg-info/SOURCES.txt +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection.egg-info/dependency_links.txt +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection.egg-info/requires.txt +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection.egg-info/top_level.txt +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/setup.cfg +0 -0
- {deepdoctection-0.36 → deepdoctection-0.37.1}/tests/test_utils.py +0 -0
|
@@ -24,7 +24,7 @@ from .utils.logger import LoggingRecord, logger
|
|
|
24
24
|
|
|
25
25
|
# pylint: enable=wrong-import-position
|
|
26
26
|
|
|
27
|
-
__version__ = 0.
|
|
27
|
+
__version__ = "0.37.1"
|
|
28
28
|
|
|
29
29
|
_IMPORT_STRUCTURE = {
|
|
30
30
|
"analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory"],
|
|
@@ -87,6 +87,7 @@ _IMPORT_STRUCTURE = {
|
|
|
87
87
|
"convert_b64_to_np_array",
|
|
88
88
|
"convert_np_array_to_b64",
|
|
89
89
|
"convert_np_array_to_b64_b",
|
|
90
|
+
"convert_bytes_to_np_array",
|
|
90
91
|
"convert_pdf_bytes_to_np_array_v2",
|
|
91
92
|
"box_to_point4",
|
|
92
93
|
"point4_to_box",
|
|
@@ -371,6 +372,7 @@ _IMPORT_STRUCTURE = {
|
|
|
371
372
|
"save_config_to_yaml",
|
|
372
373
|
"config_to_cli_str",
|
|
373
374
|
"decrypt_pdf_document",
|
|
375
|
+
"decrypt_pdf_document_from_bytes",
|
|
374
376
|
"get_pdf_file_reader",
|
|
375
377
|
"get_pdf_file_writer",
|
|
376
378
|
"PDFStreamer",
|
|
@@ -327,9 +327,9 @@ class ServiceFactory:
|
|
|
327
327
|
)
|
|
328
328
|
if config.OCR.USE_TEXTRACT:
|
|
329
329
|
credentials_kwargs = {
|
|
330
|
-
"aws_access_key_id": environ.get("
|
|
331
|
-
"aws_secret_access_key": environ.get("
|
|
332
|
-
"config": Config(region_name=environ.get("
|
|
330
|
+
"aws_access_key_id": environ.get("AWS_ACCESS_KEY", None),
|
|
331
|
+
"aws_secret_access_key": environ.get("AWS_SECRET_KEY", None),
|
|
332
|
+
"config": Config(region_name=environ.get("AWS_REGION", None)),
|
|
333
333
|
}
|
|
334
334
|
return TextractOcrDetector(**credentials_kwargs)
|
|
335
335
|
raise ValueError("You have set USE_OCR=True but any of USE_TESSERACT, USE_DOCTR, USE_TEXTRACT is set to False")
|
|
@@ -593,7 +593,7 @@ class SerializerPdfDoc:
|
|
|
593
593
|
file_name = os.path.split(path)[1]
|
|
594
594
|
prefix, suffix = os.path.splitext(file_name)
|
|
595
595
|
df: DataFlow
|
|
596
|
-
df = CustomDataFromIterable(PDFStreamer(
|
|
596
|
+
df = CustomDataFromIterable(PDFStreamer(path_or_bytes=path), max_datapoints=max_datapoints)
|
|
597
597
|
df = MapData(
|
|
598
598
|
df,
|
|
599
599
|
lambda dp: {
|
|
@@ -40,6 +40,7 @@ __all__ = [
|
|
|
40
40
|
"convert_b64_to_np_array",
|
|
41
41
|
"convert_np_array_to_b64",
|
|
42
42
|
"convert_np_array_to_b64_b",
|
|
43
|
+
"convert_bytes_to_np_array",
|
|
43
44
|
"convert_pdf_bytes_to_np_array_v2",
|
|
44
45
|
"box_to_point4",
|
|
45
46
|
"point4_to_box",
|
|
@@ -107,6 +108,16 @@ def convert_np_array_to_b64_b(np_image: PixelValues) -> bytes:
|
|
|
107
108
|
return viz_handler.encode(np_image)
|
|
108
109
|
|
|
109
110
|
|
|
111
|
+
def convert_bytes_to_np_array(image_bytes: bytes) -> PixelValues:
|
|
112
|
+
"""
|
|
113
|
+
Converts an image in bytes to a numpy array
|
|
114
|
+
|
|
115
|
+
:param image_bytes: An image as bytes.
|
|
116
|
+
:return: numpy array.
|
|
117
|
+
"""
|
|
118
|
+
return viz_handler.convert_bytes_to_np(image_bytes)
|
|
119
|
+
|
|
120
|
+
|
|
110
121
|
@deprecated("Use convert_pdf_bytes_to_np_array_v2", "2022-02-23")
|
|
111
122
|
def convert_pdf_bytes_to_np_array(pdf_bytes: bytes, dpi: Optional[int] = None) -> PixelValues:
|
|
112
123
|
"""
|
|
@@ -34,6 +34,7 @@ from ..utils.error import AnnotationError, BoundingBoxError, ImageError, UUIDErr
|
|
|
34
34
|
from ..utils.identifier import get_uuid, is_uuid_like
|
|
35
35
|
from ..utils.settings import ObjectTypes, SummaryType, get_type
|
|
36
36
|
from ..utils.types import ImageDict, PathLikeOrStr, PixelValues
|
|
37
|
+
from ..utils.logger import LoggingRecord, logger
|
|
37
38
|
from .annotation import Annotation, AnnotationMap, BoundingBox, CategoryAnnotation, ImageAnnotation
|
|
38
39
|
from .box import crop_box_from_image, global_to_local_coords, intersection_box
|
|
39
40
|
from .convert import as_dict, convert_b64_to_np_array, convert_np_array_to_b64, convert_pdf_bytes_to_np_array_v2
|
|
@@ -474,8 +475,11 @@ class Image:
|
|
|
474
475
|
|
|
475
476
|
for service_id in service_ids:
|
|
476
477
|
if service_id not in service_id_to_annotation_id:
|
|
477
|
-
|
|
478
|
-
|
|
478
|
+
logger.info(
|
|
479
|
+
LoggingRecord(
|
|
480
|
+
f"Service_id {service_id} for image_id: {self.image_id} not found. Skipping removal."))
|
|
481
|
+
|
|
482
|
+
annotation_ids = service_id_to_annotation_id.get(service_id, [])
|
|
479
483
|
|
|
480
484
|
for ann_id in annotation_ids:
|
|
481
485
|
if ann_id not in ann_id_to_annotation_maps:
|
|
@@ -587,7 +591,7 @@ class Image:
|
|
|
587
591
|
)
|
|
588
592
|
ann.image.dump(sub_image)
|
|
589
593
|
|
|
590
|
-
def
|
|
594
|
+
def remove_image_from_lower_hierarchy(self, pixel_values_only: bool = False) -> None:
|
|
591
595
|
"""Will remove all images from image annotations."""
|
|
592
596
|
for ann in self.annotations:
|
|
593
597
|
if pixel_values_only:
|
|
@@ -717,7 +721,7 @@ class Image:
|
|
|
717
721
|
else:
|
|
718
722
|
path_json = fspath(path) + ".json"
|
|
719
723
|
if highest_hierarchy_only:
|
|
720
|
-
self.
|
|
724
|
+
self.remove_image_from_lower_hierarchy()
|
|
721
725
|
export_dict = self.as_dict()
|
|
722
726
|
export_dict["location"] = fspath(export_dict["location"])
|
|
723
727
|
if not image_to_json:
|
|
@@ -747,7 +751,7 @@ class Image:
|
|
|
747
751
|
if sub_cat.service_id:
|
|
748
752
|
service_id_dict[sub_cat.service_id].append(sub_cat.annotation_id)
|
|
749
753
|
if ann.image is not None:
|
|
750
|
-
for summary_cat_key in ann.image.summary:
|
|
754
|
+
for summary_cat_key in ann.image.summary.sub_categories:
|
|
751
755
|
summary_cat = ann.get_summary(summary_cat_key)
|
|
752
756
|
if summary_cat.service_id:
|
|
753
757
|
service_id_dict[summary_cat.service_id].append(summary_cat.annotation_id)
|
|
@@ -69,8 +69,7 @@ class ModelCategories:
|
|
|
69
69
|
if self.init_categories:
|
|
70
70
|
self._init_categories = MappingProxyType({key: get_type(val) for key, val in self.init_categories.items()})
|
|
71
71
|
else:
|
|
72
|
-
|
|
73
|
-
self._init_categories = MappingProxyType({})
|
|
72
|
+
self._init_categories = MappingProxyType({})
|
|
74
73
|
self.categories = self._init_categories
|
|
75
74
|
|
|
76
75
|
@overload
|
|
@@ -181,7 +180,7 @@ class NerModelCategories(ModelCategories):
|
|
|
181
180
|
self._init_categories = self.merge_bio_semantics_categories(
|
|
182
181
|
self._categories_semantics, self._categories_bio
|
|
183
182
|
)
|
|
184
|
-
|
|
183
|
+
self.categories = self._init_categories
|
|
185
184
|
|
|
186
185
|
@staticmethod
|
|
187
186
|
def merge_bio_semantics_categories(
|
|
@@ -48,7 +48,7 @@ with try_import() as pt_import_guard:
|
|
|
48
48
|
import torch.nn.functional as F
|
|
49
49
|
|
|
50
50
|
with try_import() as tr_import_guard:
|
|
51
|
-
from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
|
|
51
|
+
from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
|
|
52
52
|
from transformers import (
|
|
53
53
|
LayoutLMForSequenceClassification,
|
|
54
54
|
LayoutLMForTokenClassification,
|
|
@@ -27,7 +27,7 @@ from typing import Mapping, Optional, Sequence, Union
|
|
|
27
27
|
|
|
28
28
|
from lazy_imports import try_import
|
|
29
29
|
|
|
30
|
-
from ..datapoint.convert import convert_pdf_bytes_to_np_array_v2
|
|
30
|
+
from ..datapoint.convert import convert_bytes_to_np_array, convert_pdf_bytes_to_np_array_v2
|
|
31
31
|
from ..datapoint.image import Image
|
|
32
32
|
from ..utils.fs import get_load_image_func, load_image_from_file
|
|
33
33
|
from ..utils.types import JsonDict
|
|
@@ -49,6 +49,7 @@ def to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int]
|
|
|
49
49
|
|
|
50
50
|
file_name: Optional[str]
|
|
51
51
|
location: Optional[str]
|
|
52
|
+
image_bytes: Optional[bytes] = None
|
|
52
53
|
|
|
53
54
|
if isinstance(dp, str):
|
|
54
55
|
_, file_name = os.path.split(dp)
|
|
@@ -62,6 +63,7 @@ def to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int]
|
|
|
62
63
|
document_id = dp.get("document_id")
|
|
63
64
|
if location == "":
|
|
64
65
|
location = str(dp.get("path", ""))
|
|
66
|
+
image_bytes = dp.get("image_bytes")
|
|
65
67
|
else:
|
|
66
68
|
raise TypeError("datapoint not of expected type for converting to image")
|
|
67
69
|
|
|
@@ -76,6 +78,8 @@ def to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int]
|
|
|
76
78
|
if dp_image.pdf_bytes is not None:
|
|
77
79
|
if isinstance(dp_image.pdf_bytes, bytes):
|
|
78
80
|
dp_image.image = convert_pdf_bytes_to_np_array_v2(dp_image.pdf_bytes, dpi=dpi)
|
|
81
|
+
elif image_bytes is not None:
|
|
82
|
+
dp_image.image = convert_bytes_to_np_array(image_bytes)
|
|
79
83
|
else:
|
|
80
84
|
dp_image.image = load_image_from_file(location)
|
|
81
85
|
|
|
@@ -29,6 +29,7 @@ from uuid import uuid1
|
|
|
29
29
|
|
|
30
30
|
from ..dataflow import DataFlow, MapData
|
|
31
31
|
from ..datapoint.image import Image
|
|
32
|
+
from ..mapper.misc import curry
|
|
32
33
|
from ..utils.context import timed_operation
|
|
33
34
|
from ..utils.identifier import get_uuid_from_str
|
|
34
35
|
from ..utils.settings import ObjectTypes
|
|
@@ -247,17 +248,24 @@ class Pipeline(ABC):
|
|
|
247
248
|
"""
|
|
248
249
|
raise NotImplementedError()
|
|
249
250
|
|
|
250
|
-
|
|
251
|
+
@staticmethod
|
|
252
|
+
@curry
|
|
253
|
+
def _undo(dp: Image, service_ids: Optional[list[str]] = None) -> Image:
|
|
251
254
|
"""
|
|
252
|
-
|
|
255
|
+
Remove annotations from a datapoint
|
|
253
256
|
"""
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
257
|
+
dp.remove(service_ids=service_ids)
|
|
258
|
+
return dp
|
|
259
|
+
|
|
260
|
+
def undo(self, df: DataFlow, service_ids: Optional[set[str]] = None) -> DataFlow:
|
|
261
|
+
"""
|
|
262
|
+
Mapping a datapoint via `_undo` within a dataflow pipeline
|
|
263
|
+
|
|
264
|
+
:param df: An input dataflow of Images
|
|
265
|
+
:param service_ids: A set of service ids to remove
|
|
266
|
+
:return: A output dataflow of Images
|
|
267
|
+
"""
|
|
268
|
+
return MapData(df, self._undo(service_ids=service_ids))
|
|
261
269
|
|
|
262
270
|
@abstractmethod
|
|
263
271
|
def analyze(self, **kwargs: Any) -> DataFlow:
|
|
@@ -273,6 +281,18 @@ class Pipeline(ABC):
|
|
|
273
281
|
"""
|
|
274
282
|
raise NotImplementedError()
|
|
275
283
|
|
|
284
|
+
def _build_pipe(self, df: DataFlow, session_id: Optional[str] = None) -> DataFlow:
|
|
285
|
+
"""
|
|
286
|
+
Composition of the backbone
|
|
287
|
+
"""
|
|
288
|
+
if session_id is None and self.set_session_id:
|
|
289
|
+
session_id = self.get_session_id()
|
|
290
|
+
for component in self.pipe_component_list:
|
|
291
|
+
component.timer_on = True
|
|
292
|
+
component.dp_manager.session_id = session_id
|
|
293
|
+
df = component.predict_dataflow(df)
|
|
294
|
+
return df
|
|
295
|
+
|
|
276
296
|
def get_meta_annotation(self) -> MetaAnnotation:
|
|
277
297
|
"""
|
|
278
298
|
Collects meta annotations from all pipeline components and summarizes the returned results
|
|
@@ -23,31 +23,38 @@ import os
|
|
|
23
23
|
from pathlib import Path
|
|
24
24
|
from typing import List, Mapping, Optional, Sequence, Tuple, Union
|
|
25
25
|
|
|
26
|
-
from ..dataflow import DataFlow, MapData
|
|
26
|
+
from ..dataflow import CustomDataFromIterable, DataFlow, DataFromList, MapData
|
|
27
27
|
from ..dataflow.custom_serialize import SerializerFiles, SerializerPdfDoc
|
|
28
28
|
from ..datapoint.image import Image
|
|
29
29
|
from ..datapoint.view import IMAGE_DEFAULTS
|
|
30
30
|
from ..mapper.maputils import curry
|
|
31
31
|
from ..mapper.misc import to_image
|
|
32
32
|
from ..utils.fs import maybe_path_or_pdf
|
|
33
|
+
from ..utils.identifier import get_uuid_from_str
|
|
33
34
|
from ..utils.logger import LoggingRecord, logger
|
|
35
|
+
from ..utils.pdf_utils import PDFStreamer
|
|
34
36
|
from ..utils.types import PathLikeOrStr
|
|
37
|
+
from ..utils.utils import is_file_extension
|
|
35
38
|
from .base import Pipeline, PipelineComponent
|
|
36
39
|
from .common import PageParsingService
|
|
37
40
|
|
|
38
41
|
|
|
39
42
|
def _collect_from_kwargs(
|
|
40
|
-
**kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
|
|
41
|
-
) -> Tuple[Optional[str],
|
|
43
|
+
**kwargs: Union[Optional[str], bytes, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
|
|
44
|
+
) -> Tuple[Optional[str], Union[str, Sequence[str]], bool, int, str, DataFlow, Optional[bytes]]:
|
|
45
|
+
b_bytes = kwargs.get("bytes")
|
|
42
46
|
dataset_dataflow = kwargs.get("dataset_dataflow")
|
|
43
47
|
path = kwargs.get("path")
|
|
44
48
|
if path is None and dataset_dataflow is None:
|
|
45
49
|
raise ValueError("Pass either path or dataset_dataflow as argument")
|
|
50
|
+
if path is None and b_bytes:
|
|
51
|
+
raise ValueError("When passing bytes, a path to the source document must be provided")
|
|
46
52
|
|
|
47
53
|
shuffle = kwargs.get("shuffle", False)
|
|
48
54
|
if not isinstance(shuffle, bool):
|
|
49
55
|
raise TypeError(f"shuffle must be of type bool but is of type {type(shuffle)}")
|
|
50
56
|
|
|
57
|
+
file_type = None
|
|
51
58
|
doc_path = None
|
|
52
59
|
if path:
|
|
53
60
|
if not isinstance(path, (str, Path)):
|
|
@@ -56,15 +63,27 @@ def _collect_from_kwargs(
|
|
|
56
63
|
if path_type == 2:
|
|
57
64
|
doc_path = path
|
|
58
65
|
path = None
|
|
66
|
+
file_type = ".pdf"
|
|
67
|
+
elif path_type == 3:
|
|
68
|
+
if is_file_extension(path, ".jpg"):
|
|
69
|
+
file_type = ".jpg"
|
|
70
|
+
if is_file_extension(path, ".png"):
|
|
71
|
+
file_type = ".png"
|
|
72
|
+
if is_file_extension(path, ".jpeg"):
|
|
73
|
+
file_type = ".jpeg"
|
|
74
|
+
if not b_bytes:
|
|
75
|
+
raise ValueError("When passing a path to a single image, bytes of the image must be passed")
|
|
59
76
|
elif not path_type:
|
|
60
77
|
raise ValueError("Pass only a path to a directory or to a pdf file")
|
|
61
78
|
|
|
62
|
-
file_type = kwargs.get(
|
|
79
|
+
file_type = kwargs.get(
|
|
80
|
+
"file_type", [".jpg", ".png", ".jpeg", ".tif"] if file_type is None else file_type # type: ignore
|
|
81
|
+
)
|
|
63
82
|
|
|
64
83
|
max_datapoints = kwargs.get("max_datapoints")
|
|
65
84
|
if not isinstance(max_datapoints, (int, type(None))):
|
|
66
85
|
raise TypeError(f"max_datapoints must be of type int, but is of type {type(max_datapoints)}")
|
|
67
|
-
return path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow # type: ignore
|
|
86
|
+
return path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow, b_bytes # type: ignore
|
|
68
87
|
|
|
69
88
|
|
|
70
89
|
@curry
|
|
@@ -142,12 +161,18 @@ class DoctectionPipe(Pipeline):
|
|
|
142
161
|
|
|
143
162
|
super().__init__(pipeline_component_list)
|
|
144
163
|
|
|
145
|
-
def _entry(self, **kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]])
|
|
146
|
-
|
|
164
|
+
def _entry(self, **kwargs: Union[str, bytes, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]) \
|
|
165
|
+
-> DataFlow:
|
|
166
|
+
path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow, b_bytes = _collect_from_kwargs(**kwargs)
|
|
147
167
|
|
|
148
168
|
df: DataFlow
|
|
149
169
|
|
|
150
|
-
if isinstance(
|
|
170
|
+
if isinstance(b_bytes, bytes):
|
|
171
|
+
df = DoctectionPipe.bytes_to_dataflow(path=doc_path if path is None else path,
|
|
172
|
+
b_bytes=b_bytes,
|
|
173
|
+
file_type=file_type)
|
|
174
|
+
|
|
175
|
+
elif isinstance(path, (str, Path)):
|
|
151
176
|
if not isinstance(file_type, (str, list)):
|
|
152
177
|
raise TypeError(f"file_type must be of type string or list, but is of type {type(file_type)}")
|
|
153
178
|
df = DoctectionPipe.path_to_dataflow(path=path, file_type=file_type, shuffle=shuffle)
|
|
@@ -162,7 +187,7 @@ class DoctectionPipe(Pipeline):
|
|
|
162
187
|
|
|
163
188
|
df = MapData(df, _proto_process(path, doc_path))
|
|
164
189
|
if dataset_dataflow is None:
|
|
165
|
-
df = MapData(df, _to_image(dpi=300)) # pylint: disable=E1120
|
|
190
|
+
df = MapData(df, _to_image(dpi=os.environ.get("DPI", 300))) # pylint: disable=E1120
|
|
166
191
|
return df
|
|
167
192
|
|
|
168
193
|
@staticmethod
|
|
@@ -197,6 +222,44 @@ class DoctectionPipe(Pipeline):
|
|
|
197
222
|
"""
|
|
198
223
|
return _doc_to_dataflow(path, max_datapoints)
|
|
199
224
|
|
|
225
|
+
@staticmethod
|
|
226
|
+
def bytes_to_dataflow(
|
|
227
|
+
path: str, b_bytes: bytes, file_type: Union[str, Sequence[str]], max_datapoints: Optional[int] = None
|
|
228
|
+
) -> DataFlow:
|
|
229
|
+
"""
|
|
230
|
+
Converts a bytes object to a dataflow
|
|
231
|
+
|
|
232
|
+
:param path: path to directory or an image file
|
|
233
|
+
:param b_bytes: bytes object
|
|
234
|
+
:param file_type: e.g. ".pdf", ".jpg" or [".jpg", ".png", ".jpeg", ".tif"]
|
|
235
|
+
:param max_datapoints: max number of datapoints to consider
|
|
236
|
+
:return: DataFlow
|
|
237
|
+
"""
|
|
238
|
+
|
|
239
|
+
file_name = os.path.split(path)[1]
|
|
240
|
+
if isinstance(file_type, str):
|
|
241
|
+
if file_type == ".pdf":
|
|
242
|
+
prefix, suffix = os.path.splitext(file_name)
|
|
243
|
+
df: DataFlow
|
|
244
|
+
df = CustomDataFromIterable(PDFStreamer(path_or_bytes=b_bytes), max_datapoints=max_datapoints)
|
|
245
|
+
df = MapData(
|
|
246
|
+
df,
|
|
247
|
+
lambda dp: {
|
|
248
|
+
"path": path,
|
|
249
|
+
"file_name": prefix + f"_{dp[1]}" + suffix,
|
|
250
|
+
"pdf_bytes": dp[0],
|
|
251
|
+
"page_number": dp[1],
|
|
252
|
+
"document_id": get_uuid_from_str(prefix),
|
|
253
|
+
},
|
|
254
|
+
)
|
|
255
|
+
else:
|
|
256
|
+
df = DataFromList(lst=[{"path": path, "file_name": file_name, "image_bytes": b_bytes}])
|
|
257
|
+
return df
|
|
258
|
+
raise ValueError(
|
|
259
|
+
f"pass: {path}, b_bytes: {b_bytes!r}, file_type: {file_type} and max_datapoints: {max_datapoints} "
|
|
260
|
+
f"not supported"
|
|
261
|
+
)
|
|
262
|
+
|
|
200
263
|
def dataflow_to_page(self, df: DataFlow) -> DataFlow:
|
|
201
264
|
"""
|
|
202
265
|
Converts a dataflow of images to a dataflow of pages
|
|
@@ -206,7 +269,9 @@ class DoctectionPipe(Pipeline):
|
|
|
206
269
|
"""
|
|
207
270
|
return self.page_parser.predict_dataflow(df)
|
|
208
271
|
|
|
209
|
-
def analyze(
|
|
272
|
+
def analyze(
|
|
273
|
+
self, **kwargs: Union[str, bytes, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
|
|
274
|
+
) -> DataFlow:
|
|
210
275
|
"""
|
|
211
276
|
`kwargs key dataset_dataflow:` Transfer a dataflow of a dataset via its dataflow builder
|
|
212
277
|
|
|
@@ -215,6 +280,8 @@ class DoctectionPipe(Pipeline):
|
|
|
215
280
|
only the first page is processed through the pipeline.
|
|
216
281
|
Alternatively, a path to a pdf document with multiple pages.
|
|
217
282
|
|
|
283
|
+
`kwargs key bytes:` A bytes object of an image
|
|
284
|
+
|
|
218
285
|
`kwargs key file_type:` Selection of the file type, if: args:`file_type` is passed
|
|
219
286
|
|
|
220
287
|
`kwargs key max_datapoints:` Stops processing as soon as max_datapoints images have been processed
|
|
@@ -227,20 +227,21 @@ def get_load_image_func(
|
|
|
227
227
|
|
|
228
228
|
def maybe_path_or_pdf(path: PathLikeOrStr) -> int:
|
|
229
229
|
"""
|
|
230
|
-
Checks if the path points to a directory or a
|
|
231
|
-
if the path points to a pdf doc or 0
|
|
230
|
+
Checks if the path points to a directory, a pdf document or a single image. Returns 1 if the path points to a
|
|
231
|
+
directory, 2 if the path points to a pdf doc and 3 if path points to either a PNG, JPG or JPEG or 0 if none of the
|
|
232
|
+
previous is true.
|
|
232
233
|
|
|
233
234
|
:param path: A path
|
|
234
|
-
:return: A value of 0,1,2
|
|
235
|
+
:return: A value of 0,1,2,3
|
|
235
236
|
"""
|
|
236
237
|
|
|
237
|
-
|
|
238
|
-
if is_dir:
|
|
238
|
+
if os.path.isdir(path):
|
|
239
239
|
return 1
|
|
240
240
|
file_name = os.path.split(path)[1]
|
|
241
|
-
|
|
242
|
-
if is_pdf:
|
|
241
|
+
if is_file_extension(file_name, ".pdf"):
|
|
243
242
|
return 2
|
|
243
|
+
if is_file_extension(file_name, [".png", ".jpeg", ".jpg", ".tif"]):
|
|
244
|
+
return 3
|
|
244
245
|
return 0
|
|
245
246
|
|
|
246
247
|
|
|
@@ -26,7 +26,7 @@ from errno import ENOENT
|
|
|
26
26
|
from io import BytesIO
|
|
27
27
|
from pathlib import Path
|
|
28
28
|
from shutil import copyfile
|
|
29
|
-
from typing import Generator, Literal, Optional
|
|
29
|
+
from typing import Generator, Literal, Optional, Union
|
|
30
30
|
|
|
31
31
|
from lazy_imports import try_import
|
|
32
32
|
from numpy import uint8
|
|
@@ -46,6 +46,7 @@ with try_import() as pt_import_guard:
|
|
|
46
46
|
|
|
47
47
|
__all__ = [
|
|
48
48
|
"decrypt_pdf_document",
|
|
49
|
+
"decrypt_pdf_document_from_bytes",
|
|
49
50
|
"get_pdf_file_reader",
|
|
50
51
|
"get_pdf_file_writer",
|
|
51
52
|
"PDFStreamer",
|
|
@@ -68,7 +69,6 @@ def decrypt_pdf_document(path: PathLikeOrStr) -> bool:
|
|
|
68
69
|
:param path: A path to the pdf file
|
|
69
70
|
:return: True if document has been successfully decrypted
|
|
70
71
|
"""
|
|
71
|
-
|
|
72
72
|
if qpdf_available():
|
|
73
73
|
path_base, file_name = os.path.split(path)
|
|
74
74
|
file_name_tmp = os.path.splitext(file_name)[0] + "tmp.pdf"
|
|
@@ -86,41 +86,69 @@ def decrypt_pdf_document(path: PathLikeOrStr) -> bool:
|
|
|
86
86
|
return False
|
|
87
87
|
|
|
88
88
|
|
|
89
|
-
def
|
|
89
|
+
def decrypt_pdf_document_from_bytes(input_bytes: bytes) -> bytes:
|
|
90
|
+
"""
|
|
91
|
+
Decrypting a pdf given as bytes. Under the hood, it saves the bytes to a temporary file and then calls
|
|
92
|
+
|
|
93
|
+
qpdf: <http://qpdf.sourceforge.net/>
|
|
94
|
+
|
|
95
|
+
:param input_bytes: A bytes object representing the pdf file
|
|
96
|
+
:return: The decrypted bytes object
|
|
97
|
+
"""
|
|
98
|
+
with save_tmp_file(input_bytes, "pdf_") as (_, input_file_name):
|
|
99
|
+
is_decrypted = decrypt_pdf_document(input_file_name)
|
|
100
|
+
if is_decrypted:
|
|
101
|
+
with open(input_file_name, "rb") as file:
|
|
102
|
+
return file.read()
|
|
103
|
+
else:
|
|
104
|
+
logger.error(LoggingRecord("pdf bytes cannot be decrypted and therefore cannot be processed further."))
|
|
105
|
+
sys.exit()
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def get_pdf_file_reader(path_or_bytes: Union[PathLikeOrStr, bytes]) -> PdfReader:
|
|
90
109
|
"""
|
|
91
110
|
Creates a file reader object from a pdf document. Will try to decrypt the document if it is
|
|
92
111
|
encrypted. (See `decrypt_pdf_document` to understand what is meant with "decrypt").
|
|
93
112
|
|
|
94
|
-
:param
|
|
113
|
+
:param path_or_bytes: A path to a pdf document
|
|
95
114
|
:return: A file reader object from which you can iterate through the document.
|
|
96
115
|
"""
|
|
97
116
|
|
|
98
|
-
if
|
|
99
|
-
|
|
100
|
-
|
|
117
|
+
if isinstance(path_or_bytes, bytes):
|
|
118
|
+
try:
|
|
119
|
+
reader = PdfReader(BytesIO(path_or_bytes))
|
|
120
|
+
except (errors.PdfReadError, AttributeError):
|
|
121
|
+
decrypted_bytes = decrypt_pdf_document_from_bytes(path_or_bytes)
|
|
122
|
+
reader = PdfReader(BytesIO(decrypted_bytes))
|
|
123
|
+
return reader
|
|
124
|
+
|
|
125
|
+
if not os.path.isfile(path_or_bytes):
|
|
126
|
+
raise FileNotFoundError(str(path_or_bytes))
|
|
127
|
+
file_name = os.path.split(path_or_bytes)[1]
|
|
101
128
|
if not is_file_extension(file_name, ".pdf"):
|
|
102
129
|
raise FileExtensionError(f"must be a pdf file: {file_name}")
|
|
103
130
|
|
|
104
|
-
with open(
|
|
131
|
+
with open(path_or_bytes, "rb") as file:
|
|
105
132
|
qpdf_called = False
|
|
106
133
|
try:
|
|
107
|
-
|
|
134
|
+
reader = PdfReader(file)
|
|
108
135
|
except (errors.PdfReadError, AttributeError):
|
|
109
|
-
_ = decrypt_pdf_document(
|
|
136
|
+
_ = decrypt_pdf_document(path_or_bytes)
|
|
110
137
|
qpdf_called = True
|
|
111
138
|
|
|
112
139
|
if not qpdf_called:
|
|
113
|
-
if
|
|
114
|
-
is_decrypted = decrypt_pdf_document(
|
|
140
|
+
if reader.is_encrypted:
|
|
141
|
+
is_decrypted = decrypt_pdf_document(path_or_bytes)
|
|
115
142
|
if not is_decrypted:
|
|
116
143
|
logger.error(
|
|
117
144
|
LoggingRecord(
|
|
118
|
-
f"pdf document {
|
|
145
|
+
f"pdf document {path_or_bytes} cannot be decrypted and therefore cannot "
|
|
146
|
+
f"be processed further."
|
|
119
147
|
)
|
|
120
148
|
)
|
|
121
149
|
sys.exit()
|
|
122
150
|
|
|
123
|
-
return PdfReader(os.fspath(
|
|
151
|
+
return PdfReader(os.fspath(path_or_bytes))
|
|
124
152
|
|
|
125
153
|
|
|
126
154
|
def get_pdf_file_writer() -> PdfWriter:
|
|
@@ -157,11 +185,11 @@ class PDFStreamer:
|
|
|
157
185
|
|
|
158
186
|
"""
|
|
159
187
|
|
|
160
|
-
def __init__(self,
|
|
188
|
+
def __init__(self, path_or_bytes: Union[PathLikeOrStr, bytes]) -> None:
|
|
161
189
|
"""
|
|
162
|
-
:param
|
|
190
|
+
:param path_or_bytes: to a pdf.
|
|
163
191
|
"""
|
|
164
|
-
self.file_reader = get_pdf_file_reader(
|
|
192
|
+
self.file_reader = get_pdf_file_reader(path_or_bytes)
|
|
165
193
|
self.file_writer = PdfWriter()
|
|
166
194
|
|
|
167
195
|
def __len__(self) -> int:
|
|
@@ -312,6 +312,7 @@ class VizPackageHandler:
|
|
|
312
312
|
"interactive_imshow": "_cv2_interactive_imshow",
|
|
313
313
|
"encode": "_cv2_encode",
|
|
314
314
|
"rotate_image": "_cv2_rotate_image",
|
|
315
|
+
"convert_bytes_to_np": "_cv2_convert_bytes_to_np",
|
|
315
316
|
},
|
|
316
317
|
"pillow": {
|
|
317
318
|
"read_image": "_pillow_read_image",
|
|
@@ -325,6 +326,7 @@ class VizPackageHandler:
|
|
|
325
326
|
"interactive_imshow": "_pillow_interactive_imshow",
|
|
326
327
|
"encode": "_pillow_encode",
|
|
327
328
|
"rotate_image": "_pillow_rotate_image",
|
|
329
|
+
"convert_bytes_to_np": "_pillow_convert_bytes_to_np",
|
|
328
330
|
},
|
|
329
331
|
}
|
|
330
332
|
|
|
@@ -484,6 +486,37 @@ class VizPackageHandler:
|
|
|
484
486
|
pil_image = Image.open(im_file)
|
|
485
487
|
return np.array(pil_image)[:, :, ::-1]
|
|
486
488
|
|
|
489
|
+
def convert_bytes_to_np(self, image_bytes: bytes) -> PixelValues:
|
|
490
|
+
"""Converting an image as bytes into np.array
|
|
491
|
+
|
|
492
|
+
:param image_bytes: Image as np.array
|
|
493
|
+
"""
|
|
494
|
+
return getattr(self, self.pkg_func_dict["convert_bytes_to_np"])(image_bytes)
|
|
495
|
+
|
|
496
|
+
@staticmethod
|
|
497
|
+
def _cv2_convert_bytes_to_np(image_bytes: bytes) -> PixelValues:
|
|
498
|
+
"""
|
|
499
|
+
Convert image bytes to a numpy array using OpenCV.
|
|
500
|
+
|
|
501
|
+
:param image_bytes: Image bytes
|
|
502
|
+
:return: Image as numpy array
|
|
503
|
+
"""
|
|
504
|
+
np_array = np.frombuffer(image_bytes, np.uint8)
|
|
505
|
+
np_image = cv2.imdecode(np_array, cv2.IMREAD_COLOR)
|
|
506
|
+
return np_image
|
|
507
|
+
|
|
508
|
+
@staticmethod
|
|
509
|
+
def _pillow_convert_bytes_to_np(image_bytes: bytes) -> PixelValues:
|
|
510
|
+
"""
|
|
511
|
+
Convert image bytes to a numpy array using Pillow.
|
|
512
|
+
|
|
513
|
+
:param image_bytes: Image bytes
|
|
514
|
+
:return: Image as numpy array
|
|
515
|
+
"""
|
|
516
|
+
image = Image.open(BytesIO(image_bytes))
|
|
517
|
+
np_image = np.array(image)
|
|
518
|
+
return np_image
|
|
519
|
+
|
|
487
520
|
def resize(self, image: PixelValues, width: int, height: int, interpolation: str) -> PixelValues:
|
|
488
521
|
"""
|
|
489
522
|
Resize a given image to new width, height. Specifying an interpolation method is required. Depending on the
|
|
File without changes
|
|
File without changes
|