PyPI - deepdoctection - Versions diffs - 0.36__py3-none-any.whl → 0.37.1__py3-none-any.whl - Mend

deepdoctection 0.36py3-none-any.whl → 0.37.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (19) hide show

deepdoctection/__init__.py +3 -1
deepdoctection/analyzer/factory.py +3 -3
deepdoctection/dataflow/custom_serialize.py +1 -1
deepdoctection/datapoint/convert.py +11 -0
deepdoctection/datapoint/image.py +9 -5
deepdoctection/datasets/save.py +1 -1
deepdoctection/extern/base.py +2 -3
deepdoctection/extern/hflayoutlm.py +1 -1
deepdoctection/mapper/misc.py +5 -1
deepdoctection/pipe/base.py +29 -9
deepdoctection/pipe/doctectionpipe.py +77 -10
deepdoctection/utils/fs.py +8 -7
deepdoctection/utils/pdf_utils.py +45 -17
deepdoctection/utils/viz.py +33 -0
{deepdoctection-0.36.dist-info → deepdoctection-0.37.1.dist-info}/METADATA +1 -1
{deepdoctection-0.36.dist-info → deepdoctection-0.37.1.dist-info}/RECORD +19 -19
{deepdoctection-0.36.dist-info → deepdoctection-0.37.1.dist-info}/LICENSE +0 -0
{deepdoctection-0.36.dist-info → deepdoctection-0.37.1.dist-info}/WHEEL +0 -0
{deepdoctection-0.36.dist-info → deepdoctection-0.37.1.dist-info}/top_level.txt +0 -0

deepdoctection/__init__.py CHANGED Viewed

@@ -24,7 +24,7 @@ from .utils.logger import LoggingRecord, logger
 # pylint: enable=wrong-import-position
-__version__ = 0.36
+__version__ = "0.37.1"
 _IMPORT_STRUCTURE = {
     "analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory"],
@@ -87,6 +87,7 @@ _IMPORT_STRUCTURE = {
         "convert_b64_to_np_array",
         "convert_np_array_to_b64",
         "convert_np_array_to_b64_b",
+        "convert_bytes_to_np_array",
         "convert_pdf_bytes_to_np_array_v2",
         "box_to_point4",
         "point4_to_box",
@@ -371,6 +372,7 @@ _IMPORT_STRUCTURE = {
         "save_config_to_yaml",
         "config_to_cli_str",
         "decrypt_pdf_document",
+        "decrypt_pdf_document_from_bytes",
         "get_pdf_file_reader",
         "get_pdf_file_writer",
         "PDFStreamer",

deepdoctection/analyzer/factory.py CHANGED Viewed

@@ -327,9 +327,9 @@ class ServiceFactory:
             )
         if config.OCR.USE_TEXTRACT:
             credentials_kwargs = {
-                "aws_access_key_id": environ.get("ACCESS_KEY", None),
-                "aws_secret_access_key": environ.get("SECRET_KEY", None),
-                "config": Config(region_name=environ.get("REGION", None)),
+                "aws_access_key_id": environ.get("AWS_ACCESS_KEY", None),
+                "aws_secret_access_key": environ.get("AWS_SECRET_KEY", None),
+                "config": Config(region_name=environ.get("AWS_REGION", None)),
             }
             return TextractOcrDetector(**credentials_kwargs)
         raise ValueError("You have set USE_OCR=True but any of USE_TESSERACT, USE_DOCTR, USE_TEXTRACT is set to False")

deepdoctection/dataflow/custom_serialize.py CHANGED Viewed

@@ -593,7 +593,7 @@ class SerializerPdfDoc:
         file_name = os.path.split(path)[1]
         prefix, suffix = os.path.splitext(file_name)
         df: DataFlow
-        df = CustomDataFromIterable(PDFStreamer(path=path), max_datapoints=max_datapoints)
+        df = CustomDataFromIterable(PDFStreamer(path_or_bytes=path), max_datapoints=max_datapoints)
         df = MapData(
             df,
             lambda dp: {

deepdoctection/datapoint/convert.py CHANGED Viewed

@@ -40,6 +40,7 @@ __all__ = [
     "convert_b64_to_np_array",
     "convert_np_array_to_b64",
     "convert_np_array_to_b64_b",
+    "convert_bytes_to_np_array",
     "convert_pdf_bytes_to_np_array_v2",
     "box_to_point4",
     "point4_to_box",
@@ -107,6 +108,16 @@ def convert_np_array_to_b64_b(np_image: PixelValues) -> bytes:
     return viz_handler.encode(np_image)
+def convert_bytes_to_np_array(image_bytes: bytes) -> PixelValues:
+    """
+    Converts an image in bytes to a numpy array
+    :param image_bytes: An image as bytes.
+    :return: numpy array.
+    """
+    return viz_handler.convert_bytes_to_np(image_bytes)
 @deprecated("Use convert_pdf_bytes_to_np_array_v2", "2022-02-23")
 def convert_pdf_bytes_to_np_array(pdf_bytes: bytes, dpi: Optional[int] = None) -> PixelValues:
     """

deepdoctection/datapoint/image.py CHANGED Viewed

@@ -34,6 +34,7 @@ from ..utils.error import AnnotationError, BoundingBoxError, ImageError, UUIDErr
 from ..utils.identifier import get_uuid, is_uuid_like
 from ..utils.settings import ObjectTypes, SummaryType, get_type
 from ..utils.types import ImageDict, PathLikeOrStr, PixelValues
+from ..utils.logger import LoggingRecord, logger
 from .annotation import Annotation, AnnotationMap, BoundingBox, CategoryAnnotation, ImageAnnotation
 from .box import crop_box_from_image, global_to_local_coords, intersection_box
 from .convert import as_dict, convert_b64_to_np_array, convert_np_array_to_b64, convert_pdf_bytes_to_np_array_v2
@@ -474,8 +475,11 @@ class Image:
             for service_id in service_ids:
                 if service_id not in service_id_to_annotation_id:
-                    raise ImageError(f"Service id {service_id} not found")
-                annotation_ids = service_id_to_annotation_id[service_id]
+                    logger.info(
+                        LoggingRecord(
+                            f"Service_id {service_id} for image_id: {self.image_id} not found. Skipping removal."))
+                annotation_ids = service_id_to_annotation_id.get(service_id, [])
                 for ann_id in annotation_ids:
                     if ann_id not in ann_id_to_annotation_maps:
@@ -587,7 +591,7 @@ class Image:
             )
             ann.image.dump(sub_image)
-    def remove_image_from_lower_hierachy(self, pixel_values_only: bool = False) -> None:
+    def remove_image_from_lower_hierarchy(self, pixel_values_only: bool = False) -> None:
         """Will remove all images from image annotations."""
         for ann in self.annotations:
             if pixel_values_only:
@@ -717,7 +721,7 @@ class Image:
         else:
             path_json = fspath(path) + ".json"
         if highest_hierarchy_only:
-            self.remove_image_from_lower_hierachy()
+            self.remove_image_from_lower_hierarchy()
         export_dict = self.as_dict()
         export_dict["location"] = fspath(export_dict["location"])
         if not image_to_json:
@@ -747,7 +751,7 @@ class Image:
                 if sub_cat.service_id:
                     service_id_dict[sub_cat.service_id].append(sub_cat.annotation_id)
             if ann.image is not None:
-                for summary_cat_key in ann.image.summary:
+                for summary_cat_key in ann.image.summary.sub_categories:
                     summary_cat = ann.get_summary(summary_cat_key)
                     if summary_cat.service_id:
                         service_id_dict[summary_cat.service_id].append(summary_cat.annotation_id)

deepdoctection/datasets/save.py CHANGED Viewed

@@ -62,7 +62,7 @@ def dataflow_to_json(
     if highest_hierarchy_only:
         def _remove_hh(dp: Image) -> Image:
-            dp.remove_image_from_lower_hierachy()
+            dp.remove_image_from_lower_hierarchy()
             return dp
         df = MapData(df, _remove_hh)

deepdoctection/extern/base.py CHANGED Viewed

@@ -69,8 +69,7 @@ class ModelCategories:
         if self.init_categories:
             self._init_categories = MappingProxyType({key: get_type(val) for key, val in self.init_categories.items()})
         else:
-            if self._init_categories is None:
-                self._init_categories = MappingProxyType({})
+            self._init_categories = MappingProxyType({})
         self.categories = self._init_categories
     @overload
@@ -181,7 +180,7 @@ class NerModelCategories(ModelCategories):
             self._init_categories = self.merge_bio_semantics_categories(
                 self._categories_semantics, self._categories_bio
             )
-        super().__post_init__()
+        self.categories = self._init_categories
     @staticmethod
     def merge_bio_semantics_categories(

deepdoctection/extern/hflayoutlm.py CHANGED Viewed

@@ -48,7 +48,7 @@ with try_import() as pt_import_guard:
     import torch.nn.functional as F
 with try_import() as tr_import_guard:
-    from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD  # type: ignore
+    from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
     from transformers import (
         LayoutLMForSequenceClassification,
         LayoutLMForTokenClassification,

deepdoctection/mapper/misc.py CHANGED Viewed

@@ -27,7 +27,7 @@ from typing import Mapping, Optional, Sequence, Union
 from lazy_imports import try_import
-from ..datapoint.convert import convert_pdf_bytes_to_np_array_v2
+from ..datapoint.convert import convert_bytes_to_np_array, convert_pdf_bytes_to_np_array_v2
 from ..datapoint.image import Image
 from ..utils.fs import get_load_image_func, load_image_from_file
 from ..utils.types import JsonDict
@@ -49,6 +49,7 @@ def to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int]
     file_name: Optional[str]
     location: Optional[str]
+    image_bytes: Optional[bytes] = None
     if isinstance(dp, str):
         _, file_name = os.path.split(dp)
@@ -62,6 +63,7 @@ def to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int]
         document_id = dp.get("document_id")
         if location == "":
             location = str(dp.get("path", ""))
+        image_bytes = dp.get("image_bytes")
     else:
         raise TypeError("datapoint not of expected type for converting to image")
@@ -76,6 +78,8 @@ def to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int]
                 if dp_image.pdf_bytes is not None:
                     if isinstance(dp_image.pdf_bytes, bytes):
                         dp_image.image = convert_pdf_bytes_to_np_array_v2(dp_image.pdf_bytes, dpi=dpi)
+            elif image_bytes is not None:
+                dp_image.image = convert_bytes_to_np_array(image_bytes)
             else:
                 dp_image.image = load_image_from_file(location)

deepdoctection/pipe/base.py CHANGED Viewed

@@ -29,6 +29,7 @@ from uuid import uuid1
 from ..dataflow import DataFlow, MapData
 from ..datapoint.image import Image
+from ..mapper.misc import curry
 from ..utils.context import timed_operation
 from ..utils.identifier import get_uuid_from_str
 from ..utils.settings import ObjectTypes
@@ -247,17 +248,24 @@ class Pipeline(ABC):
         """
         raise NotImplementedError()
-    def _build_pipe(self, df: DataFlow, session_id: Optional[str] = None) -> DataFlow:
+    @staticmethod
+    @curry
+    def _undo(dp: Image, service_ids: Optional[list[str]] = None) -> Image:
         """
-        Composition of the backbone
+        Remove annotations from a datapoint
         """
-        if session_id is None and self.set_session_id:
-            session_id = self.get_session_id()
-        for component in self.pipe_component_list:
-            component.timer_on = True
-            component.dp_manager.session_id = session_id
-            df = component.predict_dataflow(df)
-        return df
+        dp.remove(service_ids=service_ids)
+        return dp
+    def undo(self, df: DataFlow, service_ids: Optional[set[str]] = None) -> DataFlow:
+        """
+        Mapping a datapoint via `_undo` within a dataflow pipeline
+        :param df: An input dataflow of Images
+        :param service_ids: A set of service ids to remove
+        :return: A output dataflow of Images
+        """
+        return MapData(df, self._undo(service_ids=service_ids))
     @abstractmethod
     def analyze(self, **kwargs: Any) -> DataFlow:
@@ -273,6 +281,18 @@ class Pipeline(ABC):
         """
         raise NotImplementedError()
+    def _build_pipe(self, df: DataFlow, session_id: Optional[str] = None) -> DataFlow:
+        """
+        Composition of the backbone
+        """
+        if session_id is None and self.set_session_id:
+            session_id = self.get_session_id()
+        for component in self.pipe_component_list:
+            component.timer_on = True
+            component.dp_manager.session_id = session_id
+            df = component.predict_dataflow(df)
+        return df
     def get_meta_annotation(self) -> MetaAnnotation:
         """
         Collects meta annotations from all pipeline components and summarizes the returned results

deepdoctection/pipe/doctectionpipe.py CHANGED Viewed

@@ -23,31 +23,38 @@ import os
 from pathlib import Path
 from typing import List, Mapping, Optional, Sequence, Tuple, Union
-from ..dataflow import DataFlow, MapData
+from ..dataflow import CustomDataFromIterable, DataFlow, DataFromList, MapData
 from ..dataflow.custom_serialize import SerializerFiles, SerializerPdfDoc
 from ..datapoint.image import Image
 from ..datapoint.view import IMAGE_DEFAULTS
 from ..mapper.maputils import curry
 from ..mapper.misc import to_image
 from ..utils.fs import maybe_path_or_pdf
+from ..utils.identifier import get_uuid_from_str
 from ..utils.logger import LoggingRecord, logger
+from ..utils.pdf_utils import PDFStreamer
 from ..utils.types import PathLikeOrStr
+from ..utils.utils import is_file_extension
 from .base import Pipeline, PipelineComponent
 from .common import PageParsingService
 def _collect_from_kwargs(
-    **kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
-) -> Tuple[Optional[str], Optional[str], bool, int, str, DataFlow]:
+    **kwargs: Union[Optional[str], bytes, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
+) -> Tuple[Optional[str], Union[str, Sequence[str]], bool, int, str, DataFlow, Optional[bytes]]:
+    b_bytes = kwargs.get("bytes")
     dataset_dataflow = kwargs.get("dataset_dataflow")
     path = kwargs.get("path")
     if path is None and dataset_dataflow is None:
         raise ValueError("Pass either path or dataset_dataflow as argument")
+    if path is None and b_bytes:
+        raise ValueError("When passing bytes, a path to the source document must be provided")
     shuffle = kwargs.get("shuffle", False)
     if not isinstance(shuffle, bool):
         raise TypeError(f"shuffle must be of type bool but is of type {type(shuffle)}")
+    file_type = None
     doc_path = None
     if path:
         if not isinstance(path, (str, Path)):
@@ -56,15 +63,27 @@ def _collect_from_kwargs(
         if path_type == 2:
             doc_path = path
             path = None
+            file_type = ".pdf"
+        elif path_type == 3:
+            if is_file_extension(path, ".jpg"):
+                file_type = ".jpg"
+            if is_file_extension(path, ".png"):
+                file_type = ".png"
+            if is_file_extension(path, ".jpeg"):
+                file_type = ".jpeg"
+            if not b_bytes:
+                raise ValueError("When passing a path to a single image, bytes of the image must be passed")
         elif not path_type:
             raise ValueError("Pass only a path to a directory or to a pdf file")
-    file_type = kwargs.get("file_type", [".jpg", ".png", ".tif"])
+    file_type = kwargs.get(
+        "file_type", [".jpg", ".png", ".jpeg", ".tif"] if file_type is None else file_type  # type: ignore
+    )
     max_datapoints = kwargs.get("max_datapoints")
     if not isinstance(max_datapoints, (int, type(None))):
         raise TypeError(f"max_datapoints must be of type int, but is of type {type(max_datapoints)}")
-    return path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow  # type: ignore
+    return path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow, b_bytes  # type: ignore
 @curry
@@ -142,12 +161,18 @@ class DoctectionPipe(Pipeline):
         super().__init__(pipeline_component_list)
-    def _entry(self, **kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]) -> DataFlow:
-        path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow = _collect_from_kwargs(**kwargs)
+    def _entry(self, **kwargs: Union[str, bytes, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]) \
+            -> DataFlow:
+        path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow, b_bytes = _collect_from_kwargs(**kwargs)
         df: DataFlow
-        if isinstance(path, (str, Path)):
+        if isinstance(b_bytes, bytes):
+            df = DoctectionPipe.bytes_to_dataflow(path=doc_path if path is None else path,
+                                                  b_bytes=b_bytes,
+                                                  file_type=file_type)
+        elif isinstance(path, (str, Path)):
             if not isinstance(file_type, (str, list)):
                 raise TypeError(f"file_type must be of type string or list, but is of type {type(file_type)}")
             df = DoctectionPipe.path_to_dataflow(path=path, file_type=file_type, shuffle=shuffle)
@@ -162,7 +187,7 @@ class DoctectionPipe(Pipeline):
         df = MapData(df, _proto_process(path, doc_path))
         if dataset_dataflow is None:
-            df = MapData(df, _to_image(dpi=300))  # pylint: disable=E1120
+            df = MapData(df, _to_image(dpi=os.environ.get("DPI", 300)))  # pylint: disable=E1120
         return df
     @staticmethod
@@ -197,6 +222,44 @@ class DoctectionPipe(Pipeline):
         """
         return _doc_to_dataflow(path, max_datapoints)
+    @staticmethod
+    def bytes_to_dataflow(
+        path: str, b_bytes: bytes, file_type: Union[str, Sequence[str]], max_datapoints: Optional[int] = None
+    ) -> DataFlow:
+        """
+        Converts a bytes object to a dataflow
+        :param path: path to directory or an image file
+        :param b_bytes: bytes object
+        :param file_type: e.g. ".pdf", ".jpg" or [".jpg", ".png", ".jpeg", ".tif"]
+        :param max_datapoints: max number of datapoints to consider
+        :return: DataFlow
+        """
+        file_name = os.path.split(path)[1]
+        if isinstance(file_type, str):
+            if file_type == ".pdf":
+                prefix, suffix = os.path.splitext(file_name)
+                df: DataFlow
+                df = CustomDataFromIterable(PDFStreamer(path_or_bytes=b_bytes), max_datapoints=max_datapoints)
+                df = MapData(
+                    df,
+                    lambda dp: {
+                        "path": path,
+                        "file_name": prefix + f"_{dp[1]}" + suffix,
+                        "pdf_bytes": dp[0],
+                        "page_number": dp[1],
+                        "document_id": get_uuid_from_str(prefix),
+                    },
+                )
+            else:
+                df = DataFromList(lst=[{"path": path, "file_name": file_name, "image_bytes": b_bytes}])
+            return df
+        raise ValueError(
+            f"pass: {path}, b_bytes: {b_bytes!r}, file_type: {file_type} and max_datapoints: {max_datapoints} "
+            f"not supported"
+        )
     def dataflow_to_page(self, df: DataFlow) -> DataFlow:
         """
         Converts a dataflow of images to a dataflow of pages
@@ -206,7 +269,9 @@ class DoctectionPipe(Pipeline):
         """
         return self.page_parser.predict_dataflow(df)
-    def analyze(self, **kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]) -> DataFlow:
+    def analyze(
+        self, **kwargs: Union[str, bytes, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
+    ) -> DataFlow:
         """
         `kwargs key dataset_dataflow:` Transfer a dataflow of a dataset via its dataflow builder
@@ -215,6 +280,8 @@ class DoctectionPipe(Pipeline):
                            only the first page is processed through the pipeline.
                            Alternatively, a path to a pdf document with multiple pages.
+        `kwargs key bytes:` A bytes object of an image
         `kwargs key file_type:` Selection of the file type, if: args:`file_type` is passed
         `kwargs key max_datapoints:` Stops processing as soon as max_datapoints images have been processed

deepdoctection/utils/fs.py CHANGED Viewed

@@ -227,20 +227,21 @@ def get_load_image_func(
 def maybe_path_or_pdf(path: PathLikeOrStr) -> int:
     """
-    Checks if the path points to a directory or a pdf document. Returns 1 if the path points to a directory, 2
-    if the path points to a pdf doc or 0, if none of the previous is true.
+    Checks if the path points to a directory, a pdf document or a single image. Returns 1 if the path points to a
+    directory, 2 if the path points to a pdf doc and 3 if path points to either a PNG, JPG or JPEG or 0 if none of the
+    previous is true.
     :param path: A path
-    :return: A value of 0,1,2
+    :return: A value of 0,1,2,3
     """
-    is_dir = os.path.isdir(path)
-    if is_dir:
+    if os.path.isdir(path):
         return 1
     file_name = os.path.split(path)[1]
-    is_pdf = is_file_extension(file_name, ".pdf")
-    if is_pdf:
+    if is_file_extension(file_name, ".pdf"):
         return 2
+    if is_file_extension(file_name, [".png", ".jpeg", ".jpg", ".tif"]):
+        return 3
     return 0

deepdoctection/utils/pdf_utils.py CHANGED Viewed

@@ -26,7 +26,7 @@ from errno import ENOENT
 from io import BytesIO
 from pathlib import Path
 from shutil import copyfile
-from typing import Generator, Literal, Optional
+from typing import Generator, Literal, Optional, Union
 from lazy_imports import try_import
 from numpy import uint8
@@ -46,6 +46,7 @@ with try_import() as pt_import_guard:
 __all__ = [
     "decrypt_pdf_document",
+    "decrypt_pdf_document_from_bytes",
     "get_pdf_file_reader",
     "get_pdf_file_writer",
     "PDFStreamer",
@@ -68,7 +69,6 @@ def decrypt_pdf_document(path: PathLikeOrStr) -> bool:
     :param path: A path to the pdf file
     :return: True if document has been successfully decrypted
     """
     if qpdf_available():
         path_base, file_name = os.path.split(path)
         file_name_tmp = os.path.splitext(file_name)[0] + "tmp.pdf"
@@ -86,41 +86,69 @@ def decrypt_pdf_document(path: PathLikeOrStr) -> bool:
     return False
-def get_pdf_file_reader(path: PathLikeOrStr) -> PdfReader:
+def decrypt_pdf_document_from_bytes(input_bytes: bytes) -> bytes:
+    """
+    Decrypting a pdf given as bytes. Under the hood, it saves the bytes to a temporary file and then calls
+    qpdf: <http://qpdf.sourceforge.net/>
+    :param input_bytes: A bytes object representing the pdf file
+    :return: The decrypted bytes object
+    """
+    with save_tmp_file(input_bytes, "pdf_") as (_, input_file_name):
+        is_decrypted = decrypt_pdf_document(input_file_name)
+        if is_decrypted:
+            with open(input_file_name, "rb") as file:
+                return file.read()
+        else:
+            logger.error(LoggingRecord("pdf bytes cannot be decrypted and therefore cannot be processed further."))
+            sys.exit()
+def get_pdf_file_reader(path_or_bytes: Union[PathLikeOrStr, bytes]) -> PdfReader:
     """
     Creates a file reader object from a pdf document. Will try to decrypt the document if it is
     encrypted. (See `decrypt_pdf_document` to understand what is meant with "decrypt").
-    :param path: A path to a pdf document
+    :param path_or_bytes: A path to a pdf document
     :return: A file reader object from which you can iterate through the document.
     """
-    if not os.path.isfile(path):
-        raise FileNotFoundError(str(path))
-    file_name = os.path.split(path)[1]
+    if isinstance(path_or_bytes, bytes):
+        try:
+            reader = PdfReader(BytesIO(path_or_bytes))
+        except (errors.PdfReadError, AttributeError):
+            decrypted_bytes = decrypt_pdf_document_from_bytes(path_or_bytes)
+            reader = PdfReader(BytesIO(decrypted_bytes))
+        return reader
+    if not os.path.isfile(path_or_bytes):
+        raise FileNotFoundError(str(path_or_bytes))
+    file_name = os.path.split(path_or_bytes)[1]
     if not is_file_extension(file_name, ".pdf"):
         raise FileExtensionError(f"must be a pdf file: {file_name}")
-    with open(path, "rb") as file:
+    with open(path_or_bytes, "rb") as file:
         qpdf_called = False
         try:
-            input_pdf_as_bytes = PdfReader(file)
+            reader = PdfReader(file)
         except (errors.PdfReadError, AttributeError):
-            _ = decrypt_pdf_document(path)
+            _ = decrypt_pdf_document(path_or_bytes)
             qpdf_called = True
         if not qpdf_called:
-            if input_pdf_as_bytes.is_encrypted:
-                is_decrypted = decrypt_pdf_document(path)
+            if reader.is_encrypted:
+                is_decrypted = decrypt_pdf_document(path_or_bytes)
                 if not is_decrypted:
                     logger.error(
                         LoggingRecord(
-                            f"pdf document {path} cannot be decrypted and therefore cannot be " f"processed further."
+                            f"pdf document {path_or_bytes} cannot be decrypted and therefore cannot "
+                            f"be processed further."
                         )
                     )
                     sys.exit()
-    return PdfReader(os.fspath(path))
+    return PdfReader(os.fspath(path_or_bytes))
 def get_pdf_file_writer() -> PdfWriter:
@@ -157,11 +185,11 @@ class PDFStreamer:
     """
-    def __init__(self, path: PathLikeOrStr) -> None:
+    def __init__(self, path_or_bytes: Union[PathLikeOrStr, bytes]) -> None:
         """
-        :param path: to a pdf.
+        :param path_or_bytes: to a pdf.
         """
-        self.file_reader = get_pdf_file_reader(path)
+        self.file_reader = get_pdf_file_reader(path_or_bytes)
         self.file_writer = PdfWriter()
     def __len__(self) -> int:

deepdoctection/utils/viz.py CHANGED Viewed

@@ -312,6 +312,7 @@ class VizPackageHandler:
             "interactive_imshow": "_cv2_interactive_imshow",
             "encode": "_cv2_encode",
             "rotate_image": "_cv2_rotate_image",
+            "convert_bytes_to_np": "_cv2_convert_bytes_to_np",
         },
         "pillow": {
             "read_image": "_pillow_read_image",
@@ -325,6 +326,7 @@ class VizPackageHandler:
             "interactive_imshow": "_pillow_interactive_imshow",
             "encode": "_pillow_encode",
             "rotate_image": "_pillow_rotate_image",
+            "convert_bytes_to_np": "_pillow_convert_bytes_to_np",
         },
     }
@@ -484,6 +486,37 @@ class VizPackageHandler:
         pil_image = Image.open(im_file)
         return np.array(pil_image)[:, :, ::-1]
+    def convert_bytes_to_np(self, image_bytes: bytes) -> PixelValues:
+        """Converting an image as bytes into np.array
+        :param image_bytes: Image as np.array
+        """
+        return getattr(self, self.pkg_func_dict["convert_bytes_to_np"])(image_bytes)
+    @staticmethod
+    def _cv2_convert_bytes_to_np(image_bytes: bytes) -> PixelValues:
+        """
+        Convert image bytes to a numpy array using OpenCV.
+        :param image_bytes: Image bytes
+        :return: Image as numpy array
+        """
+        np_array = np.frombuffer(image_bytes, np.uint8)
+        np_image = cv2.imdecode(np_array, cv2.IMREAD_COLOR)
+        return np_image
+    @staticmethod
+    def _pillow_convert_bytes_to_np(image_bytes: bytes) -> PixelValues:
+        """
+        Convert image bytes to a numpy array using Pillow.
+        :param image_bytes: Image bytes
+        :return: Image as numpy array
+        """
+        image = Image.open(BytesIO(image_bytes))
+        np_image = np.array(image)
+        return np_image
     def resize(self, image: PixelValues, width: int, height: int, interpolation: str) -> PixelValues:
         """
         Resize a given image to new width, height. Specifying an interpolation method is required. Depending on the

{deepdoctection-0.36.dist-info → deepdoctection-0.37.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: deepdoctection
-Version: 0.36
+Version: 0.37.1
 Summary: Repository for Document AI
 Home-page: https://github.com/deepdoctection/deepdoctection
 Author: Dr. Janis Meyer

{deepdoctection-0.36.dist-info → deepdoctection-0.37.1.dist-info}/RECORD RENAMED Viewed

@@ -1,9 +1,9 @@
-deepdoctection/__init__.py,sha256=fNUbaFAlK1JUXgPCmTu2UOLUMqW4HIgkaW4uOUYjYYg,12571
+deepdoctection/__init__.py,sha256=i23UZBqMlkcvUILJxvUQAdj-3d2yV9edzxFsC5RoMHA,12655
 deepdoctection/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 deepdoctection/analyzer/__init__.py,sha256=icClxrd20XutD6LxLgEPIWceSs4j_QfI3szCE-9BL2w,729
 deepdoctection/analyzer/_config.py,sha256=NZl_REM8Ge2xfxvHN-mZR5KURcHfZii3xfMlKQwckbA,4864
 deepdoctection/analyzer/dd.py,sha256=DUOhOtwipHw5nabYqn3WGR9aZcgP0ma_bi_tjf9xscw,5973
-deepdoctection/analyzer/factory.py,sha256=xmo5F9X7I6lp0ZWJv8QavpMyG8UWYLvMi4qogsZV1_s,31507
+deepdoctection/analyzer/factory.py,sha256=dEUOtdBS3yQGLqMqLR_kq5EYCR3IE30DjHNzE0spoQE,31519
 deepdoctection/configs/__init__.py,sha256=TX_P6tqDOF1LK1mi9ruAl7x0mtv1Asm8cYWCz3Pe2dk,646
 deepdoctection/configs/conf_dd_one.yaml,sha256=td7XsyVhdXkhh5Pie7sT_WNjGTaxBOWgpxhkobHd1H0,2325
 deepdoctection/configs/conf_tesseract.yaml,sha256=oF6szDyoi15FHvq7yFUNIEjfA_jNLhGxoowiRsz_zY4,35
@@ -11,15 +11,15 @@ deepdoctection/dataflow/__init__.py,sha256=CWRHMpmJaPk4xY_oIIFubCt-z11SguWrMWxHZ
 deepdoctection/dataflow/base.py,sha256=z4DCComSj5wStEPjtk0093cNNGfUMiDqx8dqz36nS_o,6221
 deepdoctection/dataflow/common.py,sha256=MyGA2VWlNMjQdIN_Jd-o0Ec3bDJmjQit4Nv0v43OCSQ,10119
 deepdoctection/dataflow/custom.py,sha256=3CK_1oL9p6nbOq8WtH5_vQUo70_8Z8pXY7kG0OFqzug,6803
-deepdoctection/dataflow/custom_serialize.py,sha256=CKeyw2Ayq_qAl0O5BoKkIOFJgteCt78h9QFTI23XhmQ,22818
+deepdoctection/dataflow/custom_serialize.py,sha256=WocuiYo2gkih5Z9lWAoIIfUewwYSDOhHzG7ZZjKlUic,22827
 deepdoctection/dataflow/parallel_map.py,sha256=8FhxJBWV-kjJrJ27jQtP3yYF6Ev6rz98worO60oi96c,15837
 deepdoctection/dataflow/serialize.py,sha256=4pYC7m9h53JCu99waVeKpHDpsCDDdYCrSZpP2QYSsgs,4555
 deepdoctection/dataflow/stats.py,sha256=Bsr6v7lcesKXUYtO9wjqlzx_Yq_uyIF3Lel-tQ0i4wI,9619
 deepdoctection/datapoint/__init__.py,sha256=3K406GbOPhoEp8koVaSbMocmSsmWifnSZ1SPb7C1lOY,1643
 deepdoctection/datapoint/annotation.py,sha256=FEgz4COxVDfjic0gG7kS6iHnWLBIgFnquQ63Cbj2a4Y,22531
 deepdoctection/datapoint/box.py,sha256=tkFuVM6xfx2jL7W4UED4qHXV572LSRdIsVJbrEiyIxI,23524
-deepdoctection/datapoint/convert.py,sha256=Gw2IjNiEotPu1yuMZqrIYB0mCAwafKt-VgMnrHj6S7U,6808
-deepdoctection/datapoint/image.py,sha256=EvZlVwJjMAcL1z8RNPBvZ8fwdJvkGuGpcFxCP1y26Go,33045
+deepdoctection/datapoint/convert.py,sha256=O7920pIomyEkzXwxpFsrzfhn7Pl6UzVGhNzv90VcuKU,7099
+deepdoctection/datapoint/image.py,sha256=AM34br9eM1syTIUXcJIrAaP7pEnejbUl-w-CK5pr9z8,33233
 deepdoctection/datapoint/view.py,sha256=1rVMuqucCrI5zlwyXMADJQBV38V_zSNFqFyBi3cMA1E,44914
 deepdoctection/datasets/__init__.py,sha256=-A3aR90aDsHPmVM35JavfnQ2itYSCn3ujl4krRni1QU,1076
 deepdoctection/datasets/adapter.py,sha256=Ly_vbOAgVI73V41FUccnSX1ECTOyesW_qsuvQuvOZbw,7796
@@ -27,7 +27,7 @@ deepdoctection/datasets/base.py,sha256=DT4i-d74sIEiUNC6UspIHNJuHSK0t1dBv7qwadg4r
 deepdoctection/datasets/dataflow_builder.py,sha256=cYU2zV3gZW2bFvMHimlO9VIl3BAUaCwML08cCIQ8Em4,4107
 deepdoctection/datasets/info.py,sha256=6y5TfiUhQppynbMFP5JmUPk95ggsVCtGIw4dYh2lVus,20501
 deepdoctection/datasets/registry.py,sha256=ZjzVzjsCgNXJuZZZtR98_yKocADmh4EBGV5JqJbGjWk,2543
-deepdoctection/datasets/save.py,sha256=khYQ4t94FOu9RWMimP9E4kASq25f61SIow78NHaX1pg,3349
+deepdoctection/datasets/save.py,sha256=Y9508Qqp8gIGN7pbGgVBBnkiC6NdCb9L2YR4wVvEUxM,3350
 deepdoctection/datasets/instances/__init__.py,sha256=XEc_4vT5lDn6bbZID9ujDEumWu8Ec2W-QS4pI_bfWWE,1388
 deepdoctection/datasets/instances/doclaynet.py,sha256=wRZT7wMTilZBLZ1gKY2cWReD1EGT735vOOTy0pD0N6M,12038
 deepdoctection/datasets/instances/fintabnet.py,sha256=qYzFK1dWF6MEPkHamP255DvAzlQT_GnkvDe1aM7CgjA,12006
@@ -50,13 +50,13 @@ deepdoctection/eval/registry.py,sha256=v4mp-s67vBVRu1nQzuGlYPViQnMSeIXEcF_WmvfUC
 deepdoctection/eval/tedsmetric.py,sha256=rKw-734Y9CpBtIfkBSPQF2vAZxnIdWrI9Zc723P7RxI,9529
 deepdoctection/eval/tp_eval_callback.py,sha256=SXsXumoyxq-MIH9Cep5eUOwnNshMbKmC6mYOGwCg0pM,5283
 deepdoctection/extern/__init__.py,sha256=9Iks9b4Q_LynjcV167TVCoK8YsQRUcA2jjmAmDNA_X8,1056
-deepdoctection/extern/base.py,sha256=ajzFzD9BrFwnly4SziN8PadI-PBOzzVRlIGPm_sNllE,24142
+deepdoctection/extern/base.py,sha256=ONPgappl_P5HSwQr42FatuRnwMTvUPecPsCztDTN0Hw,24108
 deepdoctection/extern/d2detect.py,sha256=zrKv1yurApnjD7QZIZk_8LYCahjmN82MQUjHjv8zvkQ,22127
 deepdoctection/extern/deskew.py,sha256=sPoixu8S9he-0wbs-jgxtPE2V9BiP4-3uZlb6F5Y1SA,3077
 deepdoctection/extern/doctrocr.py,sha256=T3_tvlih22_dVCBZypS1Y8tjQQB1fkAxIbGdUGHIapQ,24473
 deepdoctection/extern/fastlang.py,sha256=F4gK-SEwcCujjxH327ZDzMGWToJ49xS_dCKcePQ9IlY,4780
 deepdoctection/extern/hfdetr.py,sha256=1NPW_u5eH2tP3ixZ91l4WR-O-wLVcrFsLWA7BqID0oM,12055
-deepdoctection/extern/hflayoutlm.py,sha256=KfoWx9_Rpa1Y2L51HLrYvenfWaTB4SVTmVJH00Cqb-s,56510
+deepdoctection/extern/hflayoutlm.py,sha256=_OUeQsbNgfjbV7TPYBjkqc4HoTBQqkOINnwpewPJpl8,56494
 deepdoctection/extern/hflm.py,sha256=kwS6kcSlY_2m9u0RzBLTRq-UMM7c1PhyUaDTvSdejus,9217
 deepdoctection/extern/model.py,sha256=ViHHKPvbGmLCPw7ZESv_rmjlkA90UiBU6oZiHOMqNSw,59869
 deepdoctection/extern/pdftext.py,sha256=KS_t27SUiYn_IOS_J2lF9lSSo22vLagxmxvYCY3CqXA,7228
@@ -95,7 +95,7 @@ deepdoctection/mapper/hfstruct.py,sha256=2PjGKsYturVJBimLT1CahYh09KSRAFEHz_QNtC1
 deepdoctection/mapper/laylmstruct.py,sha256=abMZkYU2W0e_VcCm_c0ZXNFuv-lfMFWcTedcZS5EYvE,42935
 deepdoctection/mapper/maputils.py,sha256=eI6ZcDg9W5uB6xQNBZpMIdEd86HlCxTtkJuyROdTqiw,8146
 deepdoctection/mapper/match.py,sha256=pCWZpz2R8JahiKXCw7dxKRTLiPgJXeVDgkddDPLy_c0,9643
-deepdoctection/mapper/misc.py,sha256=rCqHOcsCfVPXs36AWK0rZ2kk0CUM3yXV370_zyIGBJ4,6518
+deepdoctection/mapper/misc.py,sha256=NLSSgk066Tkrrdi075HkqV7cP-iqT9fv_MtyAJ-8gOg,6743
 deepdoctection/mapper/pascalstruct.py,sha256=TzVU1p0oiw0nOuxTFFbEB9vXJxH1v6VUvTJ7MD0manU,3828
 deepdoctection/mapper/prodigystruct.py,sha256=Re4Sd_zAp6qOvbXZLmMJeG0IGEfMQxebuyDeZgMcTa8,6827
 deepdoctection/mapper/pubstruct.py,sha256=YxsrZ-E0pD45Mm_VCPQB9yEgHsTPkw4htt-3DwCRX1k,23361
@@ -103,10 +103,10 @@ deepdoctection/mapper/tpstruct.py,sha256=YNABRibvcISD5Lavg3jouoE4FMdqXEJoM-hNoB_
 deepdoctection/mapper/xfundstruct.py,sha256=_3r3c0K82fnF2h1HxA85h-9ETYrHwcERa6MNc6Ko6Z8,8807
 deepdoctection/pipe/__init__.py,sha256=ywTVoetftdL6plXg2YlBzMfmqBZupq7yXblSVyvvkcQ,1127
 deepdoctection/pipe/anngen.py,sha256=3319l4aaXzcY4w6ItVBNPX8LGS5fHFDVtyVY9KMefac,16393
-deepdoctection/pipe/base.py,sha256=Davjkf3D837y9AIITcx7yXdebmVaz6Moyw_5Wi3nfmg,13561
+deepdoctection/pipe/base.py,sha256=ynNg5SSRuUVxN69VWOO3Oi7WSeGrYwn3A56NQMBJDvw,14222
 deepdoctection/pipe/common.py,sha256=haOb4v0jLX3r41BSC8cVseX2E320_HkSrGlZsQiKE2g,17728
 deepdoctection/pipe/concurrency.py,sha256=AAKRsVgaBEYNluntbDa46SBF1JZ_XqnWLDSWrNvAzEo,9657
-deepdoctection/pipe/doctectionpipe.py,sha256=I6B6HT_BG2ByQ3Rjsui3-Ct31yLmodx-iuZnujXaiSc,8953
+deepdoctection/pipe/doctectionpipe.py,sha256=uhsrSuwaHcOMj8b8i6wCpPaZlSxCTaeHVhMokJ8vRSI,11835
 deepdoctection/pipe/language.py,sha256=5zI0UQC6Fh12_r2pfVL42HoCGz2hpHrOhpXAn5m-rYw,5451
 deepdoctection/pipe/layout.py,sha256=xIhnJpyUSbvLbhTXyAKXY1hmG9352jihGYFSclTH_1g,5567
 deepdoctection/pipe/lm.py,sha256=Sp-b7smeslNDyioEfNjuNBUxAuFKn3-OKpCZkGXri_c,16643
@@ -129,20 +129,20 @@ deepdoctection/utils/develop.py,sha256=4HyTarkFbJwctL-Hgu1TU_LSJppHvaroDbcyHsxhI
 deepdoctection/utils/env_info.py,sha256=TnCA-LOTj4WIHd9yvn1AaoPWsLmPgc42l-BJmGV6zmM,19147
 deepdoctection/utils/error.py,sha256=_3q9VepKfEhsM3H033_Fu0hwBzMSjsWALsjyJbGAZr8,2367
 deepdoctection/utils/file_utils.py,sha256=IRElrcND0YEiU1QELw5hfXeNA39uE2_nyzh9-X7YcxI,19477
-deepdoctection/utils/fs.py,sha256=C4ktrzjoVtX9kgycv5YrEigDI9byi65b6_D0aKsGM4Y,10161
+deepdoctection/utils/fs.py,sha256=x842BxUP5bbjJ2cofw-g4dKJv4QAaGzda4qnAazabO4,10281
 deepdoctection/utils/identifier.py,sha256=QkNaGGqPynHwDPnd3_m8iur4Cv64rcQa7qolCE7Qphk,2159
 deepdoctection/utils/logger.py,sha256=J0OVKiXP_2A82MWbbJoOeMEJ-75aZu5npgaS_yI6mVA,10003
 deepdoctection/utils/metacfg.py,sha256=hD76KQ_RnD_5B02qLI2Zxf3WfnsnXhEI_KUTKpw91RI,5711
 deepdoctection/utils/mocks.py,sha256=IkN3-IzAl4eX0ibgKIHg8IY7ykVw6BnpF6XnxKnKaZI,2389
-deepdoctection/utils/pdf_utils.py,sha256=OAQjE9xHVNcDsFqAvX47Lu-mgmoMpVXqIf5pOK8AwxY,11595
+deepdoctection/utils/pdf_utils.py,sha256=G0m8kUn2HwwyZWH_BcrDkm-m3MP9GN9SWHj5VhB7swY,12845
 deepdoctection/utils/settings.py,sha256=k6OyuWbj-IPeaO9zT9RZ-5Yad1wNhWGYqGLZdtgXAZY,12464
 deepdoctection/utils/tqdm.py,sha256=cBUtR0L1x0KMeYrLP2rrzyzCamCjpQAKroHXLv81_pk,1820
 deepdoctection/utils/transform.py,sha256=3kCgsEeRkG1efCdkfvj7tUFMs-e2jbjbflq826F2GPU,8502
 deepdoctection/utils/types.py,sha256=_3dmPdCIZNLbgU5QP5k_c5phDf18xLe1kYL6t2nM45s,2953
 deepdoctection/utils/utils.py,sha256=csVs_VvCq4QBETPoE2JdTTL4MFYnD4xh-Js5vRb612g,6492
-deepdoctection/utils/viz.py,sha256=Mok1d0V7NwlhAvO1S1Iq5YitKpVmOfH_XHTSlRelCB0,25902
-deepdoctection-0.36.dist-info/LICENSE,sha256=GQ0rUvuGdrMNEI3iHK5UQx6dIMU1QwAuyXsxUHn5MEQ,11351
-deepdoctection-0.36.dist-info/METADATA,sha256=E-zXgx0bTdSqbd88D_abscR_poEJaKJGIwlv2RFbQs8,19543
-deepdoctection-0.36.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
-deepdoctection-0.36.dist-info/top_level.txt,sha256=hs2DdoOL9h4mnHhmO82BT4pz4QATIoOZ20PZmlnxFI8,15
-deepdoctection-0.36.dist-info/RECORD,,
+deepdoctection/utils/viz.py,sha256=Jf8ePNYWlpuyaS6SeTYQ4OyA3eNhtgjvAQZnGNdgHC0,27051
+deepdoctection-0.37.1.dist-info/LICENSE,sha256=GQ0rUvuGdrMNEI3iHK5UQx6dIMU1QwAuyXsxUHn5MEQ,11351
+deepdoctection-0.37.1.dist-info/METADATA,sha256=M-HjpJpxuM4tHN0ld8DscsZPgKRUoNmsbx9slFkj6tg,19545
+deepdoctection-0.37.1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
+deepdoctection-0.37.1.dist-info/top_level.txt,sha256=hs2DdoOL9h4mnHhmO82BT4pz4QATIoOZ20PZmlnxFI8,15
+deepdoctection-0.37.1.dist-info/RECORD,,

{deepdoctection-0.36.dist-info → deepdoctection-0.37.1.dist-info}/LICENSE RENAMED Viewed

File without changes

{deepdoctection-0.36.dist-info → deepdoctection-0.37.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{deepdoctection-0.36.dist-info → deepdoctection-0.37.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

deepdoctection 0.36__py3-none-any.whl → 0.37.1__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.36py3-none-any.whl → 0.37.1py3-none-any.whl