PyPI - deepdoctection - Versions diffs - 0.36__tar.gz → 0.37.1__tar.gz - Mend

deepdoctection 0.36tar.gz → 0.37.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (154) hide show

{deepdoctection-0.36 → deepdoctection-0.37.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: deepdoctection
-Version: 0.36
+Version: 0.37.1
 Summary: Repository for Document AI
 Home-page: https://github.com/deepdoctection/deepdoctection
 Author: Dr. Janis Meyer

{deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/__init__.py RENAMED Viewed

@@ -24,7 +24,7 @@ from .utils.logger import LoggingRecord, logger
 # pylint: enable=wrong-import-position
-__version__ = 0.36
+__version__ = "0.37.1"
 _IMPORT_STRUCTURE = {
     "analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory"],
@@ -87,6 +87,7 @@ _IMPORT_STRUCTURE = {
         "convert_b64_to_np_array",
         "convert_np_array_to_b64",
         "convert_np_array_to_b64_b",
+        "convert_bytes_to_np_array",
         "convert_pdf_bytes_to_np_array_v2",
         "box_to_point4",
         "point4_to_box",
@@ -371,6 +372,7 @@ _IMPORT_STRUCTURE = {
         "save_config_to_yaml",
         "config_to_cli_str",
         "decrypt_pdf_document",
+        "decrypt_pdf_document_from_bytes",
         "get_pdf_file_reader",
         "get_pdf_file_writer",
         "PDFStreamer",

{deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/analyzer/factory.py RENAMED Viewed

@@ -327,9 +327,9 @@ class ServiceFactory:
             )
         if config.OCR.USE_TEXTRACT:
             credentials_kwargs = {
-                "aws_access_key_id": environ.get("ACCESS_KEY", None),
-                "aws_secret_access_key": environ.get("SECRET_KEY", None),
-                "config": Config(region_name=environ.get("REGION", None)),
+                "aws_access_key_id": environ.get("AWS_ACCESS_KEY", None),
+                "aws_secret_access_key": environ.get("AWS_SECRET_KEY", None),
+                "config": Config(region_name=environ.get("AWS_REGION", None)),
             }
             return TextractOcrDetector(**credentials_kwargs)
         raise ValueError("You have set USE_OCR=True but any of USE_TESSERACT, USE_DOCTR, USE_TEXTRACT is set to False")

{deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/dataflow/custom_serialize.py RENAMED Viewed

@@ -593,7 +593,7 @@ class SerializerPdfDoc:
         file_name = os.path.split(path)[1]
         prefix, suffix = os.path.splitext(file_name)
         df: DataFlow
-        df = CustomDataFromIterable(PDFStreamer(path=path), max_datapoints=max_datapoints)
+        df = CustomDataFromIterable(PDFStreamer(path_or_bytes=path), max_datapoints=max_datapoints)
         df = MapData(
             df,
             lambda dp: {

{deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/datapoint/convert.py RENAMED Viewed

@@ -40,6 +40,7 @@ __all__ = [
     "convert_b64_to_np_array",
     "convert_np_array_to_b64",
     "convert_np_array_to_b64_b",
+    "convert_bytes_to_np_array",
     "convert_pdf_bytes_to_np_array_v2",
     "box_to_point4",
     "point4_to_box",
@@ -107,6 +108,16 @@ def convert_np_array_to_b64_b(np_image: PixelValues) -> bytes:
     return viz_handler.encode(np_image)
+def convert_bytes_to_np_array(image_bytes: bytes) -> PixelValues:
+    """
+    Converts an image in bytes to a numpy array
+    :param image_bytes: An image as bytes.
+    :return: numpy array.
+    """
+    return viz_handler.convert_bytes_to_np(image_bytes)
 @deprecated("Use convert_pdf_bytes_to_np_array_v2", "2022-02-23")
 def convert_pdf_bytes_to_np_array(pdf_bytes: bytes, dpi: Optional[int] = None) -> PixelValues:
     """

{deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/datapoint/image.py RENAMED Viewed

@@ -34,6 +34,7 @@ from ..utils.error import AnnotationError, BoundingBoxError, ImageError, UUIDErr
 from ..utils.identifier import get_uuid, is_uuid_like
 from ..utils.settings import ObjectTypes, SummaryType, get_type
 from ..utils.types import ImageDict, PathLikeOrStr, PixelValues
+from ..utils.logger import LoggingRecord, logger
 from .annotation import Annotation, AnnotationMap, BoundingBox, CategoryAnnotation, ImageAnnotation
 from .box import crop_box_from_image, global_to_local_coords, intersection_box
 from .convert import as_dict, convert_b64_to_np_array, convert_np_array_to_b64, convert_pdf_bytes_to_np_array_v2
@@ -474,8 +475,11 @@ class Image:
             for service_id in service_ids:
                 if service_id not in service_id_to_annotation_id:
-                    raise ImageError(f"Service id {service_id} not found")
-                annotation_ids = service_id_to_annotation_id[service_id]
+                    logger.info(
+                        LoggingRecord(
+                            f"Service_id {service_id} for image_id: {self.image_id} not found. Skipping removal."))
+                annotation_ids = service_id_to_annotation_id.get(service_id, [])
                 for ann_id in annotation_ids:
                     if ann_id not in ann_id_to_annotation_maps:
@@ -587,7 +591,7 @@ class Image:
             )
             ann.image.dump(sub_image)
-    def remove_image_from_lower_hierachy(self, pixel_values_only: bool = False) -> None:
+    def remove_image_from_lower_hierarchy(self, pixel_values_only: bool = False) -> None:
         """Will remove all images from image annotations."""
         for ann in self.annotations:
             if pixel_values_only:
@@ -717,7 +721,7 @@ class Image:
         else:
             path_json = fspath(path) + ".json"
         if highest_hierarchy_only:
-            self.remove_image_from_lower_hierachy()
+            self.remove_image_from_lower_hierarchy()
         export_dict = self.as_dict()
         export_dict["location"] = fspath(export_dict["location"])
         if not image_to_json:
@@ -747,7 +751,7 @@ class Image:
                 if sub_cat.service_id:
                     service_id_dict[sub_cat.service_id].append(sub_cat.annotation_id)
             if ann.image is not None:
-                for summary_cat_key in ann.image.summary:
+                for summary_cat_key in ann.image.summary.sub_categories:
                     summary_cat = ann.get_summary(summary_cat_key)
                     if summary_cat.service_id:
                         service_id_dict[summary_cat.service_id].append(summary_cat.annotation_id)

{deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/datasets/save.py RENAMED Viewed

@@ -62,7 +62,7 @@ def dataflow_to_json(
     if highest_hierarchy_only:
         def _remove_hh(dp: Image) -> Image:
-            dp.remove_image_from_lower_hierachy()
+            dp.remove_image_from_lower_hierarchy()
             return dp
         df = MapData(df, _remove_hh)

{deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/base.py RENAMED Viewed

@@ -69,8 +69,7 @@ class ModelCategories:
         if self.init_categories:
             self._init_categories = MappingProxyType({key: get_type(val) for key, val in self.init_categories.items()})
         else:
-            if self._init_categories is None:
-                self._init_categories = MappingProxyType({})
+            self._init_categories = MappingProxyType({})
         self.categories = self._init_categories
     @overload
@@ -181,7 +180,7 @@ class NerModelCategories(ModelCategories):
             self._init_categories = self.merge_bio_semantics_categories(
                 self._categories_semantics, self._categories_bio
             )
-        super().__post_init__()
+        self.categories = self._init_categories
     @staticmethod
     def merge_bio_semantics_categories(

{deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/extern/hflayoutlm.py RENAMED Viewed

@@ -48,7 +48,7 @@ with try_import() as pt_import_guard:
     import torch.nn.functional as F
 with try_import() as tr_import_guard:
-    from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD  # type: ignore
+    from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
     from transformers import (
         LayoutLMForSequenceClassification,
         LayoutLMForTokenClassification,

{deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/mapper/misc.py RENAMED Viewed

@@ -27,7 +27,7 @@ from typing import Mapping, Optional, Sequence, Union
 from lazy_imports import try_import
-from ..datapoint.convert import convert_pdf_bytes_to_np_array_v2
+from ..datapoint.convert import convert_bytes_to_np_array, convert_pdf_bytes_to_np_array_v2
 from ..datapoint.image import Image
 from ..utils.fs import get_load_image_func, load_image_from_file
 from ..utils.types import JsonDict
@@ -49,6 +49,7 @@ def to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int]
     file_name: Optional[str]
     location: Optional[str]
+    image_bytes: Optional[bytes] = None
     if isinstance(dp, str):
         _, file_name = os.path.split(dp)
@@ -62,6 +63,7 @@ def to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int]
         document_id = dp.get("document_id")
         if location == "":
             location = str(dp.get("path", ""))
+        image_bytes = dp.get("image_bytes")
     else:
         raise TypeError("datapoint not of expected type for converting to image")
@@ -76,6 +78,8 @@ def to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int]
                 if dp_image.pdf_bytes is not None:
                     if isinstance(dp_image.pdf_bytes, bytes):
                         dp_image.image = convert_pdf_bytes_to_np_array_v2(dp_image.pdf_bytes, dpi=dpi)
+            elif image_bytes is not None:
+                dp_image.image = convert_bytes_to_np_array(image_bytes)
             else:
                 dp_image.image = load_image_from_file(location)

{deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/pipe/base.py RENAMED Viewed

@@ -29,6 +29,7 @@ from uuid import uuid1
 from ..dataflow import DataFlow, MapData
 from ..datapoint.image import Image
+from ..mapper.misc import curry
 from ..utils.context import timed_operation
 from ..utils.identifier import get_uuid_from_str
 from ..utils.settings import ObjectTypes
@@ -247,17 +248,24 @@ class Pipeline(ABC):
         """
         raise NotImplementedError()
-    def _build_pipe(self, df: DataFlow, session_id: Optional[str] = None) -> DataFlow:
+    @staticmethod
+    @curry
+    def _undo(dp: Image, service_ids: Optional[list[str]] = None) -> Image:
         """
-        Composition of the backbone
+        Remove annotations from a datapoint
         """
-        if session_id is None and self.set_session_id:
-            session_id = self.get_session_id()
-        for component in self.pipe_component_list:
-            component.timer_on = True
-            component.dp_manager.session_id = session_id
-            df = component.predict_dataflow(df)
-        return df
+        dp.remove(service_ids=service_ids)
+        return dp
+    def undo(self, df: DataFlow, service_ids: Optional[set[str]] = None) -> DataFlow:
+        """
+        Mapping a datapoint via `_undo` within a dataflow pipeline
+        :param df: An input dataflow of Images
+        :param service_ids: A set of service ids to remove
+        :return: A output dataflow of Images
+        """
+        return MapData(df, self._undo(service_ids=service_ids))
     @abstractmethod
     def analyze(self, **kwargs: Any) -> DataFlow:
@@ -273,6 +281,18 @@ class Pipeline(ABC):
         """
         raise NotImplementedError()
+    def _build_pipe(self, df: DataFlow, session_id: Optional[str] = None) -> DataFlow:
+        """
+        Composition of the backbone
+        """
+        if session_id is None and self.set_session_id:
+            session_id = self.get_session_id()
+        for component in self.pipe_component_list:
+            component.timer_on = True
+            component.dp_manager.session_id = session_id
+            df = component.predict_dataflow(df)
+        return df
     def get_meta_annotation(self) -> MetaAnnotation:
         """
         Collects meta annotations from all pipeline components and summarizes the returned results

{deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/pipe/doctectionpipe.py RENAMED Viewed

@@ -23,31 +23,38 @@ import os
 from pathlib import Path
 from typing import List, Mapping, Optional, Sequence, Tuple, Union
-from ..dataflow import DataFlow, MapData
+from ..dataflow import CustomDataFromIterable, DataFlow, DataFromList, MapData
 from ..dataflow.custom_serialize import SerializerFiles, SerializerPdfDoc
 from ..datapoint.image import Image
 from ..datapoint.view import IMAGE_DEFAULTS
 from ..mapper.maputils import curry
 from ..mapper.misc import to_image
 from ..utils.fs import maybe_path_or_pdf
+from ..utils.identifier import get_uuid_from_str
 from ..utils.logger import LoggingRecord, logger
+from ..utils.pdf_utils import PDFStreamer
 from ..utils.types import PathLikeOrStr
+from ..utils.utils import is_file_extension
 from .base import Pipeline, PipelineComponent
 from .common import PageParsingService
 def _collect_from_kwargs(
-    **kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
-) -> Tuple[Optional[str], Optional[str], bool, int, str, DataFlow]:
+    **kwargs: Union[Optional[str], bytes, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
+) -> Tuple[Optional[str], Union[str, Sequence[str]], bool, int, str, DataFlow, Optional[bytes]]:
+    b_bytes = kwargs.get("bytes")
     dataset_dataflow = kwargs.get("dataset_dataflow")
     path = kwargs.get("path")
     if path is None and dataset_dataflow is None:
         raise ValueError("Pass either path or dataset_dataflow as argument")
+    if path is None and b_bytes:
+        raise ValueError("When passing bytes, a path to the source document must be provided")
     shuffle = kwargs.get("shuffle", False)
     if not isinstance(shuffle, bool):
         raise TypeError(f"shuffle must be of type bool but is of type {type(shuffle)}")
+    file_type = None
     doc_path = None
     if path:
         if not isinstance(path, (str, Path)):
@@ -56,15 +63,27 @@ def _collect_from_kwargs(
         if path_type == 2:
             doc_path = path
             path = None
+            file_type = ".pdf"
+        elif path_type == 3:
+            if is_file_extension(path, ".jpg"):
+                file_type = ".jpg"
+            if is_file_extension(path, ".png"):
+                file_type = ".png"
+            if is_file_extension(path, ".jpeg"):
+                file_type = ".jpeg"
+            if not b_bytes:
+                raise ValueError("When passing a path to a single image, bytes of the image must be passed")
         elif not path_type:
             raise ValueError("Pass only a path to a directory or to a pdf file")
-    file_type = kwargs.get("file_type", [".jpg", ".png", ".tif"])
+    file_type = kwargs.get(
+        "file_type", [".jpg", ".png", ".jpeg", ".tif"] if file_type is None else file_type  # type: ignore
+    )
     max_datapoints = kwargs.get("max_datapoints")
     if not isinstance(max_datapoints, (int, type(None))):
         raise TypeError(f"max_datapoints must be of type int, but is of type {type(max_datapoints)}")
-    return path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow  # type: ignore
+    return path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow, b_bytes  # type: ignore
 @curry
@@ -142,12 +161,18 @@ class DoctectionPipe(Pipeline):
         super().__init__(pipeline_component_list)
-    def _entry(self, **kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]) -> DataFlow:
-        path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow = _collect_from_kwargs(**kwargs)
+    def _entry(self, **kwargs: Union[str, bytes, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]) \
+            -> DataFlow:
+        path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow, b_bytes = _collect_from_kwargs(**kwargs)
         df: DataFlow
-        if isinstance(path, (str, Path)):
+        if isinstance(b_bytes, bytes):
+            df = DoctectionPipe.bytes_to_dataflow(path=doc_path if path is None else path,
+                                                  b_bytes=b_bytes,
+                                                  file_type=file_type)
+        elif isinstance(path, (str, Path)):
             if not isinstance(file_type, (str, list)):
                 raise TypeError(f"file_type must be of type string or list, but is of type {type(file_type)}")
             df = DoctectionPipe.path_to_dataflow(path=path, file_type=file_type, shuffle=shuffle)
@@ -162,7 +187,7 @@ class DoctectionPipe(Pipeline):
         df = MapData(df, _proto_process(path, doc_path))
         if dataset_dataflow is None:
-            df = MapData(df, _to_image(dpi=300))  # pylint: disable=E1120
+            df = MapData(df, _to_image(dpi=os.environ.get("DPI", 300)))  # pylint: disable=E1120
         return df
     @staticmethod
@@ -197,6 +222,44 @@ class DoctectionPipe(Pipeline):
         """
         return _doc_to_dataflow(path, max_datapoints)
+    @staticmethod
+    def bytes_to_dataflow(
+        path: str, b_bytes: bytes, file_type: Union[str, Sequence[str]], max_datapoints: Optional[int] = None
+    ) -> DataFlow:
+        """
+        Converts a bytes object to a dataflow
+        :param path: path to directory or an image file
+        :param b_bytes: bytes object
+        :param file_type: e.g. ".pdf", ".jpg" or [".jpg", ".png", ".jpeg", ".tif"]
+        :param max_datapoints: max number of datapoints to consider
+        :return: DataFlow
+        """
+        file_name = os.path.split(path)[1]
+        if isinstance(file_type, str):
+            if file_type == ".pdf":
+                prefix, suffix = os.path.splitext(file_name)
+                df: DataFlow
+                df = CustomDataFromIterable(PDFStreamer(path_or_bytes=b_bytes), max_datapoints=max_datapoints)
+                df = MapData(
+                    df,
+                    lambda dp: {
+                        "path": path,
+                        "file_name": prefix + f"_{dp[1]}" + suffix,
+                        "pdf_bytes": dp[0],
+                        "page_number": dp[1],
+                        "document_id": get_uuid_from_str(prefix),
+                    },
+                )
+            else:
+                df = DataFromList(lst=[{"path": path, "file_name": file_name, "image_bytes": b_bytes}])
+            return df
+        raise ValueError(
+            f"pass: {path}, b_bytes: {b_bytes!r}, file_type: {file_type} and max_datapoints: {max_datapoints} "
+            f"not supported"
+        )
     def dataflow_to_page(self, df: DataFlow) -> DataFlow:
         """
         Converts a dataflow of images to a dataflow of pages
@@ -206,7 +269,9 @@ class DoctectionPipe(Pipeline):
         """
         return self.page_parser.predict_dataflow(df)
-    def analyze(self, **kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]) -> DataFlow:
+    def analyze(
+        self, **kwargs: Union[str, bytes, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
+    ) -> DataFlow:
         """
         `kwargs key dataset_dataflow:` Transfer a dataflow of a dataset via its dataflow builder
@@ -215,6 +280,8 @@ class DoctectionPipe(Pipeline):
                            only the first page is processed through the pipeline.
                            Alternatively, a path to a pdf document with multiple pages.
+        `kwargs key bytes:` A bytes object of an image
         `kwargs key file_type:` Selection of the file type, if: args:`file_type` is passed
         `kwargs key max_datapoints:` Stops processing as soon as max_datapoints images have been processed

{deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/utils/fs.py RENAMED Viewed

@@ -227,20 +227,21 @@ def get_load_image_func(
 def maybe_path_or_pdf(path: PathLikeOrStr) -> int:
     """
-    Checks if the path points to a directory or a pdf document. Returns 1 if the path points to a directory, 2
-    if the path points to a pdf doc or 0, if none of the previous is true.
+    Checks if the path points to a directory, a pdf document or a single image. Returns 1 if the path points to a
+    directory, 2 if the path points to a pdf doc and 3 if path points to either a PNG, JPG or JPEG or 0 if none of the
+    previous is true.
     :param path: A path
-    :return: A value of 0,1,2
+    :return: A value of 0,1,2,3
     """
-    is_dir = os.path.isdir(path)
-    if is_dir:
+    if os.path.isdir(path):
         return 1
     file_name = os.path.split(path)[1]
-    is_pdf = is_file_extension(file_name, ".pdf")
-    if is_pdf:
+    if is_file_extension(file_name, ".pdf"):
         return 2
+    if is_file_extension(file_name, [".png", ".jpeg", ".jpg", ".tif"]):
+        return 3
     return 0

{deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/utils/pdf_utils.py RENAMED Viewed

@@ -26,7 +26,7 @@ from errno import ENOENT
 from io import BytesIO
 from pathlib import Path
 from shutil import copyfile
-from typing import Generator, Literal, Optional
+from typing import Generator, Literal, Optional, Union
 from lazy_imports import try_import
 from numpy import uint8
@@ -46,6 +46,7 @@ with try_import() as pt_import_guard:
 __all__ = [
     "decrypt_pdf_document",
+    "decrypt_pdf_document_from_bytes",
     "get_pdf_file_reader",
     "get_pdf_file_writer",
     "PDFStreamer",
@@ -68,7 +69,6 @@ def decrypt_pdf_document(path: PathLikeOrStr) -> bool:
     :param path: A path to the pdf file
     :return: True if document has been successfully decrypted
     """
     if qpdf_available():
         path_base, file_name = os.path.split(path)
         file_name_tmp = os.path.splitext(file_name)[0] + "tmp.pdf"
@@ -86,41 +86,69 @@ def decrypt_pdf_document(path: PathLikeOrStr) -> bool:
     return False
-def get_pdf_file_reader(path: PathLikeOrStr) -> PdfReader:
+def decrypt_pdf_document_from_bytes(input_bytes: bytes) -> bytes:
+    """
+    Decrypting a pdf given as bytes. Under the hood, it saves the bytes to a temporary file and then calls
+    qpdf: <http://qpdf.sourceforge.net/>
+    :param input_bytes: A bytes object representing the pdf file
+    :return: The decrypted bytes object
+    """
+    with save_tmp_file(input_bytes, "pdf_") as (_, input_file_name):
+        is_decrypted = decrypt_pdf_document(input_file_name)
+        if is_decrypted:
+            with open(input_file_name, "rb") as file:
+                return file.read()
+        else:
+            logger.error(LoggingRecord("pdf bytes cannot be decrypted and therefore cannot be processed further."))
+            sys.exit()
+def get_pdf_file_reader(path_or_bytes: Union[PathLikeOrStr, bytes]) -> PdfReader:
     """
     Creates a file reader object from a pdf document. Will try to decrypt the document if it is
     encrypted. (See `decrypt_pdf_document` to understand what is meant with "decrypt").
-    :param path: A path to a pdf document
+    :param path_or_bytes: A path to a pdf document
     :return: A file reader object from which you can iterate through the document.
     """
-    if not os.path.isfile(path):
-        raise FileNotFoundError(str(path))
-    file_name = os.path.split(path)[1]
+    if isinstance(path_or_bytes, bytes):
+        try:
+            reader = PdfReader(BytesIO(path_or_bytes))
+        except (errors.PdfReadError, AttributeError):
+            decrypted_bytes = decrypt_pdf_document_from_bytes(path_or_bytes)
+            reader = PdfReader(BytesIO(decrypted_bytes))
+        return reader
+    if not os.path.isfile(path_or_bytes):
+        raise FileNotFoundError(str(path_or_bytes))
+    file_name = os.path.split(path_or_bytes)[1]
     if not is_file_extension(file_name, ".pdf"):
         raise FileExtensionError(f"must be a pdf file: {file_name}")
-    with open(path, "rb") as file:
+    with open(path_or_bytes, "rb") as file:
         qpdf_called = False
         try:
-            input_pdf_as_bytes = PdfReader(file)
+            reader = PdfReader(file)
         except (errors.PdfReadError, AttributeError):
-            _ = decrypt_pdf_document(path)
+            _ = decrypt_pdf_document(path_or_bytes)
             qpdf_called = True
         if not qpdf_called:
-            if input_pdf_as_bytes.is_encrypted:
-                is_decrypted = decrypt_pdf_document(path)
+            if reader.is_encrypted:
+                is_decrypted = decrypt_pdf_document(path_or_bytes)
                 if not is_decrypted:
                     logger.error(
                         LoggingRecord(
-                            f"pdf document {path} cannot be decrypted and therefore cannot be " f"processed further."
+                            f"pdf document {path_or_bytes} cannot be decrypted and therefore cannot "
+                            f"be processed further."
                         )
                     )
                     sys.exit()
-    return PdfReader(os.fspath(path))
+    return PdfReader(os.fspath(path_or_bytes))
 def get_pdf_file_writer() -> PdfWriter:
@@ -157,11 +185,11 @@ class PDFStreamer:
     """
-    def __init__(self, path: PathLikeOrStr) -> None:
+    def __init__(self, path_or_bytes: Union[PathLikeOrStr, bytes]) -> None:
         """
-        :param path: to a pdf.
+        :param path_or_bytes: to a pdf.
         """
-        self.file_reader = get_pdf_file_reader(path)
+        self.file_reader = get_pdf_file_reader(path_or_bytes)
         self.file_writer = PdfWriter()
     def __len__(self) -> int:

{deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection/utils/viz.py RENAMED Viewed

@@ -312,6 +312,7 @@ class VizPackageHandler:
             "interactive_imshow": "_cv2_interactive_imshow",
             "encode": "_cv2_encode",
             "rotate_image": "_cv2_rotate_image",
+            "convert_bytes_to_np": "_cv2_convert_bytes_to_np",
         },
         "pillow": {
             "read_image": "_pillow_read_image",
@@ -325,6 +326,7 @@ class VizPackageHandler:
             "interactive_imshow": "_pillow_interactive_imshow",
             "encode": "_pillow_encode",
             "rotate_image": "_pillow_rotate_image",
+            "convert_bytes_to_np": "_pillow_convert_bytes_to_np",
         },
     }
@@ -484,6 +486,37 @@ class VizPackageHandler:
         pil_image = Image.open(im_file)
         return np.array(pil_image)[:, :, ::-1]
+    def convert_bytes_to_np(self, image_bytes: bytes) -> PixelValues:
+        """Converting an image as bytes into np.array
+        :param image_bytes: Image as np.array
+        """
+        return getattr(self, self.pkg_func_dict["convert_bytes_to_np"])(image_bytes)
+    @staticmethod
+    def _cv2_convert_bytes_to_np(image_bytes: bytes) -> PixelValues:
+        """
+        Convert image bytes to a numpy array using OpenCV.
+        :param image_bytes: Image bytes
+        :return: Image as numpy array
+        """
+        np_array = np.frombuffer(image_bytes, np.uint8)
+        np_image = cv2.imdecode(np_array, cv2.IMREAD_COLOR)
+        return np_image
+    @staticmethod
+    def _pillow_convert_bytes_to_np(image_bytes: bytes) -> PixelValues:
+        """
+        Convert image bytes to a numpy array using Pillow.
+        :param image_bytes: Image bytes
+        :return: Image as numpy array
+        """
+        image = Image.open(BytesIO(image_bytes))
+        np_image = np.array(image)
+        return np_image
     def resize(self, image: PixelValues, width: int, height: int, interpolation: str) -> PixelValues:
         """
         Resize a given image to new width, height. Specifying an interpolation method is required. Depending on the

{deepdoctection-0.36 → deepdoctection-0.37.1}/deepdoctection.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: deepdoctection
-Version: 0.36
+Version: 0.37.1
 Summary: Repository for Document AI
 Home-page: https://github.com/deepdoctection/deepdoctection
 Author: Dr. Janis Meyer

{deepdoctection-0.36 → deepdoctection-0.37.1}/setup.py RENAMED Viewed

@@ -215,7 +215,7 @@ dev_deps = deps_list(
 EXTRA_DEPS = {
     "tf": tf_deps,
     "pt": pt_deps,
-#    "source-pt": source_pt_deps,
+    #"source-pt": source_pt_deps,
     "docs": docs_deps,
     "dev": dev_deps,
     "test": test_deps,

{deepdoctection-0.36 → deepdoctection-0.37.1}/LICENSE RENAMED Viewed

File without changes

{deepdoctection-0.36 → deepdoctection-0.37.1}/README.md RENAMED Viewed

File without changes

deepdoctection 0.36__tar.gz → 0.37.1__tar.gz

Potentially problematic release.

deepdoctection 0.36tar.gz → 0.37.1tar.gz