PyPI - deepdoctection - Versions diffs - 0.35__py3-none-any.whl → 0.37__py3-none-any.whl - Mend

deepdoctection 0.35py3-none-any.whl → 0.37py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (23) hide show

deepdoctection/__init__.py +5 -6
deepdoctection/analyzer/_config.py +10 -18
deepdoctection/analyzer/factory.py +214 -18
deepdoctection/configs/conf_dd_one.yaml +4 -0
deepdoctection/dataflow/custom_serialize.py +1 -1
deepdoctection/datapoint/convert.py +11 -0
deepdoctection/datapoint/image.py +2 -2
deepdoctection/datapoint/view.py +90 -15
deepdoctection/datasets/save.py +1 -1
deepdoctection/eval/cocometric.py +59 -13
deepdoctection/extern/base.py +2 -3
deepdoctection/mapper/match.py +4 -2
deepdoctection/mapper/misc.py +5 -1
deepdoctection/pipe/doctectionpipe.py +77 -10
deepdoctection/utils/fs.py +8 -7
deepdoctection/utils/pdf_utils.py +45 -17
deepdoctection/utils/utils.py +39 -0
deepdoctection/utils/viz.py +49 -13
{deepdoctection-0.35.dist-info → deepdoctection-0.37.dist-info}/METADATA +116 -112
{deepdoctection-0.35.dist-info → deepdoctection-0.37.dist-info}/RECORD +23 -23
{deepdoctection-0.35.dist-info → deepdoctection-0.37.dist-info}/WHEEL +1 -1
{deepdoctection-0.35.dist-info → deepdoctection-0.37.dist-info}/LICENSE +0 -0
{deepdoctection-0.35.dist-info → deepdoctection-0.37.dist-info}/top_level.txt +0 -0

deepdoctection/datapoint/view.py CHANGED Viewed

@@ -25,6 +25,7 @@ from copy import copy
 from typing import Any, Mapping, Optional, Sequence, Type, TypedDict, Union, no_type_check
 import numpy as np
+from typing_extensions import LiteralString
 from ..utils.error import AnnotationError, ImageError
 from ..utils.logger import LoggingRecord, logger
@@ -40,10 +41,12 @@ from ..utils.settings import (
     WordType,
     get_type,
 )
+from ..utils.transform import ResizeTransform
 from ..utils.types import HTML, AnnotationDict, Chunks, ImageDict, PathLikeOrStr, PixelValues, Text_, csv
 from ..utils.viz import draw_boxes, interactive_imshow, viz_handler
 from .annotation import CategoryAnnotation, ContainerAnnotation, ImageAnnotation, ann_from_dict
 from .box import BoundingBox, crop_box_from_image
+from .convert import box_to_point4, point4_to_box
 from .image import Image
@@ -101,7 +104,7 @@ class ImageAnnotationBaseView(ImageAnnotation):
             return np_image
         raise AnnotationError(f"base_page.image is None for {self.annotation_id}")
-    def __getattr__(self, item: str) -> Optional[Union[str, int, list[str]]]:
+    def __getattr__(self, item: str) -> Optional[Union[str, int, list[str], list[ImageAnnotationBaseView]]]:
         """
         Get attributes defined by registered `self.get_attribute_names()` in a multi step process:
@@ -126,6 +129,9 @@ class ImageAnnotationBaseView(ImageAnnotation):
             if isinstance(sub_cat, ContainerAnnotation):
                 return sub_cat.value
             return sub_cat.category_id
+        if item in self.relationships:
+            relationship_ids = self.get_relationship(get_type(item))
+            return self.base_page.get_annotation(annotation_ids=relationship_ids)
         if self.image is not None:
             if item in self.image.summary.sub_categories:
                 sub_cat = self.get_summary(get_type(item))
@@ -165,7 +171,11 @@ class Word(ImageAnnotationBaseView):
     """
     def get_attribute_names(self) -> set[str]:
-        return set(WordType).union(super().get_attribute_names()).union({Relationships.READING_ORDER})
+        return (
+            set(WordType)
+            .union(super().get_attribute_names())
+            .union({Relationships.READING_ORDER, Relationships.LAYOUT_LINK})
+        )
 class Layout(ImageAnnotationBaseView):
@@ -246,7 +256,11 @@ class Layout(ImageAnnotationBaseView):
         }
     def get_attribute_names(self) -> set[str]:
-        return {"words", "text"}.union(super().get_attribute_names()).union({Relationships.READING_ORDER})
+        return (
+            {"words", "text"}
+            .union(super().get_attribute_names())
+            .union({Relationships.READING_ORDER, Relationships.LAYOUT_LINK})
+        )
     def __len__(self) -> int:
         """len of text counted by number of characters"""
@@ -433,8 +447,8 @@ class ImageDefaults(TypedDict):
     """ImageDefaults"""
     text_container: LayoutType
-    floating_text_block_categories: tuple[LayoutType, ...]
-    text_block_categories: tuple[LayoutType, ...]
+    floating_text_block_categories: tuple[Union[LayoutType, CellType], ...]
+    text_block_categories: tuple[Union[LayoutType, CellType], ...]
 IMAGE_DEFAULTS: ImageDefaults = {
@@ -448,9 +462,13 @@ IMAGE_DEFAULTS: ImageDefaults = {
     "text_block_categories": (
         LayoutType.TEXT,
         LayoutType.TITLE,
-        LayoutType.FIGURE,
         LayoutType.LIST,
         LayoutType.CELL,
+        LayoutType.FIGURE,
+        CellType.COLUMN_HEADER,
+        CellType.PROJECTED_ROW_HEADER,
+        CellType.SPANNING,
+        CellType.ROW_HEADER,
     ),
 }
@@ -510,6 +528,8 @@ class Page(Image):
         "document_id",
         "page_number",
         "angle",
+        "figures",
+        "residual_layouts",
     }
     include_residual_text_container: bool = True
@@ -608,6 +628,41 @@ class Page(Image):
         """
         return self.get_annotation(category_names=LayoutType.TABLE)
+    @property
+    def figures(self) -> list[ImageAnnotationBaseView]:
+        """
+        A list of a figures.
+        """
+        return self.get_annotation(category_names=LayoutType.FIGURE)
+    @property
+    def residual_layouts(self) -> list[ImageAnnotationBaseView]:
+        """
+        A list of all residual layouts. Residual layouts are all layouts that are
+           - not floating text blocks,
+           - not text containers,
+           - not tables,
+           - not figures
+           - not cells
+           - not rows
+           - not columns
+        """
+        return self.get_annotation(category_names=self._get_residual_layout())
+    def _get_residual_layout(self) -> list[LiteralString]:
+        layouts = copy(list(self.floating_text_block_categories))
+        layouts.extend(
+            [
+                LayoutType.TABLE,
+                LayoutType.FIGURE,
+                self.text_container,
+                LayoutType.CELL,
+                LayoutType.ROW,
+                LayoutType.COLUMN,
+            ]
+        )
+        return [layout for layout in LayoutType if layout not in layouts]
     @classmethod
     def from_image(
         cls,
@@ -801,12 +856,15 @@ class Page(Image):
         self,
         show_tables: bool = True,
         show_layouts: bool = True,
+        show_figures: bool = False,
+        show_residual_layouts: bool = False,
         show_cells: bool = True,
         show_table_structure: bool = True,
         show_words: bool = False,
         show_token_class: bool = True,
         ignore_default_token_class: bool = False,
         interactive: bool = False,
+        scaled_width: int = 600,
         **debug_kwargs: str,
     ) -> Optional[PixelValues]:
         """
@@ -827,12 +885,14 @@ class Page(Image):
         :param show_tables: Will display all tables boxes as well as cells, rows and columns
         :param show_layouts: Will display all other layout components.
+        :param show_figures: Will display all figures
         :param show_cells: Will display cells within tables. (Only available if `show_tables=True`)
         :param show_table_structure: Will display rows and columns
         :param show_words: Will display bounding boxes around words labeled with token class and bio tag (experimental)
         :param show_token_class: Will display token class instead of token tags (i.e. token classes with tags)
         :param interactive: If set to True will open an interactive image, otherwise it will return a numpy array that
                             can be displayed differently.
+        :param scaled_width: Width of the image to display
         :param ignore_default_token_class: Will ignore displaying word bounding boxes with default or None token class
                                            label
         :return: If `interactive=False` will return a numpy array.
@@ -858,6 +918,11 @@ class Page(Image):
                 box_stack.append(item.bbox)
                 category_names_list.append(item.category_name.value)
+        if show_figures and not debug_kwargs:
+            for item in self.figures:
+                box_stack.append(item.bbox)
+                category_names_list.append(item.category_name.value)
         if show_tables and not debug_kwargs:
             for table in self.tables:
                 box_stack.append(table.bbox)
@@ -914,24 +979,34 @@ class Page(Image):
                         else:
                             category_names_list.append(word.token_tag.value if word.token_tag is not None else None)
+        if show_residual_layouts and not debug_kwargs:
+            for item in self.residual_layouts:
+                box_stack.append(item.bbox)
+                category_names_list.append(item.category_name.value)
         if self.image is not None:
+            scale_fx = scaled_width / self.width
+            scaled_height = int(self.height * scale_fx)
+            img = viz_handler.resize(self.image, scaled_width, scaled_height, "VIZ")
             if box_stack:
                 boxes = np.vstack(box_stack)
+                boxes = box_to_point4(boxes)
+                resizer = ResizeTransform(self.height, self.width, scaled_height, scaled_width, "VIZ")
+                boxes = resizer.apply_coords(boxes)
+                boxes = point4_to_box(boxes)
                 if show_words:
                     img = draw_boxes(
-                        self.image,
-                        boxes,
-                        category_names_list,
+                        np_image=img,
+                        boxes=boxes,
+                        category_names_list=category_names_list,
                         font_scale=1.0,
                         rectangle_thickness=4,
                     )
                 else:
-                    img = draw_boxes(self.image, boxes, category_names_list)
-                scale_fx, scale_fy = 1.3, 1.3
-                scaled_width, scaled_height = int(self.width * scale_fx), int(self.height * scale_fy)
-                img = viz_handler.resize(img, scaled_width, scaled_height, "VIZ")
-            else:
-                img = self.image
+                    img = draw_boxes(
+                        np_image=img, boxes=boxes, category_names_list=category_names_list, show_palette=False
+                    )
             if interactive:
                 interactive_imshow(img)

deepdoctection/datasets/save.py CHANGED Viewed

@@ -62,7 +62,7 @@ def dataflow_to_json(
     if highest_hierarchy_only:
         def _remove_hh(dp: Image) -> Image:
-            dp.remove_image_from_lower_hierachy()
+            dp.remove_image_from_lower_hierarchy()
             return dp
         df = MapData(df, _remove_hh)

deepdoctection/eval/cocometric.py CHANGED Viewed

@@ -71,8 +71,8 @@ https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/cocoeva
 def _summarize(  # type: ignore
-    self, ap: int = 1, iouThr: float = 0.9, areaRng: str = "all", maxDets: int = 100
-) -> float:
+    self, ap: int = 1, iouThr: float = 0.9, areaRng: str = "all", maxDets: int = 100, per_category: bool = False
+) -> Union[float, list[float]]:
     # pylint: disable=C0103
     p = self.params
     iStr = " {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}"
@@ -86,6 +86,36 @@ def _summarize(  # type: ignore
     aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
     mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
+    if per_category:
+        if ap == 1:
+            s = self.eval["precision"]
+            num_classes = s.shape[2]
+            results_per_class = []
+            for idx in range(num_classes):
+                if iouThr is not None:
+                    s = self.eval["precision"]
+                    t = np.where(iouThr == p.iouThrs)[0]
+                    s = s[t]
+                precision = s[:, :, idx, aind, mind]
+                precision = precision[precision > -1]
+                res = np.mean(precision) if precision.size else float("nan")
+                results_per_class.append(float(res))
+                print(f"Precision for class {idx+1}: @[ IoU={iouStr} | area={areaRng} | maxDets={maxDets} ] = {res}")
+        else:
+            s = self.eval["recall"]
+            num_classes = s.shape[1]
+            results_per_class = []
+            for idx in range(num_classes):
+                if iouThr is not None:
+                    s = self.eval["recall"]
+                    t = np.where(iouThr == p.iouThrs)[0]
+                    s = s[t]
+                recall = s[:, idx, aind, mind]
+                recall = recall[recall > -1]
+                res = np.mean(recall) if recall.size else float("nan")
+                results_per_class.append(float(res))
+                print(f"Recall for class {idx+1}: @[ IoU={iouStr} | area={areaRng} | maxDets={maxDets} ] = {res}")
+        return results_per_class
     if ap == 1:
         # dimension of precision: [TxRxKxAxM]
         s = self.eval["precision"]
@@ -124,6 +154,7 @@ class CocoMetric(MetricBase):
     mapper = image_to_coco
     _f1_score = None
     _f1_iou = None
+    _per_category = False
     _params: dict[str, Union[list[int], list[list[int]]]] = {}
     @classmethod
@@ -176,18 +207,28 @@ class CocoMetric(MetricBase):
         if cls._f1_score:
             summary_bbox = [
-                metric.summarize_f1(1, cls._f1_iou, maxDets=metric.params.maxDets[2]),
-                metric.summarize_f1(0, cls._f1_iou, maxDets=metric.params.maxDets[2]),
+                metric.summarize_f1(1, cls._f1_iou, maxDets=metric.params.maxDets[2], per_category=cls._per_category),
+                metric.summarize_f1(0, cls._f1_iou, maxDets=metric.params.maxDets[2], per_category=cls._per_category),
             ]
         else:
             metric.summarize()
             summary_bbox = metric.stats
         results = []
-        for params, value in zip(cls.get_summary_default_parameters(), summary_bbox):
+        default_parameters = cls.get_summary_default_parameters()
+        if cls._per_category:
+            default_parameters = default_parameters * len(summary_bbox[0])
+            summary_bbox = [item for pair in zip(*summary_bbox) for item in pair]
+        val = 0
+        for idx, (params, value) in enumerate(zip(default_parameters, summary_bbox)):
             params = copy(params)
             params["mode"] = "bbox"
             params["val"] = value
+            if cls._per_category:
+                if idx % 2 == 0:
+                    val += 1
+                params["category_id"] = val
             results.append(params)
         return results
@@ -201,15 +242,16 @@ class CocoMetric(MetricBase):
                  area range and maximum detections.
         """
         if cls._f1_score:
+            for el, idx in zip(_F1_DEFAULTS, [2, 2]):
+                if cls._params:
+                    if cls._params.get("maxDets") is not None:
+                        el["maxDets"] = cls._params["maxDets"][idx]
+                el["iouThr"] = cls._f1_iou
+            return _F1_DEFAULTS
+        for el, idx in zip(_COCOEVAL_DEFAULTS, _MAX_DET_INDEX):
             if cls._params:
                 if cls._params.get("maxDets") is not None:
-                    for el, idx in zip(_F1_DEFAULTS, [2, 2]):
-                        el["maxDets"] = cls._params["maxDets"][idx]
-                        el["iouThr"] = cls._f1_iou
-                    return _F1_DEFAULTS
-        if cls._params:
-            if cls._params.get("maxDets") is not None:
-                for el, idx in zip(_COCOEVAL_DEFAULTS, _MAX_DET_INDEX):
                     el["maxDets"] = cls._params["maxDets"][idx]
         return _COCOEVAL_DEFAULTS
@@ -220,13 +262,16 @@ class CocoMetric(MetricBase):
         area_range: Optional[list[list[int]]] = None,
         f1_score: bool = False,
         f1_iou: float = 0.9,
+        per_category: bool = False,
     ) -> None:
         """
         Setting params for different coco metric modes.
         :param max_detections: The maximum number of detections to consider
         :param area_range: The area range to classify objects as "all", "small", "medium" and "large"
-        :param f1_score: Will use f1 score setting with default iouThr 0.9
+        :param f1_score: Will use f1 score setting with default iouThr 0.9. To be more precise it does not calculate
+                         the f1 score but the precision and recall for a given iou threshold. Use the harmonic mean to
+                         get the ultimate f1 score.
         :param f1_iou: Use with f1_score True and reset the f1 iou threshold
         """
         if max_detections is not None:
@@ -238,6 +283,7 @@ class CocoMetric(MetricBase):
         cls._f1_score = f1_score
         cls._f1_iou = f1_iou
+        cls._per_category = per_category
     @classmethod
     def get_requirements(cls) -> list[Requirement]:

deepdoctection/extern/base.py CHANGED Viewed

@@ -69,8 +69,7 @@ class ModelCategories:
         if self.init_categories:
             self._init_categories = MappingProxyType({key: get_type(val) for key, val in self.init_categories.items()})
         else:
-            if self._init_categories is None:
-                self._init_categories = MappingProxyType({})
+            self._init_categories = MappingProxyType({})
         self.categories = self._init_categories
     @overload
@@ -181,7 +180,7 @@ class NerModelCategories(ModelCategories):
             self._init_categories = self.merge_bio_semantics_categories(
                 self._categories_semantics, self._categories_bio
             )
-        super().__post_init__()
+        self.categories = self._init_categories
     @staticmethod
     def merge_bio_semantics_categories(

deepdoctection/mapper/match.py CHANGED Viewed

@@ -193,5 +193,7 @@ def match_anns_by_distance(
     child_anns = dp.get_annotation(annotation_ids=child_ann_ids, category_names=child_ann_category_names)
     child_centers = [block.get_bounding_box(dp.image_id).center for block in child_anns]
     parent_centers = [block.get_bounding_box(dp.image_id).center for block in parent_anns]
-    child_indices = distance.cdist(parent_centers, child_centers).argmin(axis=1)
-    return [(parent_anns[i], child_anns[j]) for i, j in enumerate(child_indices)]
+    if child_centers and parent_centers:
+        child_indices = distance.cdist(parent_centers, child_centers).argmin(axis=1)
+        return [(parent_anns[i], child_anns[j]) for i, j in enumerate(child_indices)]
+    return []

deepdoctection/mapper/misc.py CHANGED Viewed

@@ -27,7 +27,7 @@ from typing import Mapping, Optional, Sequence, Union
 from lazy_imports import try_import
-from ..datapoint.convert import convert_pdf_bytes_to_np_array_v2
+from ..datapoint.convert import convert_bytes_to_np_array, convert_pdf_bytes_to_np_array_v2
 from ..datapoint.image import Image
 from ..utils.fs import get_load_image_func, load_image_from_file
 from ..utils.types import JsonDict
@@ -49,6 +49,7 @@ def to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int]
     file_name: Optional[str]
     location: Optional[str]
+    image_bytes: Optional[bytes] = None
     if isinstance(dp, str):
         _, file_name = os.path.split(dp)
@@ -62,6 +63,7 @@ def to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int]
         document_id = dp.get("document_id")
         if location == "":
             location = str(dp.get("path", ""))
+        image_bytes = dp.get("image_bytes")
     else:
         raise TypeError("datapoint not of expected type for converting to image")
@@ -76,6 +78,8 @@ def to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int]
                 if dp_image.pdf_bytes is not None:
                     if isinstance(dp_image.pdf_bytes, bytes):
                         dp_image.image = convert_pdf_bytes_to_np_array_v2(dp_image.pdf_bytes, dpi=dpi)
+            elif image_bytes is not None:
+                dp_image.image = convert_bytes_to_np_array(image_bytes)
             else:
                 dp_image.image = load_image_from_file(location)

deepdoctection/pipe/doctectionpipe.py CHANGED Viewed

@@ -23,31 +23,38 @@ import os
 from pathlib import Path
 from typing import List, Mapping, Optional, Sequence, Tuple, Union
-from ..dataflow import DataFlow, MapData
+from ..dataflow import CustomDataFromIterable, DataFlow, DataFromList, MapData
 from ..dataflow.custom_serialize import SerializerFiles, SerializerPdfDoc
 from ..datapoint.image import Image
 from ..datapoint.view import IMAGE_DEFAULTS
 from ..mapper.maputils import curry
 from ..mapper.misc import to_image
 from ..utils.fs import maybe_path_or_pdf
+from ..utils.identifier import get_uuid_from_str
 from ..utils.logger import LoggingRecord, logger
+from ..utils.pdf_utils import PDFStreamer
 from ..utils.types import PathLikeOrStr
+from ..utils.utils import is_file_extension
 from .base import Pipeline, PipelineComponent
 from .common import PageParsingService
 def _collect_from_kwargs(
-    **kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
-) -> Tuple[Optional[str], Optional[str], bool, int, str, DataFlow]:
+    **kwargs: Union[Optional[str], bytes, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
+) -> Tuple[Optional[str], Union[str, Sequence[str]], bool, int, str, DataFlow, Optional[bytes]]:
+    b_bytes = kwargs.get("bytes")
     dataset_dataflow = kwargs.get("dataset_dataflow")
     path = kwargs.get("path")
     if path is None and dataset_dataflow is None:
         raise ValueError("Pass either path or dataset_dataflow as argument")
+    if path is None and b_bytes:
+        raise ValueError("When passing bytes, a path to the source document must be provided")
     shuffle = kwargs.get("shuffle", False)
     if not isinstance(shuffle, bool):
         raise TypeError(f"shuffle must be of type bool but is of type {type(shuffle)}")
+    file_type = None
     doc_path = None
     if path:
         if not isinstance(path, (str, Path)):
@@ -56,15 +63,27 @@ def _collect_from_kwargs(
         if path_type == 2:
             doc_path = path
             path = None
+            file_type = ".pdf"
+        elif path_type == 3:
+            if is_file_extension(path, ".jpg"):
+                file_type = ".jpg"
+            if is_file_extension(path, ".png"):
+                file_type = ".png"
+            if is_file_extension(path, ".jpeg"):
+                file_type = ".jpeg"
+            if not b_bytes:
+                raise ValueError("When passing a path to a single image, bytes of the image must be passed")
         elif not path_type:
             raise ValueError("Pass only a path to a directory or to a pdf file")
-    file_type = kwargs.get("file_type", [".jpg", ".png", ".tif"])
+    file_type = kwargs.get(
+        "file_type", [".jpg", ".png", ".jpeg", ".tif"] if file_type is None else file_type  # type: ignore
+    )
     max_datapoints = kwargs.get("max_datapoints")
     if not isinstance(max_datapoints, (int, type(None))):
         raise TypeError(f"max_datapoints must be of type int, but is of type {type(max_datapoints)}")
-    return path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow  # type: ignore
+    return path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow, b_bytes  # type: ignore
 @curry
@@ -142,12 +161,18 @@ class DoctectionPipe(Pipeline):
         super().__init__(pipeline_component_list)
-    def _entry(self, **kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]) -> DataFlow:
-        path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow = _collect_from_kwargs(**kwargs)
+    def _entry(self, **kwargs: Union[str, bytes, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]) \
+            -> DataFlow:
+        path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow, b_bytes = _collect_from_kwargs(**kwargs)
         df: DataFlow
-        if isinstance(path, (str, Path)):
+        if isinstance(b_bytes, bytes):
+            df = DoctectionPipe.bytes_to_dataflow(path=doc_path if path is None else path,
+                                                  b_bytes=b_bytes,
+                                                  file_type=file_type)
+        elif isinstance(path, (str, Path)):
             if not isinstance(file_type, (str, list)):
                 raise TypeError(f"file_type must be of type string or list, but is of type {type(file_type)}")
             df = DoctectionPipe.path_to_dataflow(path=path, file_type=file_type, shuffle=shuffle)
@@ -162,7 +187,7 @@ class DoctectionPipe(Pipeline):
         df = MapData(df, _proto_process(path, doc_path))
         if dataset_dataflow is None:
-            df = MapData(df, _to_image(dpi=300))  # pylint: disable=E1120
+            df = MapData(df, _to_image(dpi=os.environ.get("DPI", 300)))  # pylint: disable=E1120
         return df
     @staticmethod
@@ -197,6 +222,44 @@ class DoctectionPipe(Pipeline):
         """
         return _doc_to_dataflow(path, max_datapoints)
+    @staticmethod
+    def bytes_to_dataflow(
+        path: str, b_bytes: bytes, file_type: Union[str, Sequence[str]], max_datapoints: Optional[int] = None
+    ) -> DataFlow:
+        """
+        Converts a bytes object to a dataflow
+        :param path: path to directory or an image file
+        :param b_bytes: bytes object
+        :param file_type: e.g. ".pdf", ".jpg" or [".jpg", ".png", ".jpeg", ".tif"]
+        :param max_datapoints: max number of datapoints to consider
+        :return: DataFlow
+        """
+        file_name = os.path.split(path)[1]
+        if isinstance(file_type, str):
+            if file_type == ".pdf":
+                prefix, suffix = os.path.splitext(file_name)
+                df: DataFlow
+                df = CustomDataFromIterable(PDFStreamer(path_or_bytes=b_bytes), max_datapoints=max_datapoints)
+                df = MapData(
+                    df,
+                    lambda dp: {
+                        "path": path,
+                        "file_name": prefix + f"_{dp[1]}" + suffix,
+                        "pdf_bytes": dp[0],
+                        "page_number": dp[1],
+                        "document_id": get_uuid_from_str(prefix),
+                    },
+                )
+            else:
+                df = DataFromList(lst=[{"path": path, "file_name": file_name, "image_bytes": b_bytes}])
+            return df
+        raise ValueError(
+            f"pass: {path}, b_bytes: {b_bytes!r}, file_type: {file_type} and max_datapoints: {max_datapoints} "
+            f"not supported"
+        )
     def dataflow_to_page(self, df: DataFlow) -> DataFlow:
         """
         Converts a dataflow of images to a dataflow of pages
@@ -206,7 +269,9 @@ class DoctectionPipe(Pipeline):
         """
         return self.page_parser.predict_dataflow(df)
-    def analyze(self, **kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]) -> DataFlow:
+    def analyze(
+        self, **kwargs: Union[str, bytes, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
+    ) -> DataFlow:
         """
         `kwargs key dataset_dataflow:` Transfer a dataflow of a dataset via its dataflow builder
@@ -215,6 +280,8 @@ class DoctectionPipe(Pipeline):
                            only the first page is processed through the pipeline.
                            Alternatively, a path to a pdf document with multiple pages.
+        `kwargs key bytes:` A bytes object of an image
         `kwargs key file_type:` Selection of the file type, if: args:`file_type` is passed
         `kwargs key max_datapoints:` Stops processing as soon as max_datapoints images have been processed

deepdoctection/utils/fs.py CHANGED Viewed

@@ -227,20 +227,21 @@ def get_load_image_func(
 def maybe_path_or_pdf(path: PathLikeOrStr) -> int:
     """
-    Checks if the path points to a directory or a pdf document. Returns 1 if the path points to a directory, 2
-    if the path points to a pdf doc or 0, if none of the previous is true.
+    Checks if the path points to a directory, a pdf document or a single image. Returns 1 if the path points to a
+    directory, 2 if the path points to a pdf doc and 3 if path points to either a PNG, JPG or JPEG or 0 if none of the
+    previous is true.
     :param path: A path
-    :return: A value of 0,1,2
+    :return: A value of 0,1,2,3
     """
-    is_dir = os.path.isdir(path)
-    if is_dir:
+    if os.path.isdir(path):
         return 1
     file_name = os.path.split(path)[1]
-    is_pdf = is_file_extension(file_name, ".pdf")
-    if is_pdf:
+    if is_file_extension(file_name, ".pdf"):
         return 2
+    if is_file_extension(file_name, [".png", ".jpeg", ".jpg", ".tif"]):
+        return 3
     return 0

deepdoctection 0.35__py3-none-any.whl → 0.37__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.35py3-none-any.whl → 0.37py3-none-any.whl