PyPI - deepdoctection - Versions diffs - 0.32__tar.gz → 0.33__tar.gz - Mend

deepdoctection 0.32tar.gz → 0.33tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (154) hide show

{deepdoctection-0.32 → deepdoctection-0.33}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: deepdoctection
-Version: 0.32
+Version: 0.33
 Summary: Repository for Document AI
 Home-page: https://github.com/deepdoctection/deepdoctection
 Author: Dr. Janis Meyer
@@ -23,7 +23,7 @@ Requires-Dist: jsonlines==3.1.0
 Requires-Dist: lazy-imports==0.3.1
 Requires-Dist: mock==4.0.3
 Requires-Dist: networkx>=2.7.1
-Requires-Dist: numpy>=1.21
+Requires-Dist: numpy<2.0,>=1.21
 Requires-Dist: packaging>=20.0
 Requires-Dist: Pillow>=10.0.0
 Requires-Dist: pypdf>=3.16.0
@@ -40,7 +40,7 @@ Requires-Dist: jsonlines==3.1.0; extra == "tf"
 Requires-Dist: lazy-imports==0.3.1; extra == "tf"
 Requires-Dist: mock==4.0.3; extra == "tf"
 Requires-Dist: networkx>=2.7.1; extra == "tf"
-Requires-Dist: numpy>=1.21; extra == "tf"
+Requires-Dist: numpy<2.0,>=1.21; extra == "tf"
 Requires-Dist: packaging>=20.0; extra == "tf"
 Requires-Dist: Pillow>=10.0.0; extra == "tf"
 Requires-Dist: pypdf>=3.16.0; extra == "tf"
@@ -70,7 +70,7 @@ Requires-Dist: jsonlines==3.1.0; extra == "pt"
 Requires-Dist: lazy-imports==0.3.1; extra == "pt"
 Requires-Dist: mock==4.0.3; extra == "pt"
 Requires-Dist: networkx>=2.7.1; extra == "pt"
-Requires-Dist: numpy>=1.21; extra == "pt"
+Requires-Dist: numpy<2.0,>=1.21; extra == "pt"
 Requires-Dist: packaging>=20.0; extra == "pt"
 Requires-Dist: Pillow>=10.0.0; extra == "pt"
 Requires-Dist: pypdf>=3.16.0; extra == "pt"

{deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/__init__.py RENAMED Viewed

@@ -25,7 +25,7 @@ from .utils.logger import LoggingRecord, logger
 # pylint: enable=wrong-import-position
-__version__ = 0.32
+__version__ = 0.33
 _IMPORT_STRUCTURE = {
     "analyzer": [
@@ -160,6 +160,8 @@ _IMPORT_STRUCTURE = {
         "EvalCallback",
     ],
     "extern": [
+        "ModelCategories",
+        "NerModelCategories",
         "PredictorBase",
         "DetectionResult",
         "ObjectDetector",
@@ -423,28 +425,6 @@ _IMPORT_STRUCTURE = {
 env_info = collect_env_info()
 logger.debug(LoggingRecord(msg=env_info))
-if os.environ.get("PYTORCH_AVAILABLE") and os.environ.get("DD_USE_TORCH") is None:
-    os.environ["DD_USE_TORCH"] = "1"
-    os.environ["USE_TORCH"] = "1"
-if os.environ.get("TENSORFLOW_AVAILABLE") and os.environ.get("DD_USE_TF") is None:
-    os.environ["DD_USE_TF"] = "1"
-    os.environ["USE_TF"] = "1"
-if os.environ.get("DD_USE_TORCH") and os.environ.get("DD_USE_TF"):
-    logger.warning(
-        "Both DD_USE_TORCH and DD_USE_TF are set. Defaulting to PyTorch. If you want a different "
-        "behaviour, set DD_USE_TORCH to None before importing deepdoctection."
-    )
-    os.environ.pop("DD_USE_TF")
-    os.environ.pop("USE_TF")
-if not os.environ.get("PYTORCH_AVAILABLE") and not os.environ.get("TENSORFLOW_AVAILABLE"):
-    logger.warning(
-        LoggingRecord(
-            msg="Neither Tensorflow or Pytorch are available. You will not be able to use any Deep Learning "
-            "model from the library."
-        )
-    )
 # Direct imports for type-checking
 if TYPE_CHECKING:

{deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/analyzer/dd.py RENAMED Viewed

@@ -23,10 +23,12 @@ Module for **deep**doctection analyzer.
 -user factory with a reduced config setting
 """
+from __future__ import annotations
 import os
 from os import environ
 from shutil import copyfile
-from typing import List, Optional, Union
+from typing import Optional, Union
 from lazy_imports import try_import
@@ -50,7 +52,7 @@ from ..pipe.refine import TableSegmentationRefinementService
 from ..pipe.segment import PubtablesSegmentationService, TableSegmentationService
 from ..pipe.sub_layout import DetectResultGenerator, SubImageLayoutService
 from ..pipe.text import TextExtractionService
-from ..utils.detection_types import Pathlike
+from ..utils.env_info import ENV_VARS_TRUE
 from ..utils.error import DependencyError
 from ..utils.file_utils import detectron2_available, tensorpack_available
 from ..utils.fs import get_configs_dir_path, get_package_path, mkdir_p
@@ -58,6 +60,7 @@ from ..utils.logger import LoggingRecord, logger
 from ..utils.metacfg import AttrDict, set_config_by_yaml
 from ..utils.settings import CellType, LayoutType
 from ..utils.transform import PadTransform
+from ..utils.types import PathLikeOrStr
 with try_import() as image_guard:
     from botocore.config import Config  # type: ignore
@@ -81,7 +84,7 @@ _TESSERACT = "deepdoctection/configs/conf_tesseract.yaml"
 def maybe_copy_config_to_cache(
-    package_path: Pathlike, configs_dir_path: Pathlike, file_name: str, force_copy: bool = True
+    package_path: PathLikeOrStr, configs_dir_path: PathLikeOrStr, file_name: str, force_copy: bool = True
 ) -> str:
     """
     Initial copying of various files
@@ -115,7 +118,7 @@ def config_sanity_checks(cfg: AttrDict) -> None:
 def build_detector(
     cfg: AttrDict, mode: str
-) -> Union["D2FrcnnDetector", "TPFrcnnDetector", "HFDetrDerivedDetector", "D2FrcnnTracingDetector"]:
+) -> Union[D2FrcnnDetector, TPFrcnnDetector, HFDetrDerivedDetector, D2FrcnnTracingDetector]:
     """Building a D2-Detector, a TP-Detector as Detr-Detector or a D2-Torch Tracing Detector according to
     the config
@@ -133,8 +136,8 @@ def build_detector(
     config_path = ModelCatalog.get_full_path_configs(weights)
     weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
     profile = ModelCatalog.get_profile(weights)
-    categories = profile.categories
-    assert categories is not None
+    categories = profile.categories if profile.categories is not None else {}
     if profile.model_wrapper in ("TPFrcnnDetector",):
         return TPFrcnnDetector(config_path, weights_path, categories, filter_categories=filter_categories)
     if profile.model_wrapper in ("D2FrcnnDetector",):
@@ -202,11 +205,13 @@ def build_sub_image_service(detector: ObjectDetector, cfg: AttrDict, mode: str)
     padder = None
     if mode == "ITEM":
         if detector.__class__.__name__ in ("HFDetrDerivedDetector",):
-            exclude_category_ids.extend(["1", "3", "4", "5", "6"])
+            exclude_category_ids.extend([1, 3, 4, 5, 6])
             padder = build_padder(cfg, mode)
-    detect_result_generator = DetectResultGenerator(detector.categories, exclude_category_ids=exclude_category_ids)
+    detect_result_generator = DetectResultGenerator(
+        categories=detector.categories.categories, exclude_category_ids=exclude_category_ids
+    )
     return SubImageLayoutService(
-        detector, [LayoutType.table, LayoutType.table_rotated], None, detect_result_generator, padder
+        detector, [LayoutType.TABLE, LayoutType.TABLE_ROTATED], None, detect_result_generator, padder
     )
@@ -233,9 +238,9 @@ def build_ocr(cfg: AttrDict) -> Union[TesseractOcrDetector, DoctrTextRecognizer,
         )
     if cfg.OCR.USE_TEXTRACT:
         credentials_kwargs = {
-            "aws_access_key_id": environ.get("ACCESS_KEY"),
-            "aws_secret_access_key": environ.get("SECRET_KEY"),
-            "config": Config(region_name=environ.get("REGION")),
+            "aws_access_key_id": environ.get("ACCESS_KEY", None),
+            "aws_secret_access_key": environ.get("SECRET_KEY", None),
+            "config": Config(region_name=environ.get("REGION", None)),
         }
         return TextractOcrDetector(**credentials_kwargs)
     raise ValueError("You have set USE_OCR=True but any of USE_TESSERACT, USE_DOCTR, USE_TEXTRACT is set to False")
@@ -260,7 +265,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
     :param cfg: A configuration
     :return: Analyzer pipeline
     """
-    pipe_component_list: List[PipelineComponent] = []
+    pipe_component_list: list[PipelineComponent] = []
     if cfg.USE_LAYOUT:
         d_layout = build_detector(cfg, "LAYOUT")
@@ -300,22 +305,22 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
                 cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
                 cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
                 cfg.SEGMENTATION.CELL_CATEGORY_ID,
-                LayoutType.table,
+                LayoutType.TABLE,
                 [
-                    CellType.spanning,
-                    CellType.row_header,
-                    CellType.column_header,
-                    CellType.projected_row_header,
-                    LayoutType.cell,
+                    CellType.SPANNING,
+                    CellType.ROW_HEADER,
+                    CellType.COLUMN_HEADER,
+                    CellType.PROJECTED_ROW_HEADER,
+                    LayoutType.CELL,
                 ],
                 [
-                    CellType.spanning,
-                    CellType.row_header,
-                    CellType.column_header,
-                    CellType.projected_row_header,
+                    CellType.SPANNING,
+                    CellType.ROW_HEADER,
+                    CellType.COLUMN_HEADER,
+                    CellType.PROJECTED_ROW_HEADER,
                 ],
-                [LayoutType.row, LayoutType.column],
-                [CellType.row_number, CellType.column_number],
+                [LayoutType.ROW, LayoutType.COLUMN],
+                [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER],
                 stretch_rule=cfg.SEGMENTATION.STRETCH_RULE,
             )
             pipe_component_list.append(pubtables)
@@ -327,23 +332,23 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
                 cfg.SEGMENTATION.FULL_TABLE_TILING,
                 cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
                 cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
-                LayoutType.table,
-                [CellType.header, CellType.body, LayoutType.cell],
-                [LayoutType.row, LayoutType.column],
-                [CellType.row_number, CellType.column_number],
+                LayoutType.TABLE,
+                [CellType.HEADER, CellType.BODY, LayoutType.CELL],
+                [LayoutType.ROW, LayoutType.COLUMN],
+                [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER],
                 cfg.SEGMENTATION.STRETCH_RULE,
             )
             pipe_component_list.append(table_segmentation)
             if cfg.USE_TABLE_REFINEMENT:
                 table_segmentation_refinement = TableSegmentationRefinementService(
-                    [LayoutType.table, LayoutType.table_rotated],
+                    [LayoutType.TABLE, LayoutType.TABLE_ROTATED],
                     [
-                        LayoutType.cell,
-                        CellType.column_header,
-                        CellType.projected_row_header,
-                        CellType.spanning,
-                        CellType.row_header,
+                        LayoutType.CELL,
+                        CellType.COLUMN_HEADER,
+                        CellType.PROJECTED_ROW_HEADER,
+                        CellType.SPANNING,
+                        CellType.ROW_HEADER,
                     ],
                 )
                 pipe_component_list.append(table_segmentation_refinement)
@@ -363,7 +368,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
         ocr = build_ocr(cfg)
         skip_if_text_extracted = cfg.USE_PDF_MINER
-        extract_from_roi = LayoutType.word if cfg.OCR.USE_DOCTR else None
+        extract_from_roi = LayoutType.WORD if cfg.OCR.USE_DOCTR else None
         text = TextExtractionService(
             ocr, skip_if_text_extracted=skip_if_text_extracted, extract_from_roi=extract_from_roi
         )
@@ -372,7 +377,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
     if cfg.USE_PDF_MINER or cfg.USE_OCR:
         match = MatchingService(
             parent_categories=cfg.WORD_MATCHING.PARENTAL_CATEGORIES,
-            child_categories=LayoutType.word,
+            child_categories=LayoutType.WORD,
             matching_rule=cfg.WORD_MATCHING.RULE,
             threshold=cfg.WORD_MATCHING.THRESHOLD,
             max_parent_only=cfg.WORD_MATCHING.MAX_PARENT_ONLY,
@@ -380,7 +385,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
         pipe_component_list.append(match)
         order = TextOrderService(
-            text_container=LayoutType.word,
+            text_container=LayoutType.WORD,
             text_block_categories=cfg.TEXT_ORDERING.TEXT_BLOCK_CATEGORIES,
             floating_text_block_categories=cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES,
             include_residual_text_container=cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
@@ -392,7 +397,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
         pipe_component_list.append(order)
     page_parsing_service = PageParsingService(
-        text_container=LayoutType.word,
+        text_container=LayoutType.WORD,
         floating_text_block_categories=cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES,
         include_residual_text_container=cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
     )
@@ -403,8 +408,8 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
 def get_dd_analyzer(
     reset_config_file: bool = True,
-    config_overwrite: Optional[List[str]] = None,
-    path_config_file: Optional[Pathlike] = None,
+    config_overwrite: Optional[list[str]] = None,
+    path_config_file: Optional[PathLikeOrStr] = None,
 ) -> DoctectionPipe:
     """
     Factory function for creating the built-in **deep**doctection analyzer.
@@ -431,7 +436,7 @@ def get_dd_analyzer(
     :return: A DoctectionPipe instance with given configs
     """
     config_overwrite = [] if config_overwrite is None else config_overwrite
-    lib = "TF" if os.environ.get("DD_USE_TF") else "PT"
+    lib = "TF" if os.environ.get("DD_USE_TF", "0") in ENV_VARS_TRUE else "PT"
     if lib == "TF":
         device = get_tf_device()
     elif lib == "PT":

{deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/dataflow/common.py RENAMED Viewed

@@ -12,7 +12,7 @@ Some DataFlow classes for transforming and processing datapoints. Many classes h
 """
 import itertools
 from copy import copy
-from typing import Any, Callable, Iterator, List, Union
+from typing import Any, Callable, Iterator, Union
 import tqdm
@@ -164,6 +164,10 @@ class RepeatedData(ProxyDataFlow):
                 Set to -1 to repeat ``ds`` infinite times.
         """
         self.num = num
+        if self.num != -1:
+            self.dfs = itertools.tee(df, self.num)
+        else:
+            self.dfs = ()
         super().__init__(df)
     def __len__(self) -> int:
@@ -180,8 +184,8 @@ class RepeatedData(ProxyDataFlow):
             while True:
                 yield from self.df
         else:
-            for _ in range(self.num):
-                yield from self.df
+            for df in self.dfs:
+                yield from df
 class ConcatData(DataFlow):
@@ -197,7 +201,7 @@ class ConcatData(DataFlow):
            df = ConcatData([df_1,df_2])
     """
-    def __init__(self, df_lists: List[DataFlow]) -> None:
+    def __init__(self, df_lists: list[DataFlow]) -> None:
         """
         :param df_lists: a list of DataFlow.
         """
@@ -233,7 +237,7 @@ class JoinData(DataFlow):
     `JoinData` will stop once the first Dataflow throws a StopIteration
     """
-    def __init__(self, df_lists: List[DataFlow]) -> None:
+    def __init__(self, df_lists: list[DataFlow]) -> None:
         """
         :param df_lists: a list of DataFlow. When these dataflows have different sizes, JoinData will stop when any
                         of them is exhausted.

{deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/dataflow/custom.py RENAMED Viewed

@@ -21,7 +21,7 @@ from
 <https://github.com/tensorpack/dataflow/blob/master/dataflow/dataflow/common.py>
 """
-from typing import Any, Callable, Iterable, Iterator, List, Optional
+from typing import Any, Callable, Iterable, Iterator, Optional
 import numpy as np
@@ -54,7 +54,7 @@ class CacheData(ProxyDataFlow):
         :param shuffle: whether to shuffle the cache before yielding from it.
         """
         self.shuffle = shuffle
-        self.buffer: List[Any] = []
+        self.buffer: list[Any] = []
         self._guard: Optional[DataFlowReentrantGuard] = None
         self.rng = get_rng(self)
         super().__init__(df)
@@ -78,7 +78,7 @@ class CacheData(ProxyDataFlow):
                     yield dp
                     self.buffer.append(dp)
-    def get_cache(self) -> List[Any]:
+    def get_cache(self) -> list[Any]:
         """
         get the cache of the whole dataflow as a list
@@ -115,10 +115,10 @@ class CustomDataFromList(DataFromList):
     def __init__(
         self,
-        lst: List[Any],
+        lst: list[Any],
         shuffle: bool = False,
         max_datapoints: Optional[int] = None,
-        rebalance_func: Optional[Callable[[List[Any]], List[Any]]] = None,
+        rebalance_func: Optional[Callable[[list[Any]], list[Any]]] = None,
     ):
         """
         :param lst: the input list. Each element represents a datapoint.

{deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/dataflow/custom_serialize.py RENAMED Viewed

@@ -19,23 +19,25 @@
 Methods that convert incoming data to dataflows.
 """
+from __future__ import annotations
 import itertools
 import json
 import os
 from collections import defaultdict
 from pathlib import Path
-from typing import DefaultDict, Dict, List, Optional, Sequence, Union
+from typing import Any, DefaultDict, Dict, Iterator, List, Optional, Sequence, TextIO, Union
 from jsonlines import Reader, Writer
 from tabulate import tabulate
 from termcolor import colored
 from ..utils.context import timed_operation
-from ..utils.detection_types import JsonDict, Pathlike
 from ..utils.error import FileExtensionError
 from ..utils.identifier import get_uuid_from_str
 from ..utils.pdf_utils import PDFStreamer
 from ..utils.tqdm import get_tqdm
+from ..utils.types import JsonDict, PathLikeOrStr
 from ..utils.utils import is_file_extension
 from .base import DataFlow
 from .common import FlattenData, JoinData, MapData
@@ -53,6 +55,59 @@ def _reset_df_and_get_length(df: DataFlow) -> int:
     return length
+class FileClosingIterator:
+    """
+    A custom iterator that closes the file object once the iteration is complete.
+    This iterator is used to ensure that the file object is properly closed after
+    reading the data from it. It is used in the context of reading data from a file
+    in a streaming manner, where the data is not loaded into memory all at once.
+    **Example:**
+        file = open(path, "r")
+        iterator = Reader(file)
+        closing_iterator = FileClosingIterator(file, iter(iterator))
+        df = CustomDataFromIterable(closing_iterator, max_datapoints=max_datapoints) # set up a dataflow
+    """
+    def __init__(self, file_obj: TextIO, iterator: Iterator[Any]):
+        """
+        Initializes the FileClosingIterator with a file object and its iterator.
+        :param file_obj (TextIO): The file object to read data from.
+        :param     iterator (Iterator): The actual iterator of the file object.
+        """
+        self.file_obj = file_obj
+        self.iterator = iterator
+    def __iter__(self) -> FileClosingIterator:
+        """
+        Returns the iterator object itself.
+        :return:  FileClosingIterator: The instance of the class itself.
+        """
+        return self
+    def __next__(self) -> Any:
+        """
+        Returns the next item from the file object's iterator.
+        Closes the file object if the iteration is finished.
+        :return: The next item from the file object's iterator.
+        Raises:
+            StopIteration: If there are no more items to return.
+        """
+        try:
+            return next(self.iterator)
+        except StopIteration as exc:
+            self.file_obj.close()
+            raise StopIteration from exc
 class SerializerJsonlines:
     """
     Serialize a dataflow from a jsonlines file. Alternatively, save a dataflow of JSON objects to a .jsonl file.
@@ -66,7 +121,7 @@ class SerializerJsonlines:
     """
     @staticmethod
-    def load(path: Pathlike, max_datapoints: Optional[int] = None) -> CustomDataFromIterable:
+    def load(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> CustomDataFromIterable:
         """
         :param path: a path to a .jsonl file.
         :param max_datapoints: Will stop the iteration once max_datapoints have been streamed
@@ -75,10 +130,11 @@ class SerializerJsonlines:
         """
         file = open(path, "r")  # pylint: disable=W1514,R1732
         iterator = Reader(file)
-        return CustomDataFromIterable(iterator, max_datapoints=max_datapoints)
+        closing_iterator = FileClosingIterator(file, iter(iterator))
+        return CustomDataFromIterable(closing_iterator, max_datapoints=max_datapoints)
     @staticmethod
-    def save(df: DataFlow, path: Pathlike, file_name: str, max_datapoints: Optional[int] = None) -> None:
+    def save(df: DataFlow, path: PathLikeOrStr, file_name: str, max_datapoints: Optional[int] = None) -> None:
         """
         Writes a dataflow iteratively to a .jsonl file. Every datapoint must be a dict where all items are serializable.
         As the length of the dataflow cannot be determined in every case max_datapoint prevents generating an
@@ -120,7 +176,7 @@ class SerializerTabsepFiles:
     """
     @staticmethod
-    def load(path: Pathlike, max_datapoins: Optional[int] = None) -> CustomDataFromList:
+    def load(path: PathLikeOrStr, max_datapoins: Optional[int] = None) -> CustomDataFromList:
         """
         :param path: a path to a .txt file.
         :param max_datapoins: Will stop the iteration once max_datapoints have been streamed
@@ -133,7 +189,7 @@ class SerializerTabsepFiles:
         return CustomDataFromList(file_list, max_datapoints=max_datapoins)
     @staticmethod
-    def save(df: DataFlow, path: Pathlike, file_name: str, max_datapoints: Optional[int] = None) -> None:
+    def save(df: DataFlow, path: PathLikeOrStr, file_name: str, max_datapoints: Optional[int] = None) -> None:
         """
         Writes a dataflow iteratively to a .txt file. Every datapoint must be a string.
         As the length of the dataflow cannot be determined in every case max_datapoint prevents generating an
@@ -168,7 +224,7 @@ class SerializerFiles:
     @staticmethod
     def load(
-        path: Pathlike,
+        path: PathLikeOrStr,
         file_type: Union[str, Sequence[str]],
         max_datapoints: Optional[int] = None,
         shuffle: Optional[bool] = False,
@@ -190,15 +246,14 @@ class SerializerFiles:
         df2: DataFlow
         df3: DataFlow
-        if isinstance(path, str):
-            path = Path(path)
+        path = Path(path)
         if not path.exists():
             raise NotADirectoryError(f"The path {path} to the directory or file does not exist")
         if shuffle:
             sort = False
-        it1 = os.walk(path, topdown=False)
-        it2 = os.walk(path, topdown=False)
+        it1 = os.walk(os.fspath(path), topdown=False)
+        it2 = os.walk(os.fspath(path), topdown=False)
         df1 = CustomDataFromIterable(it1)
         df2 = CustomDataFromIterable(it2)
         df1 = MapData(df1, lambda dp: None if len(dp[2]) == 0 else dp)
@@ -237,7 +292,7 @@ class CocoParser:
     :param annotation_file: location of annotation file
     """
-    def __init__(self, annotation_file: Optional[Pathlike] = None) -> None:
+    def __init__(self, annotation_file: Optional[PathLikeOrStr] = None) -> None:
         self.dataset: JsonDict = {}
         self.anns: Dict[int, JsonDict] = {}
         self.cats: Dict[int, JsonDict] = {}
@@ -465,7 +520,7 @@ class SerializerCoco:
     """
     @staticmethod
-    def load(path: Pathlike, max_datapoints: Optional[int] = None) -> DataFlow:
+    def load(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> DataFlow:
         """
         Loads a .json file and generates a dataflow.
@@ -478,7 +533,7 @@ class SerializerCoco:
                 {'image':{'id',...},'annotations':[{'id':…,'bbox':...}]}
-            for each single image id.
+            for each image id. We use the type hint CocoDatapointDict to describe this dictionary
         :param max_datapoints: Will stop the iteration once max_datapoints have been streamed.
         :param path: a path to a .json file.
@@ -525,7 +580,7 @@ class SerializerPdfDoc:
     """
     @staticmethod
-    def load(path: Pathlike, max_datapoints: Optional[int] = None) -> DataFlow:
+    def load(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> DataFlow:
         """
         Loads the document page wise and returns a dataflow accordingly.
@@ -552,14 +607,16 @@ class SerializerPdfDoc:
         return df
     @staticmethod
-    def save(path: Pathlike) -> None:
+    def save(path: PathLikeOrStr) -> None:
         """
         Not implemented
         """
         raise NotImplementedError()
     @staticmethod
-    def split(path: Pathlike, path_target: Optional[Pathlike] = None, max_datapoint: Optional[int] = None) -> None:
+    def split(
+        path: PathLikeOrStr, path_target: Optional[PathLikeOrStr] = None, max_datapoint: Optional[int] = None
+    ) -> None:
         """
         Split a document into single pages.
         """

{deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/dataflow/parallel_map.py RENAMED Viewed

@@ -23,7 +23,7 @@ import uuid
 import weakref
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
-from typing import Any, Callable, Iterator, List, no_type_check
+from typing import Any, Callable, Iterator, no_type_check
 import zmq
@@ -236,7 +236,7 @@ class MultiThreadMapData(_ParallelMapData):
         self._strict = strict
         self.num_thread = num_thread
         self.map_func = map_func
-        self._threads: List[Any] = []
+        self._threads: list[Any] = []
         self._evt = None
     def reset_state(self) -> None:
@@ -284,7 +284,7 @@ class _MultiProcessZMQDataFlow(DataFlow, ABC):
         if os.name == "nt":
             raise EnvironmentError("ZMQ IPC doesn't support windows")
         self._reset_done = False
-        self._procs: List[Any] = []
+        self._procs: list[Any] = []
         self.context = None
         self.socket = None

{deepdoctection-0.32 → deepdoctection-0.33}/deepdoctection/dataflow/serialize.py RENAMED Viewed

@@ -12,7 +12,7 @@ Some DataFlow classes for serialization. Many classes have been taken from
 import pickle
 from copy import copy
-from typing import Any, Iterable, Iterator, List, Optional, Tuple, Union
+from typing import Any, Iterable, Iterator, Optional, Union
 import numpy as np
@@ -23,7 +23,7 @@ from .base import DataFlow, RNGDataFlow
 class DataFromList(RNGDataFlow):
     """Wrap a list of datapoints to a DataFlow"""
-    def __init__(self, lst: List[Any], shuffle: bool = True) -> None:
+    def __init__(self, lst: list[Any], shuffle: bool = True) -> None:
         """
         :param lst: input list. Each element is a datapoint.
         :param shuffle: shuffle data.
@@ -79,11 +79,11 @@ class FakeData(RNGDataFlow):
     def __init__(
         self,
-        shapes: List[Union[List[Any], Tuple[Any]]],
+        shapes: list[Union[list[Any], tuple[Any]]],
         size: int = 1000,
         random: bool = True,
         dtype: str = "float32",
-        domain: Tuple[Union[float, int], Union[float, int]] = (0, 1),
+        domain: tuple[Union[float, int], Union[float, int]] = (0, 1),
     ):
         """
         :param  shapes: a list of lists/tuples. Shapes of each component.

deepdoctection 0.32__tar.gz → 0.33__tar.gz

Potentially problematic release.

deepdoctection 0.32tar.gz → 0.33tar.gz