PyPI - deepdoctection - Versions diffs - 0.38__py3-none-any.whl → 0.39.1__py3-none-any.whl - Mend

deepdoctection 0.38py3-none-any.whl → 0.39.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (25) hide show

deepdoctection/__init__.py +5 -1
deepdoctection/analyzer/dd.py +6 -5
deepdoctection/analyzer/factory.py +7 -2
deepdoctection/datapoint/convert.py +14 -8
deepdoctection/datapoint/image.py +1 -1
deepdoctection/datapoint/view.py +34 -24
deepdoctection/extern/model.py +6 -97
deepdoctection/mapper/cats.py +21 -10
deepdoctection/mapper/match.py +0 -22
deepdoctection/mapper/misc.py +12 -2
deepdoctection/mapper/pubstruct.py +1 -1
deepdoctection/pipe/base.py +38 -5
deepdoctection/pipe/common.py +3 -3
deepdoctection/pipe/doctectionpipe.py +20 -3
deepdoctection/pipe/lm.py +20 -5
deepdoctection/pipe/segment.py +4 -8
deepdoctection/train/hf_detr_train.py +1 -1
deepdoctection/train/hf_layoutlm_train.py +3 -1
deepdoctection/utils/pdf_utils.py +17 -9
deepdoctection/utils/settings.py +1 -1
{deepdoctection-0.38.dist-info → deepdoctection-0.39.1.dist-info}/METADATA +8 -8
{deepdoctection-0.38.dist-info → deepdoctection-0.39.1.dist-info}/RECORD +25 -25
{deepdoctection-0.38.dist-info → deepdoctection-0.39.1.dist-info}/LICENSE +0 -0
{deepdoctection-0.38.dist-info → deepdoctection-0.39.1.dist-info}/WHEEL +0 -0
{deepdoctection-0.38.dist-info → deepdoctection-0.39.1.dist-info}/top_level.txt +0 -0

deepdoctection/__init__.py CHANGED Viewed

@@ -6,6 +6,7 @@ Init file for deepdoctection package. This file is used to import all submodules
 """
 import importlib.util
+import os
 # Before doing anything else, check if the .env file exists and load it
 if importlib.util.find_spec("dotenv") is not None:
@@ -24,7 +25,7 @@ from .utils.logger import LoggingRecord, logger
 # pylint: enable=wrong-import-position
-__version__ = "0.38"
+__version__ = "0.39.1"
 _IMPORT_STRUCTURE = {
     "analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory"],
@@ -423,6 +424,9 @@ _IMPORT_STRUCTURE = {
 env_info = collect_env_info()
 logger.debug(LoggingRecord(msg=env_info))
 auto_select_pdf_render_framework()
+os.environ["DPI"] = "300"
+os.environ["IMAGE_WIDTH"] = ""
+os.environ["IMAGE_HEIGHT"] = ""
 # Direct imports for type-checking
 if TYPE_CHECKING:

deepdoctection/analyzer/dd.py CHANGED Viewed

@@ -32,7 +32,6 @@ from ..extern.pt.ptutils import get_torch_device
 from ..extern.tp.tfutils import disable_tp_layer_logging, get_tf_device
 from ..pipe.doctectionpipe import DoctectionPipe
 from ..utils.env_info import ENV_VARS_TRUE
-from ..utils.error import DependencyError
 from ..utils.file_utils import tensorpack_available
 from ..utils.fs import get_configs_dir_path, get_package_path, maybe_copy_config_to_cache
 from ..utils.logger import LoggingRecord, logger
@@ -118,13 +117,15 @@ def get_dd_analyzer(
     :return: A DoctectionPipe instance with given configs
     """
     config_overwrite = [] if config_overwrite is None else config_overwrite
-    lib = "TF" if os.environ.get("DD_USE_TF", "0") in ENV_VARS_TRUE else "PT"
-    if lib == "TF":
+    if os.environ.get("DD_USE_TF", "0") in ENV_VARS_TRUE:
+        lib = "TF"
         device = get_tf_device()
-    elif lib == "PT":
+    elif os.environ.get("DD_USE_TORCH", "0") in ENV_VARS_TRUE:
+        lib = "PT"
         device = get_torch_device()
     else:
-        raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
+        lib = None
+        device = None
     dd_one_config_path = maybe_copy_config_to_cache(
         get_package_path(), get_configs_dir_path() / "dd", _DD_ONE, reset_config_file
     )

deepdoctection/analyzer/factory.py CHANGED Viewed

@@ -48,6 +48,7 @@ from ..pipe.segment import PubtablesSegmentationService, TableSegmentationServic
 from ..pipe.sub_layout import DetectResultGenerator, SubImageLayoutService
 from ..pipe.text import TextExtractionService
 from ..pipe.transform import SimpleTransformService
+from ..utils.error import DependencyError
 from ..utils.file_utils import detectron2_available
 from ..utils.fs import get_configs_dir_path
 from ..utils.metacfg import AttrDict
@@ -62,8 +63,6 @@ __all__ = [
     "ServiceFactory",
 ]
-# from ._config import cfg
 class ServiceFactory:
     """
@@ -94,6 +93,8 @@ class ServiceFactory:
         :param config: configuration object
         :param mode: either `LAYOUT`,`CELL` or `ITEM`
         """
+        if config.LIB is None:
+            raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
         weights = (
             getattr(config.TF, mode).WEIGHTS
             if config.LIB == "TF"
@@ -310,6 +311,8 @@ class ServiceFactory:
                 config_overwrite=[f"LANGUAGES={config.LANGUAGE}"] if config.LANGUAGE is not None else None,
             )
         if config.OCR.USE_DOCTR:
+            if config.LIB is None:
+                raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
             weights = (
                 config.OCR.WEIGHTS.DOCTR_RECOGNITION.TF
                 if config.LIB == "TF"
@@ -353,6 +356,8 @@ class ServiceFactory:
         :param config: configuration object
         :return: DoctrTextlineDetector
         """
+        if config.LIB is None:
+            raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
         weights = config.OCR.WEIGHTS.DOCTR_WORD.TF if config.LIB == "TF" else config.OCR.WEIGHTS.DOCTR_WORD.PT
         weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
         profile = ModelCatalog.get_profile(weights)

deepdoctection/datapoint/convert.py CHANGED Viewed

@@ -154,7 +154,9 @@ def convert_pdf_bytes_to_np_array(pdf_bytes: bytes, dpi: Optional[int] = None) -
     return np_array.astype(uint8)
-def convert_pdf_bytes_to_np_array_v2(pdf_bytes: bytes, dpi: Optional[int] = 200) -> PixelValues:
+def convert_pdf_bytes_to_np_array_v2(
+    pdf_bytes: bytes, dpi: Optional[int] = None, width: Optional[int] = None, height: Optional[int] = None
+) -> PixelValues:
     """
     Converts a pdf passed as bytes into a numpy array. We use poppler or pdfmium to convert the pdf to an image.
     If both is available you can steer the selection of the render engine with environment variables:
@@ -165,17 +167,21 @@ def convert_pdf_bytes_to_np_array_v2(pdf_bytes: bytes, dpi: Optional[int] = 200)
     :param pdf_bytes: A pdf as bytes object. A byte representation can from a pdf file can be generated e.g. with
                       `utils.fs.load_bytes_from_pdf_file`
     :param dpi: The dpi value of the resulting output image. For high resolution set dpi=300.
+    :param width: The width of the resulting output image. This option does only work when using Poppler as
+    PDF renderer
+    :param height: The height of the resulting output image. This option does only work when using Poppler as
+    PDF renderer
     :return: Image as numpy array.
     """
-    with BytesIO(pdf_bytes) as pdf_file:
-        pdf = PdfReader(pdf_file).pages[0]
-    shape = pdf.mediabox  # pylint: disable=E1101
-    height = shape[3] - shape[1]
-    width = shape[2] - shape[0]
     if dpi is None:
-        return pdf_to_np_array(pdf_bytes, size=(int(width), int(height)))
+        if width is None or height is None:
+            with BytesIO(pdf_bytes) as pdf_file:
+                pdf = PdfReader(pdf_file).pages[0]
+            shape = pdf.mediabox  # pylint: disable=E1101
+            height = shape[3] - shape[1]
+            width = shape[2] - shape[0]
+        return pdf_to_np_array(pdf_bytes, size=(int(width), int(height)))  # type: ignore
     return pdf_to_np_array(pdf_bytes, dpi=dpi)

deepdoctection/datapoint/image.py CHANGED Viewed

@@ -153,7 +153,7 @@ class Image:
             self.set_width_height(self._image.shape[1], self._image.shape[0])
             self._self_embedding()
         elif isinstance(image, bytes):
-            self._image = convert_pdf_bytes_to_np_array_v2(image, dpi=environ.get("DPI", 300))  # type: ignore
+            self._image = convert_pdf_bytes_to_np_array_v2(image, dpi=int(environ["DPI"]))
             self.set_width_height(self._image.shape[1], self._image.shape[0])
             self._self_embedding()
         else:

deepdoctection/datapoint/view.py CHANGED Viewed

@@ -228,23 +228,33 @@ class Layout(ImageAnnotationBaseView):
         """
         words = self.get_ordered_words()
-        characters, ann_ids, token_classes, token_tags, token_classes_ids, token_tag_ids = zip(
-            *[
-                (
-                    word.characters,
-                    word.annotation_id,
-                    word.token_class,
-                    word.token_tag,
-                    word.get_sub_category(WordType.TOKEN_CLASS).category_id
-                    if WordType.TOKEN_CLASS in word.sub_categories
-                    else None,
-                    word.get_sub_category(WordType.TOKEN_TAG).category_id
-                    if WordType.TOKEN_TAG in word.sub_categories
-                    else None,
-                )
-                for word in words
-            ]
-        )
+        if words:
+            characters, ann_ids, token_classes, token_tags, token_classes_ids, token_tag_ids = zip(
+                *[
+                    (
+                        word.characters,
+                        word.annotation_id,
+                        word.token_class,
+                        word.token_tag,
+                        word.get_sub_category(WordType.TOKEN_CLASS).category_id
+                        if WordType.TOKEN_CLASS in word.sub_categories
+                        else None,
+                        word.get_sub_category(WordType.TOKEN_TAG).category_id
+                        if WordType.TOKEN_TAG in word.sub_categories
+                        else None,
+                    )
+                    for word in words
+                ]
+            )
+        else:
+            characters, ann_ids, token_classes, token_tags, token_classes_ids, token_tag_ids = (
+                [],  # type: ignore
+                [],  # type: ignore
+                [],  # type: ignore
+                [],  # type: ignore
+                [],  # type: ignore
+                [],  # type: ignore
+            )
         return {
             "text": " ".join(characters),
             "words": characters,
@@ -327,7 +337,7 @@ class Table(Layout):
         :return: A list of `Cell` objects that are row headers.
         """
         all_relation_ids = self.get_relationship(Relationships.CHILD)
-        all_cells: list[Cell] = self.base_page.get_annotation( # type: ignore
+        all_cells: list[Cell] = self.base_page.get_annotation(  # type: ignore
             category_names=[LayoutType.CELL, CellType.SPANNING], annotation_ids=all_relation_ids
         )
         row_header_cells = list(filter(lambda cell: CellType.ROW_HEADER in cell.sub_categories, all_cells))
@@ -363,18 +373,18 @@ class Table(Layout):
             category_names=[LayoutType.CELL, CellType.SPANNING], annotation_ids=all_relation_ids
         )
         row_cells = list(
-            filter(
-                lambda c: row_number in (c.row_number, c.row_number + c.row_span), all_cells  # type: ignore
-            )
+            filter(lambda c: row_number in (c.row_number, c.row_number + c.row_span), all_cells)  # type: ignore
         )
-        row_cells.sort(key=lambda c: c.column_number) # type: ignore
+        row_cells.sort(key=lambda c: c.column_number)  # type: ignore
         column_header_cells = self.column_header_cells
         kv_dict: Mapping[str, str] = {}
         for cell in row_cells:
             for header in column_header_cells:
-                if (cell.column_number == header.column_number and  # type: ignore
-                        cell.annotation_id != header.annotation_id):  # type: ignore
+                if (
+                    cell.column_number == header.column_number  # type: ignore
+                    and cell.annotation_id != header.annotation_id  # type: ignore
+                ):
                     kv_dict[(header.column_number, header.text)] = cell.text  # type: ignore
                     break
         return kv_dict

deepdoctection/extern/model.py CHANGED Viewed

@@ -24,7 +24,7 @@ from dataclasses import asdict, dataclass, field
 from typing import Any, Mapping, Optional, Union
 import jsonlines
-from huggingface_hub import cached_download, hf_hub_url  # type: ignore
+from huggingface_hub import hf_hub_download
 from tabulate import tabulate
 from termcolor import colored
@@ -136,51 +136,6 @@ class ModelCatalog:
             dl_library="TF",
             model_wrapper="TPFrcnnDetector",
         ),
-        "item/model-1620000.data-00000-of-00001": ModelProfile(
-            name="item/model-1620000.data-00000-of-00001",
-            description="Tensorpack row/column detection model trained on Pubtabnet",
-            config="dd/tp/conf_frcnn_rows.yaml",
-            size=[823546048, 25787],
-            tp_model=True,
-            hf_repo_id="deepdoctection/tp_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_rc",
-            hf_model_name="model-1620000",
-            hf_config_file=["conf_frcnn_rows.yaml"],
-            categories={1: LayoutType.ROW, 2: LayoutType.COLUMN},
-            dl_library="TF",
-            model_wrapper="TPFrcnnDetector",
-        ),
-        "layout/model-800000.data-00000-of-00001": ModelProfile(
-            name="layout/model-800000.data-00000-of-00001",
-            description="Tensorpack layout detection model trained on Publaynet",
-            config="dd/tp/conf_frcnn_layout.yaml",
-            size=[823656748, 25796],
-            tp_model=True,
-            hf_repo_id="deepdoctection/tp_casc_rcnn_X_32xd4_50_FPN_GN_2FC_publaynet",
-            hf_model_name="model-800000",
-            hf_config_file=["conf_frcnn_layout.yaml"],
-            dl_library="TF",
-            categories={
-                1: LayoutType.TEXT,
-                2: LayoutType.TITLE,
-                3: LayoutType.LIST,
-                4: LayoutType.TABLE,
-                5: LayoutType.FIGURE,
-            },
-            model_wrapper="TPFrcnnDetector",
-        ),
-        "cell/model-1800000.data-00000-of-00001": ModelProfile(
-            name="cell/model-1800000.data-00000-of-00001",
-            description="Tensorpack cell detection model trained on Pubtabnet",
-            config="dd/tp/conf_frcnn_cell.yaml",
-            size=[823509160, 25905],
-            tp_model=True,
-            hf_repo_id="deepdoctection/tp_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_c",
-            hf_model_name="model-1800000",
-            hf_config_file=["conf_frcnn_cell.yaml"],
-            categories={1: LayoutType.CELL},
-            dl_library="TF",
-            model_wrapper="TPFrcnnDetector",
-        ),
         "layout/d2_model_0829999_layout_inf_only.pt": ModelProfile(
             name="layout/d2_model_0829999_layout_inf_only.pt",
             description="Detectron2 layout detection model trained on Publaynet",
@@ -200,25 +155,6 @@ class ModelCatalog:
             dl_library="PT",
             model_wrapper="D2FrcnnDetector",
         ),
-        "layout/d2_model_0829999_layout.pth": ModelProfile(
-            name="layout/d2_model_0829999_layout.pth",
-            description="Detectron2 layout detection model trained on Publaynet. Checkpoint for resuming training",
-            config="dd/d2/layout/CASCADE_RCNN_R_50_FPN_GN.yaml",
-            size=[548377327],
-            tp_model=False,
-            hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_publaynet_inference_only",
-            hf_model_name="d2_model_0829999_layout.pth",
-            hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
-            categories={
-                1: LayoutType.TEXT,
-                2: LayoutType.TITLE,
-                3: LayoutType.LIST,
-                4: LayoutType.TABLE,
-                5: LayoutType.FIGURE,
-            },
-            dl_library="PT",
-            model_wrapper="D2FrcnnDetector",
-        ),
         "layout/d2_model_0829999_layout_inf_only.ts": ModelProfile(
             name="layout/d2_model_0829999_layout_inf_only.ts",
             description="Detectron2 layout detection model trained on Publaynet. Torchscript export",
@@ -264,32 +200,6 @@ class ModelCatalog:
             dl_library="PT",
             model_wrapper="D2FrcnnTracingDetector",
         ),
-        "cell/d2_model_1849999_cell.pth": ModelProfile(
-            name="cell/d2_model_1849999_cell.pth",
-            description="Detectron2 cell detection inference only model trained on Pubtabnet",
-            config="dd/d2/cell/CASCADE_RCNN_R_50_FPN_GN.yaml",
-            size=[548279023],
-            tp_model=False,
-            hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_c_inference_only",
-            hf_model_name="cell/d2_model_1849999_cell.pth",
-            hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
-            categories={1: LayoutType.CELL},
-            dl_library="PT",
-            model_wrapper="D2FrcnnDetector",
-        ),
-        "item/d2_model_1639999_item.pth": ModelProfile(
-            name="item/d2_model_1639999_item.pth",
-            description="Detectron2 item detection model trained on Pubtabnet",
-            config="dd/d2/item/CASCADE_RCNN_R_50_FPN_GN.yaml",
-            size=[548303599],
-            tp_model=False,
-            hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_rc_inference_only",
-            hf_model_name="d2_model_1639999_item.pth",
-            hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
-            categories={1: LayoutType.ROW, 2: LayoutType.COLUMN},
-            dl_library="PT",
-            model_wrapper="D2FrcnnDetector",
-        ),
         "item/d2_model_1639999_item_inf_only.pt": ModelProfile(
             name="item/d2_model_1639999_item_inf_only.pt",
             description="Detectron2 item detection model inference only trained on Pubtabnet",
@@ -1232,20 +1142,19 @@ class ModelDownloadManager:
     def _load_from_hf_hub(
         repo_id: str, file_name: str, cache_directory: PathLikeOrStr, force_download: bool = False
     ) -> int:
-        url = hf_hub_url(repo_id=repo_id, filename=file_name)
         token = os.environ.get("HF_CREDENTIALS", None)
-        f_path = cached_download(
-            url,
-            cache_dir=cache_directory,
+        f_path = hf_hub_download(
+            repo_id,
+            file_name,
+            local_dir=cache_directory,  # type: ignore
             force_filename=file_name,
             force_download=force_download,
             token=token,
-            legacy_cache_layout=True,
         )
         if f_path:
             stat_info = os.stat(f_path)
             size = stat_info.st_size
-            assert size > 0, f"Downloaded an empty file from {url}!"
+            assert size > 0, f"Downloaded an empty file from {f_path}!"
             return size
         raise TypeError("Returned value from cached_download cannot be Null")

deepdoctection/mapper/cats.py CHANGED Viewed

@@ -73,18 +73,21 @@ def re_assign_cat_ids(
     Annotations that are not in the dictionary provided will be removed.
     :param dp: Image
-    :param categories_dict_name_as_key: e.g. `{LayoutType.word: '1'}`
+    :param categories_dict_name_as_key: e.g. `{LayoutType.word: 1}`
     :param cat_to_sub_cat_mapping: e.g. `{<LayoutType.word>:
         {<WordType.token_class>:
-            {<FundsFirstPage.report_date>: '1',
-            <FundsFirstPage.report_type>: '2',
-            <FundsFirstPage.umbrella>: '3',
-            <FundsFirstPage.fund_name>: '4',
-            <TokenClasses.other>: '5'},
-            <WordType.tag>:
-            {<BioTag.inside>: '1',
-            <BioTag.outside>: '2',
-            <BioTag.begin>: '3'}}}`
+            {<FundsFirstPage.REPORT_DATE>: 1,
+            <FundsFirstPage.REPORT_TYPE>: 2,
+            <FundsFirstPage.UMBRELLA>: 3,
+            <FundsFirstPage.FUND_NAME>: 4,
+            <TokenClasses.OTHER>: 5},
+            <WordType.TAG>:
+            {<BioTag.INSIDE>: 1,
+            <BioTag.OUTSIDE>: 2,
+            <BioTag.BEGIN>: 3}}}`
+            To re-assign the category ids of an image summary, use the key 'default_type' for the default category, e.g.
+            `{DefaultType.DEFAULT_TYPE: {<PageType.DOCUMENT_TYPE>: {<DocumentType.INVOICE>:1,
+            <DocumentType.BANK_STATEMENT>:2}}}`
     :return: Image
     """
@@ -104,6 +107,14 @@ def re_assign_cat_ids(
                     sub_category = ann.get_sub_category(key)
                     sub_category.category_id = sub_cat_values_dict.get(sub_category.category_name, DEFAULT_CATEGORY_ID)
+    if cat_to_sub_cat_mapping:
+        if "default_type" in cat_to_sub_cat_mapping:
+            sub_cat_keys_to_sub_cat_values = cat_to_sub_cat_mapping[get_type("default_type")]
+            for key in sub_cat_keys_to_sub_cat_values:
+                sub_cat_values_dict = sub_cat_keys_to_sub_cat_values[key]
+                sub_category = dp.summary.get_sub_category(key)
+                sub_category.category_id = sub_cat_values_dict.get(sub_category.category_name, DEFAULT_CATEGORY_ID)
     dp.remove(annotation_ids=ann_ids_to_remove)
     return dp

deepdoctection/mapper/match.py CHANGED Viewed

@@ -101,17 +101,6 @@ def match_anns_by_intersection(
         ]
     )
-    # second try, if ann has empty image
-    n_dim = child_ann_boxes.ndim
-    if n_dim != 2:
-        child_ann_boxes = np.array(
-            [
-                ann.bounding_box.transform(dp.width, dp.height, absolute_coords=True).to_list(mode="xyxy")
-                for ann in child_anns
-                if ann.bounding_box is not None
-            ]
-        )
     parent_anns = dp.get_annotation(annotation_ids=parent_ann_ids, category_names=parent_ann_category_names)
     parent_ann_boxes = np.array(
         [
@@ -120,17 +109,6 @@ def match_anns_by_intersection(
         ]
     )
-    # same for parent
-    n_dim = parent_ann_boxes.ndim
-    if n_dim != 2:
-        parent_ann_boxes = np.array(
-            [
-                ann.bounding_box.transform(dp.width, dp.height, absolute_coords=True).to_list(mode="xyxy")
-                for ann in parent_anns
-                if ann.bounding_box is not None
-            ]
-        )
     if matching_rule in ["iou"] and parent_anns and child_anns:
         iou_matrix = iou(child_ann_boxes, parent_ann_boxes)
         output = iou_matrix > threshold

deepdoctection/mapper/misc.py CHANGED Viewed

@@ -38,12 +38,20 @@ with try_import() as import_guard:
     from lxml import etree  # pylint: disable=W0611
-def to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int] = None) -> Optional[Image]:
+def to_image(
+    dp: Union[str, Mapping[str, Union[str, bytes]]],
+    dpi: Optional[int] = None,
+    width: Optional[int] = None,
+    height: Optional[int] = None,
+) -> Optional[Image]:
     """
     Mapping an input from `dataflow.SerializerFiles` or similar to an Image
     :param dp: Image
     :param dpi: dot per inch definition for pdf resolution when converting to numpy array
+    :param width: target width of the image. This option does only work when using Poppler as PDF renderer
+    :param height: target width of the image. This option does only work when using Poppler as PDF renderer
+    :param height: target height of the image
     :return: Image
     """
@@ -77,7 +85,9 @@ def to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int]
                 dp_image.pdf_bytes = dp.get("pdf_bytes")
                 if dp_image.pdf_bytes is not None:
                     if isinstance(dp_image.pdf_bytes, bytes):
-                        dp_image.image = convert_pdf_bytes_to_np_array_v2(dp_image.pdf_bytes, dpi=dpi)
+                        dp_image.image = convert_pdf_bytes_to_np_array_v2(
+                            dp_image.pdf_bytes, dpi=dpi, width=width, height=height
+                        )
             elif image_bytes is not None:
                 dp_image.image = convert_bytes_to_np_array(image_bytes)
             else:

deepdoctection/mapper/pubstruct.py CHANGED Viewed

@@ -393,7 +393,7 @@ def pub_to_image_uncur(  # pylint: disable=R0914
             np_image = load_image_from_file(dp["filename"])
         if is_file_extension(dp["filename"], ".pdf"):
             pdf_bytes = load_bytes_from_pdf_file(dp["filename"])
-            np_image = convert_pdf_bytes_to_np_array_v2(pdf_bytes)
+            np_image = convert_pdf_bytes_to_np_array_v2(pdf_bytes, dpi=200)
             dp = _convert_boxes(dp, np_image.shape[0])
         if load_image and np_image is not None:

deepdoctection/pipe/base.py CHANGED Viewed

@@ -24,7 +24,7 @@ from __future__ import annotations
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from dataclasses import dataclass, field
-from typing import Any, Mapping, Optional, Union
+from typing import Any, Mapping, Optional, Union, Callable
 from uuid import uuid1
 from ..dataflow import DataFlow, MapData
@@ -33,6 +33,7 @@ from ..mapper.misc import curry
 from ..utils.context import timed_operation
 from ..utils.identifier import get_uuid_from_str
 from ..utils.settings import ObjectTypes
+from ..utils.types import DP
 from .anngen import DatapointManager
@@ -76,6 +77,30 @@ class PipelineComponent(ABC):
         self.service_id = self.get_service_id()
         self.dp_manager = DatapointManager(self.service_id, model_id)
         self.timer_on = False
+        self.filter_func: Callable[[DP], bool] = lambda dp: False
+    def set_inbound_filter(self, filter_func: Callable[[DP], bool]) -> None:
+        """
+        Set a filter function to decide, if an image of the inbound dataflow should be passed to self.serve.
+        The filter function should return a boolean value. If the function returns True, the image will not be processed
+        by this pipeline component.
+        **Example:**
+            ```python
+            def do_not_process_tables(dp: Image) -> bool:
+                 if "table" not in dp.get_categories_from_current_state():
+                    return True
+                 return False
+            layout_component = ImageLayoutService(...)
+            layout_component.set_inbound_filter(do_not_process_tables)
+            ```
+        :param filter_func: A function that takes an image datapoint and returns a boolean value
+        """
+        self.filter_func = filter_func # type: ignore
     @abstractmethod
     def serve(self, dp: Image) -> None:
@@ -92,6 +117,12 @@ class PipelineComponent(ABC):
         """
         raise NotImplementedError()
+    def _pass_datapoint(self, dp: Image) -> None:
+        self.dp_manager.datapoint = dp
+        if not self.filter_func(dp):
+            self.serve(dp)
     def pass_datapoint(self, dp: Image) -> Image:
         """
         Acceptance, handover to dp_manager, transformation and forwarding of dp. To measure the time, use
@@ -103,11 +134,9 @@ class PipelineComponent(ABC):
         """
         if self.timer_on:
             with timed_operation(self.__class__.__name__):
-                self.dp_manager.datapoint = dp
-                self.serve(dp)
+                self._pass_datapoint(dp)
         else:
-            self.dp_manager.datapoint = dp
-            self.serve(dp)
+            self._pass_datapoint(dp)
         return self.dp_manager.datapoint
     def predict_dataflow(self, df: DataFlow) -> DataFlow:
@@ -205,6 +234,7 @@ class Pipeline(ABC):
     **Example:**
+            ```python
             layout = LayoutPipeComponent(layout_detector ...)
             text = TextExtractPipeComponent(text_detector ...)
             simple_pipe = MyPipeline(pipeline_component = [layout, text])
@@ -212,6 +242,7 @@ class Pipeline(ABC):
             for page in doc_dataflow:
                 print(page)
+            ```
     In doing so, page contains all document structures determined via the pipeline (either directly from the Image core
     model or already processed further).
@@ -225,10 +256,12 @@ class Pipeline(ABC):
     **Example:**
+           ```python
            pipe = MyPipeline(pipeline_component = [layout, text])
            pipe.set_session_id = True
            df = pipe.analyze(input = "path/to/dir") # session_id is generated automatically
+           ```
     """
     def __init__(self, pipeline_component_list: list[PipelineComponent]) -> None:

deepdoctection/pipe/common.py CHANGED Viewed

@@ -349,8 +349,8 @@ class AnnotationNmsService(PipelineComponent):
     def __init__(
         self,
         nms_pairs: Sequence[Sequence[TypeOrStr]],
-        thresholds: Union[float, list[float]],
-        priority: Optional[list[Union[Optional[TypeOrStr]]]] = None,
+        thresholds: Union[float, Sequence[float]],
+        priority: Optional[Sequence[Union[Optional[TypeOrStr]]]] = None,
     ):
         """
         :param nms_pairs: Groups of categories, either as string or by `ObjectType`.
@@ -362,7 +362,7 @@ class AnnotationNmsService(PipelineComponent):
             self.threshold = [thresholds for _ in self.nms_pairs]
         else:
             assert len(self.nms_pairs) == len(thresholds), "Sequences of nms_pairs and thresholds must have same length"
-            self.threshold = thresholds
+            self.threshold = thresholds # type: ignore
         if priority:
             assert len(self.nms_pairs) == len(priority), "Sequences of nms_pairs and priority must have same length"

deepdoctection/pipe/doctectionpipe.py CHANGED Viewed

@@ -109,8 +109,13 @@ def _proto_process(
 @curry
-def _to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int] = None) -> Optional[Image]:
-    return to_image(dp, dpi)
+def _to_image(
+    dp: Union[str, Mapping[str, Union[str, bytes]]],
+    dpi: Optional[int] = None,
+    width: Optional[int] = None,
+    height: Optional[int] = None,
+) -> Optional[Image]:
+    return to_image(dp, dpi, width, height)
 def _doc_to_dataflow(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> DataFlow:
@@ -188,7 +193,19 @@ class DoctectionPipe(Pipeline):
         df = MapData(df, _proto_process(path, doc_path))
         if dataset_dataflow is None:
-            df = MapData(df, _to_image(dpi=int(os.environ.get("DPI", 300))))  # pylint: disable=E1120
+            if dpi := os.environ["DPI"]:
+                df = MapData(df, _to_image(dpi=int(dpi)))  # pylint: disable=E1120
+            else:
+                width, height = kwargs.get("width", ""), kwargs.get("height", "")
+                if not width or not height:
+                    width = os.environ["IMAGE_WIDTH"]
+                    height = os.environ["IMAGE_HEIGHT"]
+                    if not width or not height:
+                        raise ValueError(
+                            "DPI, IMAGE_WIDTH and IMAGE_HEIGHT are all None, but "
+                            "either DPI or IMAGE_WIDTH and IMAGE_HEIGHT must be set"
+                        )
+                df = MapData(df, _to_image(width=int(width), height=int(height)))  # pylint: disable=E1120
         return df
     @staticmethod

deepdoctection/pipe/lm.py CHANGED Viewed

@@ -24,6 +24,7 @@ from copy import copy
 from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Sequence, Union
 from ..datapoint.image import Image
+from ..extern.base import SequenceClassResult
 from ..mapper.laylmstruct import image_to_layoutlm_features, image_to_lm_features
 from ..utils.settings import BioTag, LayoutType, ObjectTypes, PageType, TokenClasses, WordType
 from .base import MetaAnnotation, PipelineComponent
@@ -264,6 +265,7 @@ class LMSequenceClassifierService(PipelineComponent):
         padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
         truncation: bool = True,
         return_overflowing_tokens: bool = False,
+        use_other_as_default_category: bool = False
     ) -> None:
         """
         :param tokenizer: Tokenizer, typing allows currently anything. This will be changed in the future
@@ -279,11 +281,16 @@ class LMSequenceClassifierService(PipelineComponent):
         :param return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows the overflowing tokens
                            can be returned as an additional batch element. Not that in this case, the number of input
                            batch samples will be smaller than the output batch samples.
+        :param use_other_as_default_category: When predicting document classes, it might be possible that some pages
+                           do not get sent to the model because they are empty. If set to `True` it
+                           will assign images with no features the category `TokenClasses.OTHER`.
         """
         self.language_model = language_model
         self.padding = padding
         self.truncation = truncation
         self.return_overflowing_tokens = return_overflowing_tokens
+        self.use_other_as_default_category = use_other_as_default_category
         self.tokenizer = tokenizer
         self.mapping_to_lm_input_func = self.image_to_features_func(self.language_model.image_to_features_mapping())
         super().__init__(self._get_name(), self.language_model.model_id)
@@ -299,12 +306,20 @@ class LMSequenceClassifierService(PipelineComponent):
     def serve(self, dp: Image) -> None:
         lm_input = self.mapping_to_lm_input_func(**self.required_kwargs)(dp)
+        lm_output = None
         if lm_input is None:
-            return
-        lm_output = self.language_model.predict(**lm_input)
-        self.dp_manager.set_summary_annotation(
-            PageType.DOCUMENT_TYPE, lm_output.class_name, lm_output.class_id, None, lm_output.score
-        )
+            if self.use_other_as_default_category:
+                class_id = self.language_model.categories.get_categories(as_dict=True,
+                                                                         name_as_key=True).get(TokenClasses.OTHER, 1)
+                lm_output = SequenceClassResult(class_name=TokenClasses.OTHER,
+                                                class_id = class_id,
+                                                score=-1.)
+        else:
+            lm_output = self.language_model.predict(**lm_input)
+        if lm_output:
+            self.dp_manager.set_summary_annotation(
+                PageType.DOCUMENT_TYPE, lm_output.class_name, lm_output.class_id, None, lm_output.score
+            )
     def clone(self) -> LMSequenceClassifierService:
         return self.__class__(

deepdoctection/pipe/segment.py CHANGED Viewed

@@ -1191,17 +1191,13 @@ class PubtablesSegmentationService(PipelineComponent):
                                 if key[idx] == item_number:
                                     cell_ann = dp.get_annotation(annotation_ids=value)[0]
                                     self.dp_manager.set_category_annotation(
-                                        item_header_cell_name,
-                                        None,
-                                        item_header_cell_name,
-                                        cell_ann.annotation_id
+                                        item_header_cell_name, None, item_header_cell_name, cell_ann.annotation_id
                                     )
                                 else:
                                     cell_ann = dp.get_annotation(annotation_ids=value)[0]
-                                    self.dp_manager.set_category_annotation(item_header_cell_name,
-                                                                            None,
-                                                                            CellType.BODY,
-                                                                            cell_ann.annotation_id)
+                                    self.dp_manager.set_category_annotation(
+                                        item_header_cell_name, None, CellType.BODY, cell_ann.annotation_id
+                                    )
             # TODO: the summaries should be sub categories of the underlying ann
             self.dp_manager.set_summary_annotation(

deepdoctection/train/hf_detr_train.py CHANGED Viewed

@@ -73,7 +73,7 @@ class DetrDerivedTrainer(Trainer):
         model: Union[PreTrainedModel, nn.Module],
         args: TrainingArguments,
         data_collator: DetrDataCollator,
-        train_dataset: Dataset[Any],
+        train_dataset: DatasetAdapter,
     ):
         self.evaluator: Optional[Evaluator] = None
         self.build_eval_kwargs: Optional[dict[str, Any]] = None

deepdoctection/train/hf_layoutlm_train.py CHANGED Viewed

@@ -499,7 +499,9 @@ def train_hf_layoutlm(
         )
         pipeline_component_cls = pipeline_component_registry.get(pipeline_component_name)
         if dataset_type == DatasetType.SEQUENCE_CLASSIFICATION:
-            pipeline_component = pipeline_component_cls(tokenizer_fast, dd_model)
+            pipeline_component = pipeline_component_cls(tokenizer_fast,
+                                                        dd_model,
+                                                        use_other_as_default_category=True)
         else:
             pipeline_component = pipeline_component_cls(
                 tokenizer_fast,

deepdoctection/utils/pdf_utils.py CHANGED Viewed

@@ -181,8 +181,6 @@ class PDFStreamer:
             streamer.close() # Do not forget to close the streamer, otherwise the file will never be closed and might
                              # cause memory leaks if you open many files.
     """
     def __init__(self, path_or_bytes: Union[PathLikeOrStr, bytes]) -> None:
@@ -223,7 +221,10 @@ class PDFStreamer:
 def _input_to_cli_str(
-    input_file_name: PathLikeOrStr, output_file_name: PathLikeOrStr, dpi: int, size: Optional[tuple[int, int]] = None
+    input_file_name: PathLikeOrStr,
+    output_file_name: PathLikeOrStr,
+    dpi: Optional[int] = None,
+    size: Optional[tuple[int, int]] = None,
 ) -> list[str]:
     cmd_args: list[str] = []
@@ -237,7 +238,10 @@ def _input_to_cli_str(
     if platform.system() == "Windows":
         command = command + ".exe"
     cmd_args.append(command)
-    cmd_args.extend(["-r", str(dpi), str(input_file_name)])
+    if dpi:
+        cmd_args.extend(["-r", str(dpi)])
+    cmd_args.append(str(input_file_name))
     cmd_args.append("-png")
     cmd_args.append(str(output_file_name))
@@ -275,7 +279,9 @@ def _run_poppler(poppler_args: list[str]) -> None:
             raise PopplerError(status=proc.returncode, message="Syntax Error: PDF cannot be read with Poppler")
-def pdf_to_np_array_poppler(pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dpi: int = 200) -> PixelValues:
+def pdf_to_np_array_poppler(
+    pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dpi: Optional[int] = None
+) -> PixelValues:
     """
     Convert a single pdf page from its byte representation to a numpy array. This function will save the pdf as to a tmp
     file and then call poppler via `pdftoppm` resp. `pdftocairo` if the former is not available.
@@ -285,7 +291,8 @@ def pdf_to_np_array_poppler(pdf_bytes: bytes, size: Optional[tuple[int, int]] =
     :param dpi:  Image quality in DPI/dots-per-inch (default 200)
     :return: numpy array
     """
+    if dpi is None and size is None:
+        raise ValueError("Either dpi or size must be provided.")
     with save_tmp_file(pdf_bytes, "pdf_") as (tmp_name, input_file_name):
         _run_poppler(_input_to_cli_str(input_file_name, tmp_name, dpi, size))
         image = viz_handler.read_image(tmp_name + "-1.png")
@@ -293,7 +300,7 @@ def pdf_to_np_array_poppler(pdf_bytes: bytes, size: Optional[tuple[int, int]] =
     return image.astype(uint8)
-def pdf_to_np_array_pdfmium(pdf_bytes: bytes, dpi: int = 200) -> PixelValues:
+def pdf_to_np_array_pdfmium(pdf_bytes: bytes, dpi: Optional[int] = None) -> PixelValues:
     """
     Convert a single pdf page from its byte representation to a numpy array using pdfium.
@@ -301,12 +308,13 @@ def pdf_to_np_array_pdfmium(pdf_bytes: bytes, dpi: int = 200) -> PixelValues:
     :param dpi:  Image quality in DPI/dots-per-inch (default 200)
     :return: numpy array
     """
+    if dpi is None:
+        raise ValueError("dpi must be provided.")
     page = pypdfium2.PdfDocument(pdf_bytes)[0]
     return page.render(scale=dpi * 1 / 72).to_numpy().astype(uint8)
-def pdf_to_np_array(pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dpi: int = 200) -> PixelValues:
+def pdf_to_np_array(pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dpi: Optional[int] = None) -> PixelValues:
     """
     Convert a single pdf page from its byte representation to a numpy array. This function will either use Poppler or
     pdfium to render the pdf.

deepdoctection/utils/settings.py CHANGED Viewed

@@ -101,7 +101,6 @@ class DocumentType(ObjectTypes):
     GOVERNMENT_TENDERS = "government_tenders"
     MANUALS = "manuals"
     PATENTS = "patents"
-    MARK = "mark"
 @object_types_registry.register("LayoutType")
@@ -132,6 +131,7 @@ class LayoutType(ObjectTypes):
     PAGE_NUMBER = "page_number"
     KEY_VALUE_AREA = "key_value_area"
     LIST_ITEM = "list_item"
+    MARK = "mark"
 @object_types_registry.register("TableType")

{deepdoctection-0.38.dist-info → deepdoctection-0.39.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: deepdoctection
-Version: 0.38
+Version: 0.39.1
 Summary: Repository for Document AI
 Home-page: https://github.com/deepdoctection/deepdoctection
 Author: Dr. Janis Meyer
@@ -17,7 +17,7 @@ Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: catalogue==2.0.10
-Requires-Dist: huggingface_hub<0.26,>=0.12.0
+Requires-Dist: huggingface_hub>=0.26.0
 Requires-Dist: importlib-metadata>=5.0.0
 Requires-Dist: jsonlines==3.1.0
 Requires-Dist: lazy-imports==0.3.1
@@ -36,7 +36,7 @@ Requires-Dist: tabulate>=0.7.7
 Requires-Dist: tqdm==4.64.0
 Provides-Extra: tf
 Requires-Dist: catalogue==2.0.10; extra == "tf"
-Requires-Dist: huggingface_hub<0.26,>=0.12.0; extra == "tf"
+Requires-Dist: huggingface_hub>=0.26.0; extra == "tf"
 Requires-Dist: importlib-metadata>=5.0.0; extra == "tf"
 Requires-Dist: jsonlines==3.1.0; extra == "tf"
 Requires-Dist: lazy-imports==0.3.1; extra == "tf"
@@ -61,14 +61,14 @@ Requires-Dist: python-doctr==0.8.1; extra == "tf"
 Requires-Dist: pycocotools>=2.0.2; extra == "tf"
 Requires-Dist: boto3==1.34.102; extra == "tf"
 Requires-Dist: pdfplumber>=0.11.0; extra == "tf"
-Requires-Dist: fasttext==0.9.2; extra == "tf"
+Requires-Dist: fasttext-wheel; extra == "tf"
 Requires-Dist: jdeskew>=0.2.2; extra == "tf"
 Requires-Dist: apted==1.0.3; extra == "tf"
 Requires-Dist: distance==0.1.3; extra == "tf"
 Requires-Dist: lxml>=4.9.1; extra == "tf"
 Provides-Extra: pt
 Requires-Dist: catalogue==2.0.10; extra == "pt"
-Requires-Dist: huggingface_hub<0.26,>=0.12.0; extra == "pt"
+Requires-Dist: huggingface_hub>=0.26.0; extra == "pt"
 Requires-Dist: importlib-metadata>=5.0.0; extra == "pt"
 Requires-Dist: jsonlines==3.1.0; extra == "pt"
 Requires-Dist: lazy-imports==0.3.1; extra == "pt"
@@ -86,12 +86,12 @@ Requires-Dist: termcolor>=1.1; extra == "pt"
 Requires-Dist: tabulate>=0.7.7; extra == "pt"
 Requires-Dist: tqdm==4.64.0; extra == "pt"
 Requires-Dist: timm>=0.9.16; extra == "pt"
-Requires-Dist: transformers>=4.36.0; extra == "pt"
+Requires-Dist: transformers>=4.48.0; extra == "pt"
 Requires-Dist: accelerate>=0.29.1; extra == "pt"
 Requires-Dist: python-doctr==0.8.1; extra == "pt"
 Requires-Dist: boto3==1.34.102; extra == "pt"
 Requires-Dist: pdfplumber>=0.11.0; extra == "pt"
-Requires-Dist: fasttext==0.9.2; extra == "pt"
+Requires-Dist: fasttext-wheel; extra == "pt"
 Requires-Dist: jdeskew>=0.2.2; extra == "pt"
 Requires-Dist: apted==1.0.3; extra == "pt"
 Requires-Dist: distance==0.1.3; extra == "pt"
@@ -99,7 +99,7 @@ Requires-Dist: lxml>=4.9.1; extra == "pt"
 Provides-Extra: docs
 Requires-Dist: tensorpack==0.11; extra == "docs"
 Requires-Dist: boto3==1.34.102; extra == "docs"
-Requires-Dist: transformers>=4.36.0; extra == "docs"
+Requires-Dist: transformers>=4.48.0; extra == "docs"
 Requires-Dist: accelerate>=0.29.1; extra == "docs"
 Requires-Dist: pdfplumber>=0.11.0; extra == "docs"
 Requires-Dist: lxml>=4.9.1; extra == "docs"

{deepdoctection-0.38.dist-info → deepdoctection-0.39.1.dist-info}/RECORD RENAMED Viewed

@@ -1,9 +1,9 @@
-deepdoctection/__init__.py,sha256=EpkATv3al-4H6AomNHcSpFPChv5KqFdZJBzg97FVOWo,12653
+deepdoctection/__init__.py,sha256=uDowNayqaYZGYaqnGzPSz6pVuHQhtDVRAN_bvPq85Ko,12754
 deepdoctection/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 deepdoctection/analyzer/__init__.py,sha256=icClxrd20XutD6LxLgEPIWceSs4j_QfI3szCE-9BL2w,729
 deepdoctection/analyzer/_config.py,sha256=OZMOPlyFv4gcyabPG6KO08EYx-0tUH82Ehs9YDv2B1Q,5027
-deepdoctection/analyzer/dd.py,sha256=DUOhOtwipHw5nabYqn3WGR9aZcgP0ma_bi_tjf9xscw,5973
-deepdoctection/analyzer/factory.py,sha256=idvIMuohtvyECBcAVBtUFGouNpMZ_DrXBbizSxieZWI,31899
+deepdoctection/analyzer/dd.py,sha256=bfR7e1JV7BwUNDRLu0jYZU7qQXnyA_vbRAJl2Ylrq5o,5905
+deepdoctection/analyzer/factory.py,sha256=Kf3Ztv5FEcF5yJf6i4I557aOIUHybuxIP0moHryguTQ,32344
 deepdoctection/configs/__init__.py,sha256=TX_P6tqDOF1LK1mi9ruAl7x0mtv1Asm8cYWCz3Pe2dk,646
 deepdoctection/configs/conf_dd_one.yaml,sha256=qnrDAST1PHBtdIKE_hdkZexW22FqVvNTI-PEo9wvinM,3025
 deepdoctection/configs/conf_tesseract.yaml,sha256=oF6szDyoi15FHvq7yFUNIEjfA_jNLhGxoowiRsz_zY4,35
@@ -18,9 +18,9 @@ deepdoctection/dataflow/stats.py,sha256=Bsr6v7lcesKXUYtO9wjqlzx_Yq_uyIF3Lel-tQ0i
 deepdoctection/datapoint/__init__.py,sha256=3K406GbOPhoEp8koVaSbMocmSsmWifnSZ1SPb7C1lOY,1643
 deepdoctection/datapoint/annotation.py,sha256=FEgz4COxVDfjic0gG7kS6iHnWLBIgFnquQ63Cbj2a4Y,22531
 deepdoctection/datapoint/box.py,sha256=UAdSnLexvFyg4KK1u9kXdJxhaWTwRxTU-cnQcvl37Q8,23410
-deepdoctection/datapoint/convert.py,sha256=O7920pIomyEkzXwxpFsrzfhn7Pl6UzVGhNzv90VcuKU,7099
-deepdoctection/datapoint/image.py,sha256=0ipkaF5k5sCe-qVQsWA8FOYF90UBAbAVLfAFwtq_sLg,33639
-deepdoctection/datapoint/view.py,sha256=AgSEZlKK-cm1erQ872ZWGUN3gomNpsQ39LkTR8Cg0BQ,49019
+deepdoctection/datapoint/convert.py,sha256=gJbHY2V8nlMht1N5VdxTmWSsOeydpFPTJsaJHp6XGgE,7516
+deepdoctection/datapoint/image.py,sha256=S6yfsIRQgMCl6HYAcHYJSBcbfdYKKtebtkEkkkrXsMQ,33619
+deepdoctection/datapoint/view.py,sha256=srMyPQGsK4OSiorxkyG6UAIgpViM6Ks1CI3b5k97cjY,49452
 deepdoctection/datasets/__init__.py,sha256=-A3aR90aDsHPmVM35JavfnQ2itYSCn3ujl4krRni1QU,1076
 deepdoctection/datasets/adapter.py,sha256=Ly_vbOAgVI73V41FUccnSX1ECTOyesW_qsuvQuvOZbw,7796
 deepdoctection/datasets/base.py,sha256=DT4i-d74sIEiUNC6UspIHNJuHSK0t1dBv7qwadg4rLw,22341
@@ -58,7 +58,7 @@ deepdoctection/extern/fastlang.py,sha256=F4gK-SEwcCujjxH327ZDzMGWToJ49xS_dCKcePQ
 deepdoctection/extern/hfdetr.py,sha256=JzHrrTyzS9qh6T2TsvKboAGZkIhno2txmSoLQ5Vd-lo,12077
 deepdoctection/extern/hflayoutlm.py,sha256=tFaf90FRbZzhSycdp8rGkeiPywQa6UcTEEwbayIXkr0,57023
 deepdoctection/extern/hflm.py,sha256=kwS6kcSlY_2m9u0RzBLTRq-UMM7c1PhyUaDTvSdejus,9217
-deepdoctection/extern/model.py,sha256=ViHHKPvbGmLCPw7ZESv_rmjlkA90UiBU6oZiHOMqNSw,59869
+deepdoctection/extern/model.py,sha256=lbVwDa3vD6VwCD_dsozcI8b4xDZs4KJ1628SxaDdtHQ,55378
 deepdoctection/extern/pdftext.py,sha256=KS_t27SUiYn_IOS_J2lF9lSSo22vLagxmxvYCY3CqXA,7228
 deepdoctection/extern/tessocr.py,sha256=tG7etMvZ-jHFdq-jJAHYMJii3ujDjMfAFYUsjBp3nKI,17444
 deepdoctection/extern/texocr.py,sha256=yMt5ZzKtsjd7ogrcNXba7zccGGGF9LXK194EtER6YNQ,5804
@@ -88,39 +88,39 @@ deepdoctection/extern/tp/tpfrcnn/utils/__init__.py,sha256=kiPlXxHlTGN9eI7YE9Bgwt
 deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py,sha256=aBLqPg_ApaiimtBRaOsLKTZZFIBh87vVtqjLPMaX9fQ,2379
 deepdoctection/extern/tp/tpfrcnn/utils/np_box_ops.py,sha256=O-q1GQiOEd1lN1MQDsJvHwD2OmBO-qHNeqJ1Qnec93g,3539
 deepdoctection/mapper/__init__.py,sha256=Xqb34aCjslZDQnqQgCSvnloL5DbdT9eHhn-StpVPbzE,1130
-deepdoctection/mapper/cats.py,sha256=EsYdUw8LAPsyqAfGfhNa6gAPVpUhP8GGOchSIKto_e0,15741
+deepdoctection/mapper/cats.py,sha256=s73JzONV2UQ71szfljurk7H1-UjDBWsW4oNLs5xePUk,16474
 deepdoctection/mapper/cocostruct.py,sha256=GcbUpPFUg67pcOHQluWBFOFcGaYnlZcTmwBDERBVgCA,5978
 deepdoctection/mapper/d2struct.py,sha256=Dx-YnycsIQH4a5-9Gn_yMhiQ-gOFgMueNeH3rhXjuCU,8555
 deepdoctection/mapper/hfstruct.py,sha256=2PjGKsYturVJBimLT1CahYh09KSRAFEHz_QNtC162kQ,5551
 deepdoctection/mapper/laylmstruct.py,sha256=abMZkYU2W0e_VcCm_c0ZXNFuv-lfMFWcTedcZS5EYvE,42935
 deepdoctection/mapper/maputils.py,sha256=eI6ZcDg9W5uB6xQNBZpMIdEd86HlCxTtkJuyROdTqiw,8146
-deepdoctection/mapper/match.py,sha256=pCWZpz2R8JahiKXCw7dxKRTLiPgJXeVDgkddDPLy_c0,9643
-deepdoctection/mapper/misc.py,sha256=NLSSgk066Tkrrdi075HkqV7cP-iqT9fv_MtyAJ-8gOg,6743
+deepdoctection/mapper/match.py,sha256=Ed9FsuVPNp_faaW5PKnvUHZoEXcRcrO-muduTMzjp1s,8937
+deepdoctection/mapper/misc.py,sha256=vX-fV420Te00eD-cqTiWBV2twHqdBcBV2_7rAFRgPRg,7164
 deepdoctection/mapper/pascalstruct.py,sha256=TzVU1p0oiw0nOuxTFFbEB9vXJxH1v6VUvTJ7MD0manU,3828
 deepdoctection/mapper/prodigystruct.py,sha256=Re4Sd_zAp6qOvbXZLmMJeG0IGEfMQxebuyDeZgMcTa8,6827
-deepdoctection/mapper/pubstruct.py,sha256=YxsrZ-E0pD45Mm_VCPQB9yEgHsTPkw4htt-3DwCRX1k,23361
+deepdoctection/mapper/pubstruct.py,sha256=PAJ2N1HSPNS6F2ZrIwlD7PiBhIM-rJscK_Ti8OR_IGs,23370
 deepdoctection/mapper/tpstruct.py,sha256=YNABRibvcISD5Lavg3jouoE4FMdqXEJoM-hNoB_rnww,4481
 deepdoctection/mapper/xfundstruct.py,sha256=_3r3c0K82fnF2h1HxA85h-9ETYrHwcERa6MNc6Ko6Z8,8807
 deepdoctection/pipe/__init__.py,sha256=ywTVoetftdL6plXg2YlBzMfmqBZupq7yXblSVyvvkcQ,1127
 deepdoctection/pipe/anngen.py,sha256=3319l4aaXzcY4w6ItVBNPX8LGS5fHFDVtyVY9KMefac,16393
-deepdoctection/pipe/base.py,sha256=ynNg5SSRuUVxN69VWOO3Oi7WSeGrYwn3A56NQMBJDvw,14222
-deepdoctection/pipe/common.py,sha256=haOb4v0jLX3r41BSC8cVseX2E320_HkSrGlZsQiKE2g,17728
+deepdoctection/pipe/base.py,sha256=F4NusbZ-xYc6wuO-XAngmC8uzahT2ubsu2g9NO8PpVw,15390
+deepdoctection/pipe/common.py,sha256=vlWzvwn8wl7baPbK-917HUWujEGJEkHur_-ilkweKjk,17751
 deepdoctection/pipe/concurrency.py,sha256=AAKRsVgaBEYNluntbDa46SBF1JZ_XqnWLDSWrNvAzEo,9657
-deepdoctection/pipe/doctectionpipe.py,sha256=xrDK2_84tVUMsRG7bzqGKiOCsoO-49tweTOK2Je1fls,11770
+deepdoctection/pipe/doctectionpipe.py,sha256=bGW3ugky-fb-nEe-3bvO6Oc_4_6w82cQboGM_6p2eIo,12530
 deepdoctection/pipe/language.py,sha256=5zI0UQC6Fh12_r2pfVL42HoCGz2hpHrOhpXAn5m-rYw,5451
 deepdoctection/pipe/layout.py,sha256=xIhnJpyUSbvLbhTXyAKXY1hmG9352jihGYFSclTH_1g,5567
-deepdoctection/pipe/lm.py,sha256=tLuCtML-S_kTEYcDAEtM3NBYmR7Aovv9p5TcXYL_AAg,16693
+deepdoctection/pipe/lm.py,sha256=Ygj6MmBvBZ7l4RGCwBuhmMsOM0Ep3LWteNg7bzh-UmI,17703
 deepdoctection/pipe/order.py,sha256=PnJZiCnxFluJiECXLTZT0c1Rr66vIRBFraa_G41UA2k,40121
 deepdoctection/pipe/refine.py,sha256=dTfI396xydPdbzpfo4yqFcuxl3UAB1y-WbSQn1o76ec,22367
 deepdoctection/pipe/registry.py,sha256=aFx-Tn0xhVA5l5H18duNW5QoTNKQltybsEUEzsMgUfg,902
-deepdoctection/pipe/segment.py,sha256=WhIi-m6Wwm9JjHOBomw9q5XUUzmt7-BFNpdcU1m2LH8,59386
+deepdoctection/pipe/segment.py,sha256=CR83HQMW0hrRG8W6pFuB0YibxQMWpqI7_LaUIcJcQwo,59116
 deepdoctection/pipe/sub_layout.py,sha256=N1RcID-boORcwsW_j0l64HpUu3rff0ge5qEanudLYgk,13838
 deepdoctection/pipe/text.py,sha256=h9q6d3HFOs7LOg-iwdLUPiQxrPqgunBVNmtYMBrfRQE,11180
 deepdoctection/pipe/transform.py,sha256=9Om7X7hJeL4jgUwHM1CHa4sb5v7Qo1PtVG0ls_3nI7w,3798
 deepdoctection/train/__init__.py,sha256=YFTRAZF1F7cEAKTdAIi1BLyYb6rSRcwq09Ui5Lu8d6E,1071
 deepdoctection/train/d2_frcnn_train.py,sha256=sFc_G-mEpaM8d1CCE0_6Gl4nBh11X2RYRBA3p_ylFJQ,16000
-deepdoctection/train/hf_detr_train.py,sha256=8ydysxzOPE_IPoNFGaHb7PbKr9Nbl41rcY4lbylQavU,10783
-deepdoctection/train/hf_layoutlm_train.py,sha256=BNjPgPAvxm4beHULqzo58u-gW7GcTGiZAk2rF6TootM,22532
+deepdoctection/train/hf_detr_train.py,sha256=eHSdI11U8oGy93noZxAISfukhRBElj4dBerJ4Xcercw,10785
+deepdoctection/train/hf_layoutlm_train.py,sha256=irSg-IpbVoSlaw1-vZCej2mCZcctONtXr5Z2NQAc_a4,22680
 deepdoctection/train/tp_frcnn_train.py,sha256=pEpXokSVGveqo82pRnhnAmHPmjQ_8wQWpqM4ZyNHJgs,13049
 deepdoctection/utils/__init__.py,sha256=brBceRWeov9WXMiJTjyJOF2rHMP8trGGRRjhMdZ61nI,2371
 deepdoctection/utils/concurrency.py,sha256=nIhpkSncmv0LBB8PtcOLY-BsRGlfcDpz7foVdgzZd20,4598
@@ -134,15 +134,15 @@ deepdoctection/utils/identifier.py,sha256=QkNaGGqPynHwDPnd3_m8iur4Cv64rcQa7qolCE
 deepdoctection/utils/logger.py,sha256=J0OVKiXP_2A82MWbbJoOeMEJ-75aZu5npgaS_yI6mVA,10003
 deepdoctection/utils/metacfg.py,sha256=hD76KQ_RnD_5B02qLI2Zxf3WfnsnXhEI_KUTKpw91RI,5711
 deepdoctection/utils/mocks.py,sha256=IkN3-IzAl4eX0ibgKIHg8IY7ykVw6BnpF6XnxKnKaZI,2389
-deepdoctection/utils/pdf_utils.py,sha256=G0m8kUn2HwwyZWH_BcrDkm-m3MP9GN9SWHj5VhB7swY,12845
-deepdoctection/utils/settings.py,sha256=k6OyuWbj-IPeaO9zT9RZ-5Yad1wNhWGYqGLZdtgXAZY,12464
+deepdoctection/utils/pdf_utils.py,sha256=Fi0eZ2GbnO7N61Rd8b8YRKRff4dalHAzkcn3zpGPoic,13119
+deepdoctection/utils/settings.py,sha256=hDD6yDX_4pQXwR5ILVwJIj6hb7NXA0-ifnC25ldcUjA,12464
 deepdoctection/utils/tqdm.py,sha256=cBUtR0L1x0KMeYrLP2rrzyzCamCjpQAKroHXLv81_pk,1820
 deepdoctection/utils/transform.py,sha256=3kCgsEeRkG1efCdkfvj7tUFMs-e2jbjbflq826F2GPU,8502
 deepdoctection/utils/types.py,sha256=_3dmPdCIZNLbgU5QP5k_c5phDf18xLe1kYL6t2nM45s,2953
 deepdoctection/utils/utils.py,sha256=csVs_VvCq4QBETPoE2JdTTL4MFYnD4xh-Js5vRb612g,6492
 deepdoctection/utils/viz.py,sha256=Jf8ePNYWlpuyaS6SeTYQ4OyA3eNhtgjvAQZnGNdgHC0,27051
-deepdoctection-0.38.dist-info/LICENSE,sha256=GQ0rUvuGdrMNEI3iHK5UQx6dIMU1QwAuyXsxUHn5MEQ,11351
-deepdoctection-0.38.dist-info/METADATA,sha256=WoWX8R8jC04bj81VPQWYzBJgB9mREE5Ng7LCZtqGylc,19759
-deepdoctection-0.38.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-deepdoctection-0.38.dist-info/top_level.txt,sha256=hs2DdoOL9h4mnHhmO82BT4pz4QATIoOZ20PZmlnxFI8,15
-deepdoctection-0.38.dist-info/RECORD,,
+deepdoctection-0.39.1.dist-info/LICENSE,sha256=GQ0rUvuGdrMNEI3iHK5UQx6dIMU1QwAuyXsxUHn5MEQ,11351
+deepdoctection-0.39.1.dist-info/METADATA,sha256=NBN2dqFMUiXkcJ28xJDwyN6eNP-MmFw64F7dm3kUWTA,19741
+deepdoctection-0.39.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+deepdoctection-0.39.1.dist-info/top_level.txt,sha256=hs2DdoOL9h4mnHhmO82BT4pz4QATIoOZ20PZmlnxFI8,15
+deepdoctection-0.39.1.dist-info/RECORD,,

{deepdoctection-0.38.dist-info → deepdoctection-0.39.1.dist-info}/LICENSE RENAMED Viewed

File without changes

{deepdoctection-0.38.dist-info → deepdoctection-0.39.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{deepdoctection-0.38.dist-info → deepdoctection-0.39.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

deepdoctection 0.38__py3-none-any.whl → 0.39.1__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.38py3-none-any.whl → 0.39.1py3-none-any.whl