PyPI - deepdoctection - Versions diffs - 0.37.3__py3-none-any.whl → 0.39__py3-none-any.whl - Mend

deepdoctection 0.37.3py3-none-any.whl → 0.39py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (27) hide show

deepdoctection/__init__.py +5 -1
deepdoctection/analyzer/_config.py +2 -1
deepdoctection/analyzer/dd.py +6 -5
deepdoctection/analyzer/factory.py +16 -6
deepdoctection/configs/conf_dd_one.yaml +126 -85
deepdoctection/datapoint/box.py +2 -4
deepdoctection/datapoint/convert.py +14 -8
deepdoctection/datapoint/image.py +12 -5
deepdoctection/datapoint/view.py +151 -53
deepdoctection/extern/hfdetr.py +4 -3
deepdoctection/extern/model.py +6 -97
deepdoctection/mapper/cats.py +21 -10
deepdoctection/mapper/match.py +0 -22
deepdoctection/mapper/misc.py +12 -2
deepdoctection/mapper/pubstruct.py +1 -1
deepdoctection/pipe/doctectionpipe.py +20 -3
deepdoctection/pipe/lm.py +20 -5
deepdoctection/pipe/refine.py +6 -13
deepdoctection/pipe/segment.py +225 -46
deepdoctection/pipe/sub_layout.py +40 -22
deepdoctection/train/hf_layoutlm_train.py +3 -1
deepdoctection/utils/pdf_utils.py +17 -9
{deepdoctection-0.37.3.dist-info → deepdoctection-0.39.dist-info}/METADATA +15 -5
{deepdoctection-0.37.3.dist-info → deepdoctection-0.39.dist-info}/RECORD +27 -27
{deepdoctection-0.37.3.dist-info → deepdoctection-0.39.dist-info}/WHEEL +1 -1
{deepdoctection-0.37.3.dist-info → deepdoctection-0.39.dist-info}/LICENSE +0 -0
{deepdoctection-0.37.3.dist-info → deepdoctection-0.39.dist-info}/top_level.txt +0 -0

deepdoctection/datapoint/view.py CHANGED Viewed

@@ -28,7 +28,7 @@ import numpy as np
 from typing_extensions import LiteralString
 from ..utils.error import AnnotationError, ImageError
-from ..utils.logger import LoggingRecord, logger
+from ..utils.logger import LoggingRecord, log_once, logger
 from ..utils.settings import (
     CellType,
     LayoutType,
@@ -228,23 +228,33 @@ class Layout(ImageAnnotationBaseView):
         """
         words = self.get_ordered_words()
-        characters, ann_ids, token_classes, token_tags, token_classes_ids, token_tag_ids = zip(
-            *[
-                (
-                    word.characters,
-                    word.annotation_id,
-                    word.token_class,
-                    word.token_tag,
-                    word.get_sub_category(WordType.TOKEN_CLASS).category_id
-                    if WordType.TOKEN_CLASS in word.sub_categories
-                    else None,
-                    word.get_sub_category(WordType.TOKEN_TAG).category_id
-                    if WordType.TOKEN_TAG in word.sub_categories
-                    else None,
-                )
-                for word in words
-            ]
-        )
+        if words:
+            characters, ann_ids, token_classes, token_tags, token_classes_ids, token_tag_ids = zip(
+                *[
+                    (
+                        word.characters,
+                        word.annotation_id,
+                        word.token_class,
+                        word.token_tag,
+                        word.get_sub_category(WordType.TOKEN_CLASS).category_id
+                        if WordType.TOKEN_CLASS in word.sub_categories
+                        else None,
+                        word.get_sub_category(WordType.TOKEN_TAG).category_id
+                        if WordType.TOKEN_TAG in word.sub_categories
+                        else None,
+                    )
+                    for word in words
+                ]
+            )
+        else:
+            characters, ann_ids, token_classes, token_tags, token_classes_ids, token_tag_ids = (
+                [],  # type: ignore
+                [],  # type: ignore
+                [],  # type: ignore
+                [],  # type: ignore
+                [],  # type: ignore
+                [],  # type: ignore
+            )
         return {
             "text": " ".join(characters),
             "words": characters,
@@ -282,25 +292,103 @@ class Table(Layout):
     """
     @property
-    def cells(self) -> list[ImageAnnotationBaseView]:
+    def cells(self) -> list[Cell]:
         """
         A list of a table cells.
         """
         all_relation_ids = self.get_relationship(Relationships.CHILD)
-        cell_anns = self.base_page.get_annotation(
+        cell_anns: list[Cell] = self.base_page.get_annotation(  # type: ignore
             annotation_ids=all_relation_ids,
             category_names=[
                 LayoutType.CELL,
                 CellType.HEADER,
                 CellType.BODY,
-                CellType.PROJECTED_ROW_HEADER,
                 CellType.SPANNING,
-                CellType.ROW_HEADER,
-                CellType.COLUMN_HEADER,
             ],
         )
         return cell_anns
+    @property
+    def column_header_cells(self) -> list[Cell]:
+        """
+        Retrieve a list of cells that are column headers in the table.
+        This property filters and sorts the cells in the table to return only those that are column headers.
+        The cells are sorted by their column number.
+        :return: A list of `Cell` objects that are column headers.
+        """
+        all_relation_ids = self.get_relationship(Relationships.CHILD)
+        all_cells: list[Cell] = self.base_page.get_annotation(  # type: ignore
+            category_names=[LayoutType.CELL, CellType.SPANNING], annotation_ids=all_relation_ids
+        )
+        headers = list(filter(lambda cell: CellType.COLUMN_HEADER in cell.sub_categories, all_cells))
+        headers.sort(key=lambda x: x.column_number)  # type: ignore
+        return headers
+    @property
+    def row_header_cells(self) -> list[Cell]:
+        """
+        Retrieve a list of cells that are row headers in the table.
+        This property filters and sorts the cells in the table to return only those that are row headers.
+        The cells are sorted by their column number.
+        :return: A list of `Cell` objects that are row headers.
+        """
+        all_relation_ids = self.get_relationship(Relationships.CHILD)
+        all_cells: list[Cell] = self.base_page.get_annotation(  # type: ignore
+            category_names=[LayoutType.CELL, CellType.SPANNING], annotation_ids=all_relation_ids
+        )
+        row_header_cells = list(filter(lambda cell: CellType.ROW_HEADER in cell.sub_categories, all_cells))
+        row_header_cells.sort(key=lambda x: x.column_number)  # type: ignore
+        return row_header_cells
+    def kv_header_rows(self, row_number: int) -> Mapping[str, str]:
+        """
+        For a given row number, returns a dictionary mapping column headers to cell values in that row.
+        This method retrieves all cells in the specified row and matches them with their corresponding column headers.
+        It then creates a key-value pair where the key is a tuple containing the column number and header text,
+        and the value is the cell text.
+        :param row_number: The row number for which to retrieve the key-value pairs.
+        :return: A dictionary where keys are tuples of (column number, header text) and values are cell texts.
+        Example:
+        If the table has the following structure:
+        | Header1 | Header2 |
+        |---------|---------|
+        | Value1  | Value2  |
+        | Value3  | Value4  |
+        Calling kv_header_rows(1) would return:
+        {
+            (1, 'Header1'): 'Value1',
+            (2, 'Header2'): 'Value2'
+        }
+        """
+        all_relation_ids = self.get_relationship(Relationships.CHILD)
+        all_cells = self.base_page.get_annotation(
+            category_names=[LayoutType.CELL, CellType.SPANNING], annotation_ids=all_relation_ids
+        )
+        row_cells = list(
+            filter(lambda c: row_number in (c.row_number, c.row_number + c.row_span), all_cells)  # type: ignore
+        )
+        row_cells.sort(key=lambda c: c.column_number)  # type: ignore
+        column_header_cells = self.column_header_cells
+        kv_dict: Mapping[str, str] = {}
+        for cell in row_cells:
+            for header in column_header_cells:
+                if (
+                    cell.column_number == header.column_number  # type: ignore
+                    and cell.annotation_id != header.annotation_id  # type: ignore
+                ):
+                    kv_dict[(header.column_number, header.text)] = cell.text  # type: ignore
+                    break
+        return kv_dict
     @property
     def rows(self) -> list[ImageAnnotationBaseView]:
         """
@@ -335,7 +423,7 @@ class Table(Layout):
             try:
                 html_index = html_list.index(cell.annotation_id)
                 html_list.pop(html_index)
-                html_list.insert(html_index, cell.text)  # type: ignore
+                html_list.insert(html_index, cell.text)
             except ValueError:
                 logger.warning(LoggingRecord("html construction not possible", {"annotation_id": cell.annotation_id}))
@@ -357,6 +445,12 @@ class Table(Layout):
         cells = self.cells
         table_list = [["" for _ in range(self.number_of_columns)] for _ in range(self.number_of_rows)]  # type: ignore
         for cell in cells:
+            if cell.category_name == CellType.SPANNING:
+                log_once(
+                    "Table has spanning cells. This implies, that the .csv output will not be correct."
+                    "To prevent spanning cell table creation set PT.ITEM.FILTER=['table','spanning'] ",
+                    "error",
+                )
             table_list[cell.row_number - 1][cell.column_number - 1] = (  # type: ignore
                 table_list[cell.row_number - 1][cell.column_number - 1] + cell.text + " "  # type: ignore
             )
@@ -386,13 +480,13 @@ class Table(Layout):
         token_class_ids: list[str] = []
         token_tag_ids: list[str] = []
         for cell in cells:
-            text.extend(cell.text_["text"])  # type: ignore
-            words.extend(cell.text_["words"])  # type: ignore
-            ann_ids.extend(cell.text_["ann_ids"])  # type: ignore
-            token_classes.extend(cell.text_["token_classes"])  # type: ignore
-            token_tags.extend(cell.text_["token_tags"])  # type: ignore
-            token_class_ids.extend(cell.text_["token_class_ids"])  # type: ignore
-            token_tag_ids.extend(cell.text_["token_tag_ids"])  # type: ignore
+            text.extend(cell.text_["text"])
+            words.extend(cell.text_["words"])
+            ann_ids.extend(cell.text_["ann_ids"])
+            token_classes.extend(cell.text_["token_classes"])
+            token_tags.extend(cell.text_["token_tags"])
+            token_class_ids.extend(cell.text_["token_class_ids"])
+            token_tag_ids.extend(cell.text_["token_tag_ids"])
         return {
             "text": " ".join(text),
             "words": words,
@@ -414,7 +508,7 @@ class Table(Layout):
         if not cells:
             return super().words
         for cell in cells:
-            all_words.extend(cell.words)  # type: ignore
+            all_words.extend(cell.words)
         return all_words
     def get_ordered_words(self) -> list[ImageAnnotationBaseView]:
@@ -424,7 +518,7 @@ class Table(Layout):
             all_words = []
             cells.sort(key=lambda x: (x.ROW_NUMBER, x.COLUMN_NUMBER))
             for cell in cells:
-                all_words.extend(cell.get_ordered_words())  # type: ignore
+                all_words.extend(cell.get_ordered_words())
             return all_words
         except (TypeError, AnnotationError):
             return super().get_ordered_words()
@@ -436,10 +530,10 @@ IMAGE_ANNOTATION_TO_LAYOUTS: dict[ObjectTypes, Type[Union[Layout, Table, Word]]]
     LayoutType.TABLE_ROTATED: Table,
     LayoutType.WORD: Word,
     LayoutType.CELL: Cell,
-    CellType.PROJECTED_ROW_HEADER: Cell,
     CellType.SPANNING: Cell,
     CellType.ROW_HEADER: Cell,
     CellType.COLUMN_HEADER: Cell,
+    CellType.PROJECTED_ROW_HEADER: Cell,
 }
@@ -465,10 +559,7 @@ IMAGE_DEFAULTS: ImageDefaults = {
         LayoutType.LIST,
         LayoutType.CELL,
         LayoutType.FIGURE,
-        CellType.COLUMN_HEADER,
-        CellType.PROJECTED_ROW_HEADER,
         CellType.SPANNING,
-        CellType.ROW_HEADER,
     ),
 }
@@ -851,6 +942,16 @@ class Page(Image):
         """
         return self._make_text(False)
+    def _ann_viz_bbox(self, ann: ImageAnnotationBaseView) -> list[float]:
+        """
+        Get the bounding box as list and in absolute coordinates of the base page.
+        """
+        bounding_box = ann.get_bounding_box(self.image_id)
+        if not bounding_box.absolute_coords:
+            bounding_box = bounding_box.transform(self.width, self.height, absolute_coords=True)
+        return bounding_box.to_list(mode="xyxy")
     @no_type_check
     def viz(
         self,
@@ -886,6 +987,7 @@ class Page(Image):
         :param show_tables: Will display all tables boxes as well as cells, rows and columns
         :param show_layouts: Will display all other layout components.
         :param show_figures: Will display all figures
+        :param show_residual_layouts: Will display all residual layouts
         :param show_cells: Will display cells within tables. (Only available if `show_tables=True`)
         :param show_table_structure: Will display rows and columns
         :param show_words: Will display bounding boxes around words labeled with token class and bio tag (experimental)
@@ -910,50 +1012,46 @@ class Page(Image):
         if debug_kwargs:
             anns = self.get_annotation(category_names=list(debug_kwargs.keys()))
             for ann in anns:
-                box_stack.append(ann.bbox)
+                box_stack.append(self._ann_viz_bbox(ann))
                 category_names_list.append(str(getattr(ann, debug_kwargs[ann.category_name])))
         if show_layouts and not debug_kwargs:
             for item in self.layouts:
-                box_stack.append(item.bbox)
+                box_stack.append(self._ann_viz_bbox(item))
                 category_names_list.append(item.category_name.value)
         if show_figures and not debug_kwargs:
             for item in self.figures:
-                box_stack.append(item.bbox)
+                box_stack.append(self._ann_viz_bbox(item))
                 category_names_list.append(item.category_name.value)
         if show_tables and not debug_kwargs:
             for table in self.tables:
-                box_stack.append(table.bbox)
+                box_stack.append(self._ann_viz_bbox(table))
                 category_names_list.append(LayoutType.TABLE.value)
                 if show_cells:
                     for cell in table.cells:
                         if cell.category_name in {
                             LayoutType.CELL,
-                            CellType.PROJECTED_ROW_HEADER,
                             CellType.SPANNING,
-                            CellType.ROW_HEADER,
-                            CellType.COLUMN_HEADER,
                         }:
                             cells_found = True
-                            box_stack.append(cell.bbox)
+                            box_stack.append(self._ann_viz_bbox(cell))
                             category_names_list.append(None)
                 if show_table_structure:
                     rows = table.rows
                     cols = table.columns
                     for row in rows:
-                        box_stack.append(row.bbox)
+                        box_stack.append(self._ann_viz_bbox(row))
                         category_names_list.append(None)
                     for col in cols:
-                        box_stack.append(col.bbox)
+                        box_stack.append(self._ann_viz_bbox(col))
                         category_names_list.append(None)
         if show_cells and not cells_found and not debug_kwargs:
-            for ann in self.annotations:
-                if isinstance(ann, Cell) and ann.active:
-                    box_stack.append(ann.bbox)
-                    category_names_list.append(None)
+            for ann in self.get_annotation(category_names=[LayoutType.CELL, CellType.SPANNING]):
+                box_stack.append(self._ann_viz_bbox(ann))
+                category_names_list.append(None)
         if show_words and not debug_kwargs:
             all_words = []
@@ -965,7 +1063,7 @@ class Page(Image):
                 all_words = self.get_annotation(category_names=LayoutType.WORD)
             if not ignore_default_token_class:
                 for word in all_words:
-                    box_stack.append(word.bbox)
+                    box_stack.append(self._ann_viz_bbox(word))
                     if show_token_class:
                         category_names_list.append(word.token_class.value if word.token_class is not None else None)
                     else:
@@ -973,7 +1071,7 @@ class Page(Image):
             else:
                 for word in all_words:
                     if word.token_class is not None and word.token_class != TokenClasses.OTHER:
-                        box_stack.append(word.bbox)
+                        box_stack.append(self._ann_viz_bbox(word))
                         if show_token_class:
                             category_names_list.append(word.token_class.value if word.token_class is not None else None)
                         else:

deepdoctection/extern/hfdetr.py CHANGED Viewed

@@ -41,6 +41,7 @@ with try_import() as tr_import_guard:
     from transformers import (  # pylint: disable=W0611
         AutoFeatureExtractor,
         DetrFeatureExtractor,
+        DetrImageProcessor,
         PretrainedConfig,
         TableTransformerForObjectDetection,
     )
@@ -55,7 +56,7 @@ def _detr_post_processing(
 def detr_predict_image(
     np_img: PixelValues,
     predictor: TableTransformerForObjectDetection,
-    feature_extractor: DetrFeatureExtractor,
+    feature_extractor: DetrImageProcessor,
     device: torch.device,
     threshold: float,
     nms_threshold: float,
@@ -224,13 +225,13 @@ class HFDetrDerivedDetector(HFDetrDerivedDetectorMixin):
         )
     @staticmethod
-    def get_pre_processor(path_feature_extractor_config: PathLikeOrStr) -> DetrFeatureExtractor:
+    def get_pre_processor(path_feature_extractor_config: PathLikeOrStr) -> DetrImageProcessor:
         """
         Builds the feature extractor
         :return: DetrFeatureExtractor
         """
-        return AutoFeatureExtractor.from_pretrained(
+        return DetrImageProcessor.from_pretrained(
             pretrained_model_name_or_path=os.fspath(path_feature_extractor_config)
         )

deepdoctection/extern/model.py CHANGED Viewed

@@ -24,7 +24,7 @@ from dataclasses import asdict, dataclass, field
 from typing import Any, Mapping, Optional, Union
 import jsonlines
-from huggingface_hub import cached_download, hf_hub_url  # type: ignore
+from huggingface_hub import hf_hub_download
 from tabulate import tabulate
 from termcolor import colored
@@ -136,51 +136,6 @@ class ModelCatalog:
             dl_library="TF",
             model_wrapper="TPFrcnnDetector",
         ),
-        "item/model-1620000.data-00000-of-00001": ModelProfile(
-            name="item/model-1620000.data-00000-of-00001",
-            description="Tensorpack row/column detection model trained on Pubtabnet",
-            config="dd/tp/conf_frcnn_rows.yaml",
-            size=[823546048, 25787],
-            tp_model=True,
-            hf_repo_id="deepdoctection/tp_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_rc",
-            hf_model_name="model-1620000",
-            hf_config_file=["conf_frcnn_rows.yaml"],
-            categories={1: LayoutType.ROW, 2: LayoutType.COLUMN},
-            dl_library="TF",
-            model_wrapper="TPFrcnnDetector",
-        ),
-        "layout/model-800000.data-00000-of-00001": ModelProfile(
-            name="layout/model-800000.data-00000-of-00001",
-            description="Tensorpack layout detection model trained on Publaynet",
-            config="dd/tp/conf_frcnn_layout.yaml",
-            size=[823656748, 25796],
-            tp_model=True,
-            hf_repo_id="deepdoctection/tp_casc_rcnn_X_32xd4_50_FPN_GN_2FC_publaynet",
-            hf_model_name="model-800000",
-            hf_config_file=["conf_frcnn_layout.yaml"],
-            dl_library="TF",
-            categories={
-                1: LayoutType.TEXT,
-                2: LayoutType.TITLE,
-                3: LayoutType.LIST,
-                4: LayoutType.TABLE,
-                5: LayoutType.FIGURE,
-            },
-            model_wrapper="TPFrcnnDetector",
-        ),
-        "cell/model-1800000.data-00000-of-00001": ModelProfile(
-            name="cell/model-1800000.data-00000-of-00001",
-            description="Tensorpack cell detection model trained on Pubtabnet",
-            config="dd/tp/conf_frcnn_cell.yaml",
-            size=[823509160, 25905],
-            tp_model=True,
-            hf_repo_id="deepdoctection/tp_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_c",
-            hf_model_name="model-1800000",
-            hf_config_file=["conf_frcnn_cell.yaml"],
-            categories={1: LayoutType.CELL},
-            dl_library="TF",
-            model_wrapper="TPFrcnnDetector",
-        ),
         "layout/d2_model_0829999_layout_inf_only.pt": ModelProfile(
             name="layout/d2_model_0829999_layout_inf_only.pt",
             description="Detectron2 layout detection model trained on Publaynet",
@@ -200,25 +155,6 @@ class ModelCatalog:
             dl_library="PT",
             model_wrapper="D2FrcnnDetector",
         ),
-        "layout/d2_model_0829999_layout.pth": ModelProfile(
-            name="layout/d2_model_0829999_layout.pth",
-            description="Detectron2 layout detection model trained on Publaynet. Checkpoint for resuming training",
-            config="dd/d2/layout/CASCADE_RCNN_R_50_FPN_GN.yaml",
-            size=[548377327],
-            tp_model=False,
-            hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_publaynet_inference_only",
-            hf_model_name="d2_model_0829999_layout.pth",
-            hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
-            categories={
-                1: LayoutType.TEXT,
-                2: LayoutType.TITLE,
-                3: LayoutType.LIST,
-                4: LayoutType.TABLE,
-                5: LayoutType.FIGURE,
-            },
-            dl_library="PT",
-            model_wrapper="D2FrcnnDetector",
-        ),
         "layout/d2_model_0829999_layout_inf_only.ts": ModelProfile(
             name="layout/d2_model_0829999_layout_inf_only.ts",
             description="Detectron2 layout detection model trained on Publaynet. Torchscript export",
@@ -264,32 +200,6 @@ class ModelCatalog:
             dl_library="PT",
             model_wrapper="D2FrcnnTracingDetector",
         ),
-        "cell/d2_model_1849999_cell.pth": ModelProfile(
-            name="cell/d2_model_1849999_cell.pth",
-            description="Detectron2 cell detection inference only model trained on Pubtabnet",
-            config="dd/d2/cell/CASCADE_RCNN_R_50_FPN_GN.yaml",
-            size=[548279023],
-            tp_model=False,
-            hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_c_inference_only",
-            hf_model_name="cell/d2_model_1849999_cell.pth",
-            hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
-            categories={1: LayoutType.CELL},
-            dl_library="PT",
-            model_wrapper="D2FrcnnDetector",
-        ),
-        "item/d2_model_1639999_item.pth": ModelProfile(
-            name="item/d2_model_1639999_item.pth",
-            description="Detectron2 item detection model trained on Pubtabnet",
-            config="dd/d2/item/CASCADE_RCNN_R_50_FPN_GN.yaml",
-            size=[548303599],
-            tp_model=False,
-            hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_rc_inference_only",
-            hf_model_name="d2_model_1639999_item.pth",
-            hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
-            categories={1: LayoutType.ROW, 2: LayoutType.COLUMN},
-            dl_library="PT",
-            model_wrapper="D2FrcnnDetector",
-        ),
         "item/d2_model_1639999_item_inf_only.pt": ModelProfile(
             name="item/d2_model_1639999_item_inf_only.pt",
             description="Detectron2 item detection model inference only trained on Pubtabnet",
@@ -1232,20 +1142,19 @@ class ModelDownloadManager:
     def _load_from_hf_hub(
         repo_id: str, file_name: str, cache_directory: PathLikeOrStr, force_download: bool = False
     ) -> int:
-        url = hf_hub_url(repo_id=repo_id, filename=file_name)
         token = os.environ.get("HF_CREDENTIALS", None)
-        f_path = cached_download(
-            url,
-            cache_dir=cache_directory,
+        f_path = hf_hub_download(
+            repo_id,
+            file_name,
+            local_dir=cache_directory,  # type: ignore
             force_filename=file_name,
             force_download=force_download,
             token=token,
-            legacy_cache_layout=True,
         )
         if f_path:
             stat_info = os.stat(f_path)
             size = stat_info.st_size
-            assert size > 0, f"Downloaded an empty file from {url}!"
+            assert size > 0, f"Downloaded an empty file from {f_path}!"
             return size
         raise TypeError("Returned value from cached_download cannot be Null")

deepdoctection/mapper/cats.py CHANGED Viewed

@@ -73,18 +73,21 @@ def re_assign_cat_ids(
     Annotations that are not in the dictionary provided will be removed.
     :param dp: Image
-    :param categories_dict_name_as_key: e.g. `{LayoutType.word: '1'}`
+    :param categories_dict_name_as_key: e.g. `{LayoutType.word: 1}`
     :param cat_to_sub_cat_mapping: e.g. `{<LayoutType.word>:
         {<WordType.token_class>:
-            {<FundsFirstPage.report_date>: '1',
-            <FundsFirstPage.report_type>: '2',
-            <FundsFirstPage.umbrella>: '3',
-            <FundsFirstPage.fund_name>: '4',
-            <TokenClasses.other>: '5'},
-            <WordType.tag>:
-            {<BioTag.inside>: '1',
-            <BioTag.outside>: '2',
-            <BioTag.begin>: '3'}}}`
+            {<FundsFirstPage.REPORT_DATE>: 1,
+            <FundsFirstPage.REPORT_TYPE>: 2,
+            <FundsFirstPage.UMBRELLA>: 3,
+            <FundsFirstPage.FUND_NAME>: 4,
+            <TokenClasses.OTHER>: 5},
+            <WordType.TAG>:
+            {<BioTag.INSIDE>: 1,
+            <BioTag.OUTSIDE>: 2,
+            <BioTag.BEGIN>: 3}}}`
+            To re-assign the category ids of an image summary, use the key 'default_type' for the default category, e.g.
+            `{DefaultType.DEFAULT_TYPE: {<PageType.DOCUMENT_TYPE>: {<DocumentType.INVOICE>:1,
+            <DocumentType.BANK_STATEMENT>:2}}}`
     :return: Image
     """
@@ -104,6 +107,14 @@ def re_assign_cat_ids(
                     sub_category = ann.get_sub_category(key)
                     sub_category.category_id = sub_cat_values_dict.get(sub_category.category_name, DEFAULT_CATEGORY_ID)
+    if cat_to_sub_cat_mapping:
+        if "default_type" in cat_to_sub_cat_mapping:
+            sub_cat_keys_to_sub_cat_values = cat_to_sub_cat_mapping[get_type("default_type")]
+            for key in sub_cat_keys_to_sub_cat_values:
+                sub_cat_values_dict = sub_cat_keys_to_sub_cat_values[key]
+                sub_category = dp.summary.get_sub_category(key)
+                sub_category.category_id = sub_cat_values_dict.get(sub_category.category_name, DEFAULT_CATEGORY_ID)
     dp.remove(annotation_ids=ann_ids_to_remove)
     return dp

deepdoctection/mapper/match.py CHANGED Viewed

@@ -101,17 +101,6 @@ def match_anns_by_intersection(
         ]
     )
-    # second try, if ann has empty image
-    n_dim = child_ann_boxes.ndim
-    if n_dim != 2:
-        child_ann_boxes = np.array(
-            [
-                ann.bounding_box.transform(dp.width, dp.height, absolute_coords=True).to_list(mode="xyxy")
-                for ann in child_anns
-                if ann.bounding_box is not None
-            ]
-        )
     parent_anns = dp.get_annotation(annotation_ids=parent_ann_ids, category_names=parent_ann_category_names)
     parent_ann_boxes = np.array(
         [
@@ -120,17 +109,6 @@ def match_anns_by_intersection(
         ]
     )
-    # same for parent
-    n_dim = parent_ann_boxes.ndim
-    if n_dim != 2:
-        parent_ann_boxes = np.array(
-            [
-                ann.bounding_box.transform(dp.width, dp.height, absolute_coords=True).to_list(mode="xyxy")
-                for ann in parent_anns
-                if ann.bounding_box is not None
-            ]
-        )
     if matching_rule in ["iou"] and parent_anns and child_anns:
         iou_matrix = iou(child_ann_boxes, parent_ann_boxes)
         output = iou_matrix > threshold

deepdoctection/mapper/misc.py CHANGED Viewed

@@ -38,12 +38,20 @@ with try_import() as import_guard:
     from lxml import etree  # pylint: disable=W0611
-def to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int] = None) -> Optional[Image]:
+def to_image(
+    dp: Union[str, Mapping[str, Union[str, bytes]]],
+    dpi: Optional[int] = None,
+    width: Optional[int] = None,
+    height: Optional[int] = None,
+) -> Optional[Image]:
     """
     Mapping an input from `dataflow.SerializerFiles` or similar to an Image
     :param dp: Image
     :param dpi: dot per inch definition for pdf resolution when converting to numpy array
+    :param width: target width of the image. This option does only work when using Poppler as PDF renderer
+    :param height: target width of the image. This option does only work when using Poppler as PDF renderer
+    :param height: target height of the image
     :return: Image
     """
@@ -77,7 +85,9 @@ def to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int]
                 dp_image.pdf_bytes = dp.get("pdf_bytes")
                 if dp_image.pdf_bytes is not None:
                     if isinstance(dp_image.pdf_bytes, bytes):
-                        dp_image.image = convert_pdf_bytes_to_np_array_v2(dp_image.pdf_bytes, dpi=dpi)
+                        dp_image.image = convert_pdf_bytes_to_np_array_v2(
+                            dp_image.pdf_bytes, dpi=dpi, width=width, height=height
+                        )
             elif image_bytes is not None:
                 dp_image.image = convert_bytes_to_np_array(image_bytes)
             else:

deepdoctection/mapper/pubstruct.py CHANGED Viewed

@@ -393,7 +393,7 @@ def pub_to_image_uncur(  # pylint: disable=R0914
             np_image = load_image_from_file(dp["filename"])
         if is_file_extension(dp["filename"], ".pdf"):
             pdf_bytes = load_bytes_from_pdf_file(dp["filename"])
-            np_image = convert_pdf_bytes_to_np_array_v2(pdf_bytes)
+            np_image = convert_pdf_bytes_to_np_array_v2(pdf_bytes, dpi=200)
             dp = _convert_boxes(dp, np_image.shape[0])
         if load_image and np_image is not None:

deepdoctection 0.37.3__py3-none-any.whl → 0.39__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.37.3py3-none-any.whl → 0.39py3-none-any.whl