PyPI - deepdoctection - Versions diffs - 0.37.3__py3-none-any.whl → 0.39__py3-none-any.whl - Mend

deepdoctection 0.37.3py3-none-any.whl → 0.39py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (27) hide show

deepdoctection/__init__.py +5 -1
deepdoctection/analyzer/_config.py +2 -1
deepdoctection/analyzer/dd.py +6 -5
deepdoctection/analyzer/factory.py +16 -6
deepdoctection/configs/conf_dd_one.yaml +126 -85
deepdoctection/datapoint/box.py +2 -4
deepdoctection/datapoint/convert.py +14 -8
deepdoctection/datapoint/image.py +12 -5
deepdoctection/datapoint/view.py +151 -53
deepdoctection/extern/hfdetr.py +4 -3
deepdoctection/extern/model.py +6 -97
deepdoctection/mapper/cats.py +21 -10
deepdoctection/mapper/match.py +0 -22
deepdoctection/mapper/misc.py +12 -2
deepdoctection/mapper/pubstruct.py +1 -1
deepdoctection/pipe/doctectionpipe.py +20 -3
deepdoctection/pipe/lm.py +20 -5
deepdoctection/pipe/refine.py +6 -13
deepdoctection/pipe/segment.py +225 -46
deepdoctection/pipe/sub_layout.py +40 -22
deepdoctection/train/hf_layoutlm_train.py +3 -1
deepdoctection/utils/pdf_utils.py +17 -9
{deepdoctection-0.37.3.dist-info → deepdoctection-0.39.dist-info}/METADATA +15 -5
{deepdoctection-0.37.3.dist-info → deepdoctection-0.39.dist-info}/RECORD +27 -27
{deepdoctection-0.37.3.dist-info → deepdoctection-0.39.dist-info}/WHEEL +1 -1
{deepdoctection-0.37.3.dist-info → deepdoctection-0.39.dist-info}/LICENSE +0 -0
{deepdoctection-0.37.3.dist-info → deepdoctection-0.39.dist-info}/top_level.txt +0 -0

deepdoctection/pipe/doctectionpipe.py CHANGED Viewed

@@ -109,8 +109,13 @@ def _proto_process(
 @curry
-def _to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int] = None) -> Optional[Image]:
-    return to_image(dp, dpi)
+def _to_image(
+    dp: Union[str, Mapping[str, Union[str, bytes]]],
+    dpi: Optional[int] = None,
+    width: Optional[int] = None,
+    height: Optional[int] = None,
+) -> Optional[Image]:
+    return to_image(dp, dpi, width, height)
 def _doc_to_dataflow(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> DataFlow:
@@ -188,7 +193,19 @@ class DoctectionPipe(Pipeline):
         df = MapData(df, _proto_process(path, doc_path))
         if dataset_dataflow is None:
-            df = MapData(df, _to_image(dpi=os.environ.get("DPI", 300)))  # pylint: disable=E1120
+            if dpi := os.environ["DPI"]:
+                df = MapData(df, _to_image(dpi=int(dpi)))  # pylint: disable=E1120
+            else:
+                width, height = kwargs.get("width", ""), kwargs.get("height", "")
+                if not width or not height:
+                    width = os.environ["IMAGE_WIDTH"]
+                    height = os.environ["IMAGE_HEIGHT"]
+                    if not width or not height:
+                        raise ValueError(
+                            "DPI, IMAGE_WIDTH and IMAGE_HEIGHT are all None, but "
+                            "either DPI or IMAGE_WIDTH and IMAGE_HEIGHT must be set"
+                        )
+                df = MapData(df, _to_image(width=int(width), height=int(height)))  # pylint: disable=E1120
         return df
     @staticmethod

deepdoctection/pipe/lm.py CHANGED Viewed

@@ -24,6 +24,7 @@ from copy import copy
 from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Sequence, Union
 from ..datapoint.image import Image
+from ..extern.base import SequenceClassResult
 from ..mapper.laylmstruct import image_to_layoutlm_features, image_to_lm_features
 from ..utils.settings import BioTag, LayoutType, ObjectTypes, PageType, TokenClasses, WordType
 from .base import MetaAnnotation, PipelineComponent
@@ -264,6 +265,7 @@ class LMSequenceClassifierService(PipelineComponent):
         padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
         truncation: bool = True,
         return_overflowing_tokens: bool = False,
+        use_other_as_default_category: bool = False
     ) -> None:
         """
         :param tokenizer: Tokenizer, typing allows currently anything. This will be changed in the future
@@ -279,11 +281,16 @@ class LMSequenceClassifierService(PipelineComponent):
         :param return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows the overflowing tokens
                            can be returned as an additional batch element. Not that in this case, the number of input
                            batch samples will be smaller than the output batch samples.
+        :param use_other_as_default_category: When predicting document classes, it might be possible that some pages
+                           do not get sent to the model because they are empty. If set to `True` it
+                           will assign images with no features the category `TokenClasses.OTHER`.
         """
         self.language_model = language_model
         self.padding = padding
         self.truncation = truncation
         self.return_overflowing_tokens = return_overflowing_tokens
+        self.use_other_as_default_category = use_other_as_default_category
         self.tokenizer = tokenizer
         self.mapping_to_lm_input_func = self.image_to_features_func(self.language_model.image_to_features_mapping())
         super().__init__(self._get_name(), self.language_model.model_id)
@@ -299,12 +306,20 @@ class LMSequenceClassifierService(PipelineComponent):
     def serve(self, dp: Image) -> None:
         lm_input = self.mapping_to_lm_input_func(**self.required_kwargs)(dp)
+        lm_output = None
         if lm_input is None:
-            return
-        lm_output = self.language_model.predict(**lm_input)
-        self.dp_manager.set_summary_annotation(
-            PageType.DOCUMENT_TYPE, lm_output.class_name, lm_output.class_id, None, lm_output.score
-        )
+            if self.use_other_as_default_category:
+                class_id = self.language_model.categories.get_categories(as_dict=True,
+                                                                         name_as_key=True).get(TokenClasses.OTHER, 1)
+                lm_output = SequenceClassResult(class_name=TokenClasses.OTHER,
+                                                class_id = class_id,
+                                                score=-1.)
+        else:
+            lm_output = self.language_model.predict(**lm_input)
+        if lm_output:
+            self.dp_manager.set_summary_annotation(
+                PageType.DOCUMENT_TYPE, lm_output.class_name, lm_output.class_id, None, lm_output.score
+            )
     def clone(self) -> LMSequenceClassifierService:
         return self.__class__(

deepdoctection/pipe/refine.py CHANGED Viewed

@@ -295,28 +295,21 @@ def _html_table(
     return html
-def generate_html_string(table: ImageAnnotation) -> list[str]:
+def generate_html_string(table: ImageAnnotation, cell_names: Sequence[ObjectTypes]) -> list[str]:
     """
     Takes the table segmentation by using table cells row number, column numbers etc. and generates a html
     representation.
     :param table: An annotation that has a not None image and fully segmented cell annotation.
+    :param cell_names: List of cell names that are used for the table segmentation. Note: It must be ensured that
+                      that all cells have a row number, column number, row span and column span and that the dissection
+                      by rows and columns is completely covered by cells.
     :return: HTML representation of the table
     """
     if table.image is None:
         raise ImageError("table.image cannot be None")
     table_image = table.image
-    cells = table_image.get_annotation(
-        category_names=[
-            LayoutType.CELL,
-            CellType.HEADER,
-            CellType.BODY,
-            CellType.SPANNING,
-            CellType.ROW_HEADER,
-            CellType.COLUMN_HEADER,
-            CellType.PROJECTED_ROW_HEADER,
-        ]
-    )
+    cells = table_image.get_annotation(category_names=cell_names)
     number_of_rows = table_image.summary.get_sub_category(TableType.NUMBER_OF_ROWS).category_id
     number_of_cols = table_image.summary.get_sub_category(TableType.NUMBER_OF_COLUMNS).category_id
     table_list = []
@@ -485,7 +478,7 @@ class TableSegmentationRefinementService(PipelineComponent):
             self.dp_manager.set_summary_annotation(
                 TableType.MAX_COL_SPAN, TableType.MAX_COL_SPAN, max_col_span, annotation_id=table.annotation_id
             )
-            html = generate_html_string(table)
+            html = generate_html_string(table, self.cell_names)
             self.dp_manager.set_container_annotation(TableType.HTML, -1, TableType.HTML, table.annotation_id, html)
     def clone(self) -> TableSegmentationRefinementService:

deepdoctection/pipe/segment.py CHANGED Viewed

@@ -28,13 +28,13 @@ from typing import Literal, Optional, Sequence, Union
 import numpy as np
 from ..datapoint.annotation import ImageAnnotation
-from ..datapoint.box import BoundingBox, global_to_local_coords, intersection_boxes, iou
+from ..datapoint.box import BoundingBox, global_to_local_coords, intersection_box, intersection_boxes, iou, merge_boxes
 from ..datapoint.image import Image
 from ..extern.base import DetectionResult
 from ..mapper.maputils import MappingContextManager
 from ..mapper.match import match_anns_by_intersection
 from ..utils.error import ImageError
-from ..utils.settings import CellType, LayoutType, ObjectTypes, Relationships, TableType
+from ..utils.settings import CellType, LayoutType, ObjectTypes, Relationships, TableType, TypeOrStr, get_type
 from .base import MetaAnnotation, PipelineComponent
 from .refine import generate_html_string
 from .registry import pipeline_component_registry
@@ -55,6 +55,15 @@ class SegmentationResult:
     cs: int
+@dataclass
+class ItemHeaderResult:
+    """
+    Simple mutable storage for item header results
+    """
+    annotation_id: str
 def choose_items_by_iou(
     dp: Image,
     item_proposals: list[ImageAnnotation],
@@ -314,7 +323,7 @@ def _tile_by_stretching_rows_leftwise_column_downwise(
 def tile_tables_with_items_per_table(
-    dp: Image, table: ImageAnnotation, item_name: str, stretch_rule: Literal["left", "equal"] = "left"
+    dp: Image, table: ImageAnnotation, item_name: ObjectTypes, stretch_rule: Literal["left", "equal"] = "left"
 ) -> Image:
     """
     Tiling a table with items (i.e. rows or columns). To ensure that every position in a table can be assigned to a row
@@ -355,9 +364,9 @@ def tile_tables_with_items_per_table(
 def stretch_items(
     dp: Image,
-    table_name: str,
-    row_name: str,
-    col_name: str,
+    table_name: ObjectTypes,
+    row_name: ObjectTypes,
+    col_name: ObjectTypes,
     remove_iou_threshold_rows: float,
     remove_iou_threshold_cols: float,
 ) -> Image:
@@ -491,7 +500,7 @@ def create_intersection_cells(
     cols: Sequence[ImageAnnotation],
     table_annotation_id: str,
     cell_class_id: int,
-    sub_item_names: Sequence[CellType],
+    sub_item_names: Sequence[ObjectTypes],
 ) -> tuple[Sequence[DetectionResult], Sequence[SegmentationResult]]:
     """
     Given rows and columns with row- and column number sub categories, create a list of `DetectionResult` and
@@ -511,6 +520,7 @@ def create_intersection_cells(
     detect_result_cells = []
     segment_result_cells = []
     idx = 0
+    break_outer_loop = False
     for row in rows:
         for col in cols:
             detect_result_cells.append(
@@ -531,17 +541,59 @@ def create_intersection_cells(
                 )
             )
             idx += 1
-            # it is possible to have less intersection boxes, e.g. if one cell has height/width 0
+            # it is possible to have less intersection boxes, e.g. if one cell has height/width 0. We need to break both
+            # loops.
             if idx >= len(boxes_cells):
+                break_outer_loop = True
                 break
+        if break_outer_loop:
+            break
     return detect_result_cells, segment_result_cells
+def header_cell_to_item_detect_result(
+    dp: Image,
+    table: ImageAnnotation,
+    item_name: ObjectTypes,
+    item_header_name: ObjectTypes,
+    segment_rule: Literal["iou", "ioa"],
+    threshold: float,
+) -> list[ItemHeaderResult]:
+    """
+    Match header cells to items (rows or columns) based on intersection-over-union (iou) or intersection-over-area (ioa)
+    and return a list of ItemHeaderResult.
+    :param dp: The image containing the table and items.
+    :param table: The table image annotation.
+    :param item_name: The type of items (e.g., rows or columns) to match with header cells.
+    :param item_header_name: The type of header cells to match with items.
+    :param segment_rule: The rule to use for matching, either 'iou' or 'ioa'.
+    :param threshold: The iou/ioa threshold for matching header cells with items.
+    :return: A list of ItemHeaderResult containing the matched header cells.
+    """
+    child_ann_ids = table.get_relationship(Relationships.CHILD)
+    item_index, _, items, _ = match_anns_by_intersection(
+        dp,
+        item_header_name,
+        item_name,
+        segment_rule,
+        threshold,
+        True,
+        child_ann_ids,
+        child_ann_ids,
+    )
+    item_headers = []
+    for idx, item in enumerate(items):
+        if idx in item_index:
+            item_headers.append(ItemHeaderResult(annotation_id=item.annotation_id))
+    return item_headers
 def segment_pubtables(
     dp: Image,
     table: ImageAnnotation,
-    item_names: Sequence[LayoutType],
-    spanning_cell_names: Sequence[Union[LayoutType, CellType]],
+    item_names: Sequence[ObjectTypes],
+    spanning_cell_names: Sequence[ObjectTypes],
     segment_rule: Literal["iou", "ioa"],
     threshold_rows: float,
     threshold_cols: float,
@@ -553,7 +605,7 @@ def segment_pubtables(
     Row and column positions as well as row and column lengths are determined for all types of spanning cells.
     All simple cells that are covered by a spanning cell as well in the table position (double allocation) are then
-    removed.
+    replaced by the spanning cell and deactivated.
     :param dp: Image
     :param table: table ImageAnnotation
@@ -566,6 +618,7 @@ def segment_pubtables(
                                to the column.
     :return: A list of len(number of cells) of SegmentationResult for spanning cells
     """
     child_ann_ids = table.get_relationship(Relationships.CHILD)
     cell_index_rows, row_index, _, _ = match_anns_by_intersection(
         dp,
@@ -600,29 +653,77 @@ def segment_pubtables(
         for idx, cell in enumerate(spanning_cells):
             cell_positions_rows = cell_index_rows == idx
             rows_of_cell = [rows[k] for k in row_index[cell_positions_rows]]
-            rs = (
-                max(row.get_sub_category(CellType.ROW_NUMBER).category_id for row in rows_of_cell)
-                - min(row.get_sub_category(CellType.ROW_NUMBER).category_id for row in rows_of_cell)
-                + 1
-            )
-            if len(rows_of_cell):
-                row_number = min(row.get_sub_category(CellType.ROW_NUMBER).category_id for row in rows_of_cell)
+            if rows_of_cell:
+                min_row_cell = min(rows_of_cell, key=lambda row: row.get_sub_category(CellType.ROW_NUMBER).category_id)
+                max_row_cell = max(rows_of_cell, key=lambda row: row.get_sub_category(CellType.ROW_NUMBER).category_id)
+                max_row = max_row_cell.get_sub_category(CellType.ROW_NUMBER).category_id
+                min_row = min_row_cell.get_sub_category(CellType.ROW_NUMBER).category_id
+                rs = max_row - min_row + 1
+                row_number = min_row
             else:
+                rs = 0
                 row_number = 0
             cell_positions_cols = cell_index_cols == idx
             cols_of_cell = [columns[k] for k in col_index[cell_positions_cols]]
-            cs = (
-                max(col.get_sub_category(CellType.COLUMN_NUMBER).category_id for col in cols_of_cell)
-                - min(col.get_sub_category(CellType.COLUMN_NUMBER).category_id for col in cols_of_cell)
-                + 1
-            )
-            if len(cols_of_cell):
-                col_number = min(col.get_sub_category(CellType.COLUMN_NUMBER).category_id for col in cols_of_cell)
+            if cols_of_cell:
+                min_col_cell = min(
+                    cols_of_cell, key=lambda col: col.get_sub_category(CellType.COLUMN_NUMBER).category_id
+                )
+                max_col_cell = max(
+                    cols_of_cell, key=lambda col: col.get_sub_category(CellType.COLUMN_NUMBER).category_id
+                )
+                max_col = max_col_cell.get_sub_category(CellType.COLUMN_NUMBER).category_id
+                min_col = min_col_cell.get_sub_category(CellType.COLUMN_NUMBER).category_id
+                cs = max_col - min_col + 1
+                col_number = min_col
             else:
+                cs = 0
                 col_number = 0
+            if rows_of_cell and cols_of_cell:
+                # We resize all bounding boxes of spanning cells so that they match with the grid structure, determined
+                # by the rows ans columns.
+                merge_box_image_row = merge_boxes(
+                    *[min_row_cell.get_bounding_box(dp.image_id), max_row_cell.get_bounding_box(dp.image_id)]
+                )
+                merge_box_image_column = merge_boxes(
+                    *[min_col_cell.get_bounding_box(dp.image_id), max_col_cell.get_bounding_box(dp.image_id)]
+                )
+                merge_box_image = intersection_box(merge_box_image_row, merge_box_image_column)
+                merge_box_table_row = merge_boxes(
+                    *[
+                        min_row_cell.get_bounding_box(table.annotation_id),
+                        max_row_cell.get_bounding_box(table.annotation_id),
+                    ]
+                )
+                merge_box_table_column = merge_boxes(
+                    *[
+                        min_col_cell.get_bounding_box(table.annotation_id),
+                        max_col_cell.get_bounding_box(table.annotation_id),
+                    ]
+                )
+                merge_box_table = intersection_box(merge_box_table_row, merge_box_table_column)
+                merge_box_spanning_cell_row = merge_boxes(
+                    *[
+                        min_row_cell.get_bounding_box(min_row_cell.annotation_id),
+                        max_row_cell.get_bounding_box(max_row_cell.annotation_id),
+                    ]
+                )
+                merge_box_spanning_cell_column = merge_boxes(
+                    *[
+                        min_col_cell.get_bounding_box(min_col_cell.annotation_id),
+                        max_col_cell.get_bounding_box(max_col_cell.annotation_id),
+                    ]
+                )
+                merge_box_spanning_cell = intersection_box(merge_box_spanning_cell_row, merge_box_spanning_cell_column)
+                if cell.image is None:
+                    raise ImageError("cell.image cannot be None")
+                cell.image.set_embedding(dp.image_id, merge_box_image)
+                cell.image.set_embedding(table.annotation_id, merge_box_table)
+                cell.image.set_embedding(cell.annotation_id, merge_box_spanning_cell)
             raw_table_segments.append(
                 SegmentationResult(
                     annotation_id=cell.annotation_id,
@@ -674,10 +775,10 @@ class TableSegmentationService(PipelineComponent):
         tile_table_with_items: bool,
         remove_iou_threshold_rows: float,
         remove_iou_threshold_cols: float,
-        table_name: ObjectTypes,
-        cell_names: Sequence[ObjectTypes],
-        item_names: Sequence[ObjectTypes],
-        sub_item_names: Sequence[ObjectTypes],
+        table_name: TypeOrStr,
+        cell_names: Sequence[TypeOrStr],
+        item_names: Sequence[TypeOrStr],
+        sub_item_names: Sequence[TypeOrStr],
         stretch_rule: Literal["left", "equal"] = "left",
     ):
         """
@@ -705,10 +806,10 @@ class TableSegmentationService(PipelineComponent):
         self.tile_table = tile_table_with_items
         self.remove_iou_threshold_rows = remove_iou_threshold_rows
         self.remove_iou_threshold_cols = remove_iou_threshold_cols
-        self.table_name = table_name
-        self.cell_names = cell_names
-        self.item_names = item_names  # row names must be before column name
-        self.sub_item_names = sub_item_names
+        self.table_name = get_type(table_name)
+        self.cell_names = [get_type(cell_name) for cell_name in cell_names]
+        self.item_names = [get_type(item_name) for item_name in item_names]  # row names must be before column name
+        self.sub_item_names = [get_type(sub_item_name) for sub_item_name in sub_item_names]
         self.stretch_rule = stretch_rule
         self.item_iou_threshold = 0.0001
         super().__init__("table_segment")
@@ -876,11 +977,13 @@ class PubtablesSegmentationService(PipelineComponent):
         remove_iou_threshold_rows: float,
         remove_iou_threshold_cols: float,
         cell_class_id: int,
-        table_name: ObjectTypes,
-        cell_names: Sequence[Union[LayoutType, CellType]],
-        spanning_cell_names: Sequence[Union[LayoutType, CellType]],
-        item_names: Sequence[LayoutType],
-        sub_item_names: Sequence[CellType],
+        table_name: TypeOrStr,
+        cell_names: Sequence[TypeOrStr],
+        spanning_cell_names: Sequence[TypeOrStr],
+        item_names: Sequence[TypeOrStr],
+        sub_item_names: Sequence[TypeOrStr],
+        item_header_cell_names: Sequence[TypeOrStr],
+        item_header_thresholds: Sequence[float],
         cell_to_image: bool = True,
         crop_cell_image: bool = False,
         stretch_rule: Literal["left", "equal"] = "left",
@@ -900,6 +1003,11 @@ class PubtablesSegmentationService(PipelineComponent):
         :param spanning_cell_names: layout type of spanning cells
         :param item_names: layout type of items (e.g. row and column)
         :param sub_item_names: layout type of sub items (e.g. row number and column number)
+        :param item_header_cell_names: layout type of item header cells (e.g. CellType.COLUMN_HEADER,
+        CellType.ROW_HEADER). Note that column header, resp. row header will be first assigned to rows, resp. columns
+        and then transferred to cells.
+        :param item_header_thresholds: iou/ioa threshold for matching header cells with items. The first threshold
+        corresponds to matching the first entry of item_names.
         :param cell_to_image: If set to 'True' it will create an 'Image' for LayoutType.cell
         :param crop_cell_image: If set to 'True' it will crop a numpy array image for LayoutType.cell.
                                 Requires 'cell_to_image=True'
@@ -909,17 +1017,20 @@ class PubtablesSegmentationService(PipelineComponent):
         self.threshold_rows = threshold_rows
         self.threshold_cols = threshold_cols
         self.tile_table = tile_table_with_items
-        self.table_name = table_name
-        self.cell_names = cell_names
-        self.spanning_cell_names = spanning_cell_names
+        self.table_name = get_type(table_name)
+        self.cell_names = [get_type(cell_name) for cell_name in cell_names]
+        self.spanning_cell_names = [get_type(cell_name) for cell_name in spanning_cell_names]
         self.remove_iou_threshold_rows = remove_iou_threshold_rows
         self.remove_iou_threshold_cols = remove_iou_threshold_cols
         self.cell_class_id = cell_class_id
         self.cell_to_image = cell_to_image
         self.crop_cell_image = crop_cell_image
-        self.item_names = item_names  # row names must be before column name
-        self.sub_item_names = sub_item_names
+        self.item_names = [get_type(item_name) for item_name in item_names]  # row names must be before column name
+        self.sub_item_names = [get_type(item_name) for item_name in sub_item_names]
         self.stretch_rule = stretch_rule
+        self.item_header_cell_names = [get_type(item_name) for item_name in item_header_cell_names]
+        self.item_header_thresholds = item_header_thresholds
         super().__init__("table_transformer_segment")
     def serve(self, dp: Image) -> None:
@@ -932,10 +1043,18 @@ class PubtablesSegmentationService(PipelineComponent):
             self.remove_iou_threshold_cols,
         )
         table_anns = dp.get_annotation(category_names=self.table_name)
+        has_item_headers = {item: False for item in self.item_names}
         for table in table_anns:
             item_ann_ids = table.get_relationship(Relationships.CHILD)
-            for item_sub_item_name in zip(self.item_names, self.sub_item_names):  # one pass for rows and one for cols
-                item_name, sub_item_name = item_sub_item_name[0], item_sub_item_name[1]
+            for item_sub_item_name in zip(
+                self.item_names, self.sub_item_names, self.item_header_cell_names, self.item_header_thresholds
+            ):  # one pass for rows and one for cols
+                item_name, sub_item_name, item_header_cell_name, item_header_threshold = (
+                    item_sub_item_name[0],
+                    item_sub_item_name[1],
+                    item_sub_item_name[2],
+                    item_sub_item_name[3],
+                )
                 if self.tile_table:
                     dp = tile_tables_with_items_per_table(dp, table, item_name, self.stretch_rule)
                 items = dp.get_annotation(category_names=item_name, annotation_ids=item_ann_ids)
@@ -949,10 +1068,24 @@ class PubtablesSegmentationService(PipelineComponent):
                     )
                 )
+                item_headers_detect_results = header_cell_to_item_detect_result(
+                    dp, table, item_name, item_header_cell_name, self.segment_rule, item_header_threshold
+                )
+                if item_headers_detect_results:
+                    has_item_headers[item_name] = True
                 for item_number, item in enumerate(items, 1):
                     self.dp_manager.set_category_annotation(
                         sub_item_name, item_number, sub_item_name, item.annotation_id
                     )
+                for item_header_detect_result in item_headers_detect_results:
+                    self.dp_manager.set_category_annotation(
+                        category_name=item_header_cell_name,
+                        category_id=None,
+                        sub_cat_key=item_header_cell_name,
+                        annotation_id=item_header_detect_result.annotation_id,
+                    )
             rows = dp.get_annotation(category_names=self.item_names[0], annotation_ids=item_ann_ids)
             columns = dp.get_annotation(category_names=self.item_names[1], annotation_ids=item_ann_ids)
             detect_result_cells, segment_result_cells = create_intersection_cells(
@@ -979,6 +1112,7 @@ class PubtablesSegmentationService(PipelineComponent):
                     CellType.COLUMN_SPAN, segment_result.cs, CellType.COLUMN_SPAN, segment_result.annotation_id
                 )
                 cell_rn_cn_to_ann_id[(segment_result.row_num, segment_result.col_num)] = segment_result.annotation_id
             spanning_cell_raw_segments = segment_pubtables(
                 dp,
                 table,
@@ -988,7 +1122,15 @@ class PubtablesSegmentationService(PipelineComponent):
                 self.threshold_rows,
                 self.threshold_cols,
             )
             for segment_result in spanning_cell_raw_segments:
+                if (
+                    (segment_result.rs == 1 and segment_result.cs == 1)
+                    or segment_result.rs == 0
+                    or segment_result.cs == 0
+                ):
+                    self.dp_manager.deactivate_annotation(segment_result.annotation_id)
+                    continue
                 self.dp_manager.set_category_annotation(
                     CellType.ROW_NUMBER, segment_result.row_num, CellType.ROW_NUMBER, segment_result.annotation_id
                 )
@@ -1009,6 +1151,19 @@ class PubtablesSegmentationService(PipelineComponent):
                     cell_ann_id = cell_rn_cn_to_ann_id[cell_position]
                     self.dp_manager.deactivate_annotation(cell_ann_id)
+            for segment_result in spanning_cell_raw_segments:
+                if (
+                    (segment_result.rs == 1 and segment_result.cs == 1)
+                    or segment_result.rs == 0
+                    or segment_result.cs == 0
+                ):
+                    continue
+                for rs in range(segment_result.rs):
+                    for cs in range(segment_result.cs):
+                        cell_rn_cn_to_ann_id[
+                            (segment_result.row_num + rs, segment_result.col_num + cs)
+                        ] = segment_result.annotation_id
             cells = []
             if table.image:
                 cells = table.image.get_annotation(category_names=self.cell_names)
@@ -1022,6 +1177,28 @@ class PubtablesSegmentationService(PipelineComponent):
                 number_of_cols = 0
                 max_row_span = 0
                 max_col_span = 0
+            for idx, item_vals in enumerate(zip(self.item_names, self.item_header_cell_names, self.sub_item_names)):
+                item_obj_type, item_header_cell_name, sub_item_name = item_vals[0], item_vals[1], item_vals[2]
+                if has_item_headers[item_obj_type]:
+                    items = dp.get_annotation(category_names=item_obj_type)
+                    for item_ann in items:
+                        if item_header_cell_name in item_ann.sub_categories:
+                            item_number = item_ann.get_sub_category(sub_item_name).category_id
+                            for key, value in cell_rn_cn_to_ann_id.items():
+                                if key[idx] == item_number:
+                                    cell_ann = dp.get_annotation(annotation_ids=value)[0]
+                                    self.dp_manager.set_category_annotation(
+                                        item_header_cell_name, None, item_header_cell_name, cell_ann.annotation_id
+                                    )
+                                else:
+                                    cell_ann = dp.get_annotation(annotation_ids=value)[0]
+                                    self.dp_manager.set_category_annotation(
+                                        item_header_cell_name, None, CellType.BODY, cell_ann.annotation_id
+                                    )
             # TODO: the summaries should be sub categories of the underlying ann
             self.dp_manager.set_summary_annotation(
                 TableType.NUMBER_OF_ROWS, TableType.NUMBER_OF_ROWS, number_of_rows, annotation_id=table.annotation_id
@@ -1038,7 +1215,7 @@ class PubtablesSegmentationService(PipelineComponent):
             self.dp_manager.set_summary_annotation(
                 TableType.MAX_COL_SPAN, TableType.MAX_COL_SPAN, max_col_span, annotation_id=table.annotation_id
             )
-            html = generate_html_string(table)
+            html = generate_html_string(table, self.cell_names + self.spanning_cell_names)
             self.dp_manager.set_container_annotation(TableType.HTML, -1, TableType.HTML, table.annotation_id, html)
     def clone(self) -> PubtablesSegmentationService:
@@ -1055,6 +1232,8 @@ class PubtablesSegmentationService(PipelineComponent):
             self.spanning_cell_names,
             self.item_names,
             self.sub_item_names,
+            self.item_header_cell_names,
+            self.item_header_thresholds,
             self.cell_to_image,
             self.crop_cell_image,
             self.stretch_rule,

deepdoctection 0.37.3__py3-none-any.whl → 0.39__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.37.3py3-none-any.whl → 0.39py3-none-any.whl