PyPI - deepdoctection - Versions diffs - 1.2.5__tar.gz → 1.2.7__tar.gz - Mend

deepdoctection 1.2.5tar.gz → 1.2.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

{deepdoctection-1.2.5 → deepdoctection-1.2.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: deepdoctection
-Version: 1.2.5
+Version: 1.2.7
 Summary: Repository for Document AI - server/inference core package
 Author: Dr. Janis Meyer
 License: Apache License 2.0

{deepdoctection-1.2.5 → deepdoctection-1.2.7}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "deepdoctection"
-version = "1.2.5"
+version = "1.2.7"
 authors = [
     {name = "Dr. Janis Meyer"}
 ]

{deepdoctection-1.2.5 → deepdoctection-1.2.7}/src/deepdoctection/__init__.py RENAMED Viewed

@@ -12,7 +12,7 @@ from dd_core.utils.env_info import collect_env_info
 from dd_core.utils.file_utils import _LazyModule
 from dd_core.utils.logger import LoggingRecord, logger
-__version__ = "1.2.5"
+__version__ = "1.2.7"
 _IMPORT_STRUCTURE = {
     "analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory", "update_cfg_from_defaults"],
     "eval": [
@@ -108,7 +108,7 @@ _IMPORT_STRUCTURE = {
         "TextLineService",
         "TextOrderService",
         "TableSegmentationRefinementService",
-        "generate_html_string",
+        "generate_html_payload",
         "pipeline_component_registry",
         "TableSegmentationService",
         "PubtablesSegmentationService",

{deepdoctection-1.2.5 → deepdoctection-1.2.7}/src/deepdoctection/pipe/anngen.py RENAMED Viewed

@@ -25,7 +25,13 @@ from typing import Any, Optional, Sequence, Union
 import numpy as np
-from dd_core.datapoint.annotation import DEFAULT_CATEGORY_ID, CategoryAnnotation, ContainerAnnotation, ImageAnnotation
+from dd_core.datapoint.annotation import (
+    DEFAULT_CATEGORY_ID,
+    CategoryAnnotation,
+    ContainerAnnotation,
+    ImageAnnotation,
+    ReferencePayload,
+)
 from dd_core.datapoint.box import BoundingBox, local_to_global_coords, rescale_coords
 from dd_core.datapoint.image import Image
 from dd_core.mapper.maputils import MappingContextManager
@@ -89,9 +95,15 @@ def _set_image_keys_to_none(d: Any) -> None:
             _set_image_keys_to_none(item)
-def _image_to_cache_dict(image: Image) -> dict[str, Any]:
+def image_to_cache_dict(image: Image) -> dict[str, Any]:
+    """
+    Converting image to dict, by adding extras and removing higher hierarchies
+    Args:
+        image (Image): The image object to serialize and store.
+    """
     image.remove_image_from_lower_hierarchy()
-    export_dict = image.as_dict()
+    export_dict = image.as_dict(add_extras=True)
     _set_image_keys_to_none(export_dict)
     return export_dict
@@ -152,7 +164,7 @@ class LocalDataPointCacheStore(DataPointCacheStore):
         if pages is None:
             pages = {}
             self._pages[cache_key] = pages
-        pages[page_number] = _image_to_cache_dict(image)
+        pages[page_number] = image_to_cache_dict(image)
         if self._max_pages > 0 and len(pages) > self._max_pages:
             for k in sorted(pages.keys())[: -self._max_pages]:
                 pages.pop(k, None)
@@ -446,7 +458,7 @@ class DatapointManager:
         category_id: Optional[int],
         sub_cat_key: ObjectTypes,
         annotation_id: str,
-        value: Union[str, list[str]],
+        value: Union[str, list[str], ReferencePayload],
         score: Optional[float] = None,
     ) -> Optional[str]:
         """

{deepdoctection-1.2.5 → deepdoctection-1.2.7}/src/deepdoctection/pipe/refine.py RENAMED Viewed

@@ -25,11 +25,11 @@ from collections import defaultdict
 from copy import copy
 from dataclasses import asdict
 from itertools import chain, product
-from typing import DefaultDict, Optional, Sequence, Union
+from typing import DefaultDict, Optional, Sequence
 from lazy_imports import try_import
-from dd_core.datapoint.annotation import ImageAnnotation
+from dd_core.datapoint.annotation import AnnotationRef, ImageAnnotation, ReferencePayload
 from dd_core.datapoint.box import merge_boxes
 from dd_core.datapoint.image import Image, MetaAnnotation
 from dd_core.mapper.maputils import MappingContextManager
@@ -45,7 +45,7 @@ with try_import() as import_guard:
     import networkx as nx  # type: ignore
-__all__ = ["TableSegmentationRefinementService", "generate_html_string"]
+__all__ = ["TableSegmentationRefinementService", "generate_html_payload"]
 def tiles_to_cells(
@@ -233,8 +233,9 @@ def _tiling_to_cell_position(inputs: set[tuple[int, int]]) -> tuple[int, int, in
 def _html_cell(
-    cell_position: Union[tuple[int, int, int, int], tuple[()]], position_filled_list: list[tuple[int, int]]
-) -> list[str]:
+    cell_position: tuple[int, int, int, int] | tuple[()],
+    position_filled_list: list[tuple[int, int]],
+) -> list[str | AnnotationRef]:
     """
     Generates an HTML table cell string.
@@ -263,7 +264,7 @@ def _html_cell(
             )
     html.append(">")
     str_html = "".join(html)
-    html_list = [str_html, "</td>"]
+    html_list: list[str | AnnotationRef] = [str_html, "</td>"]
     return html_list
@@ -273,7 +274,8 @@ def _html_row(
     this_row: int,
     number_of_cols: int,
     row_ann_id_list: list[str],
-) -> list[str]:
+    image_id: str | None = None,
+) -> list[str | AnnotationRef]:
     """
     Generates an HTML table row string.
@@ -283,16 +285,17 @@ def _html_row(
         this_row: The current row number.
         number_of_cols: The total number of columns.
         row_ann_id_list: List of annotation ids for the row.
+        image_id: Image id of the table image.
     Returns:
-        List of HTML strings representing the row.
+        List of HTML strings and AnnotationRef objects representing the row.
     """
-    html = ["<tr>"]
+    html: list[str | AnnotationRef] = ["<tr>"]
     for idx in range(1, number_of_cols + 1):
         position_filled_this_row = list(filter(lambda x: x[0] == this_row, position_filled_list))
         column_filled_this_row = list(zip(*position_filled_this_row))
         column_filled_this_row = (
-            [column_filled_this_row, column_filled_this_row]  # type:ignore
+            [column_filled_this_row, column_filled_this_row]  # type: ignore
             if not column_filled_this_row
             else column_filled_this_row
         )
@@ -305,7 +308,7 @@ def _html_row(
                 cell_position = cell_position_list[0]
                 cell_id = row_ann_id_list.pop(0)
                 ret_html = _html_cell(cell_position, position_filled_list)
-                ret_html.insert(1, cell_id)
+                ret_html.insert(1, AnnotationRef(image_id=image_id, annotation_id=cell_id))
             else:
                 cell_position = ()  # type: ignore
                 ret_html = _html_cell(cell_position, position_filled_list)
@@ -319,62 +322,63 @@ def _html_table(
     cells_ann_list: list[tuple[int, list[str]]],
     number_of_rows: int,
     number_of_cols: int,
-) -> list[str]:
+    image_id: str | None = None,
+) -> list[str | AnnotationRef]:
     """
-    Generates an HTML table string.
+    Generates an HTML table representation with unresolved AnnotationRef placeholders.
     Args:
         table_list: List of tuples with row number and list of cell position tuples.
         cells_ann_list: List of tuples with row number and list of annotation ids.
         number_of_rows: The total number of rows.
         number_of_cols: The total number of columns.
+        image_id: Image id of the table image.
     Returns:
-        List of HTML strings representing the table.
+        List of HTML strings and AnnotationRef objects representing the table.
     """
-    html = ["<table>"]
+    html: list[str | AnnotationRef] = ["<table>"]
     position_filled: list[tuple[int, int]] = []
     for idx in range(1, number_of_rows + 1):
         row_idx = list(filter(lambda x: x[0] == idx, table_list))[0][1]  # pylint:disable=W0640
         row_ann_ids = list(filter(lambda x: x[0] == idx, cells_ann_list))[0][1]  # pylint:disable=W0640
-        ret_html = _html_row(row_idx, position_filled, idx, number_of_cols, row_ann_ids)
+        ret_html = _html_row(row_idx, position_filled, idx, number_of_cols, row_ann_ids, image_id)
         html.extend(ret_html)
     html.append("</table>")
     return html
-def generate_html_string(table: ImageAnnotation, cell_names: Sequence[ObjectTypes]) -> list[str]:
+def generate_html_payload(table: ImageAnnotation, cell_names: Sequence[ObjectTypes]) -> ReferencePayload:
     """
-    Generates an HTML representation of a table using table segmentation by row number, column number, etc.
-    Note:
-        It must be ensured that all cells have a row number, column number, row span, and column span, and that the
-        dissection by rows and columns is completely covered by cells.
+    Generates an unresolved HTML representation of a table using AnnotationRef placeholders.
     Args:
         table: An annotation that has a not None image and fully segmented cell annotation.
         cell_names: List of cell names that are used for the table segmentation.
     Returns:
-        HTML representation of the table.
+        ReferencePayload with HTML fragments and AnnotationRef placeholders.
     Raises:
-        `ImageError`: If `table.image` is None.
+        ImageError: If table.image is None.
     """
     if table.image is None:
         raise ImageError("table.image cannot be None")
     table_image = table.image
     cells = table_image.get_annotation(category_names=cell_names)
     number_of_rows = table_image.summary.get_sub_category(TableKey.NUMBER_OF_ROWS).category_id
     number_of_cols = table_image.summary.get_sub_category(TableKey.NUMBER_OF_COLUMNS).category_id
     table_list = []
     cells_ann_list = []
     for row_number in range(1, number_of_rows + 1):
         cells_of_row = list(
             sorted(
                 filter(
                     lambda cell: cell.get_sub_category(CellKey.ROW_NUMBER).category_id
-                    == row_number,  # pylint: disable=W0640
+                    == row_number,  # pylint:disable=W0640
                     cells,
                 ),
                 key=lambda cell: cell.get_sub_category(CellKey.COLUMN_NUMBER).category_id,
@@ -392,7 +396,16 @@ def generate_html_string(table: ImageAnnotation, cell_names: Sequence[ObjectType
         ann_list = [cell.annotation_id for cell in cells_of_row]
         table_list.append((row_number, row_list))
         cells_ann_list.append((row_number, ann_list))
-    return _html_table(table_list, cells_ann_list, number_of_rows, number_of_cols)
+    html_fragments = _html_table(
+        table_list=table_list,
+        cells_ann_list=cells_ann_list,
+        number_of_rows=number_of_rows,
+        number_of_cols=number_of_cols,
+        image_id=None,
+    )
+    return ReferencePayload(content=html_fragments)
 @pipeline_component_registry.register("TableSegmentationRefinementService")
@@ -523,10 +536,10 @@ class TableSegmentationRefinementService(PipelineComponent):
                 and TableKey.MAX_ROW_SPAN in table.image.summary.sub_categories
                 and TableKey.MAX_COL_SPAN in table.image.summary.sub_categories
             ):
-                table.image.summary.remove_sub_category(TableKey.NUMBER_OF_ROWS)
-                table.image.summary.remove_sub_category(TableKey.NUMBER_OF_COLUMNS)
-                table.image.summary.remove_sub_category(TableKey.MAX_ROW_SPAN)
-                table.image.summary.remove_sub_category(TableKey.MAX_COL_SPAN)
+                table.image.summary.pop_sub_category(TableKey.NUMBER_OF_ROWS)
+                table.image.summary.pop_sub_category(TableKey.NUMBER_OF_COLUMNS)
+                table.image.summary.pop_sub_category(TableKey.MAX_ROW_SPAN)
+                table.image.summary.pop_sub_category(TableKey.MAX_COL_SPAN)
             self.dp_manager.set_summary_annotation(
                 TableKey.NUMBER_OF_ROWS, TableKey.NUMBER_OF_ROWS, number_of_rows, annotation_id=table.annotation_id
@@ -543,7 +556,7 @@ class TableSegmentationRefinementService(PipelineComponent):
             self.dp_manager.set_summary_annotation(
                 TableKey.MAX_COL_SPAN, TableKey.MAX_COL_SPAN, max_col_span, annotation_id=table.annotation_id
             )
-            html = generate_html_string(table, self.cell_names)
+            html = generate_html_payload(table, self.cell_names)
             self.dp_manager.set_container_annotation(TableKey.HTML, -1, TableKey.HTML, table.annotation_id, html)
     def clone(self) -> TableSegmentationRefinementService:

{deepdoctection-1.2.5 → deepdoctection-1.2.7}/src/deepdoctection/pipe/segment.py RENAMED Viewed

@@ -53,7 +53,7 @@ from dd_core.utils.object_types import (
 from ..extern.base import DetectionResult
 from .base import PipelineComponent
-from .refine import generate_html_string
+from .refine import generate_html_payload
 from .registry import pipeline_component_registry
 __all__ = ["TableSegmentationService", "SegmentationResult", "PubtablesSegmentationService"]
@@ -1350,7 +1350,7 @@ class PubtablesSegmentationService(PipelineComponent):
             self.dp_manager.set_summary_annotation(
                 TableKey.MAX_COL_SPAN, TableKey.MAX_COL_SPAN, max_col_span, annotation_id=table.annotation_id
             )
-            html = generate_html_string(table, self.cell_names + self.spanning_cell_names)
+            html = generate_html_payload(table, self.cell_names + self.spanning_cell_names)
             self.dp_manager.set_container_annotation(TableKey.HTML, -1, TableKey.HTML, table.annotation_id, html)
     def clone(self) -> PubtablesSegmentationService:

{deepdoctection-1.2.5 → deepdoctection-1.2.7}/src/deepdoctection.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: deepdoctection
-Version: 1.2.5
+Version: 1.2.7
 Summary: Repository for Document AI - server/inference core package
 Author: Dr. Janis Meyer
 License: Apache License 2.0