PyPI - deepdoctection - Versions diffs - 1.2.1__tar.gz → 1.2.3__tar.gz - Mend

deepdoctection 1.2.1tar.gz → 1.2.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

{deepdoctection-1.2.1 → deepdoctection-1.2.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: deepdoctection
-Version: 1.2.1
+Version: 1.2.3
 Summary: Repository for Document AI - server/inference core package
 Author: Dr. Janis Meyer
 License: Apache License 2.0

{deepdoctection-1.2.1 → deepdoctection-1.2.3}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "deepdoctection"
-version = "1.2.1"
+version = "1.2.3"
 authors = [
     {name = "Dr. Janis Meyer"}
 ]

{deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/__init__.py RENAMED Viewed

@@ -12,7 +12,7 @@ from dd_core.utils.env_info import collect_env_info
 from dd_core.utils.file_utils import _LazyModule
 from dd_core.utils.logger import LoggingRecord, logger
-__version__ = "1.2.1"
+__version__ = "1.2.3"
 _IMPORT_STRUCTURE = {
     "analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory", "update_cfg_from_defaults"],
     "eval": [

{deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/anngen.py RENAMED Viewed

@@ -45,28 +45,31 @@ class DataPointCacheStore(ABC):
     """
     @abstractmethod
-    def put_datapoint(self, document_id: str, image_id: str, page_number: int, image: Image) -> None:
+    def put_datapoint(self, document_id: str, page_number: int, image: Image, job_id: str | None = None) -> None:
         """
         Persist a datapoint (image) for a specific document and page number.
         Args:
             document_id (str): The identifier of the document the image belongs to.
-            image_id (str): The unique identifier of the image.
             page_number (int): The 0-based page number inside the document.
             image (Image): The image object to store (may be serialized by the store).
+            job_id (str | None): Optional job identifier to distinguish between different processing runs.
+                If None, caching key remains unchanged (backward compatible).
         Returns:
             None
         """
     @abstractmethod
-    def get_datapoints(self, document_id: str, last_d: int) -> tuple[Image, ...]:
+    def get_datapoints(self, document_id: str, last_d: int, job_id: str | None = None) -> tuple[Image, ...]:
         """
         Retrieve up to `last_d` most recently stored datapoints for the given document.
         Args:
             document_id (str): The identifier of the document to retrieve datapoints for.
             last_d (int): Maximum number of most recent datapoints to return. Must be >= 0.
+            job_id (str | None): Optional job identifier to retrieve datapoints from a specific processing run.
+                If None, retrieves datapoints for the document without job distinction (backward compatible).
         Returns:
             tuple[Image, ...]: A tuple of reconstructed :class:`Image` objects ordered from
@@ -115,7 +118,23 @@ class LocalDataPointCacheStore(DataPointCacheStore):
         self._max_pages = max_pages
         self._pages: dict[str, dict[int, dict[str, Any]]] = {}
-    def put_datapoint(self, document_id: str, image_id: str, page_number: int, image: Image) -> None:
+    def _get_cache_key(self, document_id: str, job_id: str | None) -> str:
+        """
+        Generate cache key, distinguishing by job_id if present.
+        Args:
+            document_id (str): The document identifier.
+            job_id (str | None): Optional job identifier.
+        Returns:
+            str: Cache key. If job_id is None, returns document_id unchanged.
+                 If job_id is provided, returns "document_id::job_id".
+        """
+        if job_id is None:
+            return document_id
+        return f"{document_id}::{job_id}"
+    def put_datapoint(self, document_id: str, page_number: int, image: Image, job_id: str | None = None) -> None:
         """
         Store a serialized version of ``image`` for ``document_id`` at ``page_number``.
@@ -124,27 +143,28 @@ class LocalDataPointCacheStore(DataPointCacheStore):
         Args:
             document_id (str): Document identifier the image belongs to.
-            image_id (str): Image identifier (not directly used by this store but included for API
-                compatibility with other stores).
             page_number (int): 0-based page number of the image.
             image (Image): The Image object to serialize and store.
+            job_id (str | None): Optional job identifier to distinguish between different processing runs.
         """
-        pages = self._pages.get(document_id)
+        cache_key = self._get_cache_key(document_id, job_id)
+        pages = self._pages.get(cache_key)
         if pages is None:
             pages = {}
-            self._pages[document_id] = pages
+            self._pages[cache_key] = pages
         pages[page_number] = _image_to_cache_dict(image)
         if self._max_pages > 0 and len(pages) > self._max_pages:
             for k in sorted(pages.keys())[: -self._max_pages]:
                 pages.pop(k, None)
-    def get_datapoints(self, document_id: str, last_d: int) -> tuple[Image, ...]:
+    def get_datapoints(self, document_id: str, last_d: int, job_id: str | None = None) -> tuple[Image, ...]:
         """
         Retrieve up to ``last_d`` most recent datapoints for a document.
         Args:
             document_id (str): Document identifier to retrieve pages for.
             last_d (int): Maximum number of pages to return. If <= 0, an empty tuple is returned.
+            job_id (str | None): Optional job identifier to retrieve datapoints from a specific processing run.
         Returns:
             tuple[Image, ...]: Tuple of :class:`Image` instances reconstructed from the stored
@@ -152,7 +172,8 @@ class LocalDataPointCacheStore(DataPointCacheStore):
         """
         if last_d <= 0:
             return ()
-        pages = self._pages.get(document_id) or {}
+        cache_key = self._get_cache_key(document_id, job_id)
+        pages = self._pages.get(cache_key) or {}
         keys = sorted(pages.keys(), reverse=True)[:last_d]
         return tuple(Image(**pages[k]) for k in keys)
@@ -194,7 +215,6 @@ class DatapointManager:
         self.datapoint_is_passed: bool = False
         self.service_id = service_id
         self.model_id = model_id
-        self.session_id: Optional[str] = None
         if num_cached_datapoints < 0:
             raise ValueError("num_cached_datapoints must be >= 0")
@@ -203,7 +223,17 @@ class DatapointManager:
         self._cache_store = cache_store or LocalDataPointCacheStore(max_pages=num_cached_datapoints)
-    def _maybe_cache_datapoint(self, image: Optional[Image]) -> None:
+    def maybe_cache_datapoint(self, image: Optional[Image], job_id: str | None = None) -> None:
+        """
+        Cache the given datapoint if caching is enabled.
+        This should be called when a datapoint leaves the component to ensure it is cached.
+        Args:
+            image: The image datapoint to cache, or None to skip caching.
+            job_id: Optional job identifier to distinguish caches between different processing runs.
+                If None, caching key remains unchanged (backward compatible).
+        """
         if image is None:
             return
         if self.num_cached_datapoints <= 0:
@@ -214,9 +244,9 @@ class DatapointManager:
         self._cache_store.put_datapoint(
             document_id=image.document_id,
-            image_id=image.image_id,
             page_number=image.page_number,
             image=image,
+            job_id=job_id,
         )
     @property
@@ -242,7 +272,6 @@ class DatapointManager:
         Args:
             dp: The datapoint to set.
         """
-        self._maybe_cache_datapoint(self._datapoint)
         self._datapoint = dp
         self._cache_anns = {ann.annotation_id: ann for ann in dp.get_annotation()}
         self.datapoint_is_passed = True
@@ -329,7 +358,6 @@ class DatapointManager:
                 score=detect_result.score,
                 service_id=self.service_id,
                 model_id=self.model_id,
-                session_id=self.session_id,
             )
             if to_annotation_id is not None:
                 parent_ann = self._cache_anns[to_annotation_id]
@@ -406,7 +434,6 @@ class DatapointManager:
                 score=score,
                 service_id=self.service_id,
                 model_id=self.model_id,
-                session_id=self.session_id,
             )
             self._cache_anns[annotation_id].dump_sub_category(sub_cat_key, cat_ann)
         if annotation_context.context_error:
@@ -454,7 +481,6 @@ class DatapointManager:
                 score=score,
                 service_id=self.service_id,
                 model_id=self.model_id,
-                session_id=self.session_id,
             )
             self._cache_anns[annotation_id].dump_sub_category(sub_cat_key, cont_ann)
         if annotation_context.context_error:
@@ -542,7 +568,6 @@ class DatapointManager:
                     score=summary_score,
                     service_id=self.service_id,
                     model_id=self.model_id,
-                    session_id=self.session_id,
                 )
             else:
                 ann = CategoryAnnotation(
@@ -551,7 +576,6 @@ class DatapointManager:
                     score=summary_score,
                     service_id=self.service_id,
                     model_id=self.model_id,
-                    session_id=self.session_id,
                 )
             image.summary.dump_sub_category(summary_key, ann, image.image_id)

{deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/base.py RENAMED Viewed

@@ -24,7 +24,6 @@ from __future__ import annotations
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from typing import Any, Callable, Mapping, Optional, Union
-from uuid import uuid1
 from dd_core.dataflow import DataFlow, MapData
 from dd_core.datapoint.image import Image, MetaAnnotation
@@ -126,7 +125,7 @@ class PipelineComponent(ABC):
         if not self.filter_func(dp):
             self.serve(dp)
-    def pass_datapoint(self, dp: Image) -> Image:
+    def pass_datapoint(self, dp: Image, job_id: str | None = None) -> Image:
         """
         Acceptance, handover to `dp_manager`, transformation and forwarding of `dp`.
@@ -134,6 +133,8 @@ class PipelineComponent(ABC):
         Args:
             dp: Datapoint.
+            job_id: Optional job identifier to distinguish caches between different processing runs.
+                When None, caching behavior is backward compatible (no job distinction).
         Returns:
             Datapoint.
@@ -143,6 +144,9 @@ class PipelineComponent(ABC):
                 self._pass_datapoint(dp)
         else:
             self._pass_datapoint(dp)
+        self.dp_manager.maybe_cache_datapoint(self.dp_manager.datapoint, job_id=job_id)
         return self.dp_manager.datapoint
     def predict_dataflow(self, df: DataFlow) -> DataFlow:
@@ -255,6 +259,85 @@ class PipelineComponent(ABC):
         return MapData(df, self._undo)
+def get_meta_annotation(pipeline_component_list: list[PipelineComponent]) -> MetaAnnotation:
+    """
+    Collects meta annotations from all pipeline components and summarizes the returned results.
+    Returns:
+        Meta annotations with information about image annotations (list), sub categories (dict with category
+        names and generated sub categories), relationships (dict with category names and generated relationships)
+        as well as summaries (list with sub categories).
+    """
+    image_annotations: list[ObjectTypes] = []
+    sub_categories: dict[ObjectTypes, dict[ObjectTypes, set[ObjectTypes]]] = {}
+    relationships = defaultdict(set[ObjectTypes])  # type: ignore
+    summaries: list[ObjectTypes] = []
+    for component in pipeline_component_list:
+        meta_anns = component.get_meta_annotation()
+        image_annotations.extend(meta_anns.image_annotations)
+        for key, value in meta_anns.sub_categories.items():
+            sub_dict = meta_anns.sub_categories[key]
+            for sub_cat, sub_cat_value in value.items():
+                if sub_cat in sub_dict:
+                    sub_dict[sub_cat].update(sub_cat_value)
+                else:
+                    sub_dict[sub_cat] = {sub_cat_value}  # type: ignore
+            if key in sub_categories:
+                sub_categories[key].update(sub_dict)
+            else:
+                sub_categories[key] = sub_dict
+        for key, value in meta_anns.relationships.items():  # type: ignore
+            relationships[key].update(value)
+        summaries.extend(meta_anns.summaries)
+    return MetaAnnotation(
+        image_annotations=tuple(image_annotations),
+        sub_categories=sub_categories,
+        relationships=relationships,
+        summaries=tuple(summaries),
+    )
+def get_service_id_to_meta_annotation(pipeline_component_list: list[PipelineComponent]) -> Mapping[str, MetaAnnotation]:
+    """
+    Collects meta annotations from all pipeline components and return a dict of service id to its meta annotation.
+    Returns:
+        `service_id` to `MetaAnnotation` with information about image annotations (list), sub categories (dict with
+        category names and generated sub categories), relationships (dict with category names and generated
+        relationships) as well as summaries (list with sub categories).
+    """
+    service_id_to_meta_annotation = {}
+    for component in pipeline_component_list:
+        meta_anns = component.get_meta_annotation()
+        service_id_to_meta_annotation[component.service_id] = meta_anns
+    return service_id_to_meta_annotation
+def get_pipeline_info(
+    pipeline_component_list: list[PipelineComponent], service_id: str | None = None, name: str | None = None
+) -> Union[str, Mapping[str, str]]:
+    """
+    Get pipeline information.
+    Returns a dictionary with a description of each pipeline component.
+    Args:
+        service_id: Service id of the pipeline component to search for.
+        name: Name of the pipeline component to search for.
+    Returns:
+        Either a full dictionary with position and name of all pipeline components or the name, if
+        the position has been passed or the position if the name has been passed.
+    """
+    comp_info = {comp.service_id: comp.name for comp in pipeline_component_list}
+    comp_info_name_as_key = {value: key for key, value in comp_info.items()}
+    if service_id is not None:
+        return comp_info[service_id]
+    if name is not None:
+        return comp_info_name_as_key[name]
+    return comp_info
 class Pipeline(ABC):
     """
     Abstract base class for creating pipelines.
@@ -286,19 +369,6 @@ class Pipeline(ABC):
      core model or already processed further).
     In addition to `analyze`, the internal `_entry` is used to bundle preprocessing steps.
-    It is possible to set a session id for the pipeline. This is useful for logging purposes. The session id can be
-     either passed to the pipeline via the `analyze` method or generated automatically.
-    To generate a `session_id` automatically:
-    Example:
-        ```python
-        pipe = MyPipeline(pipeline_component = [layout, text])
-        pipe.set_session_id = True
-        df = pipe.analyze(input = "path/to/dir") # session_id is generated automatically
-        ```
     """
     def __init__(self, pipeline_component_list: list[PipelineComponent]) -> None:
@@ -309,7 +379,6 @@ class Pipeline(ABC):
             pipeline_component_list: A list of pipeline components.
         """
         self.pipe_component_list = pipeline_component_list
-        self.set_session_id = False
     @abstractmethod
     def _entry(self, **kwargs: Any) -> DataFlow:
@@ -380,22 +449,18 @@ class Pipeline(ABC):
         """
         raise NotImplementedError()
-    def _build_pipe(self, df: DataFlow, session_id: Optional[str] = None) -> DataFlow:
+    def _build_pipe(self, df: DataFlow) -> DataFlow:
         """
         Composition of the backbone.
         Args:
             df: The input dataflow.
-            session_id: Optional session id.
         Returns:
             The processed dataflow.
         """
-        if session_id is None and self.set_session_id:
-            session_id = self.get_session_id()
         for component in self.pipe_component_list:
             component.timer_on = True
-            component.dp_manager.session_id = session_id
             df = component.predict_dataflow(df)
         return df
@@ -408,33 +473,7 @@ class Pipeline(ABC):
             names and generated sub categories), relationships (dict with category names and generated relationships)
             as well as summaries (list with sub categories).
         """
-        image_annotations: list[ObjectTypes] = []
-        sub_categories: dict[ObjectTypes, dict[ObjectTypes, set[ObjectTypes]]] = {}
-        relationships = defaultdict(set[ObjectTypes])  # type: ignore
-        summaries: list[ObjectTypes] = []
-        for component in self.pipe_component_list:
-            meta_anns = component.get_meta_annotation()
-            image_annotations.extend(meta_anns.image_annotations)
-            for key, value in meta_anns.sub_categories.items():
-                sub_dict = meta_anns.sub_categories[key]
-                for sub_cat, sub_cat_value in value.items():
-                    if sub_cat in sub_dict:
-                        sub_dict[sub_cat].update(sub_cat_value)
-                    else:
-                        sub_dict[sub_cat] = {sub_cat_value}  # type: ignore
-                if key in sub_categories:
-                    sub_categories[key].update(sub_dict)
-                else:
-                    sub_categories[key] = sub_dict
-            for key, value in meta_anns.relationships.items():  # type: ignore
-                relationships[key].update(value)
-            summaries.extend(meta_anns.summaries)
-        return MetaAnnotation(
-            image_annotations=tuple(image_annotations),
-            sub_categories=dict(sub_categories),
-            relationships=dict(relationships),
-            summaries=tuple(summaries),
-        )
+        return get_meta_annotation(self.pipe_component_list)
     def get_service_id_to_meta_annotation(self) -> Mapping[str, MetaAnnotation]:
         """
@@ -445,11 +484,7 @@ class Pipeline(ABC):
             category names and generated sub categories), relationships (dict with category names and generated
             relationships) as well as summaries (list with sub categories).
         """
-        service_id_to_meta_annotation = {}
-        for component in self.pipe_component_list:
-            meta_anns = component.get_meta_annotation()
-            service_id_to_meta_annotation[component.service_id] = meta_anns
-        return service_id_to_meta_annotation
+        return get_service_id_to_meta_annotation(self.pipe_component_list)
     def get_pipeline_info(
         self, service_id: Optional[str] = None, name: Optional[str] = None
@@ -467,13 +502,7 @@ class Pipeline(ABC):
             Either a full dictionary with position and name of all pipeline components or the name, if
             the position has been passed or the position if the name has been passed.
         """
-        comp_info = {comp.service_id: comp.name for comp in self.pipe_component_list}
-        comp_info_name_as_key = {value: key for key, value in comp_info.items()}
-        if service_id is not None:
-            return comp_info[service_id]
-        if name is not None:
-            return comp_info_name_as_key[name]
-        return comp_info
+        return get_pipeline_info(self.pipe_component_list, service_id=service_id, name=name)
     def get_pipeline_component(self, service_id: Optional[str] = None, name: Optional[str] = None) -> PipelineComponent:
         """
@@ -490,13 +519,3 @@ class Pipeline(ABC):
             if comp.service_id == service_id or comp.name == name:
                 return comp
         raise ValueError(f"Pipeline component not found with service_id={service_id} or name={name}")
-    @staticmethod
-    def get_session_id() -> str:
-        """
-        Get the generating a session id.
-        Returns:
-            The session id as a string.
-        """
-        return str(uuid1())[:8]

{deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/common.py RENAMED Viewed

@@ -406,7 +406,7 @@ class PageParsingService(PipelineComponent):
     def serve(self, dp: Image) -> None:
         raise NotImplementedError("PageParsingService is not meant to be used in serve method")
-    def pass_datapoint(self, dp: Image) -> Page:  # type:ignore
+    def pass_datapoint(self, dp: Image, job_id: str | None = None) -> Page:  # type:ignore
         """
         Converts `Image` to `Page`.

{deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/doctectionpipe.py RENAMED Viewed

@@ -386,10 +386,9 @@ class DoctectionPipe(Pipeline):
         """
         output = kwargs.get("output", "page")
-        session_id = kwargs.get("session_id")
         assert output in ("page", "image", "dict"), "output must be either page image or dict"
         df = self._entry(**kwargs)
-        df = self._build_pipe(df, session_id=session_id)  # type: ignore
+        df = self._build_pipe(df)
         if output == "page":
             df = self.dataflow_to_page(df)
         elif output == "dict":

{deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: deepdoctection
-Version: 1.2.1
+Version: 1.2.3
 Summary: Repository for Document AI - server/inference core package
 Author: Dr. Janis Meyer
 License: Apache License 2.0