deepdoctection 1.2.0__tar.gz → 1.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/PKG-INFO +1 -1
  2. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/pyproject.toml +1 -1
  3. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/__init__.py +1 -1
  4. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/pipe/anngen.py +137 -11
  5. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection.egg-info/PKG-INFO +1 -1
  6. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/README.md +0 -0
  7. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/setup.cfg +0 -0
  8. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/analyzer/__init__.py +0 -0
  9. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/analyzer/config.py +0 -0
  10. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/analyzer/dd.py +0 -0
  11. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/analyzer/factory.py +0 -0
  12. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/configs/__init__.py +0 -0
  13. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/configs/conf_dd_one.yaml +0 -0
  14. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/configs/conf_tesseract.yaml +0 -0
  15. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/configs/profiles.jsonl +0 -0
  16. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/eval/__init__.py +0 -0
  17. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/eval/accmetric.py +0 -0
  18. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/eval/base.py +0 -0
  19. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/eval/cocometric.py +0 -0
  20. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/eval/eval.py +0 -0
  21. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/eval/registry.py +0 -0
  22. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/eval/tedsmetric.py +0 -0
  23. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/extern/__init__.py +0 -0
  24. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/extern/base.py +0 -0
  25. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/extern/d2detect.py +0 -0
  26. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/extern/deskew.py +0 -0
  27. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/extern/doctrocr.py +0 -0
  28. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/extern/hfdetr.py +0 -0
  29. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/extern/hflayoutlm.py +0 -0
  30. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/extern/hflm.py +0 -0
  31. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/extern/model.py +0 -0
  32. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/extern/pdftext.py +0 -0
  33. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/extern/tessocr.py +0 -0
  34. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/extern/texocr.py +0 -0
  35. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/pipe/__init__.py +0 -0
  36. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/pipe/base.py +0 -0
  37. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/pipe/common.py +0 -0
  38. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/pipe/concurrency.py +0 -0
  39. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/pipe/doctectionpipe.py +0 -0
  40. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/pipe/language.py +0 -0
  41. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/pipe/layout.py +0 -0
  42. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/pipe/lm.py +0 -0
  43. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/pipe/order.py +0 -0
  44. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/pipe/refine.py +0 -0
  45. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/pipe/registry.py +0 -0
  46. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/pipe/segment.py +0 -0
  47. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/pipe/sub_layout.py +0 -0
  48. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/pipe/text.py +0 -0
  49. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/pipe/transform.py +0 -0
  50. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/py.typed +0 -0
  51. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/train/__init__.py +0 -0
  52. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/train/d2_frcnn_train.py +0 -0
  53. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/train/hf_detr_train.py +0 -0
  54. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/train/hf_layoutlm_train.py +0 -0
  55. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection.egg-info/SOURCES.txt +0 -0
  56. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection.egg-info/dependency_links.txt +0 -0
  57. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection.egg-info/requires.txt +0 -0
  58. {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deepdoctection
3
- Version: 1.2.0
3
+ Version: 1.2.1
4
4
  Summary: Repository for Document AI - server/inference core package
5
5
  Author: Dr. Janis Meyer
6
6
  License: Apache License 2.0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "deepdoctection"
7
- version = "1.2.0"
7
+ version = "1.2.1"
8
8
  authors = [
9
9
  {name = "Dr. Janis Meyer"}
10
10
  ]
@@ -12,7 +12,7 @@ from dd_core.utils.env_info import collect_env_info
12
12
  from dd_core.utils.file_utils import _LazyModule
13
13
  from dd_core.utils.logger import LoggingRecord, logger
14
14
 
15
- __version__ = "1.2.0"
15
+ __version__ = "1.2.1"
16
16
  _IMPORT_STRUCTURE = {
17
17
  "analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory", "update_cfg_from_defaults"],
18
18
  "eval": [
@@ -18,7 +18,8 @@
18
18
  """
19
19
  Datapoint manager
20
20
  """
21
- from collections import deque
21
+
22
+ from abc import ABC, abstractmethod
22
23
  from dataclasses import asdict
23
24
  from typing import Any, Optional, Sequence, Union
24
25
 
@@ -33,6 +34,129 @@ from dd_core.utils.object_types import ObjectTypes, RelationshipKey
33
34
  from ..extern.base import DetectionResult
34
35
 
35
36
 
37
+ class DataPointCacheStore(ABC):
38
+ """
39
+ Abstract interface for a datapoint cache store.
40
+
41
+ Implementations are expected to provide a mechanism to persist and retrieve recently
42
+ used image datapoints (pages) for a given document. This is used by the
43
+ :class:`DatapointManager` to keep a bounded FIFO cache of previously seen
44
+ datapoints.
45
+ """
46
+
47
+ @abstractmethod
48
+ def put_datapoint(self, document_id: str, image_id: str, page_number: int, image: Image) -> None:
49
+ """
50
+ Persist a datapoint (image) for a specific document and page number.
51
+
52
+ Args:
53
+ document_id (str): The identifier of the document the image belongs to.
54
+ image_id (str): The unique identifier of the image.
55
+ page_number (int): The 0-based page number inside the document.
56
+ image (Image): The image object to store (may be serialized by the store).
57
+
58
+ Returns:
59
+ None
60
+ """
61
+
62
+ @abstractmethod
63
+ def get_datapoints(self, document_id: str, last_d: int) -> tuple[Image, ...]:
64
+ """
65
+ Retrieve up to `last_d` most recently stored datapoints for the given document.
66
+
67
+ Args:
68
+ document_id (str): The identifier of the document to retrieve datapoints for.
69
+ last_d (int): Maximum number of most recent datapoints to return. Must be >= 0.
70
+
71
+ Returns:
72
+ tuple[Image, ...]: A tuple of reconstructed :class:`Image` objects ordered from
73
+ newest to oldest (or an empty tuple if none exist).
74
+ """
75
+
76
+
77
+ def _set_image_keys_to_none(d: Any) -> None:
78
+ if isinstance(d, dict):
79
+ for key, value in d.items():
80
+ if key == "_image":
81
+ d[key] = None
82
+ else:
83
+ _set_image_keys_to_none(value)
84
+ elif isinstance(d, list):
85
+ for item in d:
86
+ _set_image_keys_to_none(item)
87
+
88
+
89
+ def _image_to_cache_dict(image: Image) -> dict[str, Any]:
90
+ image.remove_image_from_lower_hierarchy()
91
+ export_dict = image.as_dict()
92
+ _set_image_keys_to_none(export_dict)
93
+ return export_dict
94
+
95
+
96
+ class LocalDataPointCacheStore(DataPointCacheStore):
97
+ """
98
+ In-memory implementation of :class:`DataPointCacheStore`.
99
+
100
+ This simple store keeps a small per-document mapping of page-number -> serialized image
101
+ dictionaries and enforces a FIFO eviction policy based on ``max_pages``.
102
+
103
+ Args:
104
+ max_pages (int): Maximum number of pages to keep per document. If <= 0 caching
105
+ is effectively disabled. Defaults to 3.
106
+ """
107
+
108
+ def __init__(self, max_pages: int = 3) -> None:
109
+ """
110
+ Initialize the in-memory cache store.
111
+
112
+ Args:
113
+ max_pages (int): Maximum number of pages to keep per document.
114
+ """
115
+ self._max_pages = max_pages
116
+ self._pages: dict[str, dict[int, dict[str, Any]]] = {}
117
+
118
+ def put_datapoint(self, document_id: str, image_id: str, page_number: int, image: Image) -> None:
119
+ """
120
+ Store a serialized version of ``image`` for ``document_id`` at ``page_number``.
121
+
122
+ If the number of stored pages for the document exceeds ``self._max_pages`` an eviction
123
+ of the oldest pages (lowest page numbers) will be performed.
124
+
125
+ Args:
126
+ document_id (str): Document identifier the image belongs to.
127
+ image_id (str): Image identifier (not directly used by this store but included for API
128
+ compatibility with other stores).
129
+ page_number (int): 0-based page number of the image.
130
+ image (Image): The Image object to serialize and store.
131
+ """
132
+ pages = self._pages.get(document_id)
133
+ if pages is None:
134
+ pages = {}
135
+ self._pages[document_id] = pages
136
+ pages[page_number] = _image_to_cache_dict(image)
137
+ if self._max_pages > 0 and len(pages) > self._max_pages:
138
+ for k in sorted(pages.keys())[: -self._max_pages]:
139
+ pages.pop(k, None)
140
+
141
+ def get_datapoints(self, document_id: str, last_d: int) -> tuple[Image, ...]:
142
+ """
143
+ Retrieve up to ``last_d`` most recent datapoints for a document.
144
+
145
+ Args:
146
+ document_id (str): Document identifier to retrieve pages for.
147
+ last_d (int): Maximum number of pages to return. If <= 0, an empty tuple is returned.
148
+
149
+ Returns:
150
+ tuple[Image, ...]: Tuple of :class:`Image` instances reconstructed from the stored
151
+ serialized dicts ordered from newest -> oldest.
152
+ """
153
+ if last_d <= 0:
154
+ return ()
155
+ pages = self._pages.get(document_id) or {}
156
+ keys = sorted(pages.keys(), reverse=True)[:last_d]
157
+ return tuple(Image(**pages[k]) for k in keys)
158
+
159
+
36
160
  class DatapointManager:
37
161
  """
38
162
  This class provides an API for manipulating image datapoints. This includes the creation and storage of
@@ -63,6 +187,7 @@ class DatapointManager:
63
187
  model_id: Optional[str] = None,
64
188
  num_cached_datapoints: int = 0,
65
189
  remove_pixel_values_from_cache: bool = True,
190
+ cache_store: LocalDataPointCacheStore | None = None,
66
191
  ) -> None:
67
192
  self._datapoint: Optional[Image] = None
68
193
  self._cache_anns: dict[str, ImageAnnotation] = {}
@@ -75,7 +200,8 @@ class DatapointManager:
75
200
  raise ValueError("num_cached_datapoints must be >= 0")
76
201
  self.num_cached_datapoints = num_cached_datapoints
77
202
  self.remove_pixel_values_from_cache = remove_pixel_values_from_cache
78
- self._cached_datapoints: deque[Image] = deque()
203
+
204
+ self._cache_store = cache_store or LocalDataPointCacheStore(max_pages=num_cached_datapoints)
79
205
 
80
206
  def _maybe_cache_datapoint(self, image: Optional[Image]) -> None:
81
207
  if image is None:
@@ -85,12 +211,13 @@ class DatapointManager:
85
211
 
86
212
  if self.remove_pixel_values_from_cache:
87
213
  image.clear_image()
88
- image.remove_image_from_lower_hierarchy(pixel_values_only=True)
89
-
90
- self._cached_datapoints.append(image)
91
214
 
92
- while len(self._cached_datapoints) > self.num_cached_datapoints:
93
- self._cached_datapoints.popleft()
215
+ self._cache_store.put_datapoint(
216
+ document_id=image.document_id,
217
+ image_id=image.image_id,
218
+ page_number=image.page_number,
219
+ image=image,
220
+ )
94
221
 
95
222
  @property
96
223
  def datapoint(self) -> Image:
@@ -116,7 +243,6 @@ class DatapointManager:
116
243
  dp: The datapoint to set.
117
244
  """
118
245
  self._maybe_cache_datapoint(self._datapoint)
119
-
120
246
  self._datapoint = dp
121
247
  self._cache_anns = {ann.annotation_id: ann for ann in dp.get_annotation()}
122
248
  self.datapoint_is_passed = True
@@ -481,8 +607,8 @@ class DatapointManager:
481
607
  """
482
608
  if last_k < 0:
483
609
  raise ValueError("last_k must be >= 0")
484
- if last_k == 0 or not self._cached_datapoints:
610
+ if last_k == 0:
485
611
  return tuple()
486
612
 
487
- k = min(last_k, len(self._cached_datapoints))
488
- return tuple(list(self._cached_datapoints)[-k:])
613
+ doc_id = self.datapoint.document_id
614
+ return self._cache_store.get_datapoints(document_id=doc_id, last_d=last_k)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deepdoctection
3
- Version: 1.2.0
3
+ Version: 1.2.1
4
4
  Summary: Repository for Document AI - server/inference core package
5
5
  Author: Dr. Janis Meyer
6
6
  License: Apache License 2.0
File without changes
File without changes