deepdoctection 1.2.0__tar.gz → 1.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/PKG-INFO +1 -1
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/pyproject.toml +1 -1
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/__init__.py +1 -1
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/pipe/anngen.py +137 -11
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection.egg-info/PKG-INFO +1 -1
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/README.md +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/setup.cfg +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/analyzer/__init__.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/analyzer/config.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/analyzer/dd.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/analyzer/factory.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/configs/__init__.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/configs/conf_dd_one.yaml +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/configs/conf_tesseract.yaml +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/configs/profiles.jsonl +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/eval/__init__.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/eval/accmetric.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/eval/base.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/eval/cocometric.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/eval/eval.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/eval/registry.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/eval/tedsmetric.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/extern/__init__.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/extern/base.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/extern/d2detect.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/extern/deskew.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/extern/doctrocr.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/extern/hfdetr.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/extern/hflayoutlm.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/extern/hflm.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/extern/model.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/extern/pdftext.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/extern/tessocr.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/extern/texocr.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/pipe/__init__.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/pipe/base.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/pipe/common.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/pipe/concurrency.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/pipe/doctectionpipe.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/pipe/language.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/pipe/layout.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/pipe/lm.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/pipe/order.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/pipe/refine.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/pipe/registry.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/pipe/segment.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/pipe/sub_layout.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/pipe/text.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/pipe/transform.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/py.typed +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/train/__init__.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/train/d2_frcnn_train.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/train/hf_detr_train.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/train/hf_layoutlm_train.py +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection.egg-info/SOURCES.txt +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection.egg-info/dependency_links.txt +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection.egg-info/requires.txt +0 -0
- {deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection.egg-info/top_level.txt +0 -0
|
@@ -12,7 +12,7 @@ from dd_core.utils.env_info import collect_env_info
|
|
|
12
12
|
from dd_core.utils.file_utils import _LazyModule
|
|
13
13
|
from dd_core.utils.logger import LoggingRecord, logger
|
|
14
14
|
|
|
15
|
-
__version__ = "1.2.
|
|
15
|
+
__version__ = "1.2.1"
|
|
16
16
|
_IMPORT_STRUCTURE = {
|
|
17
17
|
"analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory", "update_cfg_from_defaults"],
|
|
18
18
|
"eval": [
|
|
@@ -18,7 +18,8 @@
|
|
|
18
18
|
"""
|
|
19
19
|
Datapoint manager
|
|
20
20
|
"""
|
|
21
|
-
|
|
21
|
+
|
|
22
|
+
from abc import ABC, abstractmethod
|
|
22
23
|
from dataclasses import asdict
|
|
23
24
|
from typing import Any, Optional, Sequence, Union
|
|
24
25
|
|
|
@@ -33,6 +34,129 @@ from dd_core.utils.object_types import ObjectTypes, RelationshipKey
|
|
|
33
34
|
from ..extern.base import DetectionResult
|
|
34
35
|
|
|
35
36
|
|
|
37
|
+
class DataPointCacheStore(ABC):
|
|
38
|
+
"""
|
|
39
|
+
Abstract interface for a datapoint cache store.
|
|
40
|
+
|
|
41
|
+
Implementations are expected to provide a mechanism to persist and retrieve recently
|
|
42
|
+
used image datapoints (pages) for a given document. This is used by the
|
|
43
|
+
:class:`DatapointManager` to keep a bounded FIFO cache of previously seen
|
|
44
|
+
datapoints.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
@abstractmethod
|
|
48
|
+
def put_datapoint(self, document_id: str, image_id: str, page_number: int, image: Image) -> None:
|
|
49
|
+
"""
|
|
50
|
+
Persist a datapoint (image) for a specific document and page number.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
document_id (str): The identifier of the document the image belongs to.
|
|
54
|
+
image_id (str): The unique identifier of the image.
|
|
55
|
+
page_number (int): The 0-based page number inside the document.
|
|
56
|
+
image (Image): The image object to store (may be serialized by the store).
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
None
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
@abstractmethod
|
|
63
|
+
def get_datapoints(self, document_id: str, last_d: int) -> tuple[Image, ...]:
|
|
64
|
+
"""
|
|
65
|
+
Retrieve up to `last_d` most recently stored datapoints for the given document.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
document_id (str): The identifier of the document to retrieve datapoints for.
|
|
69
|
+
last_d (int): Maximum number of most recent datapoints to return. Must be >= 0.
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
tuple[Image, ...]: A tuple of reconstructed :class:`Image` objects ordered from
|
|
73
|
+
newest to oldest (or an empty tuple if none exist).
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _set_image_keys_to_none(d: Any) -> None:
|
|
78
|
+
if isinstance(d, dict):
|
|
79
|
+
for key, value in d.items():
|
|
80
|
+
if key == "_image":
|
|
81
|
+
d[key] = None
|
|
82
|
+
else:
|
|
83
|
+
_set_image_keys_to_none(value)
|
|
84
|
+
elif isinstance(d, list):
|
|
85
|
+
for item in d:
|
|
86
|
+
_set_image_keys_to_none(item)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _image_to_cache_dict(image: Image) -> dict[str, Any]:
|
|
90
|
+
image.remove_image_from_lower_hierarchy()
|
|
91
|
+
export_dict = image.as_dict()
|
|
92
|
+
_set_image_keys_to_none(export_dict)
|
|
93
|
+
return export_dict
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class LocalDataPointCacheStore(DataPointCacheStore):
|
|
97
|
+
"""
|
|
98
|
+
In-memory implementation of :class:`DataPointCacheStore`.
|
|
99
|
+
|
|
100
|
+
This simple store keeps a small per-document mapping of page-number -> serialized image
|
|
101
|
+
dictionaries and enforces a FIFO eviction policy based on ``max_pages``.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
max_pages (int): Maximum number of pages to keep per document. If <= 0 caching
|
|
105
|
+
is effectively disabled. Defaults to 3.
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
def __init__(self, max_pages: int = 3) -> None:
|
|
109
|
+
"""
|
|
110
|
+
Initialize the in-memory cache store.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
max_pages (int): Maximum number of pages to keep per document.
|
|
114
|
+
"""
|
|
115
|
+
self._max_pages = max_pages
|
|
116
|
+
self._pages: dict[str, dict[int, dict[str, Any]]] = {}
|
|
117
|
+
|
|
118
|
+
def put_datapoint(self, document_id: str, image_id: str, page_number: int, image: Image) -> None:
|
|
119
|
+
"""
|
|
120
|
+
Store a serialized version of ``image`` for ``document_id`` at ``page_number``.
|
|
121
|
+
|
|
122
|
+
If the number of stored pages for the document exceeds ``self._max_pages`` an eviction
|
|
123
|
+
of the oldest pages (lowest page numbers) will be performed.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
document_id (str): Document identifier the image belongs to.
|
|
127
|
+
image_id (str): Image identifier (not directly used by this store but included for API
|
|
128
|
+
compatibility with other stores).
|
|
129
|
+
page_number (int): 0-based page number of the image.
|
|
130
|
+
image (Image): The Image object to serialize and store.
|
|
131
|
+
"""
|
|
132
|
+
pages = self._pages.get(document_id)
|
|
133
|
+
if pages is None:
|
|
134
|
+
pages = {}
|
|
135
|
+
self._pages[document_id] = pages
|
|
136
|
+
pages[page_number] = _image_to_cache_dict(image)
|
|
137
|
+
if self._max_pages > 0 and len(pages) > self._max_pages:
|
|
138
|
+
for k in sorted(pages.keys())[: -self._max_pages]:
|
|
139
|
+
pages.pop(k, None)
|
|
140
|
+
|
|
141
|
+
def get_datapoints(self, document_id: str, last_d: int) -> tuple[Image, ...]:
|
|
142
|
+
"""
|
|
143
|
+
Retrieve up to ``last_d`` most recent datapoints for a document.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
document_id (str): Document identifier to retrieve pages for.
|
|
147
|
+
last_d (int): Maximum number of pages to return. If <= 0, an empty tuple is returned.
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
tuple[Image, ...]: Tuple of :class:`Image` instances reconstructed from the stored
|
|
151
|
+
serialized dicts ordered from newest -> oldest.
|
|
152
|
+
"""
|
|
153
|
+
if last_d <= 0:
|
|
154
|
+
return ()
|
|
155
|
+
pages = self._pages.get(document_id) or {}
|
|
156
|
+
keys = sorted(pages.keys(), reverse=True)[:last_d]
|
|
157
|
+
return tuple(Image(**pages[k]) for k in keys)
|
|
158
|
+
|
|
159
|
+
|
|
36
160
|
class DatapointManager:
|
|
37
161
|
"""
|
|
38
162
|
This class provides an API for manipulating image datapoints. This includes the creation and storage of
|
|
@@ -63,6 +187,7 @@ class DatapointManager:
|
|
|
63
187
|
model_id: Optional[str] = None,
|
|
64
188
|
num_cached_datapoints: int = 0,
|
|
65
189
|
remove_pixel_values_from_cache: bool = True,
|
|
190
|
+
cache_store: LocalDataPointCacheStore | None = None,
|
|
66
191
|
) -> None:
|
|
67
192
|
self._datapoint: Optional[Image] = None
|
|
68
193
|
self._cache_anns: dict[str, ImageAnnotation] = {}
|
|
@@ -75,7 +200,8 @@ class DatapointManager:
|
|
|
75
200
|
raise ValueError("num_cached_datapoints must be >= 0")
|
|
76
201
|
self.num_cached_datapoints = num_cached_datapoints
|
|
77
202
|
self.remove_pixel_values_from_cache = remove_pixel_values_from_cache
|
|
78
|
-
|
|
203
|
+
|
|
204
|
+
self._cache_store = cache_store or LocalDataPointCacheStore(max_pages=num_cached_datapoints)
|
|
79
205
|
|
|
80
206
|
def _maybe_cache_datapoint(self, image: Optional[Image]) -> None:
|
|
81
207
|
if image is None:
|
|
@@ -85,12 +211,13 @@ class DatapointManager:
|
|
|
85
211
|
|
|
86
212
|
if self.remove_pixel_values_from_cache:
|
|
87
213
|
image.clear_image()
|
|
88
|
-
image.remove_image_from_lower_hierarchy(pixel_values_only=True)
|
|
89
|
-
|
|
90
|
-
self._cached_datapoints.append(image)
|
|
91
214
|
|
|
92
|
-
|
|
93
|
-
|
|
215
|
+
self._cache_store.put_datapoint(
|
|
216
|
+
document_id=image.document_id,
|
|
217
|
+
image_id=image.image_id,
|
|
218
|
+
page_number=image.page_number,
|
|
219
|
+
image=image,
|
|
220
|
+
)
|
|
94
221
|
|
|
95
222
|
@property
|
|
96
223
|
def datapoint(self) -> Image:
|
|
@@ -116,7 +243,6 @@ class DatapointManager:
|
|
|
116
243
|
dp: The datapoint to set.
|
|
117
244
|
"""
|
|
118
245
|
self._maybe_cache_datapoint(self._datapoint)
|
|
119
|
-
|
|
120
246
|
self._datapoint = dp
|
|
121
247
|
self._cache_anns = {ann.annotation_id: ann for ann in dp.get_annotation()}
|
|
122
248
|
self.datapoint_is_passed = True
|
|
@@ -481,8 +607,8 @@ class DatapointManager:
|
|
|
481
607
|
"""
|
|
482
608
|
if last_k < 0:
|
|
483
609
|
raise ValueError("last_k must be >= 0")
|
|
484
|
-
if last_k == 0
|
|
610
|
+
if last_k == 0:
|
|
485
611
|
return tuple()
|
|
486
612
|
|
|
487
|
-
|
|
488
|
-
return
|
|
613
|
+
doc_id = self.datapoint.document_id
|
|
614
|
+
return self._cache_store.get_datapoints(document_id=doc_id, last_d=last_k)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection/configs/conf_tesseract.yaml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{deepdoctection-1.2.0 → deepdoctection-1.2.1}/src/deepdoctection.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|