deepdoctection 1.2.1__tar.gz → 1.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/PKG-INFO +1 -1
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/pyproject.toml +1 -1
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/__init__.py +1 -1
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/anngen.py +43 -19
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/base.py +89 -70
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/common.py +1 -1
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/doctectionpipe.py +1 -2
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection.egg-info/PKG-INFO +1 -1
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/README.md +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/setup.cfg +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/analyzer/__init__.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/analyzer/config.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/analyzer/dd.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/analyzer/factory.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/configs/__init__.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/configs/conf_dd_one.yaml +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/configs/conf_tesseract.yaml +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/configs/profiles.jsonl +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/eval/__init__.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/eval/accmetric.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/eval/base.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/eval/cocometric.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/eval/eval.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/eval/registry.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/eval/tedsmetric.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/extern/__init__.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/extern/base.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/extern/d2detect.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/extern/deskew.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/extern/doctrocr.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/extern/hfdetr.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/extern/hflayoutlm.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/extern/hflm.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/extern/model.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/extern/pdftext.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/extern/tessocr.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/extern/texocr.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/__init__.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/concurrency.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/language.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/layout.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/lm.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/order.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/refine.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/registry.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/segment.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/sub_layout.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/text.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/transform.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/py.typed +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/train/__init__.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/train/d2_frcnn_train.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/train/hf_detr_train.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/train/hf_layoutlm_train.py +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection.egg-info/SOURCES.txt +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection.egg-info/dependency_links.txt +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection.egg-info/requires.txt +0 -0
- {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection.egg-info/top_level.txt +0 -0
|
@@ -12,7 +12,7 @@ from dd_core.utils.env_info import collect_env_info
|
|
|
12
12
|
from dd_core.utils.file_utils import _LazyModule
|
|
13
13
|
from dd_core.utils.logger import LoggingRecord, logger
|
|
14
14
|
|
|
15
|
-
__version__ = "1.2.
|
|
15
|
+
__version__ = "1.2.3"
|
|
16
16
|
_IMPORT_STRUCTURE = {
|
|
17
17
|
"analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory", "update_cfg_from_defaults"],
|
|
18
18
|
"eval": [
|
|
@@ -45,28 +45,31 @@ class DataPointCacheStore(ABC):
|
|
|
45
45
|
"""
|
|
46
46
|
|
|
47
47
|
@abstractmethod
|
|
48
|
-
def put_datapoint(self, document_id: str,
|
|
48
|
+
def put_datapoint(self, document_id: str, page_number: int, image: Image, job_id: str | None = None) -> None:
|
|
49
49
|
"""
|
|
50
50
|
Persist a datapoint (image) for a specific document and page number.
|
|
51
51
|
|
|
52
52
|
Args:
|
|
53
53
|
document_id (str): The identifier of the document the image belongs to.
|
|
54
|
-
image_id (str): The unique identifier of the image.
|
|
55
54
|
page_number (int): The 0-based page number inside the document.
|
|
56
55
|
image (Image): The image object to store (may be serialized by the store).
|
|
56
|
+
job_id (str | None): Optional job identifier to distinguish between different processing runs.
|
|
57
|
+
If None, caching key remains unchanged (backward compatible).
|
|
57
58
|
|
|
58
59
|
Returns:
|
|
59
60
|
None
|
|
60
61
|
"""
|
|
61
62
|
|
|
62
63
|
@abstractmethod
|
|
63
|
-
def get_datapoints(self, document_id: str, last_d: int) -> tuple[Image, ...]:
|
|
64
|
+
def get_datapoints(self, document_id: str, last_d: int, job_id: str | None = None) -> tuple[Image, ...]:
|
|
64
65
|
"""
|
|
65
66
|
Retrieve up to `last_d` most recently stored datapoints for the given document.
|
|
66
67
|
|
|
67
68
|
Args:
|
|
68
69
|
document_id (str): The identifier of the document to retrieve datapoints for.
|
|
69
70
|
last_d (int): Maximum number of most recent datapoints to return. Must be >= 0.
|
|
71
|
+
job_id (str | None): Optional job identifier to retrieve datapoints from a specific processing run.
|
|
72
|
+
If None, retrieves datapoints for the document without job distinction (backward compatible).
|
|
70
73
|
|
|
71
74
|
Returns:
|
|
72
75
|
tuple[Image, ...]: A tuple of reconstructed :class:`Image` objects ordered from
|
|
@@ -115,7 +118,23 @@ class LocalDataPointCacheStore(DataPointCacheStore):
|
|
|
115
118
|
self._max_pages = max_pages
|
|
116
119
|
self._pages: dict[str, dict[int, dict[str, Any]]] = {}
|
|
117
120
|
|
|
118
|
-
def
|
|
121
|
+
def _get_cache_key(self, document_id: str, job_id: str | None) -> str:
|
|
122
|
+
"""
|
|
123
|
+
Generate cache key, distinguishing by job_id if present.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
document_id (str): The document identifier.
|
|
127
|
+
job_id (str | None): Optional job identifier.
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
str: Cache key. If job_id is None, returns document_id unchanged.
|
|
131
|
+
If job_id is provided, returns "document_id::job_id".
|
|
132
|
+
"""
|
|
133
|
+
if job_id is None:
|
|
134
|
+
return document_id
|
|
135
|
+
return f"{document_id}::{job_id}"
|
|
136
|
+
|
|
137
|
+
def put_datapoint(self, document_id: str, page_number: int, image: Image, job_id: str | None = None) -> None:
|
|
119
138
|
"""
|
|
120
139
|
Store a serialized version of ``image`` for ``document_id`` at ``page_number``.
|
|
121
140
|
|
|
@@ -124,27 +143,28 @@ class LocalDataPointCacheStore(DataPointCacheStore):
|
|
|
124
143
|
|
|
125
144
|
Args:
|
|
126
145
|
document_id (str): Document identifier the image belongs to.
|
|
127
|
-
image_id (str): Image identifier (not directly used by this store but included for API
|
|
128
|
-
compatibility with other stores).
|
|
129
146
|
page_number (int): 0-based page number of the image.
|
|
130
147
|
image (Image): The Image object to serialize and store.
|
|
148
|
+
job_id (str | None): Optional job identifier to distinguish between different processing runs.
|
|
131
149
|
"""
|
|
132
|
-
|
|
150
|
+
cache_key = self._get_cache_key(document_id, job_id)
|
|
151
|
+
pages = self._pages.get(cache_key)
|
|
133
152
|
if pages is None:
|
|
134
153
|
pages = {}
|
|
135
|
-
self._pages[
|
|
154
|
+
self._pages[cache_key] = pages
|
|
136
155
|
pages[page_number] = _image_to_cache_dict(image)
|
|
137
156
|
if self._max_pages > 0 and len(pages) > self._max_pages:
|
|
138
157
|
for k in sorted(pages.keys())[: -self._max_pages]:
|
|
139
158
|
pages.pop(k, None)
|
|
140
159
|
|
|
141
|
-
def get_datapoints(self, document_id: str, last_d: int) -> tuple[Image, ...]:
|
|
160
|
+
def get_datapoints(self, document_id: str, last_d: int, job_id: str | None = None) -> tuple[Image, ...]:
|
|
142
161
|
"""
|
|
143
162
|
Retrieve up to ``last_d`` most recent datapoints for a document.
|
|
144
163
|
|
|
145
164
|
Args:
|
|
146
165
|
document_id (str): Document identifier to retrieve pages for.
|
|
147
166
|
last_d (int): Maximum number of pages to return. If <= 0, an empty tuple is returned.
|
|
167
|
+
job_id (str | None): Optional job identifier to retrieve datapoints from a specific processing run.
|
|
148
168
|
|
|
149
169
|
Returns:
|
|
150
170
|
tuple[Image, ...]: Tuple of :class:`Image` instances reconstructed from the stored
|
|
@@ -152,7 +172,8 @@ class LocalDataPointCacheStore(DataPointCacheStore):
|
|
|
152
172
|
"""
|
|
153
173
|
if last_d <= 0:
|
|
154
174
|
return ()
|
|
155
|
-
|
|
175
|
+
cache_key = self._get_cache_key(document_id, job_id)
|
|
176
|
+
pages = self._pages.get(cache_key) or {}
|
|
156
177
|
keys = sorted(pages.keys(), reverse=True)[:last_d]
|
|
157
178
|
return tuple(Image(**pages[k]) for k in keys)
|
|
158
179
|
|
|
@@ -194,7 +215,6 @@ class DatapointManager:
|
|
|
194
215
|
self.datapoint_is_passed: bool = False
|
|
195
216
|
self.service_id = service_id
|
|
196
217
|
self.model_id = model_id
|
|
197
|
-
self.session_id: Optional[str] = None
|
|
198
218
|
|
|
199
219
|
if num_cached_datapoints < 0:
|
|
200
220
|
raise ValueError("num_cached_datapoints must be >= 0")
|
|
@@ -203,7 +223,17 @@ class DatapointManager:
|
|
|
203
223
|
|
|
204
224
|
self._cache_store = cache_store or LocalDataPointCacheStore(max_pages=num_cached_datapoints)
|
|
205
225
|
|
|
206
|
-
def
|
|
226
|
+
def maybe_cache_datapoint(self, image: Optional[Image], job_id: str | None = None) -> None:
|
|
227
|
+
"""
|
|
228
|
+
Cache the given datapoint if caching is enabled.
|
|
229
|
+
|
|
230
|
+
This should be called when a datapoint leaves the component to ensure it is cached.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
image: The image datapoint to cache, or None to skip caching.
|
|
234
|
+
job_id: Optional job identifier to distinguish caches between different processing runs.
|
|
235
|
+
If None, caching key remains unchanged (backward compatible).
|
|
236
|
+
"""
|
|
207
237
|
if image is None:
|
|
208
238
|
return
|
|
209
239
|
if self.num_cached_datapoints <= 0:
|
|
@@ -214,9 +244,9 @@ class DatapointManager:
|
|
|
214
244
|
|
|
215
245
|
self._cache_store.put_datapoint(
|
|
216
246
|
document_id=image.document_id,
|
|
217
|
-
image_id=image.image_id,
|
|
218
247
|
page_number=image.page_number,
|
|
219
248
|
image=image,
|
|
249
|
+
job_id=job_id,
|
|
220
250
|
)
|
|
221
251
|
|
|
222
252
|
@property
|
|
@@ -242,7 +272,6 @@ class DatapointManager:
|
|
|
242
272
|
Args:
|
|
243
273
|
dp: The datapoint to set.
|
|
244
274
|
"""
|
|
245
|
-
self._maybe_cache_datapoint(self._datapoint)
|
|
246
275
|
self._datapoint = dp
|
|
247
276
|
self._cache_anns = {ann.annotation_id: ann for ann in dp.get_annotation()}
|
|
248
277
|
self.datapoint_is_passed = True
|
|
@@ -329,7 +358,6 @@ class DatapointManager:
|
|
|
329
358
|
score=detect_result.score,
|
|
330
359
|
service_id=self.service_id,
|
|
331
360
|
model_id=self.model_id,
|
|
332
|
-
session_id=self.session_id,
|
|
333
361
|
)
|
|
334
362
|
if to_annotation_id is not None:
|
|
335
363
|
parent_ann = self._cache_anns[to_annotation_id]
|
|
@@ -406,7 +434,6 @@ class DatapointManager:
|
|
|
406
434
|
score=score,
|
|
407
435
|
service_id=self.service_id,
|
|
408
436
|
model_id=self.model_id,
|
|
409
|
-
session_id=self.session_id,
|
|
410
437
|
)
|
|
411
438
|
self._cache_anns[annotation_id].dump_sub_category(sub_cat_key, cat_ann)
|
|
412
439
|
if annotation_context.context_error:
|
|
@@ -454,7 +481,6 @@ class DatapointManager:
|
|
|
454
481
|
score=score,
|
|
455
482
|
service_id=self.service_id,
|
|
456
483
|
model_id=self.model_id,
|
|
457
|
-
session_id=self.session_id,
|
|
458
484
|
)
|
|
459
485
|
self._cache_anns[annotation_id].dump_sub_category(sub_cat_key, cont_ann)
|
|
460
486
|
if annotation_context.context_error:
|
|
@@ -542,7 +568,6 @@ class DatapointManager:
|
|
|
542
568
|
score=summary_score,
|
|
543
569
|
service_id=self.service_id,
|
|
544
570
|
model_id=self.model_id,
|
|
545
|
-
session_id=self.session_id,
|
|
546
571
|
)
|
|
547
572
|
else:
|
|
548
573
|
ann = CategoryAnnotation(
|
|
@@ -551,7 +576,6 @@ class DatapointManager:
|
|
|
551
576
|
score=summary_score,
|
|
552
577
|
service_id=self.service_id,
|
|
553
578
|
model_id=self.model_id,
|
|
554
|
-
session_id=self.session_id,
|
|
555
579
|
)
|
|
556
580
|
image.summary.dump_sub_category(summary_key, ann, image.image_id)
|
|
557
581
|
|
|
@@ -24,7 +24,6 @@ from __future__ import annotations
|
|
|
24
24
|
from abc import ABC, abstractmethod
|
|
25
25
|
from collections import defaultdict
|
|
26
26
|
from typing import Any, Callable, Mapping, Optional, Union
|
|
27
|
-
from uuid import uuid1
|
|
28
27
|
|
|
29
28
|
from dd_core.dataflow import DataFlow, MapData
|
|
30
29
|
from dd_core.datapoint.image import Image, MetaAnnotation
|
|
@@ -126,7 +125,7 @@ class PipelineComponent(ABC):
|
|
|
126
125
|
if not self.filter_func(dp):
|
|
127
126
|
self.serve(dp)
|
|
128
127
|
|
|
129
|
-
def pass_datapoint(self, dp: Image) -> Image:
|
|
128
|
+
def pass_datapoint(self, dp: Image, job_id: str | None = None) -> Image:
|
|
130
129
|
"""
|
|
131
130
|
Acceptance, handover to `dp_manager`, transformation and forwarding of `dp`.
|
|
132
131
|
|
|
@@ -134,6 +133,8 @@ class PipelineComponent(ABC):
|
|
|
134
133
|
|
|
135
134
|
Args:
|
|
136
135
|
dp: Datapoint.
|
|
136
|
+
job_id: Optional job identifier to distinguish caches between different processing runs.
|
|
137
|
+
When None, caching behavior is backward compatible (no job distinction).
|
|
137
138
|
|
|
138
139
|
Returns:
|
|
139
140
|
Datapoint.
|
|
@@ -143,6 +144,9 @@ class PipelineComponent(ABC):
|
|
|
143
144
|
self._pass_datapoint(dp)
|
|
144
145
|
else:
|
|
145
146
|
self._pass_datapoint(dp)
|
|
147
|
+
|
|
148
|
+
self.dp_manager.maybe_cache_datapoint(self.dp_manager.datapoint, job_id=job_id)
|
|
149
|
+
|
|
146
150
|
return self.dp_manager.datapoint
|
|
147
151
|
|
|
148
152
|
def predict_dataflow(self, df: DataFlow) -> DataFlow:
|
|
@@ -255,6 +259,85 @@ class PipelineComponent(ABC):
|
|
|
255
259
|
return MapData(df, self._undo)
|
|
256
260
|
|
|
257
261
|
|
|
262
|
+
def get_meta_annotation(pipeline_component_list: list[PipelineComponent]) -> MetaAnnotation:
|
|
263
|
+
"""
|
|
264
|
+
Collects meta annotations from all pipeline components and summarizes the returned results.
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
Meta annotations with information about image annotations (list), sub categories (dict with category
|
|
268
|
+
names and generated sub categories), relationships (dict with category names and generated relationships)
|
|
269
|
+
as well as summaries (list with sub categories).
|
|
270
|
+
"""
|
|
271
|
+
image_annotations: list[ObjectTypes] = []
|
|
272
|
+
sub_categories: dict[ObjectTypes, dict[ObjectTypes, set[ObjectTypes]]] = {}
|
|
273
|
+
relationships = defaultdict(set[ObjectTypes]) # type: ignore
|
|
274
|
+
summaries: list[ObjectTypes] = []
|
|
275
|
+
for component in pipeline_component_list:
|
|
276
|
+
meta_anns = component.get_meta_annotation()
|
|
277
|
+
image_annotations.extend(meta_anns.image_annotations)
|
|
278
|
+
for key, value in meta_anns.sub_categories.items():
|
|
279
|
+
sub_dict = meta_anns.sub_categories[key]
|
|
280
|
+
for sub_cat, sub_cat_value in value.items():
|
|
281
|
+
if sub_cat in sub_dict:
|
|
282
|
+
sub_dict[sub_cat].update(sub_cat_value)
|
|
283
|
+
else:
|
|
284
|
+
sub_dict[sub_cat] = {sub_cat_value} # type: ignore
|
|
285
|
+
if key in sub_categories:
|
|
286
|
+
sub_categories[key].update(sub_dict)
|
|
287
|
+
else:
|
|
288
|
+
sub_categories[key] = sub_dict
|
|
289
|
+
for key, value in meta_anns.relationships.items(): # type: ignore
|
|
290
|
+
relationships[key].update(value)
|
|
291
|
+
summaries.extend(meta_anns.summaries)
|
|
292
|
+
return MetaAnnotation(
|
|
293
|
+
image_annotations=tuple(image_annotations),
|
|
294
|
+
sub_categories=sub_categories,
|
|
295
|
+
relationships=relationships,
|
|
296
|
+
summaries=tuple(summaries),
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def get_service_id_to_meta_annotation(pipeline_component_list: list[PipelineComponent]) -> Mapping[str, MetaAnnotation]:
|
|
301
|
+
"""
|
|
302
|
+
Collects meta annotations from all pipeline components and return a dict of service id to its meta annotation.
|
|
303
|
+
|
|
304
|
+
Returns:
|
|
305
|
+
`service_id` to `MetaAnnotation` with information about image annotations (list), sub categories (dict with
|
|
306
|
+
category names and generated sub categories), relationships (dict with category names and generated
|
|
307
|
+
relationships) as well as summaries (list with sub categories).
|
|
308
|
+
"""
|
|
309
|
+
service_id_to_meta_annotation = {}
|
|
310
|
+
for component in pipeline_component_list:
|
|
311
|
+
meta_anns = component.get_meta_annotation()
|
|
312
|
+
service_id_to_meta_annotation[component.service_id] = meta_anns
|
|
313
|
+
return service_id_to_meta_annotation
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def get_pipeline_info(
|
|
317
|
+
pipeline_component_list: list[PipelineComponent], service_id: str | None = None, name: str | None = None
|
|
318
|
+
) -> Union[str, Mapping[str, str]]:
|
|
319
|
+
"""
|
|
320
|
+
Get pipeline information.
|
|
321
|
+
|
|
322
|
+
Returns a dictionary with a description of each pipeline component.
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
service_id: Service id of the pipeline component to search for.
|
|
326
|
+
name: Name of the pipeline component to search for.
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
Either a full dictionary with position and name of all pipeline components or the name, if
|
|
330
|
+
the position has been passed or the position if the name has been passed.
|
|
331
|
+
"""
|
|
332
|
+
comp_info = {comp.service_id: comp.name for comp in pipeline_component_list}
|
|
333
|
+
comp_info_name_as_key = {value: key for key, value in comp_info.items()}
|
|
334
|
+
if service_id is not None:
|
|
335
|
+
return comp_info[service_id]
|
|
336
|
+
if name is not None:
|
|
337
|
+
return comp_info_name_as_key[name]
|
|
338
|
+
return comp_info
|
|
339
|
+
|
|
340
|
+
|
|
258
341
|
class Pipeline(ABC):
|
|
259
342
|
"""
|
|
260
343
|
Abstract base class for creating pipelines.
|
|
@@ -286,19 +369,6 @@ class Pipeline(ABC):
|
|
|
286
369
|
core model or already processed further).
|
|
287
370
|
|
|
288
371
|
In addition to `analyze`, the internal `_entry` is used to bundle preprocessing steps.
|
|
289
|
-
|
|
290
|
-
It is possible to set a session id for the pipeline. This is useful for logging purposes. The session id can be
|
|
291
|
-
either passed to the pipeline via the `analyze` method or generated automatically.
|
|
292
|
-
|
|
293
|
-
To generate a `session_id` automatically:
|
|
294
|
-
|
|
295
|
-
Example:
|
|
296
|
-
```python
|
|
297
|
-
pipe = MyPipeline(pipeline_component = [layout, text])
|
|
298
|
-
pipe.set_session_id = True
|
|
299
|
-
|
|
300
|
-
df = pipe.analyze(input = "path/to/dir") # session_id is generated automatically
|
|
301
|
-
```
|
|
302
372
|
"""
|
|
303
373
|
|
|
304
374
|
def __init__(self, pipeline_component_list: list[PipelineComponent]) -> None:
|
|
@@ -309,7 +379,6 @@ class Pipeline(ABC):
|
|
|
309
379
|
pipeline_component_list: A list of pipeline components.
|
|
310
380
|
"""
|
|
311
381
|
self.pipe_component_list = pipeline_component_list
|
|
312
|
-
self.set_session_id = False
|
|
313
382
|
|
|
314
383
|
@abstractmethod
|
|
315
384
|
def _entry(self, **kwargs: Any) -> DataFlow:
|
|
@@ -380,22 +449,18 @@ class Pipeline(ABC):
|
|
|
380
449
|
"""
|
|
381
450
|
raise NotImplementedError()
|
|
382
451
|
|
|
383
|
-
def _build_pipe(self, df: DataFlow
|
|
452
|
+
def _build_pipe(self, df: DataFlow) -> DataFlow:
|
|
384
453
|
"""
|
|
385
454
|
Composition of the backbone.
|
|
386
455
|
|
|
387
456
|
Args:
|
|
388
457
|
df: The input dataflow.
|
|
389
|
-
session_id: Optional session id.
|
|
390
458
|
|
|
391
459
|
Returns:
|
|
392
460
|
The processed dataflow.
|
|
393
461
|
"""
|
|
394
|
-
if session_id is None and self.set_session_id:
|
|
395
|
-
session_id = self.get_session_id()
|
|
396
462
|
for component in self.pipe_component_list:
|
|
397
463
|
component.timer_on = True
|
|
398
|
-
component.dp_manager.session_id = session_id
|
|
399
464
|
df = component.predict_dataflow(df)
|
|
400
465
|
return df
|
|
401
466
|
|
|
@@ -408,33 +473,7 @@ class Pipeline(ABC):
|
|
|
408
473
|
names and generated sub categories), relationships (dict with category names and generated relationships)
|
|
409
474
|
as well as summaries (list with sub categories).
|
|
410
475
|
"""
|
|
411
|
-
|
|
412
|
-
sub_categories: dict[ObjectTypes, dict[ObjectTypes, set[ObjectTypes]]] = {}
|
|
413
|
-
relationships = defaultdict(set[ObjectTypes]) # type: ignore
|
|
414
|
-
summaries: list[ObjectTypes] = []
|
|
415
|
-
for component in self.pipe_component_list:
|
|
416
|
-
meta_anns = component.get_meta_annotation()
|
|
417
|
-
image_annotations.extend(meta_anns.image_annotations)
|
|
418
|
-
for key, value in meta_anns.sub_categories.items():
|
|
419
|
-
sub_dict = meta_anns.sub_categories[key]
|
|
420
|
-
for sub_cat, sub_cat_value in value.items():
|
|
421
|
-
if sub_cat in sub_dict:
|
|
422
|
-
sub_dict[sub_cat].update(sub_cat_value)
|
|
423
|
-
else:
|
|
424
|
-
sub_dict[sub_cat] = {sub_cat_value} # type: ignore
|
|
425
|
-
if key in sub_categories:
|
|
426
|
-
sub_categories[key].update(sub_dict)
|
|
427
|
-
else:
|
|
428
|
-
sub_categories[key] = sub_dict
|
|
429
|
-
for key, value in meta_anns.relationships.items(): # type: ignore
|
|
430
|
-
relationships[key].update(value)
|
|
431
|
-
summaries.extend(meta_anns.summaries)
|
|
432
|
-
return MetaAnnotation(
|
|
433
|
-
image_annotations=tuple(image_annotations),
|
|
434
|
-
sub_categories=dict(sub_categories),
|
|
435
|
-
relationships=dict(relationships),
|
|
436
|
-
summaries=tuple(summaries),
|
|
437
|
-
)
|
|
476
|
+
return get_meta_annotation(self.pipe_component_list)
|
|
438
477
|
|
|
439
478
|
def get_service_id_to_meta_annotation(self) -> Mapping[str, MetaAnnotation]:
|
|
440
479
|
"""
|
|
@@ -445,11 +484,7 @@ class Pipeline(ABC):
|
|
|
445
484
|
category names and generated sub categories), relationships (dict with category names and generated
|
|
446
485
|
relationships) as well as summaries (list with sub categories).
|
|
447
486
|
"""
|
|
448
|
-
|
|
449
|
-
for component in self.pipe_component_list:
|
|
450
|
-
meta_anns = component.get_meta_annotation()
|
|
451
|
-
service_id_to_meta_annotation[component.service_id] = meta_anns
|
|
452
|
-
return service_id_to_meta_annotation
|
|
487
|
+
return get_service_id_to_meta_annotation(self.pipe_component_list)
|
|
453
488
|
|
|
454
489
|
def get_pipeline_info(
|
|
455
490
|
self, service_id: Optional[str] = None, name: Optional[str] = None
|
|
@@ -467,13 +502,7 @@ class Pipeline(ABC):
|
|
|
467
502
|
Either a full dictionary with position and name of all pipeline components or the name, if
|
|
468
503
|
the position has been passed or the position if the name has been passed.
|
|
469
504
|
"""
|
|
470
|
-
|
|
471
|
-
comp_info_name_as_key = {value: key for key, value in comp_info.items()}
|
|
472
|
-
if service_id is not None:
|
|
473
|
-
return comp_info[service_id]
|
|
474
|
-
if name is not None:
|
|
475
|
-
return comp_info_name_as_key[name]
|
|
476
|
-
return comp_info
|
|
505
|
+
return get_pipeline_info(self.pipe_component_list, service_id=service_id, name=name)
|
|
477
506
|
|
|
478
507
|
def get_pipeline_component(self, service_id: Optional[str] = None, name: Optional[str] = None) -> PipelineComponent:
|
|
479
508
|
"""
|
|
@@ -490,13 +519,3 @@ class Pipeline(ABC):
|
|
|
490
519
|
if comp.service_id == service_id or comp.name == name:
|
|
491
520
|
return comp
|
|
492
521
|
raise ValueError(f"Pipeline component not found with service_id={service_id} or name={name}")
|
|
493
|
-
|
|
494
|
-
@staticmethod
|
|
495
|
-
def get_session_id() -> str:
|
|
496
|
-
"""
|
|
497
|
-
Get the generating a session id.
|
|
498
|
-
|
|
499
|
-
Returns:
|
|
500
|
-
The session id as a string.
|
|
501
|
-
"""
|
|
502
|
-
return str(uuid1())[:8]
|
|
@@ -406,7 +406,7 @@ class PageParsingService(PipelineComponent):
|
|
|
406
406
|
def serve(self, dp: Image) -> None:
|
|
407
407
|
raise NotImplementedError("PageParsingService is not meant to be used in serve method")
|
|
408
408
|
|
|
409
|
-
def pass_datapoint(self, dp: Image) -> Page: # type:ignore
|
|
409
|
+
def pass_datapoint(self, dp: Image, job_id: str | None = None) -> Page: # type:ignore
|
|
410
410
|
"""
|
|
411
411
|
Converts `Image` to `Page`.
|
|
412
412
|
|
|
@@ -386,10 +386,9 @@ class DoctectionPipe(Pipeline):
|
|
|
386
386
|
"""
|
|
387
387
|
|
|
388
388
|
output = kwargs.get("output", "page")
|
|
389
|
-
session_id = kwargs.get("session_id")
|
|
390
389
|
assert output in ("page", "image", "dict"), "output must be either page image or dict"
|
|
391
390
|
df = self._entry(**kwargs)
|
|
392
|
-
df = self._build_pipe(df
|
|
391
|
+
df = self._build_pipe(df)
|
|
393
392
|
if output == "page":
|
|
394
393
|
df = self.dataflow_to_page(df)
|
|
395
394
|
elif output == "dict":
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/configs/conf_tesseract.yaml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|