deepdoctection 1.2.1__tar.gz → 1.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/PKG-INFO +1 -1
  2. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/pyproject.toml +1 -1
  3. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/__init__.py +1 -1
  4. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/anngen.py +43 -19
  5. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/base.py +89 -70
  6. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/common.py +1 -1
  7. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/doctectionpipe.py +1 -2
  8. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection.egg-info/PKG-INFO +1 -1
  9. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/README.md +0 -0
  10. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/setup.cfg +0 -0
  11. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/analyzer/__init__.py +0 -0
  12. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/analyzer/config.py +0 -0
  13. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/analyzer/dd.py +0 -0
  14. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/analyzer/factory.py +0 -0
  15. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/configs/__init__.py +0 -0
  16. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/configs/conf_dd_one.yaml +0 -0
  17. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/configs/conf_tesseract.yaml +0 -0
  18. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/configs/profiles.jsonl +0 -0
  19. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/eval/__init__.py +0 -0
  20. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/eval/accmetric.py +0 -0
  21. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/eval/base.py +0 -0
  22. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/eval/cocometric.py +0 -0
  23. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/eval/eval.py +0 -0
  24. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/eval/registry.py +0 -0
  25. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/eval/tedsmetric.py +0 -0
  26. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/extern/__init__.py +0 -0
  27. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/extern/base.py +0 -0
  28. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/extern/d2detect.py +0 -0
  29. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/extern/deskew.py +0 -0
  30. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/extern/doctrocr.py +0 -0
  31. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/extern/hfdetr.py +0 -0
  32. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/extern/hflayoutlm.py +0 -0
  33. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/extern/hflm.py +0 -0
  34. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/extern/model.py +0 -0
  35. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/extern/pdftext.py +0 -0
  36. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/extern/tessocr.py +0 -0
  37. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/extern/texocr.py +0 -0
  38. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/__init__.py +0 -0
  39. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/concurrency.py +0 -0
  40. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/language.py +0 -0
  41. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/layout.py +0 -0
  42. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/lm.py +0 -0
  43. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/order.py +0 -0
  44. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/refine.py +0 -0
  45. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/registry.py +0 -0
  46. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/segment.py +0 -0
  47. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/sub_layout.py +0 -0
  48. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/text.py +0 -0
  49. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/pipe/transform.py +0 -0
  50. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/py.typed +0 -0
  51. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/train/__init__.py +0 -0
  52. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/train/d2_frcnn_train.py +0 -0
  53. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/train/hf_detr_train.py +0 -0
  54. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection/train/hf_layoutlm_train.py +0 -0
  55. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection.egg-info/SOURCES.txt +0 -0
  56. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection.egg-info/dependency_links.txt +0 -0
  57. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection.egg-info/requires.txt +0 -0
  58. {deepdoctection-1.2.1 → deepdoctection-1.2.3}/src/deepdoctection.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deepdoctection
3
- Version: 1.2.1
3
+ Version: 1.2.3
4
4
  Summary: Repository for Document AI - server/inference core package
5
5
  Author: Dr. Janis Meyer
6
6
  License: Apache License 2.0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "deepdoctection"
7
- version = "1.2.1"
7
+ version = "1.2.3"
8
8
  authors = [
9
9
  {name = "Dr. Janis Meyer"}
10
10
  ]
@@ -12,7 +12,7 @@ from dd_core.utils.env_info import collect_env_info
12
12
  from dd_core.utils.file_utils import _LazyModule
13
13
  from dd_core.utils.logger import LoggingRecord, logger
14
14
 
15
- __version__ = "1.2.1"
15
+ __version__ = "1.2.3"
16
16
  _IMPORT_STRUCTURE = {
17
17
  "analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory", "update_cfg_from_defaults"],
18
18
  "eval": [
@@ -45,28 +45,31 @@ class DataPointCacheStore(ABC):
45
45
  """
46
46
 
47
47
  @abstractmethod
48
- def put_datapoint(self, document_id: str, image_id: str, page_number: int, image: Image) -> None:
48
+ def put_datapoint(self, document_id: str, page_number: int, image: Image, job_id: str | None = None) -> None:
49
49
  """
50
50
  Persist a datapoint (image) for a specific document and page number.
51
51
 
52
52
  Args:
53
53
  document_id (str): The identifier of the document the image belongs to.
54
- image_id (str): The unique identifier of the image.
55
54
  page_number (int): The 0-based page number inside the document.
56
55
  image (Image): The image object to store (may be serialized by the store).
56
+ job_id (str | None): Optional job identifier to distinguish between different processing runs.
57
+ If None, caching key remains unchanged (backward compatible).
57
58
 
58
59
  Returns:
59
60
  None
60
61
  """
61
62
 
62
63
  @abstractmethod
63
- def get_datapoints(self, document_id: str, last_d: int) -> tuple[Image, ...]:
64
+ def get_datapoints(self, document_id: str, last_d: int, job_id: str | None = None) -> tuple[Image, ...]:
64
65
  """
65
66
  Retrieve up to `last_d` most recently stored datapoints for the given document.
66
67
 
67
68
  Args:
68
69
  document_id (str): The identifier of the document to retrieve datapoints for.
69
70
  last_d (int): Maximum number of most recent datapoints to return. Must be >= 0.
71
+ job_id (str | None): Optional job identifier to retrieve datapoints from a specific processing run.
72
+ If None, retrieves datapoints for the document without job distinction (backward compatible).
70
73
 
71
74
  Returns:
72
75
  tuple[Image, ...]: A tuple of reconstructed :class:`Image` objects ordered from
@@ -115,7 +118,23 @@ class LocalDataPointCacheStore(DataPointCacheStore):
115
118
  self._max_pages = max_pages
116
119
  self._pages: dict[str, dict[int, dict[str, Any]]] = {}
117
120
 
118
- def put_datapoint(self, document_id: str, image_id: str, page_number: int, image: Image) -> None:
121
+ def _get_cache_key(self, document_id: str, job_id: str | None) -> str:
122
+ """
123
+ Generate cache key, distinguishing by job_id if present.
124
+
125
+ Args:
126
+ document_id (str): The document identifier.
127
+ job_id (str | None): Optional job identifier.
128
+
129
+ Returns:
130
+ str: Cache key. If job_id is None, returns document_id unchanged.
131
+ If job_id is provided, returns "document_id::job_id".
132
+ """
133
+ if job_id is None:
134
+ return document_id
135
+ return f"{document_id}::{job_id}"
136
+
137
+ def put_datapoint(self, document_id: str, page_number: int, image: Image, job_id: str | None = None) -> None:
119
138
  """
120
139
  Store a serialized version of ``image`` for ``document_id`` at ``page_number``.
121
140
 
@@ -124,27 +143,28 @@ class LocalDataPointCacheStore(DataPointCacheStore):
124
143
 
125
144
  Args:
126
145
  document_id (str): Document identifier the image belongs to.
127
- image_id (str): Image identifier (not directly used by this store but included for API
128
- compatibility with other stores).
129
146
  page_number (int): 0-based page number of the image.
130
147
  image (Image): The Image object to serialize and store.
148
+ job_id (str | None): Optional job identifier to distinguish between different processing runs.
131
149
  """
132
- pages = self._pages.get(document_id)
150
+ cache_key = self._get_cache_key(document_id, job_id)
151
+ pages = self._pages.get(cache_key)
133
152
  if pages is None:
134
153
  pages = {}
135
- self._pages[document_id] = pages
154
+ self._pages[cache_key] = pages
136
155
  pages[page_number] = _image_to_cache_dict(image)
137
156
  if self._max_pages > 0 and len(pages) > self._max_pages:
138
157
  for k in sorted(pages.keys())[: -self._max_pages]:
139
158
  pages.pop(k, None)
140
159
 
141
- def get_datapoints(self, document_id: str, last_d: int) -> tuple[Image, ...]:
160
+ def get_datapoints(self, document_id: str, last_d: int, job_id: str | None = None) -> tuple[Image, ...]:
142
161
  """
143
162
  Retrieve up to ``last_d`` most recent datapoints for a document.
144
163
 
145
164
  Args:
146
165
  document_id (str): Document identifier to retrieve pages for.
147
166
  last_d (int): Maximum number of pages to return. If <= 0, an empty tuple is returned.
167
+ job_id (str | None): Optional job identifier to retrieve datapoints from a specific processing run.
148
168
 
149
169
  Returns:
150
170
  tuple[Image, ...]: Tuple of :class:`Image` instances reconstructed from the stored
@@ -152,7 +172,8 @@ class LocalDataPointCacheStore(DataPointCacheStore):
152
172
  """
153
173
  if last_d <= 0:
154
174
  return ()
155
- pages = self._pages.get(document_id) or {}
175
+ cache_key = self._get_cache_key(document_id, job_id)
176
+ pages = self._pages.get(cache_key) or {}
156
177
  keys = sorted(pages.keys(), reverse=True)[:last_d]
157
178
  return tuple(Image(**pages[k]) for k in keys)
158
179
 
@@ -194,7 +215,6 @@ class DatapointManager:
194
215
  self.datapoint_is_passed: bool = False
195
216
  self.service_id = service_id
196
217
  self.model_id = model_id
197
- self.session_id: Optional[str] = None
198
218
 
199
219
  if num_cached_datapoints < 0:
200
220
  raise ValueError("num_cached_datapoints must be >= 0")
@@ -203,7 +223,17 @@ class DatapointManager:
203
223
 
204
224
  self._cache_store = cache_store or LocalDataPointCacheStore(max_pages=num_cached_datapoints)
205
225
 
206
- def _maybe_cache_datapoint(self, image: Optional[Image]) -> None:
226
+ def maybe_cache_datapoint(self, image: Optional[Image], job_id: str | None = None) -> None:
227
+ """
228
+ Cache the given datapoint if caching is enabled.
229
+
230
+ This should be called when a datapoint leaves the component to ensure it is cached.
231
+
232
+ Args:
233
+ image: The image datapoint to cache, or None to skip caching.
234
+ job_id: Optional job identifier to distinguish caches between different processing runs.
235
+ If None, caching key remains unchanged (backward compatible).
236
+ """
207
237
  if image is None:
208
238
  return
209
239
  if self.num_cached_datapoints <= 0:
@@ -214,9 +244,9 @@ class DatapointManager:
214
244
 
215
245
  self._cache_store.put_datapoint(
216
246
  document_id=image.document_id,
217
- image_id=image.image_id,
218
247
  page_number=image.page_number,
219
248
  image=image,
249
+ job_id=job_id,
220
250
  )
221
251
 
222
252
  @property
@@ -242,7 +272,6 @@ class DatapointManager:
242
272
  Args:
243
273
  dp: The datapoint to set.
244
274
  """
245
- self._maybe_cache_datapoint(self._datapoint)
246
275
  self._datapoint = dp
247
276
  self._cache_anns = {ann.annotation_id: ann for ann in dp.get_annotation()}
248
277
  self.datapoint_is_passed = True
@@ -329,7 +358,6 @@ class DatapointManager:
329
358
  score=detect_result.score,
330
359
  service_id=self.service_id,
331
360
  model_id=self.model_id,
332
- session_id=self.session_id,
333
361
  )
334
362
  if to_annotation_id is not None:
335
363
  parent_ann = self._cache_anns[to_annotation_id]
@@ -406,7 +434,6 @@ class DatapointManager:
406
434
  score=score,
407
435
  service_id=self.service_id,
408
436
  model_id=self.model_id,
409
- session_id=self.session_id,
410
437
  )
411
438
  self._cache_anns[annotation_id].dump_sub_category(sub_cat_key, cat_ann)
412
439
  if annotation_context.context_error:
@@ -454,7 +481,6 @@ class DatapointManager:
454
481
  score=score,
455
482
  service_id=self.service_id,
456
483
  model_id=self.model_id,
457
- session_id=self.session_id,
458
484
  )
459
485
  self._cache_anns[annotation_id].dump_sub_category(sub_cat_key, cont_ann)
460
486
  if annotation_context.context_error:
@@ -542,7 +568,6 @@ class DatapointManager:
542
568
  score=summary_score,
543
569
  service_id=self.service_id,
544
570
  model_id=self.model_id,
545
- session_id=self.session_id,
546
571
  )
547
572
  else:
548
573
  ann = CategoryAnnotation(
@@ -551,7 +576,6 @@ class DatapointManager:
551
576
  score=summary_score,
552
577
  service_id=self.service_id,
553
578
  model_id=self.model_id,
554
- session_id=self.session_id,
555
579
  )
556
580
  image.summary.dump_sub_category(summary_key, ann, image.image_id)
557
581
 
@@ -24,7 +24,6 @@ from __future__ import annotations
24
24
  from abc import ABC, abstractmethod
25
25
  from collections import defaultdict
26
26
  from typing import Any, Callable, Mapping, Optional, Union
27
- from uuid import uuid1
28
27
 
29
28
  from dd_core.dataflow import DataFlow, MapData
30
29
  from dd_core.datapoint.image import Image, MetaAnnotation
@@ -126,7 +125,7 @@ class PipelineComponent(ABC):
126
125
  if not self.filter_func(dp):
127
126
  self.serve(dp)
128
127
 
129
- def pass_datapoint(self, dp: Image) -> Image:
128
+ def pass_datapoint(self, dp: Image, job_id: str | None = None) -> Image:
130
129
  """
131
130
  Acceptance, handover to `dp_manager`, transformation and forwarding of `dp`.
132
131
 
@@ -134,6 +133,8 @@ class PipelineComponent(ABC):
134
133
 
135
134
  Args:
136
135
  dp: Datapoint.
136
+ job_id: Optional job identifier to distinguish caches between different processing runs.
137
+ When None, caching behavior is backward compatible (no job distinction).
137
138
 
138
139
  Returns:
139
140
  Datapoint.
@@ -143,6 +144,9 @@ class PipelineComponent(ABC):
143
144
  self._pass_datapoint(dp)
144
145
  else:
145
146
  self._pass_datapoint(dp)
147
+
148
+ self.dp_manager.maybe_cache_datapoint(self.dp_manager.datapoint, job_id=job_id)
149
+
146
150
  return self.dp_manager.datapoint
147
151
 
148
152
  def predict_dataflow(self, df: DataFlow) -> DataFlow:
@@ -255,6 +259,85 @@ class PipelineComponent(ABC):
255
259
  return MapData(df, self._undo)
256
260
 
257
261
 
262
+ def get_meta_annotation(pipeline_component_list: list[PipelineComponent]) -> MetaAnnotation:
263
+ """
264
+ Collects meta annotations from all pipeline components and summarizes the returned results.
265
+
266
+ Returns:
267
+ Meta annotations with information about image annotations (list), sub categories (dict with category
268
+ names and generated sub categories), relationships (dict with category names and generated relationships)
269
+ as well as summaries (list with sub categories).
270
+ """
271
+ image_annotations: list[ObjectTypes] = []
272
+ sub_categories: dict[ObjectTypes, dict[ObjectTypes, set[ObjectTypes]]] = {}
273
+ relationships = defaultdict(set[ObjectTypes]) # type: ignore
274
+ summaries: list[ObjectTypes] = []
275
+ for component in pipeline_component_list:
276
+ meta_anns = component.get_meta_annotation()
277
+ image_annotations.extend(meta_anns.image_annotations)
278
+ for key, value in meta_anns.sub_categories.items():
279
+ sub_dict = meta_anns.sub_categories[key]
280
+ for sub_cat, sub_cat_value in value.items():
281
+ if sub_cat in sub_dict:
282
+ sub_dict[sub_cat].update(sub_cat_value)
283
+ else:
284
+ sub_dict[sub_cat] = {sub_cat_value} # type: ignore
285
+ if key in sub_categories:
286
+ sub_categories[key].update(sub_dict)
287
+ else:
288
+ sub_categories[key] = sub_dict
289
+ for key, value in meta_anns.relationships.items(): # type: ignore
290
+ relationships[key].update(value)
291
+ summaries.extend(meta_anns.summaries)
292
+ return MetaAnnotation(
293
+ image_annotations=tuple(image_annotations),
294
+ sub_categories=sub_categories,
295
+ relationships=relationships,
296
+ summaries=tuple(summaries),
297
+ )
298
+
299
+
300
+ def get_service_id_to_meta_annotation(pipeline_component_list: list[PipelineComponent]) -> Mapping[str, MetaAnnotation]:
301
+ """
302
+ Collects meta annotations from all pipeline components and return a dict of service id to its meta annotation.
303
+
304
+ Returns:
305
+ `service_id` to `MetaAnnotation` with information about image annotations (list), sub categories (dict with
306
+ category names and generated sub categories), relationships (dict with category names and generated
307
+ relationships) as well as summaries (list with sub categories).
308
+ """
309
+ service_id_to_meta_annotation = {}
310
+ for component in pipeline_component_list:
311
+ meta_anns = component.get_meta_annotation()
312
+ service_id_to_meta_annotation[component.service_id] = meta_anns
313
+ return service_id_to_meta_annotation
314
+
315
+
316
+ def get_pipeline_info(
317
+ pipeline_component_list: list[PipelineComponent], service_id: str | None = None, name: str | None = None
318
+ ) -> Union[str, Mapping[str, str]]:
319
+ """
320
+ Get pipeline information.
321
+
322
+ Returns a dictionary with a description of each pipeline component.
323
+
324
+ Args:
325
+ service_id: Service id of the pipeline component to search for.
326
+ name: Name of the pipeline component to search for.
327
+
328
+ Returns:
329
+ Either a full dictionary with position and name of all pipeline components or the name, if
330
+ the position has been passed or the position if the name has been passed.
331
+ """
332
+ comp_info = {comp.service_id: comp.name for comp in pipeline_component_list}
333
+ comp_info_name_as_key = {value: key for key, value in comp_info.items()}
334
+ if service_id is not None:
335
+ return comp_info[service_id]
336
+ if name is not None:
337
+ return comp_info_name_as_key[name]
338
+ return comp_info
339
+
340
+
258
341
  class Pipeline(ABC):
259
342
  """
260
343
  Abstract base class for creating pipelines.
@@ -286,19 +369,6 @@ class Pipeline(ABC):
286
369
  core model or already processed further).
287
370
 
288
371
  In addition to `analyze`, the internal `_entry` is used to bundle preprocessing steps.
289
-
290
- It is possible to set a session id for the pipeline. This is useful for logging purposes. The session id can be
291
- either passed to the pipeline via the `analyze` method or generated automatically.
292
-
293
- To generate a `session_id` automatically:
294
-
295
- Example:
296
- ```python
297
- pipe = MyPipeline(pipeline_component = [layout, text])
298
- pipe.set_session_id = True
299
-
300
- df = pipe.analyze(input = "path/to/dir") # session_id is generated automatically
301
- ```
302
372
  """
303
373
 
304
374
  def __init__(self, pipeline_component_list: list[PipelineComponent]) -> None:
@@ -309,7 +379,6 @@ class Pipeline(ABC):
309
379
  pipeline_component_list: A list of pipeline components.
310
380
  """
311
381
  self.pipe_component_list = pipeline_component_list
312
- self.set_session_id = False
313
382
 
314
383
  @abstractmethod
315
384
  def _entry(self, **kwargs: Any) -> DataFlow:
@@ -380,22 +449,18 @@ class Pipeline(ABC):
380
449
  """
381
450
  raise NotImplementedError()
382
451
 
383
- def _build_pipe(self, df: DataFlow, session_id: Optional[str] = None) -> DataFlow:
452
+ def _build_pipe(self, df: DataFlow) -> DataFlow:
384
453
  """
385
454
  Composition of the backbone.
386
455
 
387
456
  Args:
388
457
  df: The input dataflow.
389
- session_id: Optional session id.
390
458
 
391
459
  Returns:
392
460
  The processed dataflow.
393
461
  """
394
- if session_id is None and self.set_session_id:
395
- session_id = self.get_session_id()
396
462
  for component in self.pipe_component_list:
397
463
  component.timer_on = True
398
- component.dp_manager.session_id = session_id
399
464
  df = component.predict_dataflow(df)
400
465
  return df
401
466
 
@@ -408,33 +473,7 @@ class Pipeline(ABC):
408
473
  names and generated sub categories), relationships (dict with category names and generated relationships)
409
474
  as well as summaries (list with sub categories).
410
475
  """
411
- image_annotations: list[ObjectTypes] = []
412
- sub_categories: dict[ObjectTypes, dict[ObjectTypes, set[ObjectTypes]]] = {}
413
- relationships = defaultdict(set[ObjectTypes]) # type: ignore
414
- summaries: list[ObjectTypes] = []
415
- for component in self.pipe_component_list:
416
- meta_anns = component.get_meta_annotation()
417
- image_annotations.extend(meta_anns.image_annotations)
418
- for key, value in meta_anns.sub_categories.items():
419
- sub_dict = meta_anns.sub_categories[key]
420
- for sub_cat, sub_cat_value in value.items():
421
- if sub_cat in sub_dict:
422
- sub_dict[sub_cat].update(sub_cat_value)
423
- else:
424
- sub_dict[sub_cat] = {sub_cat_value} # type: ignore
425
- if key in sub_categories:
426
- sub_categories[key].update(sub_dict)
427
- else:
428
- sub_categories[key] = sub_dict
429
- for key, value in meta_anns.relationships.items(): # type: ignore
430
- relationships[key].update(value)
431
- summaries.extend(meta_anns.summaries)
432
- return MetaAnnotation(
433
- image_annotations=tuple(image_annotations),
434
- sub_categories=dict(sub_categories),
435
- relationships=dict(relationships),
436
- summaries=tuple(summaries),
437
- )
476
+ return get_meta_annotation(self.pipe_component_list)
438
477
 
439
478
  def get_service_id_to_meta_annotation(self) -> Mapping[str, MetaAnnotation]:
440
479
  """
@@ -445,11 +484,7 @@ class Pipeline(ABC):
445
484
  category names and generated sub categories), relationships (dict with category names and generated
446
485
  relationships) as well as summaries (list with sub categories).
447
486
  """
448
- service_id_to_meta_annotation = {}
449
- for component in self.pipe_component_list:
450
- meta_anns = component.get_meta_annotation()
451
- service_id_to_meta_annotation[component.service_id] = meta_anns
452
- return service_id_to_meta_annotation
487
+ return get_service_id_to_meta_annotation(self.pipe_component_list)
453
488
 
454
489
  def get_pipeline_info(
455
490
  self, service_id: Optional[str] = None, name: Optional[str] = None
@@ -467,13 +502,7 @@ class Pipeline(ABC):
467
502
  Either a full dictionary with position and name of all pipeline components or the name, if
468
503
  the position has been passed or the position if the name has been passed.
469
504
  """
470
- comp_info = {comp.service_id: comp.name for comp in self.pipe_component_list}
471
- comp_info_name_as_key = {value: key for key, value in comp_info.items()}
472
- if service_id is not None:
473
- return comp_info[service_id]
474
- if name is not None:
475
- return comp_info_name_as_key[name]
476
- return comp_info
505
+ return get_pipeline_info(self.pipe_component_list, service_id=service_id, name=name)
477
506
 
478
507
  def get_pipeline_component(self, service_id: Optional[str] = None, name: Optional[str] = None) -> PipelineComponent:
479
508
  """
@@ -490,13 +519,3 @@ class Pipeline(ABC):
490
519
  if comp.service_id == service_id or comp.name == name:
491
520
  return comp
492
521
  raise ValueError(f"Pipeline component not found with service_id={service_id} or name={name}")
493
-
494
- @staticmethod
495
- def get_session_id() -> str:
496
- """
497
- Get the generating a session id.
498
-
499
- Returns:
500
- The session id as a string.
501
- """
502
- return str(uuid1())[:8]
@@ -406,7 +406,7 @@ class PageParsingService(PipelineComponent):
406
406
  def serve(self, dp: Image) -> None:
407
407
  raise NotImplementedError("PageParsingService is not meant to be used in serve method")
408
408
 
409
- def pass_datapoint(self, dp: Image) -> Page: # type:ignore
409
+ def pass_datapoint(self, dp: Image, job_id: str | None = None) -> Page: # type:ignore
410
410
  """
411
411
  Converts `Image` to `Page`.
412
412
 
@@ -386,10 +386,9 @@ class DoctectionPipe(Pipeline):
386
386
  """
387
387
 
388
388
  output = kwargs.get("output", "page")
389
- session_id = kwargs.get("session_id")
390
389
  assert output in ("page", "image", "dict"), "output must be either page image or dict"
391
390
  df = self._entry(**kwargs)
392
- df = self._build_pipe(df, session_id=session_id) # type: ignore
391
+ df = self._build_pipe(df)
393
392
  if output == "page":
394
393
  df = self.dataflow_to_page(df)
395
394
  elif output == "dict":
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deepdoctection
3
- Version: 1.2.1
3
+ Version: 1.2.3
4
4
  Summary: Repository for Document AI - server/inference core package
5
5
  Author: Dr. Janis Meyer
6
6
  License: Apache License 2.0
File without changes
File without changes