deepdoctection 1.2.2__tar.gz → 1.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/PKG-INFO +1 -1
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/pyproject.toml +1 -1
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/__init__.py +1 -1
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/pipe/anngen.py +2 -10
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/pipe/base.py +82 -39
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection.egg-info/PKG-INFO +1 -1
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/README.md +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/setup.cfg +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/analyzer/__init__.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/analyzer/config.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/analyzer/dd.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/analyzer/factory.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/configs/__init__.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/configs/conf_dd_one.yaml +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/configs/conf_tesseract.yaml +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/configs/profiles.jsonl +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/eval/__init__.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/eval/accmetric.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/eval/base.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/eval/cocometric.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/eval/eval.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/eval/registry.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/eval/tedsmetric.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/extern/__init__.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/extern/base.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/extern/d2detect.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/extern/deskew.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/extern/doctrocr.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/extern/hfdetr.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/extern/hflayoutlm.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/extern/hflm.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/extern/model.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/extern/pdftext.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/extern/tessocr.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/extern/texocr.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/pipe/__init__.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/pipe/common.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/pipe/concurrency.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/pipe/doctectionpipe.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/pipe/language.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/pipe/layout.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/pipe/lm.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/pipe/order.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/pipe/refine.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/pipe/registry.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/pipe/segment.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/pipe/sub_layout.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/pipe/text.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/pipe/transform.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/py.typed +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/train/__init__.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/train/d2_frcnn_train.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/train/hf_detr_train.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/train/hf_layoutlm_train.py +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection.egg-info/SOURCES.txt +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection.egg-info/dependency_links.txt +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection.egg-info/requires.txt +0 -0
- {deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection.egg-info/top_level.txt +0 -0
|
@@ -12,7 +12,7 @@ from dd_core.utils.env_info import collect_env_info
|
|
|
12
12
|
from dd_core.utils.file_utils import _LazyModule
|
|
13
13
|
from dd_core.utils.logger import LoggingRecord, logger
|
|
14
14
|
|
|
15
|
-
__version__ = "1.2.
|
|
15
|
+
__version__ = "1.2.4"
|
|
16
16
|
_IMPORT_STRUCTURE = {
|
|
17
17
|
"analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory", "update_cfg_from_defaults"],
|
|
18
18
|
"eval": [
|
|
@@ -45,15 +45,12 @@ class DataPointCacheStore(ABC):
|
|
|
45
45
|
"""
|
|
46
46
|
|
|
47
47
|
@abstractmethod
|
|
48
|
-
def put_datapoint(
|
|
49
|
-
self, document_id: str, image_id: str, page_number: int, image: Image, job_id: str | None = None
|
|
50
|
-
) -> None:
|
|
48
|
+
def put_datapoint(self, document_id: str, page_number: int, image: Image, job_id: str | None = None) -> None:
|
|
51
49
|
"""
|
|
52
50
|
Persist a datapoint (image) for a specific document and page number.
|
|
53
51
|
|
|
54
52
|
Args:
|
|
55
53
|
document_id (str): The identifier of the document the image belongs to.
|
|
56
|
-
image_id (str): The unique identifier of the image.
|
|
57
54
|
page_number (int): The 0-based page number inside the document.
|
|
58
55
|
image (Image): The image object to store (may be serialized by the store).
|
|
59
56
|
job_id (str | None): Optional job identifier to distinguish between different processing runs.
|
|
@@ -137,9 +134,7 @@ class LocalDataPointCacheStore(DataPointCacheStore):
|
|
|
137
134
|
return document_id
|
|
138
135
|
return f"{document_id}::{job_id}"
|
|
139
136
|
|
|
140
|
-
def put_datapoint(
|
|
141
|
-
self, document_id: str, image_id: str, page_number: int, image: Image, job_id: str | None = None
|
|
142
|
-
) -> None:
|
|
137
|
+
def put_datapoint(self, document_id: str, page_number: int, image: Image, job_id: str | None = None) -> None:
|
|
143
138
|
"""
|
|
144
139
|
Store a serialized version of ``image`` for ``document_id`` at ``page_number``.
|
|
145
140
|
|
|
@@ -148,8 +143,6 @@ class LocalDataPointCacheStore(DataPointCacheStore):
|
|
|
148
143
|
|
|
149
144
|
Args:
|
|
150
145
|
document_id (str): Document identifier the image belongs to.
|
|
151
|
-
image_id (str): Image identifier (not directly used by this store but included for API
|
|
152
|
-
compatibility with other stores).
|
|
153
146
|
page_number (int): 0-based page number of the image.
|
|
154
147
|
image (Image): The Image object to serialize and store.
|
|
155
148
|
job_id (str | None): Optional job identifier to distinguish between different processing runs.
|
|
@@ -251,7 +244,6 @@ class DatapointManager:
|
|
|
251
244
|
|
|
252
245
|
self._cache_store.put_datapoint(
|
|
253
246
|
document_id=image.document_id,
|
|
254
|
-
image_id=image.image_id,
|
|
255
247
|
page_number=image.page_number,
|
|
256
248
|
image=image,
|
|
257
249
|
job_id=job_id,
|
|
@@ -259,6 +259,85 @@ class PipelineComponent(ABC):
|
|
|
259
259
|
return MapData(df, self._undo)
|
|
260
260
|
|
|
261
261
|
|
|
262
|
+
def get_meta_annotation(pipeline_component_list: list[PipelineComponent]) -> MetaAnnotation:
|
|
263
|
+
"""
|
|
264
|
+
Collects meta annotations from all pipeline components and summarizes the returned results.
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
Meta annotations with information about image annotations (list), sub categories (dict with category
|
|
268
|
+
names and generated sub categories), relationships (dict with category names and generated relationships)
|
|
269
|
+
as well as summaries (list with sub categories).
|
|
270
|
+
"""
|
|
271
|
+
image_annotations: list[ObjectTypes] = []
|
|
272
|
+
sub_categories: dict[ObjectTypes, dict[ObjectTypes, set[ObjectTypes]]] = {}
|
|
273
|
+
relationships = defaultdict(set[ObjectTypes]) # type: ignore
|
|
274
|
+
summaries: list[ObjectTypes] = []
|
|
275
|
+
for component in pipeline_component_list:
|
|
276
|
+
meta_anns = component.get_meta_annotation()
|
|
277
|
+
image_annotations.extend(meta_anns.image_annotations)
|
|
278
|
+
for key, value in meta_anns.sub_categories.items():
|
|
279
|
+
sub_dict = meta_anns.sub_categories[key]
|
|
280
|
+
for sub_cat, sub_cat_value in value.items():
|
|
281
|
+
if sub_cat in sub_dict:
|
|
282
|
+
sub_dict[sub_cat].update(sub_cat_value)
|
|
283
|
+
else:
|
|
284
|
+
sub_dict[sub_cat] = {sub_cat_value} # type: ignore
|
|
285
|
+
if key in sub_categories:
|
|
286
|
+
sub_categories[key].update(sub_dict)
|
|
287
|
+
else:
|
|
288
|
+
sub_categories[key] = sub_dict
|
|
289
|
+
for key, value in meta_anns.relationships.items(): # type: ignore
|
|
290
|
+
relationships[key].update(value)
|
|
291
|
+
summaries.extend(meta_anns.summaries)
|
|
292
|
+
return MetaAnnotation(
|
|
293
|
+
image_annotations=tuple(image_annotations),
|
|
294
|
+
sub_categories=sub_categories,
|
|
295
|
+
relationships=relationships,
|
|
296
|
+
summaries=tuple(summaries),
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def get_service_id_to_meta_annotation(pipeline_component_list: list[PipelineComponent]) -> Mapping[str, MetaAnnotation]:
|
|
301
|
+
"""
|
|
302
|
+
Collects meta annotations from all pipeline components and return a dict of service id to its meta annotation.
|
|
303
|
+
|
|
304
|
+
Returns:
|
|
305
|
+
`service_id` to `MetaAnnotation` with information about image annotations (list), sub categories (dict with
|
|
306
|
+
category names and generated sub categories), relationships (dict with category names and generated
|
|
307
|
+
relationships) as well as summaries (list with sub categories).
|
|
308
|
+
"""
|
|
309
|
+
service_id_to_meta_annotation = {}
|
|
310
|
+
for component in pipeline_component_list:
|
|
311
|
+
meta_anns = component.get_meta_annotation()
|
|
312
|
+
service_id_to_meta_annotation[component.service_id] = meta_anns
|
|
313
|
+
return service_id_to_meta_annotation
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def get_pipeline_info(
|
|
317
|
+
pipeline_component_list: list[PipelineComponent], service_id: str | None = None, name: str | None = None
|
|
318
|
+
) -> Union[str, Mapping[str, str]]:
|
|
319
|
+
"""
|
|
320
|
+
Get pipeline information.
|
|
321
|
+
|
|
322
|
+
Returns a dictionary with a description of each pipeline component.
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
service_id: Service id of the pipeline component to search for.
|
|
326
|
+
name: Name of the pipeline component to search for.
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
Either a full dictionary with position and name of all pipeline components or the name, if
|
|
330
|
+
the position has been passed or the position if the name has been passed.
|
|
331
|
+
"""
|
|
332
|
+
comp_info = {comp.service_id: comp.name for comp in pipeline_component_list}
|
|
333
|
+
comp_info_name_as_key = {value: key for key, value in comp_info.items()}
|
|
334
|
+
if service_id is not None:
|
|
335
|
+
return comp_info[service_id]
|
|
336
|
+
if name is not None:
|
|
337
|
+
return comp_info_name_as_key[name]
|
|
338
|
+
return comp_info
|
|
339
|
+
|
|
340
|
+
|
|
262
341
|
class Pipeline(ABC):
|
|
263
342
|
"""
|
|
264
343
|
Abstract base class for creating pipelines.
|
|
@@ -394,33 +473,7 @@ class Pipeline(ABC):
|
|
|
394
473
|
names and generated sub categories), relationships (dict with category names and generated relationships)
|
|
395
474
|
as well as summaries (list with sub categories).
|
|
396
475
|
"""
|
|
397
|
-
|
|
398
|
-
sub_categories: dict[ObjectTypes, dict[ObjectTypes, set[ObjectTypes]]] = {}
|
|
399
|
-
relationships = defaultdict(set[ObjectTypes]) # type: ignore
|
|
400
|
-
summaries: list[ObjectTypes] = []
|
|
401
|
-
for component in self.pipe_component_list:
|
|
402
|
-
meta_anns = component.get_meta_annotation()
|
|
403
|
-
image_annotations.extend(meta_anns.image_annotations)
|
|
404
|
-
for key, value in meta_anns.sub_categories.items():
|
|
405
|
-
sub_dict = meta_anns.sub_categories[key]
|
|
406
|
-
for sub_cat, sub_cat_value in value.items():
|
|
407
|
-
if sub_cat in sub_dict:
|
|
408
|
-
sub_dict[sub_cat].update(sub_cat_value)
|
|
409
|
-
else:
|
|
410
|
-
sub_dict[sub_cat] = {sub_cat_value} # type: ignore
|
|
411
|
-
if key in sub_categories:
|
|
412
|
-
sub_categories[key].update(sub_dict)
|
|
413
|
-
else:
|
|
414
|
-
sub_categories[key] = sub_dict
|
|
415
|
-
for key, value in meta_anns.relationships.items(): # type: ignore
|
|
416
|
-
relationships[key].update(value)
|
|
417
|
-
summaries.extend(meta_anns.summaries)
|
|
418
|
-
return MetaAnnotation(
|
|
419
|
-
image_annotations=tuple(image_annotations),
|
|
420
|
-
sub_categories=dict(sub_categories),
|
|
421
|
-
relationships=dict(relationships),
|
|
422
|
-
summaries=tuple(summaries),
|
|
423
|
-
)
|
|
476
|
+
return get_meta_annotation(self.pipe_component_list)
|
|
424
477
|
|
|
425
478
|
def get_service_id_to_meta_annotation(self) -> Mapping[str, MetaAnnotation]:
|
|
426
479
|
"""
|
|
@@ -431,11 +484,7 @@ class Pipeline(ABC):
|
|
|
431
484
|
category names and generated sub categories), relationships (dict with category names and generated
|
|
432
485
|
relationships) as well as summaries (list with sub categories).
|
|
433
486
|
"""
|
|
434
|
-
|
|
435
|
-
for component in self.pipe_component_list:
|
|
436
|
-
meta_anns = component.get_meta_annotation()
|
|
437
|
-
service_id_to_meta_annotation[component.service_id] = meta_anns
|
|
438
|
-
return service_id_to_meta_annotation
|
|
487
|
+
return get_service_id_to_meta_annotation(self.pipe_component_list)
|
|
439
488
|
|
|
440
489
|
def get_pipeline_info(
|
|
441
490
|
self, service_id: Optional[str] = None, name: Optional[str] = None
|
|
@@ -453,13 +502,7 @@ class Pipeline(ABC):
|
|
|
453
502
|
Either a full dictionary with position and name of all pipeline components or the name, if
|
|
454
503
|
the position has been passed or the position if the name has been passed.
|
|
455
504
|
"""
|
|
456
|
-
|
|
457
|
-
comp_info_name_as_key = {value: key for key, value in comp_info.items()}
|
|
458
|
-
if service_id is not None:
|
|
459
|
-
return comp_info[service_id]
|
|
460
|
-
if name is not None:
|
|
461
|
-
return comp_info_name_as_key[name]
|
|
462
|
-
return comp_info
|
|
505
|
+
return get_pipeline_info(self.pipe_component_list, service_id=service_id, name=name)
|
|
463
506
|
|
|
464
507
|
def get_pipeline_component(self, service_id: Optional[str] = None, name: Optional[str] = None) -> PipelineComponent:
|
|
465
508
|
"""
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection/configs/conf_tesseract.yaml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{deepdoctection-1.2.2 → deepdoctection-1.2.4}/src/deepdoctection.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|