deepdoctection 0.43.6__py3-none-any.whl → 0.44.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

@@ -29,7 +29,8 @@ from typing import Any, Mapping, Union
29
29
 
30
30
  from lazy_imports import try_import
31
31
 
32
- from ..utils.file_utils import Requirement, get_fasttext_requirement
32
+ from ..utils.develop import deprecated
33
+ from ..utils.file_utils import Requirement, get_fasttext_requirement, get_numpy_v1_requirement
33
34
  from ..utils.settings import TypeOrStr, get_type
34
35
  from ..utils.types import PathLikeOrStr
35
36
  from .base import DetectionResult, LanguageDetector, ModelCategories
@@ -69,6 +70,7 @@ class FasttextLangDetectorMixin(LanguageDetector, ABC):
69
70
  return "fasttext_" + "_".join(Path(path_weights).parts[-2:])
70
71
 
71
72
 
73
+ @deprecated("As FastText archived, it will be deprecated in the near future.", "2025-08-17")
72
74
  class FasttextLangDetector(FasttextLangDetectorMixin):
73
75
  """
74
76
  Fasttext language detector wrapper. Two models provided in the fasttext library can be used to identify languages.
@@ -114,7 +116,7 @@ class FasttextLangDetector(FasttextLangDetectorMixin):
114
116
 
115
117
  @classmethod
116
118
  def get_requirements(cls) -> list[Requirement]:
117
- return [get_fasttext_requirement()]
119
+ return [get_numpy_v1_requirement(), get_fasttext_requirement()]
118
120
 
119
121
  def clone(self) -> FasttextLangDetector:
120
122
  return self.__class__(self.path_weights, self.categories.get_categories(), self.categories_orig)
@@ -806,17 +806,17 @@ def image_to_raw_lm_features(
806
806
  raw_features["image_id"] = page.image_id
807
807
  raw_features["width"] = page.width
808
808
  raw_features["height"] = page.height
809
- raw_features["ann_ids"] = text_["ann_ids"]
810
- raw_features["words"] = text_["words"]
809
+ raw_features["ann_ids"] = text_.ann_ids
810
+ raw_features["words"] = text_.words
811
811
  # We use a dummy bounding box for all bounding boxes so that we can pass the raw features to
812
812
  # raw_features_to_layoutlm_features
813
- raw_features["bbox"] = [_CLS_BOX] * len(text_["words"])
813
+ raw_features["bbox"] = [_CLS_BOX] * len(text_.words)
814
814
  raw_features["dataset_type"] = dataset_type
815
815
 
816
- if use_token_tag and text_["token_tags"]:
817
- raw_features["labels"] = text_["token_tags"]
818
- elif text_["token_classes"]:
819
- raw_features["labels"] = text_["token_classes"]
816
+ if use_token_tag and text_.token_tags:
817
+ raw_features["labels"] = text_.token_tags
818
+ elif text_.token_classes:
819
+ raw_features["labels"] = text_.token_classes
820
820
  elif page.document_type is not None:
821
821
  document_type_id = page.image_orig.summary.get_sub_category(PageType.DOCUMENT_TYPE).category_id - 1
822
822
  raw_features["labels"] = [document_type_id]
@@ -23,12 +23,11 @@ from __future__ import annotations
23
23
 
24
24
  from abc import ABC, abstractmethod
25
25
  from collections import defaultdict
26
- from dataclasses import dataclass, field
27
26
  from typing import Any, Callable, Mapping, Optional, Union
28
27
  from uuid import uuid1
29
28
 
30
29
  from ..dataflow import DataFlow, MapData
31
- from ..datapoint.image import Image
30
+ from ..datapoint.image import Image, MetaAnnotation
32
31
  from ..mapper.misc import curry
33
32
  from ..utils.context import timed_operation
34
33
  from ..utils.identifier import get_uuid_from_str
@@ -37,25 +36,6 @@ from ..utils.types import DP
37
36
  from .anngen import DatapointManager
38
37
 
39
38
 
40
- @dataclass(frozen=True)
41
- class MetaAnnotation:
42
- """
43
- A immutable dataclass that stores information about what `Image` are being
44
- modified through a pipeline component.
45
-
46
- Attributes:
47
- image_annotations: Tuple of `ObjectTypes` representing image annotations.
48
- sub_categories: Dictionary mapping `ObjectTypes` to sets of `ObjectTypes` for sub-categories.
49
- relationships: Dictionary mapping `ObjectTypes` to sets of `ObjectTypes` for relationships.
50
- summaries: Tuple of `ObjectTypes` representing summaries.
51
- """
52
-
53
- image_annotations: tuple[ObjectTypes, ...] = field(default=())
54
- sub_categories: dict[ObjectTypes, set[ObjectTypes]] = field(default_factory=dict)
55
- relationships: dict[ObjectTypes, set[ObjectTypes]] = field(default_factory=dict)
56
- summaries: tuple[ObjectTypes, ...] = field(default=())
57
-
58
-
59
39
  class PipelineComponent(ABC):
60
40
  """
61
41
  Base class for pipeline components.
@@ -427,15 +407,24 @@ class Pipeline(ABC):
427
407
  as well as summaries (list with sub categories).
428
408
  """
429
409
  image_annotations: list[ObjectTypes] = []
430
- sub_categories = defaultdict(set)
431
- relationships = defaultdict(set)
410
+ sub_categories: dict[ObjectTypes, dict[ObjectTypes, set[ObjectTypes]]] = {}
411
+ relationships = defaultdict(set[ObjectTypes]) # type: ignore
432
412
  summaries: list[ObjectTypes] = []
433
413
  for component in self.pipe_component_list:
434
414
  meta_anns = component.get_meta_annotation()
435
415
  image_annotations.extend(meta_anns.image_annotations)
436
416
  for key, value in meta_anns.sub_categories.items():
437
- sub_categories[key].update(value)
438
- for key, value in meta_anns.relationships.items():
417
+ sub_dict = meta_anns.sub_categories[key]
418
+ for sub_cat, sub_cat_value in value.items():
419
+ if sub_cat in sub_dict:
420
+ sub_dict[sub_cat].update(sub_cat_value)
421
+ else:
422
+ sub_dict[sub_cat] = {sub_cat_value} # type: ignore
423
+ if key in sub_categories:
424
+ sub_categories[key].update(sub_dict)
425
+ else:
426
+ sub_categories[key] = sub_dict
427
+ for key, value in meta_anns.relationships.items(): # type: ignore
439
428
  relationships[key].update(value)
440
429
  summaries.extend(meta_anns.summaries)
441
430
  return MetaAnnotation(
@@ -445,6 +434,21 @@ class Pipeline(ABC):
445
434
  summaries=tuple(summaries),
446
435
  )
447
436
 
437
+ def get_service_id_to_meta_annotation(self) -> Mapping[str, MetaAnnotation]:
438
+ """
439
+ Collects meta annotations from all pipeline components and return a dict of service id to its meta annotation.
440
+
441
+ Returns:
442
+ `service_id` to `MetaAnnotation` with information about image annotations (list), sub categories (dict with
443
+ category names and generated sub categories), relationships (dict with category names and generated
444
+ relationships) as well as summaries (list with sub categories).
445
+ """
446
+ service_id_to_meta_annotation = {}
447
+ for component in self.pipe_component_list:
448
+ meta_anns = component.get_meta_annotation()
449
+ service_id_to_meta_annotation[component.service_id] = meta_anns
450
+ return service_id_to_meta_annotation
451
+
448
452
  def get_pipeline_info(
449
453
  self, service_id: Optional[str] = None, name: Optional[str] = None
450
454
  ) -> Union[str, Mapping[str, str]]:
@@ -28,13 +28,13 @@ from typing import Literal, Mapping, Optional, Sequence, Union
28
28
  import numpy as np
29
29
 
30
30
  from ..dataflow import DataFlow, MapData
31
- from ..datapoint.image import Image
31
+ from ..datapoint.image import Image, MetaAnnotation
32
32
  from ..datapoint.view import IMAGE_DEFAULTS, Page
33
33
  from ..extern.base import DetectionResult
34
34
  from ..mapper.match import match_anns_by_distance, match_anns_by_intersection
35
35
  from ..mapper.misc import to_image
36
36
  from ..utils.settings import LayoutType, ObjectTypes, Relationships, TypeOrStr, get_type
37
- from .base import MetaAnnotation, PipelineComponent
37
+ from .base import PipelineComponent
38
38
  from .registry import pipeline_component_registry
39
39
 
40
40
  if os.environ.get("DD_USE_TORCH"):
@@ -29,11 +29,11 @@ from typing import Callable, Optional, Sequence, Union
29
29
  import tqdm
30
30
 
31
31
  from ..dataflow import DataFlow, MapData
32
- from ..datapoint.image import Image
32
+ from ..datapoint.image import Image, MetaAnnotation
33
33
  from ..utils.context import timed_operation
34
34
  from ..utils.tqdm import get_tqdm
35
35
  from ..utils.types import QueueType, TqdmType
36
- from .base import MetaAnnotation, PipelineComponent
36
+ from .base import PipelineComponent
37
37
  from .common import ImageParsingService, PageParsingService
38
38
  from .registry import pipeline_component_registry
39
39
 
@@ -20,12 +20,12 @@ Module for language detection pipeline component
20
20
  """
21
21
  from typing import Optional, Sequence
22
22
 
23
- from ..datapoint.image import Image
23
+ from ..datapoint.image import Image, MetaAnnotation
24
24
  from ..datapoint.view import ImageDefaults, Page
25
25
  from ..extern.base import LanguageDetector, ObjectDetector
26
26
  from ..utils.error import ImageError
27
27
  from ..utils.settings import PageType, TypeOrStr, get_type
28
- from .base import MetaAnnotation, PipelineComponent
28
+ from .base import PipelineComponent
29
29
  from .registry import pipeline_component_registry
30
30
 
31
31
 
@@ -24,13 +24,13 @@ from typing import Optional, Sequence, Union
24
24
 
25
25
  import numpy as np
26
26
 
27
- from ..datapoint.image import Image
27
+ from ..datapoint.image import Image, MetaAnnotation
28
28
  from ..extern.base import ObjectDetector, PdfMiner
29
29
  from ..mapper.misc import curry
30
30
  from ..utils.error import ImageError
31
31
  from ..utils.settings import ObjectTypes
32
32
  from ..utils.transform import PadTransform
33
- from .base import MetaAnnotation, PipelineComponent
33
+ from .base import PipelineComponent
34
34
  from .registry import pipeline_component_registry
35
35
 
36
36
 
deepdoctection/pipe/lm.py CHANGED
@@ -23,11 +23,11 @@ from __future__ import annotations
23
23
  from copy import copy
24
24
  from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Sequence, Union
25
25
 
26
- from ..datapoint.image import Image
26
+ from ..datapoint.image import Image, MetaAnnotation
27
27
  from ..extern.base import SequenceClassResult
28
28
  from ..mapper.laylmstruct import image_to_layoutlm_features, image_to_lm_features
29
29
  from ..utils.settings import BioTag, LayoutType, ObjectTypes, PageType, TokenClasses, WordType
30
- from .base import MetaAnnotation, PipelineComponent
30
+ from .base import PipelineComponent
31
31
  from .registry import pipeline_component_registry
32
32
 
33
33
  if TYPE_CHECKING:
@@ -246,7 +246,17 @@ class LMTokenClassifierService(PipelineComponent):
246
246
  def get_meta_annotation(self) -> MetaAnnotation:
247
247
  return MetaAnnotation(
248
248
  image_annotations=(),
249
- sub_categories={LayoutType.WORD: {WordType.TOKEN_CLASS, WordType.TAG, WordType.TOKEN_TAG}},
249
+ sub_categories={
250
+ LayoutType.WORD: {
251
+ WordType.TOKEN_CLASS: set(self.language_model.categories.categories_semantics) # type: ignore
252
+ if self.language_model.categories.categories_semantics
253
+ else [],
254
+ WordType.TAG: set(self.language_model.categories.categories_bio) # type: ignore
255
+ if self.language_model.categories.categories_bio
256
+ else [],
257
+ WordType.TOKEN_TAG: set(self.language_model.categories.get_categories(as_dict=False)),
258
+ }
259
+ },
250
260
  relationships={},
251
261
  summaries=(),
252
262
  )
@@ -31,11 +31,11 @@ import numpy as np
31
31
 
32
32
  from ..datapoint.annotation import ImageAnnotation
33
33
  from ..datapoint.box import BoundingBox, merge_boxes
34
- from ..datapoint.image import Image
34
+ from ..datapoint.image import Image, MetaAnnotation
35
35
  from ..datapoint.view import IMAGE_DEFAULTS
36
36
  from ..extern.base import DetectionResult
37
37
  from ..extern.tp.tpfrcnn.utils.np_box_ops import ioa as np_ioa
38
- from ..pipe.base import MetaAnnotation, PipelineComponent
38
+ from ..pipe.base import PipelineComponent
39
39
  from ..pipe.registry import pipeline_component_registry
40
40
  from ..utils.logger import LoggingRecord, logger
41
41
  from ..utils.settings import LayoutType, ObjectTypes, Relationships, TypeOrStr, get_type
@@ -611,8 +611,8 @@ class TextLineService(TextLineServiceMixin):
611
611
  """
612
612
  return MetaAnnotation(
613
613
  image_annotations=(LayoutType.LINE,),
614
- sub_categories={LayoutType.LINE: {Relationships.CHILD}},
615
- relationships={},
614
+ sub_categories={},
615
+ relationships={LayoutType.LINE: {Relationships.CHILD}},
616
616
  summaries=(),
617
617
  )
618
618
 
@@ -818,7 +818,11 @@ class TextOrderService(TextLineServiceMixin):
818
818
  anns_with_reading_order = list(copy(self.floating_text_block_categories)) + add_category
819
819
  return MetaAnnotation(
820
820
  image_annotations=tuple(image_annotations),
821
- sub_categories={category: {Relationships.READING_ORDER} for category in anns_with_reading_order},
821
+ sub_categories={ # type: ignore
822
+ category: {Relationships.READING_ORDER: {Relationships.READING_ORDER}}
823
+ for category in anns_with_reading_order
824
+ }
825
+ | {self.text_container: {Relationships.READING_ORDER: {Relationships.READING_ORDER}}},
822
826
  relationships={},
823
827
  summaries=(),
824
828
  )
@@ -31,12 +31,12 @@ import networkx as nx # type: ignore
31
31
 
32
32
  from ..datapoint.annotation import ImageAnnotation
33
33
  from ..datapoint.box import merge_boxes
34
- from ..datapoint.image import Image
34
+ from ..datapoint.image import Image, MetaAnnotation
35
35
  from ..extern.base import DetectionResult
36
36
  from ..mapper.maputils import MappingContextManager
37
37
  from ..utils.error import ImageError
38
38
  from ..utils.settings import CellType, LayoutType, ObjectTypes, Relationships, TableType, get_type
39
- from .base import MetaAnnotation, PipelineComponent
39
+ from .base import PipelineComponent
40
40
  from .registry import pipeline_component_registry
41
41
 
42
42
  __all__ = ["TableSegmentationRefinementService", "generate_html_string"]
@@ -537,12 +537,12 @@ class TableSegmentationRefinementService(PipelineComponent):
537
537
  image_annotations=(),
538
538
  sub_categories={
539
539
  LayoutType.CELL: {
540
- CellType.ROW_NUMBER,
541
- CellType.COLUMN_NUMBER,
542
- CellType.ROW_SPAN,
543
- CellType.COLUMN_SPAN,
540
+ CellType.ROW_NUMBER: {CellType.ROW_NUMBER},
541
+ CellType.COLUMN_NUMBER: {CellType.COLUMN_NUMBER},
542
+ CellType.ROW_SPAN: {CellType.ROW_SPAN},
543
+ CellType.COLUMN_SPAN: {CellType.COLUMN_SPAN},
544
544
  },
545
- LayoutType.TABLE: {TableType.HTML},
545
+ LayoutType.TABLE: {TableType.HTML: {TableType.HTML}},
546
546
  },
547
547
  relationships={},
548
548
  summaries=(),
@@ -29,13 +29,13 @@ import numpy as np
29
29
 
30
30
  from ..datapoint.annotation import ImageAnnotation
31
31
  from ..datapoint.box import BoundingBox, global_to_local_coords, intersection_box, intersection_boxes, iou, merge_boxes
32
- from ..datapoint.image import Image
32
+ from ..datapoint.image import Image, MetaAnnotation
33
33
  from ..extern.base import DetectionResult
34
34
  from ..mapper.maputils import MappingContextManager
35
35
  from ..mapper.match import match_anns_by_intersection
36
36
  from ..utils.error import ImageError
37
37
  from ..utils.settings import CellType, LayoutType, ObjectTypes, Relationships, TableType, TypeOrStr, get_type
38
- from .base import MetaAnnotation, PipelineComponent
38
+ from .base import PipelineComponent
39
39
  from .refine import generate_html_string
40
40
  from .registry import pipeline_component_registry
41
41
 
@@ -974,13 +974,13 @@ class TableSegmentationService(PipelineComponent):
974
974
  image_annotations=(),
975
975
  sub_categories={
976
976
  LayoutType.CELL: {
977
- CellType.ROW_NUMBER,
978
- CellType.COLUMN_NUMBER,
979
- CellType.ROW_SPAN,
980
- CellType.COLUMN_SPAN,
977
+ CellType.ROW_NUMBER: {CellType.ROW_NUMBER},
978
+ CellType.COLUMN_NUMBER: {CellType.COLUMN_NUMBER},
979
+ CellType.ROW_SPAN: {CellType.ROW_SPAN},
980
+ CellType.COLUMN_SPAN: {CellType.COLUMN_SPAN},
981
981
  },
982
- LayoutType.ROW: {CellType.ROW_NUMBER},
983
- LayoutType.COLUMN: {CellType.COLUMN_NUMBER},
982
+ LayoutType.ROW: {CellType.ROW_NUMBER: {CellType.ROW_NUMBER}},
983
+ LayoutType.COLUMN: {CellType.COLUMN_NUMBER: {CellType.COLUMN_NUMBER}},
984
984
  },
985
985
  relationships={},
986
986
  summaries=(),
@@ -1314,37 +1314,37 @@ class PubtablesSegmentationService(PipelineComponent):
1314
1314
  image_annotations=(),
1315
1315
  sub_categories={
1316
1316
  LayoutType.CELL: {
1317
- CellType.ROW_NUMBER,
1318
- CellType.COLUMN_NUMBER,
1319
- CellType.ROW_SPAN,
1320
- CellType.COLUMN_SPAN,
1317
+ CellType.ROW_NUMBER: {CellType.ROW_NUMBER},
1318
+ CellType.COLUMN_NUMBER: {CellType.COLUMN_NUMBER},
1319
+ CellType.ROW_SPAN: {CellType.ROW_SPAN},
1320
+ CellType.COLUMN_SPAN: {CellType.COLUMN_SPAN},
1321
1321
  },
1322
1322
  CellType.SPANNING: {
1323
- CellType.ROW_NUMBER,
1324
- CellType.COLUMN_NUMBER,
1325
- CellType.ROW_SPAN,
1326
- CellType.COLUMN_SPAN,
1323
+ CellType.ROW_NUMBER: {CellType.ROW_NUMBER},
1324
+ CellType.COLUMN_NUMBER: {CellType.COLUMN_NUMBER},
1325
+ CellType.ROW_SPAN: {CellType.ROW_SPAN},
1326
+ CellType.COLUMN_SPAN: {CellType.COLUMN_SPAN},
1327
1327
  },
1328
1328
  CellType.ROW_HEADER: {
1329
- CellType.ROW_NUMBER,
1330
- CellType.COLUMN_NUMBER,
1331
- CellType.ROW_SPAN,
1332
- CellType.COLUMN_SPAN,
1329
+ CellType.ROW_NUMBER: {CellType.ROW_NUMBER},
1330
+ CellType.COLUMN_NUMBER: {CellType.COLUMN_NUMBER},
1331
+ CellType.ROW_SPAN: {CellType.ROW_SPAN},
1332
+ CellType.COLUMN_SPAN: {CellType.COLUMN_SPAN},
1333
1333
  },
1334
1334
  CellType.COLUMN_HEADER: {
1335
- CellType.ROW_NUMBER,
1336
- CellType.COLUMN_NUMBER,
1337
- CellType.ROW_SPAN,
1338
- CellType.COLUMN_SPAN,
1335
+ CellType.ROW_NUMBER: {CellType.ROW_NUMBER},
1336
+ CellType.COLUMN_NUMBER: {CellType.COLUMN_NUMBER},
1337
+ CellType.ROW_SPAN: {CellType.ROW_SPAN},
1338
+ CellType.COLUMN_SPAN: {CellType.COLUMN_SPAN},
1339
1339
  },
1340
1340
  CellType.PROJECTED_ROW_HEADER: {
1341
- CellType.ROW_NUMBER,
1342
- CellType.COLUMN_NUMBER,
1343
- CellType.ROW_SPAN,
1344
- CellType.COLUMN_SPAN,
1341
+ CellType.ROW_NUMBER: {CellType.ROW_NUMBER},
1342
+ CellType.COLUMN_NUMBER: {CellType.COLUMN_NUMBER},
1343
+ CellType.ROW_SPAN: {CellType.ROW_SPAN},
1344
+ CellType.COLUMN_SPAN: {CellType.COLUMN_SPAN},
1345
1345
  },
1346
- LayoutType.ROW: {CellType.ROW_NUMBER},
1347
- LayoutType.COLUMN: {CellType.COLUMN_NUMBER},
1346
+ LayoutType.ROW: {CellType.ROW_NUMBER: {CellType.ROW_NUMBER}},
1347
+ LayoutType.COLUMN: {CellType.COLUMN_NUMBER: {CellType.COLUMN_NUMBER}},
1348
1348
  },
1349
1349
  relationships={},
1350
1350
  summaries=(),
@@ -28,12 +28,12 @@ import numpy as np
28
28
 
29
29
  from ..datapoint.annotation import ImageAnnotation
30
30
  from ..datapoint.box import crop_box_from_image
31
- from ..datapoint.image import Image
31
+ from ..datapoint.image import Image, MetaAnnotation
32
32
  from ..extern.base import DetectionResult, ObjectDetector, PdfMiner
33
33
  from ..utils.settings import ObjectTypes, Relationships, TypeOrStr, get_type
34
34
  from ..utils.transform import PadTransform
35
35
  from ..utils.types import PixelValues
36
- from .base import MetaAnnotation, PipelineComponent
36
+ from .base import PipelineComponent
37
37
  from .registry import pipeline_component_registry
38
38
 
39
39
 
@@ -25,13 +25,13 @@ from copy import deepcopy
25
25
  from typing import Optional, Sequence, Union
26
26
 
27
27
  from ..datapoint.annotation import ImageAnnotation
28
- from ..datapoint.image import Image
28
+ from ..datapoint.image import Image, MetaAnnotation
29
29
  from ..extern.base import ObjectDetector, PdfMiner, TextRecognizer
30
30
  from ..extern.tessocr import TesseractOcrDetector
31
31
  from ..utils.error import ImageError
32
32
  from ..utils.settings import ObjectTypes, PageType, TypeOrStr, WordType, get_type
33
33
  from ..utils.types import PixelValues
34
- from .base import MetaAnnotation, PipelineComponent
34
+ from .base import PipelineComponent
35
35
  from .registry import pipeline_component_registry
36
36
 
37
37
  __all__ = ["TextExtractionService"]
@@ -202,16 +202,21 @@ class TextExtractionService(PipelineComponent):
202
202
  return 1
203
203
 
204
204
  def get_meta_annotation(self) -> MetaAnnotation:
205
- sub_cat_dict: dict[ObjectTypes, set[ObjectTypes]]
205
+ sub_cat_dict: dict[ObjectTypes, dict[ObjectTypes, set[ObjectTypes]]]
206
206
  if self.extract_from_category:
207
- sub_cat_dict = {category: {WordType.CHARACTERS} for category in self.extract_from_category}
207
+ sub_cat_dict = {
208
+ category: {WordType.CHARACTERS: {WordType.CHARACTERS}} for category in self.extract_from_category
209
+ }
208
210
  else:
209
211
  if not isinstance(self.predictor, (ObjectDetector, PdfMiner)):
210
212
  raise TypeError(
211
213
  f"self.predictor must be of type ObjectDetector or PdfMiner but is of type "
212
214
  f"{type(self.predictor)}"
213
215
  )
214
- sub_cat_dict = {category: {WordType.CHARACTERS} for category in self.predictor.get_category_names()}
216
+ sub_cat_dict = {
217
+ category: {WordType.CHARACTERS: {WordType.CHARACTERS}}
218
+ for category in self.predictor.get_category_names()
219
+ }
215
220
  return MetaAnnotation(
216
221
  image_annotations=self.predictor.get_category_names()
217
222
  if isinstance(self.predictor, (ObjectDetector, PdfMiner))
@@ -22,9 +22,9 @@ Transform style pipeline components.
22
22
  from __future__ import annotations
23
23
 
24
24
  from .. import DetectionResult
25
- from ..datapoint.image import Image
25
+ from ..datapoint.image import Image, MetaAnnotation
26
26
  from ..extern.base import ImageTransformer
27
- from .base import MetaAnnotation, PipelineComponent
27
+ from .base import PipelineComponent
28
28
  from .registry import pipeline_component_registry
29
29
 
30
30
 
@@ -83,8 +83,6 @@ class SimpleTransformService(PipelineComponent):
83
83
  for detect_result in output_detect_results:
84
84
  ann = dp.get_annotation(annotation_ids=detect_result.uuid)[0]
85
85
  transformed_ann_id = self.dp_manager.set_image_annotation(detect_result)
86
- if transformed_ann_id is None:
87
- print("here")
88
86
  transformed_ann = self.dp_manager.datapoint.get_annotation(annotation_ids=transformed_ann_id)[0]
89
87
 
90
88
  for key, sub_ann in ann.sub_categories.items():
@@ -18,6 +18,7 @@ from types import ModuleType
18
18
  from typing import Any, Union, no_type_check
19
19
 
20
20
  import importlib_metadata
21
+ import numpy as np
21
22
  from packaging import version
22
23
 
23
24
  from .error import DependencyError
@@ -249,6 +250,39 @@ def get_distance_requirement() -> Requirement:
249
250
  return "distance", distance_available(), _DISTANCE_ERR_MSG
250
251
 
251
252
 
253
+ _NUMPY_V1_ERR_MSG = "numpy v1 must be installed."
254
+
255
+
256
+ def numpy_v1_available() -> bool:
257
+ """
258
+ Check if the installed NumPy version is version 1.
259
+
260
+ This helper function determines whether the currently installed version
261
+ of NumPy is version 1 by inspecting its major version number.
262
+
263
+ Returns:
264
+ True if the installed NumPy version is 1, otherwise False
265
+ """
266
+ major_version = np.__version__.split('.', maxsplit=1)[0]
267
+ print(f"major version: {major_version}")
268
+ if major_version in (1, "1"):
269
+ return True
270
+ return False
271
+
272
+
273
+ def get_numpy_v1_requirement() -> Requirement:
274
+ """
275
+ Retrieves the requirement details for numpy version 1.
276
+
277
+ Returns:
278
+ A tuple containing three elements:
279
+ - The requirement name for numpy version 1.
280
+ - A Boolean value indicating whether numpy version 1 is available.
281
+ - An error message in case numpy version 1 is not available.
282
+ """
283
+ return "numpy v1", numpy_v1_available(), _NUMPY_V1_ERR_MSG
284
+
285
+
252
286
  # Transformers
253
287
  _TRANSFORMERS_AVAILABLE = importlib.util.find_spec("transformers") is not None
254
288
  _TRANSFORMERS_ERR_MSG = f"transformers must be installed. {_GENERIC_ERR_MSG}"
@@ -70,7 +70,6 @@ AnnotationDict: TypeAlias = dict[str, Any]
70
70
  ImageDict: TypeAlias = dict[str, Any]
71
71
 
72
72
  # We use these types for output types of the Page object
73
- Text_: TypeAlias = dict[str, Any]
74
73
  HTML: TypeAlias = str
75
74
  csv: TypeAlias = list[list[str]]
76
75
  Chunks: TypeAlias = list[tuple[str, str, int, str, str, str, str]]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deepdoctection
3
- Version: 0.43.6
3
+ Version: 0.44.1
4
4
  Summary: Repository for Document AI
5
5
  Home-page: https://github.com/deepdoctection/deepdoctection
6
6
  Author: Dr. Janis Meyer
@@ -27,7 +27,7 @@ Requires-Dist: networkx>=2.7.1
27
27
  Requires-Dist: numpy<2.0,>=1.21
28
28
  Requires-Dist: packaging>=20.0
29
29
  Requires-Dist: Pillow>=10.0.0
30
- Requires-Dist: pypdf>=3.16.0
30
+ Requires-Dist: pypdf>=6.0.0
31
31
  Requires-Dist: pypdfium2>=4.30.0
32
32
  Requires-Dist: pyyaml>=6.0.1
33
33
  Requires-Dist: pyzmq>=16
@@ -46,7 +46,7 @@ Requires-Dist: networkx>=2.7.1; extra == "tf"
46
46
  Requires-Dist: numpy<2.0,>=1.21; extra == "tf"
47
47
  Requires-Dist: packaging>=20.0; extra == "tf"
48
48
  Requires-Dist: Pillow>=10.0.0; extra == "tf"
49
- Requires-Dist: pypdf>=3.16.0; extra == "tf"
49
+ Requires-Dist: pypdf>=6.0.0; extra == "tf"
50
50
  Requires-Dist: pypdfium2>=4.30.0; extra == "tf"
51
51
  Requires-Dist: pyyaml>=6.0.1; extra == "tf"
52
52
  Requires-Dist: pyzmq>=16; extra == "tf"
@@ -78,7 +78,7 @@ Requires-Dist: networkx>=2.7.1; extra == "pt"
78
78
  Requires-Dist: numpy<2.0,>=1.21; extra == "pt"
79
79
  Requires-Dist: packaging>=20.0; extra == "pt"
80
80
  Requires-Dist: Pillow>=10.0.0; extra == "pt"
81
- Requires-Dist: pypdf>=3.16.0; extra == "pt"
81
+ Requires-Dist: pypdf>=6.0.0; extra == "pt"
82
82
  Requires-Dist: pypdfium2>=4.30.0; extra == "pt"
83
83
  Requires-Dist: pyyaml>=6.0.1; extra == "pt"
84
84
  Requires-Dist: pyzmq>=16; extra == "pt"