deepdoctection 0.44.1__py3-none-any.whl → 0.46__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (33) hide show
  1. deepdoctection/__init__.py +7 -3
  2. deepdoctection/analyzer/config.py +44 -0
  3. deepdoctection/analyzer/factory.py +264 -7
  4. deepdoctection/configs/profiles.jsonl +2 -1
  5. deepdoctection/dataflow/parallel_map.py +7 -1
  6. deepdoctection/datapoint/box.py +5 -5
  7. deepdoctection/datapoint/image.py +5 -5
  8. deepdoctection/datapoint/view.py +73 -52
  9. deepdoctection/eval/cocometric.py +1 -0
  10. deepdoctection/extern/__init__.py +1 -0
  11. deepdoctection/extern/base.py +8 -1
  12. deepdoctection/extern/d2detect.py +1 -1
  13. deepdoctection/extern/doctrocr.py +18 -2
  14. deepdoctection/extern/fastlang.py +2 -2
  15. deepdoctection/extern/hflayoutlm.py +17 -10
  16. deepdoctection/extern/hflm.py +432 -7
  17. deepdoctection/extern/tessocr.py +17 -1
  18. deepdoctection/pipe/language.py +4 -4
  19. deepdoctection/pipe/lm.py +7 -3
  20. deepdoctection/pipe/order.py +12 -6
  21. deepdoctection/pipe/refine.py +10 -1
  22. deepdoctection/pipe/text.py +6 -0
  23. deepdoctection/pipe/transform.py +3 -0
  24. deepdoctection/utils/file_utils.py +34 -5
  25. deepdoctection/utils/logger.py +38 -1
  26. deepdoctection/utils/settings.py +2 -0
  27. deepdoctection/utils/transform.py +43 -18
  28. deepdoctection/utils/viz.py +24 -15
  29. {deepdoctection-0.44.1.dist-info → deepdoctection-0.46.dist-info}/METADATA +16 -21
  30. {deepdoctection-0.44.1.dist-info → deepdoctection-0.46.dist-info}/RECORD +33 -33
  31. {deepdoctection-0.44.1.dist-info → deepdoctection-0.46.dist-info}/WHEEL +0 -0
  32. {deepdoctection-0.44.1.dist-info → deepdoctection-0.46.dist-info}/licenses/LICENSE +0 -0
  33. {deepdoctection-0.44.1.dist-info → deepdoctection-0.46.dist-info}/top_level.txt +0 -0
@@ -25,11 +25,10 @@ from .utils.logger import LoggingRecord, logger
25
25
 
26
26
  # pylint: enable=wrong-import-position
27
27
 
28
- __version__ = "0.44.1"
28
+ __version__ = "0.46"
29
29
 
30
30
  _IMPORT_STRUCTURE = {
31
- "analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory"],
32
- "configs": ["update_cfg_from_defaults"],
31
+ "analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory", "update_cfg_from_defaults"],
33
32
  "dataflow": [
34
33
  "DataFlowTerminated",
35
34
  "DataFlowResetStateNotCalled",
@@ -186,7 +185,9 @@ _IMPORT_STRUCTURE = {
186
185
  "HFLayoutLmv3SequenceClassifier",
187
186
  "HFLiltTokenClassifier",
188
187
  "HFLiltSequenceClassifier",
188
+ "HFLmTokenClassifier",
189
189
  "HFLmSequenceClassifier",
190
+ "HFLmLanguageDetector",
190
191
  "ModelProfile",
191
192
  "ModelCatalog",
192
193
  "print_model_infos",
@@ -270,6 +271,7 @@ _IMPORT_STRUCTURE = {
270
271
  "MultiThreadPipelineComponent",
271
272
  "DoctectionPipe",
272
273
  "LanguageDetectionService",
274
+ "skip_if_category_or_service_extracted",
273
275
  "ImageLayoutService",
274
276
  "LMTokenClassifierService",
275
277
  "LMSequenceClassifierService",
@@ -309,12 +311,14 @@ _IMPORT_STRUCTURE = {
309
311
  "get_tensorpack_requirement",
310
312
  "pytorch_available",
311
313
  "get_pytorch_requirement",
314
+ "pyzmq_available",
312
315
  "lxml_available",
313
316
  "get_lxml_requirement",
314
317
  "apted_available",
315
318
  "get_apted_requirement",
316
319
  "distance_available",
317
320
  "get_distance_requirement",
321
+ "networkx_available",
318
322
  "numpy_v1_available",
319
323
  "get_numpy_v1_requirement",
320
324
  "transformers_available",
@@ -520,6 +520,16 @@ cfg.USE_LAYOUT_LINK = False
520
520
  # (e.g., by grouping orphan text containers). Only applicable if list items were previously grouped.
521
521
  cfg.USE_LINE_MATCHER = False
522
522
 
523
+ # Enables a sequence classification pipeline component, e.g. a LayoutLM or a Bert-like model.
524
+ cfg.USE_LM_SEQUENCE_CLASS = False
525
+
526
+ # Enables a token classification pipeline component, e.g. a LayoutLM or Bert-like model
527
+ cfg.USE_LM_TOKEN_CLASS = False
528
+
529
+ # Specifies the selection of the rotation model. There are two models available: A rotation estimator
530
+ # based on Tesseract ('tesseract'), and a rotation estimator based on DocTr ('doctr').
531
+ cfg.ROTATOR.MODEL = "tesseract"
532
+
523
533
  # Relevant when LIB = TF. Specifies the layout detection model.
524
534
  # This model should detect multiple or single objects across an entire page.
525
535
  # Currently, only one default model is supported.
@@ -899,6 +909,40 @@ cfg.LAYOUT_LINK.PARENTAL_CATEGORIES = [LayoutType.FIGURE, LayoutType.TABLE]
899
909
  # These are typically smaller or subordinate elements (e.g., captions).
900
910
  cfg.LAYOUT_LINK.CHILD_CATEGORIES = [LayoutType.CAPTION]
901
911
 
912
+
913
+ # Weights configuration for sequence classifier. This will be a fine-tuned version of a LayoutLM, LayoutLMv2,
914
+ # LayoutXLM, LayoutLMv3, LiLT or Roberta base model for sequence classification.
915
+ cfg.LM_SEQUENCE_CLASS.WEIGHTS = None
916
+
917
+ # When predicting document classes, it might be possible that some pages are empty or do not contain any text, in
918
+ # which case the model will be unable to predict anything. If set to `True` it will
919
+ # assign images with no features the category `TokenClasses.OTHER`.
920
+ cfg.LM_SEQUENCE_CLASS.USE_OTHER_AS_DEFAULT_CATEGORY = False
921
+
922
+ # Weights configuration for sequence classifier. This will be a fine-tuned version of a LayoutLM, LayoutLMv2,
923
+ # LayoutXLM, LayoutLMv3, LiLT or Roberta base model for token classification.
924
+ cfg.LM_TOKEN_CLASS.WEIGHTS = None
925
+
926
+ # When predicting token classes, it might be possible that some words might not get sent to the model because they are
927
+ # categorized as not eligible token (e.g. empty string). If set to `True` it will assign all words without token
928
+ # as `TokenClasses.OTHER`.
929
+ cfg.LM_TOKEN_CLASS.USE_OTHER_AS_DEFAULT_CATEGORY = False
930
+
931
+ # Using bounding boxes of segments instead of words might improve model accuracy
932
+ # for models that have been trained on segments rather than words (e.g. LiLT, LayoutLMv3).
933
+ # Choose a single or a sequence of layout segments to use their bounding boxes. Note,
934
+ # that the layout segments need to have a child-relationship with words. If a word
935
+ # does not appear as child, it will use the word bounding box.
936
+ cfg.LM_TOKEN_CLASS.SEGMENT_POSITIONS = None
937
+
938
+ # If the output of the `tokenizer` exceeds the `max_length` sequence length, a
939
+ # sliding window will be created with each window having `max_length` sequence
940
+ # input. When using `SLIDING_WINDOW_STRIDE=0` no strides will be created,
941
+ # otherwise it will create slides with windows shifted `SLIDING_WINDOW_STRIDE` to
942
+ # the right.
943
+ cfg.LM_TOKEN_CLASS.SLIDING_WINDOW_STRIDE = 0
944
+
945
+
902
946
  # Freezes the configuration to make it immutable.
903
947
  # This prevents accidental modification at runtime.
904
948
  cfg.freeze()
@@ -19,16 +19,29 @@
19
19
  `ServiceFactory` for building analyzers
20
20
  """
21
21
 
22
+ from __future__ import annotations
22
23
 
23
24
  from os import environ
24
- from typing import Union
25
+ from typing import TYPE_CHECKING, Literal, Union
25
26
 
26
27
  from lazy_imports import try_import
27
28
 
28
29
  from ..extern.base import ImageTransformer, ObjectDetector, PdfMiner
29
30
  from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
30
- from ..extern.doctrocr import DoctrTextlineDetector, DoctrTextRecognizer
31
+ from ..extern.doctrocr import DocTrRotationTransformer, DoctrTextlineDetector, DoctrTextRecognizer
31
32
  from ..extern.hfdetr import HFDetrDerivedDetector
33
+ from ..extern.hflayoutlm import (
34
+ HFLayoutLmSequenceClassifier,
35
+ HFLayoutLmTokenClassifier,
36
+ HFLayoutLmv2SequenceClassifier,
37
+ HFLayoutLmv2TokenClassifier,
38
+ HFLayoutLmv3SequenceClassifier,
39
+ HFLayoutLmv3TokenClassifier,
40
+ HFLiltSequenceClassifier,
41
+ HFLiltTokenClassifier,
42
+ get_tokenizer_from_model_class,
43
+ )
44
+ from ..extern.hflm import HFLmSequenceClassifier, HFLmTokenClassifier
32
45
  from ..extern.model import ModelCatalog, ModelDownloadManager
33
46
  from ..extern.pdftext import PdfPlumberTextDetector
34
47
  from ..extern.tessocr import TesseractOcrDetector, TesseractRotationTransformer
@@ -45,6 +58,7 @@ from ..pipe.common import (
45
58
  )
46
59
  from ..pipe.doctectionpipe import DoctectionPipe
47
60
  from ..pipe.layout import ImageLayoutService, skip_if_category_or_service_extracted
61
+ from ..pipe.lm import LMSequenceClassifierService, LMTokenClassifierService
48
62
  from ..pipe.order import TextOrderService
49
63
  from ..pipe.refine import TableSegmentationRefinementService
50
64
  from ..pipe.segment import PubtablesSegmentationService, TableSegmentationService
@@ -60,6 +74,11 @@ from ..utils.transform import PadTransform
60
74
  with try_import() as image_guard:
61
75
  from botocore.config import Config # type: ignore
62
76
 
77
+ if TYPE_CHECKING:
78
+ from ..extern.hflayoutlm import LayoutSequenceModels, LayoutTokenModels
79
+ from ..extern.hflm import LmSequenceModels, LmTokenModels
80
+
81
+ RotationTransformer = Union[TesseractRotationTransformer, DocTrRotationTransformer]
63
82
 
64
83
  __all__ = [
65
84
  "ServiceFactory",
@@ -172,24 +191,32 @@ class ServiceFactory:
172
191
  return ServiceFactory._build_layout_detector(config, mode)
173
192
 
174
193
  @staticmethod
175
- def _build_rotation_detector() -> TesseractRotationTransformer:
194
+ def _build_rotation_detector(rotator_name: Literal["tesseract", "doctr"]) -> RotationTransformer:
176
195
  """
177
196
  Building a rotation detector.
178
197
 
179
198
  Returns:
180
199
  TesseractRotationTransformer: Rotation detector instance.
181
200
  """
182
- return TesseractRotationTransformer()
201
+
202
+ if rotator_name == "tesseract":
203
+ return TesseractRotationTransformer()
204
+ if rotator_name == "doctr":
205
+ return DocTrRotationTransformer()
206
+ raise ValueError(
207
+ f"You have chosen rotator_name: {rotator_name} which is not allowed. Only tesseract or "
208
+ f"doctr are allowed."
209
+ )
183
210
 
184
211
  @staticmethod
185
- def build_rotation_detector() -> TesseractRotationTransformer:
212
+ def build_rotation_detector(rotator_name: Literal["tesseract", "doctr"]) -> RotationTransformer:
186
213
  """
187
214
  Building a rotation detector.
188
215
 
189
216
  Returns:
190
217
  TesseractRotationTransformer: Rotation detector instance.
191
218
  """
192
- return ServiceFactory._build_rotation_detector()
219
+ return ServiceFactory._build_rotation_detector(rotator_name)
193
220
 
194
221
  @staticmethod
195
222
  def _build_transform_service(transform_predictor: ImageTransformer) -> SimpleTransformService:
@@ -841,6 +868,226 @@ class ServiceFactory:
841
868
  """
842
869
  return ServiceFactory._build_text_order_service(config)
843
870
 
871
+ @staticmethod
872
+ def _build_sequence_classifier(config: AttrDict) -> Union[LayoutSequenceModels, LmSequenceModels]:
873
+ """
874
+ Builds and returns a sequence classifier instance.
875
+
876
+ Args:
877
+ config: Configuration object that determines the type of sequence classifier to construct.
878
+
879
+ Returns:
880
+ A sequence classifier instance constructed according to the specified configuration.
881
+ """
882
+ config_path = ModelCatalog.get_full_path_configs(config.LM_SEQUENCE_CLASS.WEIGHTS)
883
+ weights_path = ModelDownloadManager.maybe_download_weights_and_configs(config.LM_SEQUENCE_CLASS.WEIGHTS)
884
+ profile = ModelCatalog.get_profile(config.LM_SEQUENCE_CLASS.WEIGHTS)
885
+ categories = profile.categories if profile.categories is not None else {}
886
+ use_xlm_tokenizer = "xlm_tokenizer" == profile.architecture
887
+
888
+ if profile.model_wrapper in ("HFLayoutLmSequenceClassifier",):
889
+ return HFLayoutLmSequenceClassifier(
890
+ path_config_json=config_path,
891
+ path_weights=weights_path,
892
+ categories=categories,
893
+ device=config.DEVICE,
894
+ use_xlm_tokenizer=use_xlm_tokenizer,
895
+ )
896
+ if profile.model_wrapper in ("HFLayoutLmv2SequenceClassifier",):
897
+ return HFLayoutLmv2SequenceClassifier(
898
+ path_config_json=config_path,
899
+ path_weights=weights_path,
900
+ categories=categories,
901
+ device=config.DEVICE,
902
+ use_xlm_tokenizer=use_xlm_tokenizer,
903
+ )
904
+ if profile.model_wrapper in ("HFLayoutLmv3SequenceClassifier",):
905
+ return HFLayoutLmv3SequenceClassifier(
906
+ path_config_json=config_path,
907
+ path_weights=weights_path,
908
+ categories=categories,
909
+ device=config.DEVICE,
910
+ use_xlm_tokenizer=use_xlm_tokenizer,
911
+ )
912
+ if profile.model_wrapper in ("HFLiltSequenceClassifier",):
913
+ return HFLiltSequenceClassifier(
914
+ path_config_json=config_path,
915
+ path_weights=weights_path,
916
+ categories=categories,
917
+ device=config.DEVICE,
918
+ use_xlm_tokenizer=use_xlm_tokenizer,
919
+ )
920
+ if profile.model_wrapper in ("HFLmSequenceClassifier",):
921
+ return HFLmSequenceClassifier(
922
+ path_config_json=config_path,
923
+ path_weights=weights_path,
924
+ categories=categories,
925
+ device=config.DEVICE,
926
+ use_xlm_tokenizer=use_xlm_tokenizer,
927
+ )
928
+ raise ValueError(f"Unsupported model wrapper: {profile.model_wrapper}")
929
+
930
+ @staticmethod
931
+ def build_sequence_classifier(config: AttrDict) -> Union[LayoutSequenceModels, LmSequenceModels]:
932
+ """
933
+ Builds and returns a sequence classifier instance.
934
+
935
+ Args:
936
+ config: Configuration object that determines the type of sequence classifier to construct.
937
+
938
+ Returns:
939
+ A sequence classifier instance constructed according to the specified configuration.
940
+ """
941
+ return ServiceFactory._build_sequence_classifier(config)
942
+
943
+ @staticmethod
944
+ def _build_sequence_classifier_service(
945
+ config: AttrDict, sequence_classifier: Union[LayoutSequenceModels, LmSequenceModels]
946
+ ) -> LMSequenceClassifierService:
947
+ """
948
+ Building a sequence classifier service.
949
+
950
+ Args:
951
+ config: Configuration object.
952
+ sequence_classifier: Sequence classifier instance.
953
+
954
+ Returns:
955
+ LMSequenceClassifierService: Text order service instance.
956
+ """
957
+ tokenizer_fast = get_tokenizer_from_model_class(
958
+ sequence_classifier.model.__class__.__name__, sequence_classifier.use_xlm_tokenizer
959
+ )
960
+
961
+ return LMSequenceClassifierService(
962
+ tokenizer=tokenizer_fast,
963
+ language_model=sequence_classifier,
964
+ use_other_as_default_category=config.LM_SEQUENCE_CLASS.USE_OTHER_AS_DEFAULT_CATEGORY,
965
+ )
966
+
967
+ @staticmethod
968
+ def build_sequence_classifier_service(
969
+ config: AttrDict, sequence_classifier: Union[LayoutSequenceModels, LmSequenceModels]
970
+ ) -> LMSequenceClassifierService:
971
+ """
972
+ Building a sequence classifier service.
973
+
974
+ Args:
975
+ config: Configuration object.
976
+ sequence_classifier: Sequence classifier instance.
977
+
978
+ Returns:
979
+ LMSequenceClassifierService: Text order service instance.
980
+ """
981
+ return ServiceFactory._build_sequence_classifier_service(config, sequence_classifier)
982
+
983
+ @staticmethod
984
+ def _build_token_classifier(config: AttrDict) -> Union[LayoutTokenModels, LmTokenModels]:
985
+ """
986
+ Builds and returns a token classifier model.
987
+
988
+ Args:
989
+ config: Configuration object.
990
+
991
+ Returns:
992
+ The instantiated token classifier model.
993
+ """
994
+ config_path = ModelCatalog.get_full_path_configs(config.LM_TOKEN_CLASS.WEIGHTS)
995
+ weights_path = ModelDownloadManager.maybe_download_weights_and_configs(config.LM_TOKEN_CLASS.WEIGHTS)
996
+ profile = ModelCatalog.get_profile(config.LM_TOKEN_CLASS.WEIGHTS)
997
+ categories = profile.categories if profile.categories is not None else {}
998
+ use_xlm_tokenizer = "xlm_tokenizer" == profile.architecture
999
+ if profile.model_wrapper in ("HFLayoutLmTokenClassifier",):
1000
+ return HFLayoutLmTokenClassifier(
1001
+ path_config_json=config_path,
1002
+ path_weights=weights_path,
1003
+ categories=categories,
1004
+ device=config.DEVICE,
1005
+ use_xlm_tokenizer=use_xlm_tokenizer,
1006
+ )
1007
+ if profile.model_wrapper in ("HFLayoutLmv2TokenClassifier",):
1008
+ return HFLayoutLmv2TokenClassifier(
1009
+ path_config_json=config_path,
1010
+ path_weights=weights_path,
1011
+ categories=categories,
1012
+ device=config.DEVICE,
1013
+ )
1014
+ if profile.model_wrapper in ("HFLayoutLmv3TokenClassifier",):
1015
+ return HFLayoutLmv3TokenClassifier(
1016
+ path_config_json=config_path,
1017
+ path_weights=weights_path,
1018
+ categories=categories,
1019
+ device=config.DEVICE,
1020
+ )
1021
+ if profile.model_wrapper in ("HFLiltTokenClassifier",):
1022
+ return HFLiltTokenClassifier(
1023
+ path_config_json=config_path,
1024
+ path_weights=weights_path,
1025
+ categories=categories,
1026
+ device=config.DEVICE,
1027
+ )
1028
+ if profile.model_wrapper in ("HFLmTokenClassifier",):
1029
+ return HFLmTokenClassifier(
1030
+ path_config_json=config_path,
1031
+ path_weights=weights_path,
1032
+ categories=categories,
1033
+ )
1034
+ raise ValueError(f"Unsupported model wrapper: {profile.model_wrapper}")
1035
+
1036
+ @staticmethod
1037
+ def build_token_classifier(config: AttrDict) -> Union[LayoutTokenModels, LmTokenModels]:
1038
+ """
1039
+ Builds and returns a token classifier model.
1040
+
1041
+ Args:
1042
+ config: Configuration object.
1043
+
1044
+ Returns:
1045
+ The instantiated token classifier model.
1046
+ """
1047
+ return ServiceFactory._build_token_classifier(config)
1048
+
1049
+ @staticmethod
1050
+ def _build_token_classifier_service(
1051
+ config: AttrDict, token_classifier: Union[LayoutTokenModels, LmTokenModels]
1052
+ ) -> LMTokenClassifierService:
1053
+ """
1054
+ Building a token classifier service.
1055
+
1056
+ Args:
1057
+ config: Configuration object.
1058
+ token_classifier: Token classifier instance.
1059
+
1060
+ Returns:
1061
+ A LMTokenClassifierService instance.
1062
+ """
1063
+ tokenizer_fast = get_tokenizer_from_model_class(
1064
+ token_classifier.model.__class__.__name__, token_classifier.use_xlm_tokenizer
1065
+ )
1066
+
1067
+ return LMTokenClassifierService(
1068
+ tokenizer=tokenizer_fast,
1069
+ language_model=token_classifier,
1070
+ use_other_as_default_category=config.LM_TOKEN_CLASS.USE_OTHER_AS_DEFAULT_CATEGORY,
1071
+ segment_positions=config.LM_TOKEN_CLASS.SEGMENT_POSITIONS,
1072
+ sliding_window_stride=config.LM_TOKEN_CLASS.SLIDING_WINDOW_STRIDE,
1073
+ )
1074
+
1075
+ @staticmethod
1076
+ def build_token_classifier_service(
1077
+ config: AttrDict, token_classifier: Union[LayoutTokenModels, LmTokenModels]
1078
+ ) -> LMTokenClassifierService:
1079
+ """
1080
+ Building a token classifier service.
1081
+
1082
+ Args:
1083
+ config: Configuration object.
1084
+ token_classifier: Token classifier instance.
1085
+
1086
+ Returns:
1087
+ A LMTokenClassifierService instance.
1088
+ """
1089
+ return ServiceFactory._build_token_classifier_service(config, token_classifier)
1090
+
844
1091
  @staticmethod
845
1092
  def _build_page_parsing_service(config: AttrDict) -> PageParsingService:
846
1093
  """
@@ -885,7 +1132,7 @@ class ServiceFactory:
885
1132
  pipe_component_list: list[PipelineComponent] = []
886
1133
 
887
1134
  if config.USE_ROTATOR:
888
- rotation_detector = ServiceFactory.build_rotation_detector()
1135
+ rotation_detector = ServiceFactory.build_rotation_detector(config.ROTATOR.MODEL)
889
1136
  transform_service = ServiceFactory.build_transform_service(transform_predictor=rotation_detector)
890
1137
  pipe_component_list.append(transform_service)
891
1138
 
@@ -955,6 +1202,16 @@ class ServiceFactory:
955
1202
  line_list_matching_service = ServiceFactory.build_line_matching_service(config)
956
1203
  pipe_component_list.append(line_list_matching_service)
957
1204
 
1205
+ if config.USE_LM_SEQUENCE_CLASS:
1206
+ sequence_classifier = ServiceFactory.build_sequence_classifier(config)
1207
+ sequence_classifier_service = ServiceFactory.build_sequence_classifier_service(config, sequence_classifier)
1208
+ pipe_component_list.append(sequence_classifier_service)
1209
+
1210
+ if config.USE_LM_TOKEN_CLASS:
1211
+ token_classifier = ServiceFactory.build_token_classifier(config)
1212
+ token_classifier_service = ServiceFactory.build_token_classifier_service(config, token_classifier)
1213
+ pipe_component_list.append(token_classifier_service)
1214
+
958
1215
  page_parsing_service = ServiceFactory.build_page_parsing_service(config)
959
1216
 
960
1217
  return DoctectionPipe(pipeline_component_list=pipe_component_list, page_parsing_service=page_parsing_service)
@@ -30,4 +30,5 @@
30
30
  {"name": "Felix92/doctr-torch-parseq-multilingual-v1/pytorch_model.bin", "description": "", "size": [63286381], "tp_model": false, "config": "Felix92/doctr-torch-parseq-multilingual-v1/config.json", "preprocessor_config": null, "hf_repo_id": "Felix92/doctr-torch-parseq-multilingual-v1", "hf_model_name": "pytorch_model.bin", "hf_config_file": ["config.json"], "urls": null, "categories": {}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "DoctrTextRecognizer", "architecture": "parseq", "padding": null}
31
31
  {"name": "doctr/crnn_vgg16_bn/pt/master-fde31e4a.pt", "description": "MASTER", "size": [63286381], "tp_model": false, "config": null, "preprocessor_config": null, "hf_repo_id": null, "hf_model_name": null, "hf_config_file": null, "urls": ["https://doctr-static.mindee.com/models?id=v0.7.0/master-fde31e4a.pt&src=0"], "categories": {}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "DoctrTextRecognizer", "architecture": "master", "padding": null}
32
32
  {"name": "Aryn/deformable-detr-DocLayNet/model.safetensors", "description": "Deformable DEtection TRansformer (DETR), trained on DocLayNet (including 80k annotated pages in 11 classes).", "size": [115511753], "tp_model": false, "config": "Aryn/deformable-detr-DocLayNet/config.json", "preprocessor_config": "Aryn/deformable-detr-DocLayNet/preprocessor_config.json", "hf_repo_id": "Aryn/deformable-detr-DocLayNet", "hf_model_name": "model.safetensors", "hf_config_file": ["config.json", "preprocessor_config.json"], "urls": null, "categories": {"1": "default_type", "2": "caption", "11": "text", "12": "title", "3": "footnote", "4": "formula", "5": "list_item", "6": "page_footer", "7": "page_header", "8": "figure", "9": "section_header", "10": "table"}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "HFDetrDerivedDetector", "architecture": null, "padding": null}
33
- {"name": "deepdoctection/tatr_tab_struct_v2/model.safetensors", "description": "Table Transformer (DETR) model trained on PubTables1M. It was introduced in the paper Aligning benchmark datasets for table structure recognition by Smock et al. This model is devoted to table structure recognition and assumes to receive a slightly croppedtable as input. It will predict rows, column and spanning cells. Use a padding of around 5 pixels. This artefact has been converted from deepdoctection/tatr_tab_struct_v2/pytorch_model.bin and should be used to reduce security issues", "size": [115511753], "tp_model": false, "config": "deepdoctection/tatr_tab_struct_v2/config.json", "preprocessor_config": "deepdoctection/tatr_tab_struct_v2/preprocessor_config.json", "hf_repo_id": "deepdoctection/tatr_tab_struct_v2", "hf_model_name": "model.safetensors", "hf_config_file": ["config.json", "preprocessor_config.json"], "urls": null, "categories": {"1": "table", "2": "column", "3": "row", "4": "column_header", "5": "projected_row_header", "6": "spanning"}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "HFDetrDerivedDetector", "architecture": null, "padding": null}
33
+ {"name": "deepdoctection/tatr_tab_struct_v2/model.safetensors", "description": "Table Transformer (DETR) model trained on PubTables1M. It was introduced in the paper Aligning benchmark datasets for table structure recognition by Smock et al. This model is devoted to table structure recognition and assumes to receive a slightly croppedtable as input. It will predict rows, column and spanning cells. Use a padding of around 5 pixels. This artefact has been converted from deepdoctection/tatr_tab_struct_v2/pytorch_model.bin and should be used to reduce security issues", "size": [115511753], "tp_model": false, "config": "deepdoctection/tatr_tab_struct_v2/config.json", "preprocessor_config": "deepdoctection/tatr_tab_struct_v2/preprocessor_config.json", "hf_repo_id": "deepdoctection/tatr_tab_struct_v2", "hf_model_name": "model.safetensors", "hf_config_file": ["config.json", "preprocessor_config.json"], "urls": null, "categories": {"1": "table", "2": "column", "3": "row", "4": "column_header", "5": "projected_row_header", "6": "spanning"}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "HFDetrDerivedDetector", "architecture": null, "padding": null}
34
+ {"name": "papluca/xlm-roberta-base-language-detection/model.safetensors", "description": "This model is an XLM-RoBERTa transformer model with a classification head on top (i.e. a linear layer on top of the pooled output). For additional information please refer to the xlm-roberta-base model card or to the paper Unsupervised Cross-lingual Representation Learning at Scale by Conneau et al.", "size": [101971449], "tp_model": false, "config": "papluca/xlm-roberta-base-language-detection/config.json", "preprocessor_config": null, "hf_repo_id": "papluca/xlm-roberta-base-language-detection", "hf_model_name": "model.safetensors", "hf_config_file": ["config.json"], "urls": null, "categories": {"1": "jpn", "2": "dut", "3": "ara", "4": "pol", "5": "deu", "6": "ita", "7": "por", "8": "tur", "9": "spa", "10": "hin", "11": "gre", "12": "urd", "13": "bul", "14": "eng", "15": "fre", "16": "chi", "17": "rus", "18": "tha", "19": "swa", "20": "vie"}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "HFLmLanguageDetector", "architecture": null, "padding": null}
@@ -24,15 +24,19 @@ from abc import ABC, abstractmethod
24
24
  from contextlib import contextmanager
25
25
  from typing import Any, Callable, Iterator, no_type_check
26
26
 
27
- import zmq
27
+ from lazy_imports import try_import
28
28
 
29
29
  from ..utils.concurrency import StoppableThread, enable_death_signal, start_proc_mask_signal
30
30
  from ..utils.error import DataFlowTerminatedError
31
+ from ..utils.file_utils import pyzmq_available
31
32
  from ..utils.logger import LoggingRecord, logger
32
33
  from .base import DataFlow, DataFlowReentrantGuard, ProxyDataFlow
33
34
  from .common import RepeatedData
34
35
  from .serialize import PickleSerializer
35
36
 
37
+ with try_import() as import_guard:
38
+ import zmq
39
+
36
40
 
37
41
  @no_type_check
38
42
  def del_weakref(x):
@@ -77,6 +81,8 @@ def _get_pipe_name(name):
77
81
 
78
82
  class _ParallelMapData(ProxyDataFlow, ABC):
79
83
  def __init__(self, df: DataFlow, buffer_size: int, strict: bool = False) -> None:
84
+ if not pyzmq_available():
85
+ raise ModuleNotFoundError("pyzmq is required for running parallel dataflows (multiprocess/multithread).")
80
86
  super().__init__(df)
81
87
  if buffer_size <= 0:
82
88
  raise ValueError(f"buffer_size must be a positive number, got {buffer_size}")
@@ -284,7 +284,7 @@ class BoundingBox:
284
284
  raise BoundingBoxError(
285
285
  f"bounding box must have height and width >0. Check coords "
286
286
  f"ulx: {self.ulx}, uly: {self.uly}, lrx: {self.lrx}, "
287
- f"lry: {self.lry}."
287
+ f"lry: {self.lry}, absolute_coords: {self.absolute_coords}"
288
288
  )
289
289
  if not self.absolute_coords and not (
290
290
  0 <= self.ulx <= 1 and 0 <= self.uly <= 1 and 0 <= self.lrx <= 1 and 0 <= self.lry <= 1
@@ -505,10 +505,10 @@ class BoundingBox:
505
505
  if self.absolute_coords:
506
506
  transformed_box = BoundingBox(
507
507
  absolute_coords=not self.absolute_coords,
508
- ulx=max(self.ulx / image_width, 0.0),
509
- uly=max(self.uly / image_height, 0.0),
510
- lrx=min(self.lrx / image_width, 1.0),
511
- lry=min(self.lry / image_height, 1.0),
508
+ ulx=min(max(self.ulx / image_width, 0.0), 1.0),
509
+ uly=min(max(self.uly / image_height, 0.0), 1.0),
510
+ lrx=max(min(self.lrx / image_width, 1.0), 0.0),
511
+ lry=max(min(self.lry / image_height, 1.0), 0.0),
512
512
  )
513
513
  else:
514
514
  transformed_box = BoundingBox(
@@ -36,7 +36,7 @@ from ..utils.logger import LoggingRecord, logger
36
36
  from ..utils.settings import ObjectTypes, SummaryType, get_type
37
37
  from ..utils.types import ImageDict, PathLikeOrStr, PixelValues
38
38
  from .annotation import Annotation, AnnotationMap, BoundingBox, CategoryAnnotation, ImageAnnotation
39
- from .box import crop_box_from_image, global_to_local_coords, intersection_box
39
+ from .box import BoxCoordinate, crop_box_from_image, global_to_local_coords, intersection_box
40
40
  from .convert import as_dict, convert_b64_to_np_array, convert_np_array_to_b64, convert_pdf_bytes_to_np_array_v2
41
41
 
42
42
 
@@ -318,7 +318,7 @@ class Image:
318
318
  return _Img(self.image)
319
319
 
320
320
  @property
321
- def width(self) -> float:
321
+ def width(self) -> BoxCoordinate:
322
322
  """
323
323
  `width`
324
324
  """
@@ -327,7 +327,7 @@ class Image:
327
327
  return self._bbox.width
328
328
 
329
329
  @property
330
- def height(self) -> float:
330
+ def height(self) -> BoxCoordinate:
331
331
  """
332
332
  `height`
333
333
  """
@@ -335,7 +335,7 @@ class Image:
335
335
  raise ImageError("Height not available. Call set_width_height first")
336
336
  return self._bbox.height
337
337
 
338
- def set_width_height(self, width: float, height: float) -> None:
338
+ def set_width_height(self, width: BoxCoordinate, height: BoxCoordinate) -> None:
339
339
  """
340
340
  Defines bounding box of the image if not already set. Use this, if you do not want to keep the image separated
341
341
  for memory reasons.
@@ -345,7 +345,7 @@ class Image:
345
345
  height: height of image
346
346
  """
347
347
  if self._bbox is None:
348
- self._bbox = BoundingBox(ulx=0.0, uly=0.0, height=height, width=width, absolute_coords=True)
348
+ self._bbox = BoundingBox(ulx=0, uly=0, height=height, width=width, absolute_coords=True)
349
349
  self._self_embedding()
350
350
 
351
351
  def set_embedding(self, image_id: str, bounding_box: BoundingBox) -> None: