deepdoctection 0.43__py3-none-any.whl → 0.43.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

@@ -25,11 +25,11 @@ from .utils.logger import LoggingRecord, logger
25
25
 
26
26
  # pylint: enable=wrong-import-position
27
27
 
28
- __version__ = "0.43"
28
+ __version__ = "0.43.1"
29
29
 
30
30
  _IMPORT_STRUCTURE = {
31
31
  "analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory"],
32
- "configs": [],
32
+ "configs": ["update_cfg_from_defaults"],
33
33
  "dataflow": [
34
34
  "DataFlowTerminated",
35
35
  "DataFlowResetStateNotCalled",
@@ -98,6 +98,7 @@ _IMPORT_STRUCTURE = {
98
98
  "List",
99
99
  "Cell",
100
100
  "Table",
101
+ "IMAGE_DEFAULTS",
101
102
  "Page",
102
103
  ],
103
104
  "datasets": [
@@ -902,3 +902,18 @@ cfg.LAYOUT_LINK.CHILD_CATEGORIES = [LayoutType.CAPTION]
902
902
  # Freezes the configuration to make it immutable.
903
903
  # This prevents accidental modification at runtime.
904
904
  cfg.freeze()
905
+
906
+ def update_cfg_from_defaults() -> None:
907
+ """
908
+ Update the configuration with current values from IMAGE_DEFAULTS.
909
+ """
910
+ cfg.freeze(False)
911
+
912
+ # Update all dependent fields from IMAGE_DEFAULTS
913
+ cfg.TEXT_CONTAINER = IMAGE_DEFAULTS.TEXT_CONTAINER
914
+ cfg.WORD_MATCHING.PARENTAL_CATEGORIES = IMAGE_DEFAULTS.TEXT_BLOCK_CATEGORIES
915
+ cfg.TEXT_ORDERING.TEXT_BLOCK_CATEGORIES = IMAGE_DEFAULTS.TEXT_BLOCK_CATEGORIES
916
+ cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES = IMAGE_DEFAULTS.FLOATING_TEXT_BLOCK_CATEGORIES
917
+
918
+ # Re-freeze the configuration
919
+ cfg.freeze()
@@ -269,7 +269,10 @@ class DoctrTextlineDetector(DoctrTextlineDetectorMixin):
269
269
  if self.lib == "PT":
270
270
  self.device = get_torch_device(device)
271
271
 
272
- self.doctr_predictor = self.get_wrapped_model(self.architecture, self.path_weights, self.device, self.lib)
272
+ self.doctr_predictor = self.get_wrapped_model(self.architecture,
273
+ self.path_weights,
274
+ self.device,
275
+ self.lib)
273
276
 
274
277
  def predict(self, np_img: PixelValues) -> list[DetectionResult]:
275
278
  """
@@ -424,7 +427,7 @@ class DoctrTextRecognizer(TextRecognizer):
424
427
  return _get_doctr_requirements()
425
428
 
426
429
  def clone(self) -> DoctrTextRecognizer:
427
- return self.__class__(self.architecture, self.path_weights, self.device, self.lib)
430
+ return self.__class__(self.architecture, self.path_weights, self.device, self.lib, self.path_config_json)
428
431
 
429
432
  @staticmethod
430
433
  def load_model(
@@ -60,7 +60,7 @@ def _textract_to_detectresult(response: JsonDict, width: int, height: int, text_
60
60
  return all_results
61
61
 
62
62
 
63
- def predict_text(np_img: PixelValues, client: boto3.client, text_lines: bool) -> list[DetectionResult]: # type: ignore
63
+ def predict_text(np_img: PixelValues, client: boto3.client, text_lines: bool) -> list[DetectionResult]:
64
64
  """
65
65
  Calls AWS Textract client (`detect_document_text`) and returns plain OCR results.
66
66
  AWS account required.
@@ -738,10 +738,8 @@ class TextOrderService(TextLineServiceMixin):
738
738
  text_block_anns.extend(residual_text_container_anns)
739
739
  for text_block_ann in text_block_anns:
740
740
  self.order_text_in_text_block(text_block_ann)
741
- floating_text_block_anns_to_order = [
742
- ann for ann in text_block_anns if ann.category_name in self.floating_text_block_categories
743
- ]
744
- self.order_blocks(floating_text_block_anns_to_order)
741
+ floating_text_block_anns = dp.get_annotation(category_names=self.floating_text_block_categories)
742
+ self.order_blocks(floating_text_block_anns)
745
743
  self._create_columns()
746
744
 
747
745
  def _create_columns(self) -> None:
@@ -803,9 +801,14 @@ class TextOrderService(TextLineServiceMixin):
803
801
  if self.include_residual_text_container:
804
802
  add_category.append(LayoutType.LINE)
805
803
 
806
- assert set(self.floating_text_block_categories) <= set(
804
+ if set(self.floating_text_block_categories) <= set(
807
805
  self.text_block_categories + tuple(add_category)
808
- ), "floating_text_block_categories must be a subset of text_block_categories"
806
+ ):
807
+ logger.warning("In most cases floating_text_block_categories must be a subset of text_block_categories. "
808
+ "Adding categories to floating_text_block_categories, that do not belong to "
809
+ "text_block_categories makes only sense for categories set have CHILD relationships with"
810
+ " annotations that belong to text_block_categories.")
811
+
809
812
 
810
813
  def get_meta_annotation(self) -> MetaAnnotation:
811
814
  add_category = [self.text_container]
@@ -38,7 +38,7 @@ __all__ = ["timeout_manager", "save_tmp_file", "timed_operation"]
38
38
 
39
39
 
40
40
  @contextmanager
41
- def timeout_manager(proc: Any, seconds: Optional[int] = None) -> Iterator[str]: # type: ignore
41
+ def timeout_manager(proc: Any, seconds: Optional[int] = None) -> Iterator[str]:
42
42
  """
43
43
  Manager for time handling while some process is being called.
44
44
 
@@ -71,6 +71,7 @@ def log_deprecated(name: str, text: str, eos: str = "", max_num_warnings: Option
71
71
  logger.info(LoggingRecord(f"[Deprecated] {info_msg}"))
72
72
 
73
73
 
74
+
74
75
  def deprecated(
75
76
  text: str = "", eos: str = "", max_num_warnings: Optional[int] = None
76
77
  ) -> Callable[[Callable[..., T]], Callable[..., T]]:
@@ -462,7 +462,7 @@ def pt_info(data: KeyValEnvInfos) -> KeyValEnvInfos:
462
462
  data.append(("torchvision arch flags", msg))
463
463
  except (ImportError, AttributeError):
464
464
  data.append(("torchvision._C", "Not found"))
465
- except AttributeError:
465
+ except (AttributeError, ModuleNotFoundError):
466
466
  data.append(("torchvision", "unknown"))
467
467
 
468
468
  return data
@@ -12,7 +12,6 @@ import importlib.util
12
12
  import multiprocessing as mp
13
13
  import string
14
14
  import subprocess
15
- import sys
16
15
  from os import environ, path
17
16
  from shutil import which
18
17
  from types import ModuleType
@@ -22,7 +21,6 @@ import importlib_metadata
22
21
  from packaging import version
23
22
 
24
23
  from .error import DependencyError
25
- from .logger import LoggingRecord, logger
26
24
  from .metacfg import AttrDict
27
25
  from .types import PathLikeOrStr, Requirement
28
26
 
@@ -662,13 +660,6 @@ def get_doctr_requirement() -> Requirement:
662
660
  On macOS, if `poppler` is not available, this function will recursively check the requirement.
663
661
  It is not yet known how to check whether `pango`, `gdk-pixbuf`, and `libffi` are installed.
664
662
  """
665
- if sys.platform == "darwin":
666
- if not get_poppler_version():
667
- return get_doctr_requirement()
668
- # don't know yet how to check whether pango gdk-pixbuf libffi are installed
669
- logger.info(
670
- LoggingRecord("package requires weasyprint. Check that poppler pango gdk-pixbuf libffi are installed")
671
- )
672
663
  return "doctr", doctr_available(), _DOCTR_ERR_MSG
673
664
 
674
665
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deepdoctection
3
- Version: 0.43
3
+ Version: 0.43.1
4
4
  Summary: Repository for Document AI
5
5
  Home-page: https://github.com/deepdoctection/deepdoctection
6
6
  Author: Dr. Janis Meyer
@@ -245,11 +245,11 @@ alt="text" width="40%">
245
245
 
246
246
  ## Requirements
247
247
 
248
- ![requirements](https://github.com/deepdoctection/deepdoctection/raw/master/docs/tutorials/_imgs/requirements_deepdoctection_220525.png)
248
+ ![requirements](https://github.com/deepdoctection/deepdoctection/raw/master/docs/tutorials/_imgs/install_01.png)
249
249
 
250
250
  - Linux or macOS. Windows is not supported but there is a [Dockerfile](./docker/pytorch-cpu-jupyter/Dockerfile) available.
251
251
  - Python >= 3.9
252
- - 1.13 \<= PyTorch **or** 2.11 \<= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
252
+ - 2.2 \<= PyTorch **or** 2.11 \<= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
253
253
  Tensorflow support will be stopped from Python 3.11 onwards.
254
254
  - To fine-tune models, a GPU is recommended.
255
255
 
@@ -1,7 +1,7 @@
1
- deepdoctection/__init__.py,sha256=T4BXZotL855uGwIHhore8lZAfSinpIcrpeIvrVsSCyc,12910
1
+ deepdoctection/__init__.py,sha256=QYGjP3fSt1deLMEKIb7LUDVlzZgi1Q7phZQADkOjlGk,12964
2
2
  deepdoctection/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  deepdoctection/analyzer/__init__.py,sha256=wg0BcFwdCeREwzZfa--Yx8HUJ9LPv5z5PmLwtkZdPH8,772
4
- deepdoctection/analyzer/config.py,sha256=CgH4etvZL0JdeIHAXMFdDro2VvVsF2itgQar_Ml94pw,41185
4
+ deepdoctection/analyzer/config.py,sha256=Uwi9MYOym0QXzFcip7bRXoy732P4tZVMFBUnUZk9c1w,41761
5
5
  deepdoctection/analyzer/dd.py,sha256=2BGvZpl9o9khcaOV52-DPHMrs0DsqUO8cpdqFVHHzDQ,5176
6
6
  deepdoctection/analyzer/factory.py,sha256=DI0S38KAG2sIROrSximsWJsMbem91a9zXaeWsDNvkGg,37574
7
7
  deepdoctection/configs/__init__.py,sha256=TX_P6tqDOF1LK1mi9ruAl7x0mtv1Asm8cYWCz3Pe2dk,646
@@ -54,7 +54,7 @@ deepdoctection/extern/__init__.py,sha256=jG2qe5_X7fJFnJlx04Lf1KUTXKKKYtCkKR7WQ7l
54
54
  deepdoctection/extern/base.py,sha256=vs4EO9vkfyTW-nVM0dnlqsmnPPeHIXI2wW5b4Wpiz-Y,31547
55
55
  deepdoctection/extern/d2detect.py,sha256=I0oEkprr5iVpKpM3a3nknAU-sXwNoDQdp_B1gzzODsk,22374
56
56
  deepdoctection/extern/deskew.py,sha256=L_jU0rXh03qzwaT79EIqE_zYMUVeFwWDbsGbtahuL2k,3124
57
- deepdoctection/extern/doctrocr.py,sha256=d468jZpR4WBvZPt5Vze70dh6tZacBM1HO0pcy8ynOFM,25358
57
+ deepdoctection/extern/doctrocr.py,sha256=vOI67NCeEznHzdY4zu1tggwg3_PfwV1eigsr2Z8EPzc,25543
58
58
  deepdoctection/extern/fastlang.py,sha256=4D9A-_hTXUcvXG6IJJknX34LrD71v08XtNdWgvXD7fE,4736
59
59
  deepdoctection/extern/hfdetr.py,sha256=N3eLNI5BsQS9_7YZyBeWndSgUydJij7ugZA9p4V1xaQ,14316
60
60
  deepdoctection/extern/hflayoutlm.py,sha256=3mZZ3byn00jSrLWO2vZFas9j4VrhbYQNmF1mwPG2ElQ,59642
@@ -62,7 +62,7 @@ deepdoctection/extern/hflm.py,sha256=y-9brzmT2NYtFoNcWHABNg2ZZQXSOP9CyqtT1OoeV9U
62
62
  deepdoctection/extern/model.py,sha256=-GbnuhLFq7jpBOvtpJe6IhGXxQdqwiM8epEd7IRELoU,18234
63
63
  deepdoctection/extern/pdftext.py,sha256=ljzPQn3yYAlS6MoZqzixD-fO2GlHwu1aMiOQ6qMIzbg,7513
64
64
  deepdoctection/extern/tessocr.py,sha256=SuPmngsJg38riL4b09z6_FIzJH6H3RIwoighG2GPMYM,17457
65
- deepdoctection/extern/texocr.py,sha256=93vGj0TX2gENMFV6_FDk3et1sDecrNeuozv5EfOR5nk,5931
65
+ deepdoctection/extern/texocr.py,sha256=wVOuu6eUGao0mUbC8vrgdCsKfY1GqA1Am9560YgWyXU,5915
66
66
  deepdoctection/extern/tpdetect.py,sha256=Kr00n80V_OfE-EGfpjiVw1eAQ2n2tuT-hSco-dLSR9E,8516
67
67
  deepdoctection/extern/pt/__init__.py,sha256=3Cu0ZHjbYsJomru7-RQXEHihEQLegZrmLetlHiqS58I,742
68
68
  deepdoctection/extern/pt/nms.py,sha256=2lSpEH8cI_QXdz5xL_OaitqsGoHhp5xvDssK5Yo8q4Q,2218
@@ -111,7 +111,7 @@ deepdoctection/pipe/doctectionpipe.py,sha256=ik5F92F3klI5Nve_AnyIRj-ApMoKHSR2Sjc
111
111
  deepdoctection/pipe/language.py,sha256=T5g5_2GIsbTltAmn_PFymMUMoik8_b0uJNx8f5dT9MM,5898
112
112
  deepdoctection/pipe/layout.py,sha256=oAldMtwyZee1IqpuflKKvmeL2Z_nXFiqwFMS4VYv5eI,6391
113
113
  deepdoctection/pipe/lm.py,sha256=nYI2bm0sc9d3JMlIPyNyd4XxXFRBIHRUYfMImuek6b4,19793
114
- deepdoctection/pipe/order.py,sha256=9OarsHKwVqT1bTDIn7XGeGLgpetEJW3uLjuJQDdhjG4,40684
114
+ deepdoctection/pipe/order.py,sha256=8CqEWUA6U9HxThKKGP9yJMbRaML2by7do0Gdhl_7AdI,40964
115
115
  deepdoctection/pipe/refine.py,sha256=SrMcAWXRO5tJpqaZCEz9RzvjPyiQiE8fZ9TXBcaBKck,23310
116
116
  deepdoctection/pipe/registry.py,sha256=uT5fnHjffoNGk2JPuD2-pMYtO3Iko7-wrwVZVCWLtok,906
117
117
  deepdoctection/pipe/segment.py,sha256=rHhEWr5zZ1ppj-gMa-q-UCr1AYTWpUW7oA1umwebqBI,61302
@@ -125,11 +125,11 @@ deepdoctection/train/hf_layoutlm_train.py,sha256=bNL5OCLKytshG6kaTJDLTQOcvWKwEsY
125
125
  deepdoctection/train/tp_frcnn_train.py,sha256=Tltb-v2JD5oPuHCZGA9B5DM4ZaidoBITlH93QX-KPKI,13570
126
126
  deepdoctection/utils/__init__.py,sha256=brBceRWeov9WXMiJTjyJOF2rHMP8trGGRRjhMdZ61nI,2371
127
127
  deepdoctection/utils/concurrency.py,sha256=9ly81D5i2ZFzKfXMIUSmhT42eMs4QttsRhWXdkIk6Aw,5832
128
- deepdoctection/utils/context.py,sha256=GXgIGQ10JwosE6FQSPdlJoCXyeM_cahEcbQ2mxGNofI,4538
129
- deepdoctection/utils/develop.py,sha256=4myrqBDypM6tQ2a2Jo3Q20RuE_W2czykpXBwgXPrxNw,3568
130
- deepdoctection/utils/env_info.py,sha256=yyRyjQT3xz73yEcqauPM6GdCpvWwYGAH-_KUHfXBrtM,19855
128
+ deepdoctection/utils/context.py,sha256=5QfdzxsiSPnNs1qtJdgjguIoD8srLQ2W8oeDzwp9F78,4522
129
+ deepdoctection/utils/develop.py,sha256=x2MhbmoKZyRluesmc01is7ldrUN9c0TX4OAuc1yt6dI,3569
130
+ deepdoctection/utils/env_info.py,sha256=b1WohrfQuoL-BPN0_s8Rjtwzx-WKvCyaX2I4qYl1Emc,19878
131
131
  deepdoctection/utils/error.py,sha256=sIry8F5MZ0yLvKfAwVz90IorKWVvjoRqcC0L8qq8mLk,2480
132
- deepdoctection/utils/file_utils.py,sha256=D4cua4i3Q-4ZeSRXXWEL4z7hp0M1qh9rrVSfS5t-Hzo,25643
132
+ deepdoctection/utils/file_utils.py,sha256=EepfAZVADaqpBdVq2LOJXLFLsMd_oZF_FAKUHOAhiZ0,25246
133
133
  deepdoctection/utils/fs.py,sha256=KTS9FJzZk9le_vmIPr9IisJw0AyTfjkyX1KoWQy4DNs,12729
134
134
  deepdoctection/utils/identifier.py,sha256=Jt12MeZf7eC1qciY5Fp_AYUGxYVcjsy7xNBUvJil7dU,2270
135
135
  deepdoctection/utils/logger.py,sha256=ddQ0xBStluf8OvoRlEB8YkqyRR-ZYgyJYLClTmJJMAU,10290
@@ -142,8 +142,8 @@ deepdoctection/utils/transform.py,sha256=jgeCyQWLN9q79jCGW7jysyKUKcJ1AVMk8OslF-3
142
142
  deepdoctection/utils/types.py,sha256=ti4WdtIJSg3TGK_YPkkoY9PYGMnR2tTX6Xfik8U1pNk,2986
143
143
  deepdoctection/utils/utils.py,sha256=NBUb1qbx8Jm-AvYN1Sdbk0huXhbAKxZ-ZtOcMespsMM,7064
144
144
  deepdoctection/utils/viz.py,sha256=bujRIujvX317rPz4jBrj0yd3WP8wPjDUiI5GUrw9MzQ,27339
145
- deepdoctection-0.43.dist-info/licenses/LICENSE,sha256=GQ0rUvuGdrMNEI3iHK5UQx6dIMU1QwAuyXsxUHn5MEQ,11351
146
- deepdoctection-0.43.dist-info/METADATA,sha256=F08hSp-kyzwW1tsTWyRMtWG3TaJsA8_LgueZ7irkBqA,13404
147
- deepdoctection-0.43.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
148
- deepdoctection-0.43.dist-info/top_level.txt,sha256=hs2DdoOL9h4mnHhmO82BT4pz4QATIoOZ20PZmlnxFI8,15
149
- deepdoctection-0.43.dist-info/RECORD,,
145
+ deepdoctection-0.43.1.dist-info/licenses/LICENSE,sha256=GQ0rUvuGdrMNEI3iHK5UQx6dIMU1QwAuyXsxUHn5MEQ,11351
146
+ deepdoctection-0.43.1.dist-info/METADATA,sha256=jD_6fJFeK-4XpVsjUL0BgFooiuhFPNE9rK0RR6-_2gY,13381
147
+ deepdoctection-0.43.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
148
+ deepdoctection-0.43.1.dist-info/top_level.txt,sha256=hs2DdoOL9h4mnHhmO82BT4pz4QATIoOZ20PZmlnxFI8,15
149
+ deepdoctection-0.43.1.dist-info/RECORD,,