PyPI - deepdoctection - Versions diffs - 0.30__tar.gz → 0.32__tar.gz - Mend

deepdoctection 0.30tar.gz → 0.32tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (247) hide show

{deepdoctection-0.30 → deepdoctection-0.32}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: deepdoctection
-Version: 0.30
+Version: 0.32
 Summary: Repository for Document AI
 Home-page: https://github.com/deepdoctection/deepdoctection
 Author: Dr. Janis Meyer
@@ -9,94 +9,96 @@ Classifier: Development Status :: 4 - Beta
 Classifier: License :: OSI Approved :: Apache Software License
 Classifier: Natural Language :: English
 Classifier: Operating System :: POSIX :: Linux
-Classifier: Programming Language :: Python :: 3.8
 Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
-Requires-Python: >=3.8
+Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: catalogue==2.0.7
+Requires-Dist: catalogue==2.0.10
 Requires-Dist: huggingface_hub>=0.12.0
-Requires-Dist: importlib-metadata>=4.11.2
+Requires-Dist: importlib-metadata>=5.0.0
 Requires-Dist: jsonlines==3.1.0
+Requires-Dist: lazy-imports==0.3.1
 Requires-Dist: mock==4.0.3
 Requires-Dist: networkx>=2.7.1
 Requires-Dist: numpy>=1.21
 Requires-Dist: packaging>=20.0
 Requires-Dist: Pillow>=10.0.0
 Requires-Dist: pypdf>=3.16.0
-Requires-Dist: pyyaml==6.0
+Requires-Dist: pyyaml>=6.0.1
 Requires-Dist: pyzmq>=16
 Requires-Dist: termcolor>=1.1
 Requires-Dist: tabulate>=0.7.7
 Requires-Dist: tqdm==4.64.0
 Provides-Extra: tf
-Requires-Dist: catalogue==2.0.7; extra == "tf"
+Requires-Dist: catalogue==2.0.10; extra == "tf"
 Requires-Dist: huggingface_hub>=0.12.0; extra == "tf"
-Requires-Dist: importlib-metadata>=4.11.2; extra == "tf"
+Requires-Dist: importlib-metadata>=5.0.0; extra == "tf"
 Requires-Dist: jsonlines==3.1.0; extra == "tf"
+Requires-Dist: lazy-imports==0.3.1; extra == "tf"
 Requires-Dist: mock==4.0.3; extra == "tf"
 Requires-Dist: networkx>=2.7.1; extra == "tf"
 Requires-Dist: numpy>=1.21; extra == "tf"
 Requires-Dist: packaging>=20.0; extra == "tf"
 Requires-Dist: Pillow>=10.0.0; extra == "tf"
 Requires-Dist: pypdf>=3.16.0; extra == "tf"
-Requires-Dist: pyyaml==6.0; extra == "tf"
+Requires-Dist: pyyaml>=6.0.1; extra == "tf"
 Requires-Dist: pyzmq>=16; extra == "tf"
 Requires-Dist: termcolor>=1.1; extra == "tf"
 Requires-Dist: tabulate>=0.7.7; extra == "tf"
 Requires-Dist: tqdm==4.64.0; extra == "tf"
-Requires-Dist: tensorpack; extra == "tf"
+Requires-Dist: tensorpack==0.11; extra == "tf"
 Requires-Dist: protobuf==3.20.1; extra == "tf"
 Requires-Dist: tensorflow-addons>=0.17.1; extra == "tf"
 Requires-Dist: tf2onnx>=1.9.2; extra == "tf"
-Requires-Dist: python-doctr==0.7.0; extra == "tf"
+Requires-Dist: python-doctr==0.8.1; extra == "tf"
 Requires-Dist: pycocotools>=2.0.2; extra == "tf"
-Requires-Dist: boto3; extra == "tf"
-Requires-Dist: pdfplumber>=0.7.1; extra == "tf"
-Requires-Dist: fasttext; extra == "tf"
-Requires-Dist: jdeskew; extra == "tf"
+Requires-Dist: boto3==1.34.102; extra == "tf"
+Requires-Dist: pdfplumber>=0.11.0; extra == "tf"
+Requires-Dist: fasttext==0.9.2; extra == "tf"
+Requires-Dist: jdeskew>=0.2.2; extra == "tf"
 Requires-Dist: apted==1.0.3; extra == "tf"
 Requires-Dist: distance==0.1.3; extra == "tf"
 Requires-Dist: lxml>=4.9.1; extra == "tf"
 Provides-Extra: pt
-Requires-Dist: catalogue==2.0.7; extra == "pt"
+Requires-Dist: catalogue==2.0.10; extra == "pt"
 Requires-Dist: huggingface_hub>=0.12.0; extra == "pt"
-Requires-Dist: importlib-metadata>=4.11.2; extra == "pt"
+Requires-Dist: importlib-metadata>=5.0.0; extra == "pt"
 Requires-Dist: jsonlines==3.1.0; extra == "pt"
+Requires-Dist: lazy-imports==0.3.1; extra == "pt"
 Requires-Dist: mock==4.0.3; extra == "pt"
 Requires-Dist: networkx>=2.7.1; extra == "pt"
 Requires-Dist: numpy>=1.21; extra == "pt"
 Requires-Dist: packaging>=20.0; extra == "pt"
 Requires-Dist: Pillow>=10.0.0; extra == "pt"
 Requires-Dist: pypdf>=3.16.0; extra == "pt"
-Requires-Dist: pyyaml==6.0; extra == "pt"
+Requires-Dist: pyyaml>=6.0.1; extra == "pt"
 Requires-Dist: pyzmq>=16; extra == "pt"
 Requires-Dist: termcolor>=1.1; extra == "pt"
 Requires-Dist: tabulate>=0.7.7; extra == "pt"
 Requires-Dist: tqdm==4.64.0; extra == "pt"
-Requires-Dist: timm; extra == "pt"
+Requires-Dist: timm>=0.9.16; extra == "pt"
 Requires-Dist: transformers>=4.36.0; extra == "pt"
-Requires-Dist: accelerate; extra == "pt"
-Requires-Dist: python-doctr==0.7.0; extra == "pt"
-Requires-Dist: boto3; extra == "pt"
-Requires-Dist: pdfplumber>=0.7.1; extra == "pt"
-Requires-Dist: fasttext; extra == "pt"
-Requires-Dist: jdeskew; extra == "pt"
+Requires-Dist: accelerate>=0.29.1; extra == "pt"
+Requires-Dist: python-doctr==0.8.1; extra == "pt"
+Requires-Dist: boto3==1.34.102; extra == "pt"
+Requires-Dist: pdfplumber>=0.11.0; extra == "pt"
+Requires-Dist: fasttext==0.9.2; extra == "pt"
+Requires-Dist: jdeskew>=0.2.2; extra == "pt"
 Requires-Dist: apted==1.0.3; extra == "pt"
 Requires-Dist: distance==0.1.3; extra == "pt"
 Requires-Dist: lxml>=4.9.1; extra == "pt"
 Provides-Extra: docs
-Requires-Dist: tensorpack; extra == "docs"
-Requires-Dist: boto3; extra == "docs"
+Requires-Dist: tensorpack==0.11; extra == "docs"
+Requires-Dist: boto3==1.34.102; extra == "docs"
 Requires-Dist: transformers>=4.36.0; extra == "docs"
-Requires-Dist: accelerate; extra == "docs"
-Requires-Dist: pdfplumber>=0.7.1; extra == "docs"
+Requires-Dist: accelerate>=0.29.1; extra == "docs"
+Requires-Dist: pdfplumber>=0.11.0; extra == "docs"
 Requires-Dist: lxml>=4.9.1; extra == "docs"
-Requires-Dist: lxml-stubs; extra == "docs"
-Requires-Dist: jdeskew; extra == "docs"
+Requires-Dist: lxml-stubs>=0.5.1; extra == "docs"
+Requires-Dist: jdeskew>=0.2.2; extra == "docs"
 Requires-Dist: jinja2==3.0.3; extra == "docs"
 Requires-Dist: mkdocs-material; extra == "docs"
 Requires-Dist: mkdocstrings-python; extra == "docs"
@@ -105,47 +107,20 @@ Provides-Extra: dev
 Requires-Dist: python-dotenv==1.0.0; extra == "dev"
 Requires-Dist: click; extra == "dev"
 Requires-Dist: black==23.7.0; extra == "dev"
-Requires-Dist: isort; extra == "dev"
+Requires-Dist: isort==5.13.2; extra == "dev"
 Requires-Dist: pylint==2.17.4; extra == "dev"
 Requires-Dist: mypy==1.4.1; extra == "dev"
 Requires-Dist: wandb; extra == "dev"
-Requires-Dist: types-PyYAML; extra == "dev"
-Requires-Dist: types-termcolor==1.1.3; extra == "dev"
-Requires-Dist: types-tabulate; extra == "dev"
-Requires-Dist: types-tqdm; extra == "dev"
-Requires-Dist: lxml-stubs; extra == "dev"
-Requires-Dist: types-Pillow; extra == "dev"
-Requires-Dist: types-urllib3; extra == "dev"
+Requires-Dist: types-PyYAML>=6.0.12.12; extra == "dev"
+Requires-Dist: types-termcolor>=1.1.3; extra == "dev"
+Requires-Dist: types-tabulate>=0.9.0.3; extra == "dev"
+Requires-Dist: types-tqdm>=4.66.0.5; extra == "dev"
+Requires-Dist: lxml-stubs>=0.5.1; extra == "dev"
+Requires-Dist: types-Pillow>=10.2.0.20240406; extra == "dev"
+Requires-Dist: types-urllib3>=1.26.25.14; extra == "dev"
 Provides-Extra: test
-Requires-Dist: pytest; extra == "test"
+Requires-Dist: pytest==8.0.2; extra == "test"
 Requires-Dist: pytest-cov; extra == "test"
-Provides-Extra: hf
-Requires-Dist: catalogue==2.0.7; extra == "hf"
-Requires-Dist: huggingface_hub>=0.12.0; extra == "hf"
-Requires-Dist: importlib-metadata>=4.11.2; extra == "hf"
-Requires-Dist: jsonlines==3.1.0; extra == "hf"
-Requires-Dist: mock==4.0.3; extra == "hf"
-Requires-Dist: networkx>=2.7.1; extra == "hf"
-Requires-Dist: numpy>=1.21; extra == "hf"
-Requires-Dist: packaging>=20.0; extra == "hf"
-Requires-Dist: Pillow>=10.0.0; extra == "hf"
-Requires-Dist: pypdf>=3.16.0; extra == "hf"
-Requires-Dist: pyyaml==6.0; extra == "hf"
-Requires-Dist: pyzmq>=16; extra == "hf"
-Requires-Dist: termcolor>=1.1; extra == "hf"
-Requires-Dist: tabulate>=0.7.7; extra == "hf"
-Requires-Dist: tqdm==4.64.0; extra == "hf"
-Requires-Dist: timm; extra == "hf"
-Requires-Dist: transformers>=4.36.0; extra == "hf"
-Requires-Dist: accelerate; extra == "hf"
-Requires-Dist: python-doctr==0.7.0; extra == "hf"
-Requires-Dist: boto3; extra == "hf"
-Requires-Dist: pdfplumber>=0.7.1; extra == "hf"
-Requires-Dist: fasttext; extra == "hf"
-Requires-Dist: jdeskew; extra == "hf"
-Requires-Dist: apted==1.0.3; extra == "hf"
-Requires-Dist: distance==0.1.3; extra == "hf"
-Requires-Dist: lxml>=4.9.1; extra == "hf"
 <p align="center">
@@ -180,7 +155,8 @@ pipelines. Its core function does not depend on any specific deep learning libra
  - Text mining for native PDFs with  [**pdfplumber**](https://github.com/jsvine/pdfplumber),
  - Language detection with [**fastText**](https://github.com/facebookresearch/fastText),
  - Deskewing and rotating images with [**jdeskew**](https://github.com/phamquiluan/jdeskew).
- - Document and token classification with all LayoutLM models provided by the Transformer library.
+ - Document and token classification with all LayoutLM models provided by the
+   [**Transformer library**](https://github.com/huggingface/transformers).
    (Yes, you can use any LayoutLM-model with any of the provided OCR-or pdfplumber tools straight away!).
  - Table detection and table structure recognition with
    [**table-transformer**](https://github.com/microsoft/table-transformer).
@@ -190,8 +166,16 @@ pipelines. Its core function does not depend on any specific deep learning libra
  - Comprehensive configuration of **analyzer** like choosing different models, output parsing, OCR selection.
    Check this [notebook](https://github.com/deepdoctection/notebooks/blob/main/Analyzer_Configuration.ipynb) or the
    [docs](https://deepdoctection.readthedocs.io/en/latest/tutorials/analyzer_configuration_notebook/) for more infos.
- - Document layout analysis and table recognition now runs with Torchscript (CPU) as well and Detectron2 is
-   not required anymore for basic inference.
+ - Document layout analysis and table recognition now runs with
+   [**Torchscript**](https://pytorch.org/docs/stable/jit.html) (CPU) as well and [**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) is not required
+   anymore for basic inference.
+ - [**new**] More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
+   (not contained in the built-in Analyzer).
+ - [**new**] Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
+   [**transformers**](https://github.com/huggingface/transformers).
+   We have added a model wrapper for token classification with LiLT and added a some LiLT models to the model catalog
+   that seem to look promising, especially if you want to train a model on non-english data. The training script for
+   LayoutLM can be used for LiLT as well and we will be providing a notebook on how to train a model on a custom dataset soon.
 **deep**doctection provides on top of that methods for pre-processing inputs to models like cropping or resizing and to
 post-process results, like validating duplicate outputs, relating words to detected layout segments or ordering words
@@ -282,9 +266,9 @@ Everything in the overview listed below the **deep**doctection layer are necessa
 separately.
 - Linux or macOS. (Windows is not supported but there is a [Dockerfile](./docker/pytorch-cpu-jupyter/Dockerfile) available)
-- Python >= 3.8
-- 1.12 <= PyTorch < 2.0 **or** Tensorflow >= 2.9 and CUDA. If you want to run the models provided by Tensorpack a GPU is
-  required. You can run on PyTorch with a CPU only.
+- Python >= 3.9
+- 1.13 <= PyTorch  **or** 2.11 <= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
+In general, if you want to train or fine-tune models, a GPU is required.
 - **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF documents into
 images.
 - With respect to the Deep Learning framework, you must decide between [Tensorflow](https://www.tensorflow.org/install?hl=en)

{deepdoctection-0.30 → deepdoctection-0.32}/README.md RENAMED Viewed

@@ -31,7 +31,8 @@ pipelines. Its core function does not depend on any specific deep learning libra
  - Text mining for native PDFs with  [**pdfplumber**](https://github.com/jsvine/pdfplumber),
  - Language detection with [**fastText**](https://github.com/facebookresearch/fastText),
  - Deskewing and rotating images with [**jdeskew**](https://github.com/phamquiluan/jdeskew).
- - Document and token classification with all LayoutLM models provided by the Transformer library.
+ - Document and token classification with all LayoutLM models provided by the
+   [**Transformer library**](https://github.com/huggingface/transformers).
    (Yes, you can use any LayoutLM-model with any of the provided OCR-or pdfplumber tools straight away!).
  - Table detection and table structure recognition with
    [**table-transformer**](https://github.com/microsoft/table-transformer).
@@ -41,8 +42,16 @@ pipelines. Its core function does not depend on any specific deep learning libra
  - Comprehensive configuration of **analyzer** like choosing different models, output parsing, OCR selection.
    Check this [notebook](https://github.com/deepdoctection/notebooks/blob/main/Analyzer_Configuration.ipynb) or the
    [docs](https://deepdoctection.readthedocs.io/en/latest/tutorials/analyzer_configuration_notebook/) for more infos.
- - Document layout analysis and table recognition now runs with Torchscript (CPU) as well and Detectron2 is
-   not required anymore for basic inference.
+ - Document layout analysis and table recognition now runs with
+   [**Torchscript**](https://pytorch.org/docs/stable/jit.html) (CPU) as well and [**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) is not required
+   anymore for basic inference.
+ - [**new**] More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
+   (not contained in the built-in Analyzer).
+ - [**new**] Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
+   [**transformers**](https://github.com/huggingface/transformers).
+   We have added a model wrapper for token classification with LiLT and added a some LiLT models to the model catalog
+   that seem to look promising, especially if you want to train a model on non-english data. The training script for
+   LayoutLM can be used for LiLT as well and we will be providing a notebook on how to train a model on a custom dataset soon.
 **deep**doctection provides on top of that methods for pre-processing inputs to models like cropping or resizing and to
 post-process results, like validating duplicate outputs, relating words to detected layout segments or ordering words
@@ -133,9 +142,9 @@ Everything in the overview listed below the **deep**doctection layer are necessa
 separately.
 - Linux or macOS. (Windows is not supported but there is a [Dockerfile](./docker/pytorch-cpu-jupyter/Dockerfile) available)
-- Python >= 3.8
-- 1.12 <= PyTorch < 2.0 **or** Tensorflow >= 2.9 and CUDA. If you want to run the models provided by Tensorpack a GPU is
-  required. You can run on PyTorch with a CPU only.
+- Python >= 3.9
+- 1.13 <= PyTorch  **or** 2.11 <= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
+In general, if you want to train or fine-tune models, a GPU is required.
 - **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF documents into
 images.
 - With respect to the Deep Learning framework, you must decide between [Tensorflow](https://www.tensorflow.org/install?hl=en)

{deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/__init__.py RENAMED Viewed

@@ -19,15 +19,13 @@ import os
 import sys
 from typing import TYPE_CHECKING
-from packaging import version
-from .utils.env_info import auto_select_lib_and_device
+from .utils.env_info import collect_env_info
 from .utils.file_utils import _LazyModule, get_tf_version, pytorch_available, tf_available
-from .utils.logger import logger
+from .utils.logger import LoggingRecord, logger
 # pylint: enable=wrong-import-position
-__version__ = 0.30
+__version__ = 0.32
 _IMPORT_STRUCTURE = {
     "analyzer": [
@@ -179,8 +177,10 @@ _IMPORT_STRUCTURE = {
         "Jdeskewer",
         "DoctrTextlineDetector",
         "DoctrTextRecognizer",
+        "DocTrRotationTransformer",
         "FasttextLangDetector",
         "HFDetrDerivedDetector",
+        "get_tokenizer_from_architecture",
         "HFLayoutLmTokenClassifierBase",
         "HFLayoutLmTokenClassifier",
         "HFLayoutLmv2TokenClassifier",
@@ -188,12 +188,16 @@ _IMPORT_STRUCTURE = {
         "HFLayoutLmSequenceClassifier",
         "HFLayoutLmv2SequenceClassifier",
         "HFLayoutLmv3SequenceClassifier",
+        "HFLiltTokenClassifier",
+        "HFLiltSequenceClassifier",
+        "HFLmSequenceClassifier",
         "ModelProfile",
         "ModelCatalog",
         "print_model_infos",
         "ModelDownloadManager",
         "PdfPlumberTextDetector",
         "TesseractOcrDetector",
+        "TesseractRotationTransformer",
         "TextractOcrDetector",
         "TPFrcnnDetector",
     ],
@@ -266,11 +270,11 @@ _IMPORT_STRUCTURE = {
         "DoctectionPipe",
         "LanguageDetectionService",
         "ImageLayoutService",
-        "get_tokenizer_from_architecture",
         "LMTokenClassifierService",
         "LMSequenceClassifierService",
         "OrderGenerator",
         "TextLineGenerator",
+        "TextLineService",
         "TextOrderService",
         "TableSegmentationRefinementService",
         "generate_html_string",
@@ -279,7 +283,7 @@ _IMPORT_STRUCTURE = {
         "PubtablesSegmentationService",
         "SegmentationResult",
         "TextExtractionService",
-        "SimpleTransformPipelineComponent",
+        "SimpleTransformService",
     ],
     "train": [
         "D2Trainer",
@@ -295,14 +299,13 @@ _IMPORT_STRUCTURE = {
         "save_tmp_file",
         "timed_operation",
         "collect_env_info",
-        "get_device",
-        "auto_select_lib_and_device",
         "auto_select_viz_library",
         "get_tensorflow_requirement",
         "tf_addons_available",
         "get_tf_addons_requirements",
         "tensorpack_available",
         "get_tensorpack_requirement",
+        "pytorch_available",
         "get_pytorch_requirement",
         "lxml_available",
         "get_lxml_requirement",
@@ -416,25 +419,31 @@ _IMPORT_STRUCTURE = {
     ],
 }
+# Setting some environment variables so that standard functions can be invoked with available hardware
+env_info = collect_env_info()
+logger.debug(LoggingRecord(msg=env_info))
-# disable TF warnings for versions > 2.4.1
-if tf_available():
-    if version.parse(get_tf_version()) > version.parse("2.4.1"):
-        os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
-    try:
-        import tensorflow.python.util.deprecation as deprecation  # type: ignore # pylint: disable=E0401,R0402
-        deprecation._PRINT_DEPRECATION_WARNINGS = False  # pylint: disable=W0212
-    except Exception:  # pylint: disable=W0703
-        try:
-            from tensorflow.python.util import deprecation  # type: ignore # pylint: disable=E0401
-            deprecation._PRINT_DEPRECATION_WARNINGS = False  # pylint: disable=W0212
-        except Exception:  # pylint: disable=W0703
-            pass
+if os.environ.get("PYTORCH_AVAILABLE") and os.environ.get("DD_USE_TORCH") is None:
+    os.environ["DD_USE_TORCH"] = "1"
+    os.environ["USE_TORCH"] = "1"
+if os.environ.get("TENSORFLOW_AVAILABLE") and os.environ.get("DD_USE_TF") is None:
+    os.environ["DD_USE_TF"] = "1"
+    os.environ["USE_TF"] = "1"
+if os.environ.get("DD_USE_TORCH") and os.environ.get("DD_USE_TF"):
+    logger.warning(
+        "Both DD_USE_TORCH and DD_USE_TF are set. Defaulting to PyTorch. If you want a different "
+        "behaviour, set DD_USE_TORCH to None before importing deepdoctection."
+    )
+    os.environ.pop("DD_USE_TF")
+    os.environ.pop("USE_TF")
-# Setting some environment variables so that standard functions can be invoked with available hardware
-auto_select_lib_and_device()
+if not os.environ.get("PYTORCH_AVAILABLE") and not os.environ.get("TENSORFLOW_AVAILABLE"):
+    logger.warning(
+        LoggingRecord(
+            msg="Neither Tensorflow or Pytorch are available. You will not be able to use any Deep Learning "
+            "model from the library."
+        )
+    )
 # Direct imports for type-checking
@@ -442,10 +451,10 @@ if TYPE_CHECKING:
     from .analyzer import *
     from .dataflow import *
     from .datapoint import *
-    from .datasets import *
+    from .datasets import *  # type: ignore
     from .eval import *
-    from .extern import *
-    from .mapper import *
+    from .extern import *  # type: ignore
+    from .mapper import *  # type: ignore
     from .pipe import *
     from .train import *
     from .utils import *

{deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/analyzer/dd.py RENAMED Viewed

@@ -23,51 +23,43 @@ Module for **deep**doctection analyzer.
 -user factory with a reduced config setting
 """
-import ast
 import os
 from os import environ
 from shutil import copyfile
 from typing import List, Optional, Union
+from lazy_imports import try_import
 from ..extern.base import ObjectDetector
+from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
 from ..extern.doctrocr import DoctrTextlineDetector, DoctrTextRecognizer
+from ..extern.hfdetr import HFDetrDerivedDetector
 from ..extern.model import ModelCatalog, ModelDownloadManager
 from ..extern.pdftext import PdfPlumberTextDetector
+from ..extern.pt.ptutils import get_torch_device
 from ..extern.tessocr import TesseractOcrDetector
 from ..extern.texocr import TextractOcrDetector
+from ..extern.tp.tfutils import disable_tp_layer_logging, get_tf_device
+from ..extern.tpdetect import TPFrcnnDetector
 from ..pipe.base import PipelineComponent
-from ..pipe.cell import DetectResultGenerator, SubImageLayoutService
 from ..pipe.common import AnnotationNmsService, MatchingService, PageParsingService
 from ..pipe.doctectionpipe import DoctectionPipe
 from ..pipe.layout import ImageLayoutService
 from ..pipe.order import TextOrderService
 from ..pipe.refine import TableSegmentationRefinementService
 from ..pipe.segment import PubtablesSegmentationService, TableSegmentationService
+from ..pipe.sub_layout import DetectResultGenerator, SubImageLayoutService
 from ..pipe.text import TextExtractionService
 from ..utils.detection_types import Pathlike
-from ..utils.env_info import get_device
-from ..utils.file_utils import (
-    boto3_available,
-    detectron2_available,
-    pytorch_available,
-    tensorpack_available,
-    tf_available,
-)
+from ..utils.error import DependencyError
+from ..utils.file_utils import detectron2_available, tensorpack_available
 from ..utils.fs import get_configs_dir_path, get_package_path, mkdir_p
 from ..utils.logger import LoggingRecord, logger
 from ..utils.metacfg import AttrDict, set_config_by_yaml
 from ..utils.settings import CellType, LayoutType
 from ..utils.transform import PadTransform
-if tf_available() and tensorpack_available():
-    from ..extern.tp.tfutils import disable_tp_layer_logging
-    from ..extern.tpdetect import TPFrcnnDetector
-if pytorch_available():
-    from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
-    from ..extern.hfdetr import HFDetrDerivedDetector
-if boto3_available():
+with try_import() as image_guard:
     from botocore.config import Config  # type: ignore
@@ -113,11 +105,12 @@ def config_sanity_checks(cfg: AttrDict) -> None:
     """Some config sanity checks"""
     if cfg.USE_PDF_MINER and cfg.USE_OCR and cfg.OCR.USE_DOCTR:
         raise ValueError("Configuration USE_PDF_MINER= True and USE_OCR=True and USE_DOCTR=True is not allowed")
-    if cfg.OCR.USE_TESSERACT + cfg.OCR.USE_DOCTR + cfg.OCR.USE_TEXTRACT != 1:
-        raise ValueError(
-            "Choose either OCR.USE_TESSERACT=True or OCR.USE_DOCTR=True or OCR.USE_TEXTRACT=True and set the other two "
-            "to False. Only one OCR system can be activated."
-        )
+    if cfg.USE_OCR:
+        if cfg.OCR.USE_TESSERACT + cfg.OCR.USE_DOCTR + cfg.OCR.USE_TEXTRACT != 1:
+            raise ValueError(
+                "Choose either OCR.USE_TESSERACT=True or OCR.USE_DOCTR=True or OCR.USE_TEXTRACT=True "
+                "and set the other two to False. Only one OCR system can be activated."
+            )
 def build_detector(
@@ -343,11 +336,20 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
             pipe_component_list.append(table_segmentation)
             if cfg.USE_TABLE_REFINEMENT:
-                table_segmentation_refinement = TableSegmentationRefinementService()
+                table_segmentation_refinement = TableSegmentationRefinementService(
+                    [LayoutType.table, LayoutType.table_rotated],
+                    [
+                        LayoutType.cell,
+                        CellType.column_header,
+                        CellType.projected_row_header,
+                        CellType.spanning,
+                        CellType.row_header,
+                    ],
+                )
                 pipe_component_list.append(table_segmentation_refinement)
     if cfg.USE_PDF_MINER:
-        pdf_text = PdfPlumberTextDetector()
+        pdf_text = PdfPlumberTextDetector(x_tolerance=cfg.PDF_MINER.X_TOLERANCE, y_tolerance=cfg.PDF_MINER.Y_TOLERANCE)
         d_text = TextExtractionService(pdf_text)
         pipe_component_list.append(d_text)
@@ -400,7 +402,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
 def get_dd_analyzer(
-    reset_config_file: bool = False,
+    reset_config_file: bool = True,
     config_overwrite: Optional[List[str]] = None,
     path_config_file: Optional[Pathlike] = None,
 ) -> DoctectionPipe:
@@ -429,8 +431,13 @@ def get_dd_analyzer(
     :return: A DoctectionPipe instance with given configs
     """
     config_overwrite = [] if config_overwrite is None else config_overwrite
-    lib = "TF" if ast.literal_eval(os.environ.get("USE_TENSORFLOW", "False")) else "PT"
-    device = get_device(False)
+    lib = "TF" if os.environ.get("DD_USE_TF") else "PT"
+    if lib == "TF":
+        device = get_tf_device()
+    elif lib == "PT":
+        device = get_torch_device()
+    else:
+        raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
     dd_one_config_path = maybe_copy_config_to_cache(
         get_package_path(), get_configs_dir_path(), _DD_ONE, reset_config_file
     )

{deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/configs/conf_dd_one.yaml RENAMED Viewed

@@ -1,38 +1,38 @@
 USE_LAYOUT: True
 USE_TABLE_SEGMENTATION: True
 TF:
-   LAYOUT:
-      WEIGHTS: layout/model-800000_inf_only.data-00000-of-00001
-      FILTER:
-   CELL:
-      WEIGHTS: cell/model-1800000_inf_only.data-00000-of-00001
-      FILTER:
-   ITEM:
-      WEIGHTS: item/model-1620000_inf_only.data-00000-of-00001
-      FILTER:
+  LAYOUT:
+    WEIGHTS: layout/model-800000_inf_only.data-00000-of-00001
+    FILTER:
+  CELL:
+    WEIGHTS: cell/model-1800000_inf_only.data-00000-of-00001
+    FILTER:
+  ITEM:
+    WEIGHTS: item/model-1620000_inf_only.data-00000-of-00001
+    FILTER:
 PT:
-   LAYOUT:
-      WEIGHTS: layout/d2_model_0829999_layout_inf_only.pt
-      WEIGHTS_TS: layout/d2_model_0829999_layout_inf_only.ts
-      FILTER:
-      PAD:
-        TOP: 60
-        RIGHT: 60
-        BOTTOM: 60
-        LEFT: 60
-   ITEM:
-     WEIGHTS: item/d2_model_1639999_item_inf_only.pt
-     WEIGHTS_TS: item/d2_model_1639999_item_inf_only.ts
-     FILTER:
-     PAD:
-        TOP: 60
-        RIGHT: 60
-        BOTTOM: 60
-        LEFT: 60
-   CELL:
-      WEIGHTS: cell/d2_model_1849999_cell_inf_only.pt
-      WEIGHTS_TS: cell/d2_model_1849999_cell_inf_only.ts
-      FILTER:
+  LAYOUT:
+    WEIGHTS: layout/d2_model_0829999_layout_inf_only.pt
+    WEIGHTS_TS: layout/d2_model_0829999_layout_inf_only.ts
+    FILTER:
+    PAD:
+      TOP: 60
+      RIGHT: 60
+      BOTTOM: 60
+      LEFT: 60
+  ITEM:
+    WEIGHTS: item/d2_model_1639999_item_inf_only.pt
+    WEIGHTS_TS: item/d2_model_1639999_item_inf_only.ts
+    FILTER:
+    PAD:
+      TOP: 60
+      RIGHT: 60
+      BOTTOM: 60
+      LEFT: 60
+  CELL:
+    WEIGHTS: cell/d2_model_1849999_cell_inf_only.pt
+    WEIGHTS_TS: cell/d2_model_1849999_cell_inf_only.ts
+    FILTER:
 LAYOUT_NMS_PAIRS:
   COMBINATIONS:
   THRESHOLDS:
@@ -48,6 +48,9 @@ SEGMENTATION:
   STRETCH_RULE: equal
 USE_TABLE_REFINEMENT: True
 USE_PDF_MINER: False
+PDF_MINER:
+  X_TOLERANCE: 3
+  Y_TOLERANCE: 3
 USE_OCR: True
 OCR:
   USE_TESSERACT: True

{deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/dataflow/base.py RENAMED Viewed

@@ -17,25 +17,6 @@ from typing import Any, Iterator, no_type_check
 from ..utils.utils import get_rng
-class DataFlowTerminated(BaseException):
-    """
-    An exception indicating that the DataFlow is unable to produce any more
-    data, i.e. something wrong happened so that calling `__iter__`
-    cannot give a valid iterator anymore.
-    In most DataFlow this will never be raised.
-    """
-class DataFlowResetStateNotCalled(BaseException):
-    """
-    An exception indicating that `reset_state()` has not been called before starting
-    iteration.
-    """
-    def __init__(self) -> None:
-        super().__init__("Iterating a dataflow requires .reset_state() to be called first")
 class DataFlowReentrantGuard:
     """
     A tool to enforce non-reentrancy.

deepdoctection 0.30__tar.gz → 0.32__tar.gz

Potentially problematic release.

deepdoctection 0.30tar.gz → 0.32tar.gz