PyPI - deepdoctection - Versions diffs - 0.34__py3-none-any.whl → 0.36__py3-none-any.whl - Mend

deepdoctection 0.34py3-none-any.whl → 0.36py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (25) hide show

deepdoctection/__init__.py +7 -14
deepdoctection/analyzer/__init__.py +1 -0
deepdoctection/analyzer/_config.py +142 -0
deepdoctection/analyzer/dd.py +11 -335
deepdoctection/analyzer/factory.py +718 -0
deepdoctection/configs/conf_dd_one.yaml +5 -0
deepdoctection/datapoint/annotation.py +1 -1
deepdoctection/datapoint/convert.py +6 -4
deepdoctection/datapoint/image.py +16 -6
deepdoctection/datapoint/view.py +91 -15
deepdoctection/eval/cocometric.py +59 -13
deepdoctection/extern/pdftext.py +96 -5
deepdoctection/extern/tessocr.py +1 -0
deepdoctection/mapper/match.py +4 -2
deepdoctection/utils/env_info.py +30 -1
deepdoctection/utils/file_utils.py +19 -0
deepdoctection/utils/metacfg.py +12 -0
deepdoctection/utils/pdf_utils.py +86 -3
deepdoctection/utils/utils.py +39 -0
deepdoctection/utils/viz.py +16 -13
{deepdoctection-0.34.dist-info → deepdoctection-0.36.dist-info}/METADATA +126 -116
{deepdoctection-0.34.dist-info → deepdoctection-0.36.dist-info}/RECORD +25 -23
{deepdoctection-0.34.dist-info → deepdoctection-0.36.dist-info}/WHEEL +1 -1
{deepdoctection-0.34.dist-info → deepdoctection-0.36.dist-info}/LICENSE +0 -0
{deepdoctection-0.34.dist-info → deepdoctection-0.36.dist-info}/top_level.txt +0 -0

deepdoctection/utils/env_info.py CHANGED Viewed

@@ -20,6 +20,10 @@ Some useful function for collecting environment information.
 This is also the place where we give an overview of the important environment variables.
+For env variables with boolean character, use one of the following values:
+{"1", "True", "TRUE", "true", "yes"}
 `USE_TENSORFLOW
 USE_PYTORCH
 USE_CUDA
@@ -35,6 +39,12 @@ decide what image processing library the `viz_handler` should use. The default l
 to be installed separately. However, if both libraries have been detected `viz_handler` will opt for OpenCV.
 Use the variables to let choose `viz_handler` according to your preferences.
+`USE_DD_POPPLER
+USE_DD_PDFIUM`
+For PDF rendering we use PyPDFium2 as default but for legacy reasons, we also support Poppler. If you want to enforce
+Poppler set one to `USE_DD_POPPLER=True` and `USE_DD_PDFIUM=False` the other to False.
 `HF_CREDENTIALS`
 will be used by the `ModelDownloadManager` to pass your credentials if you have a model registered that resides in a
@@ -56,6 +66,7 @@ from typing import Optional
 import numpy as np
 from packaging import version
+from pypdf.errors import DependencyError
 from tabulate import tabulate
 from .file_utils import (
@@ -75,6 +86,7 @@ from .file_utils import (
     pdf_to_cairo_available,
     pdf_to_ppm_available,
     pdfplumber_available,
+    pypdfium2_available,
     pytorch_available,
     qpdf_available,
     scipy_available,
@@ -88,7 +100,7 @@ from .file_utils import (
 from .logger import LoggingRecord, logger
 from .types import KeyValEnvInfos, PathLikeOrStr
-__all__ = ["collect_env_info", "auto_select_viz_library", "ENV_VARS_TRUE"]
+__all__ = ["collect_env_info", "auto_select_viz_library", "auto_select_pdf_render_framework", "ENV_VARS_TRUE"]
 # pylint: disable=import-outside-toplevel
@@ -532,4 +544,21 @@ def auto_select_viz_library() -> None:
         os.environ["USE_DD_OPENCV"] = "False"
+def auto_select_pdf_render_framework() -> None:
+    """Setting pdf2image as default pdf rendering library if pdfium is not installed"""
+    # if env variables are already set, don't change them
+    if os.environ.get("USE_DD_POPPLER") or os.environ.get("USE_DD_PDFIUM"):
+        return
+    if pypdfium2_available():
+        os.environ["USE_DD_POPPLER"] = "False"
+        os.environ["USE_DD_PDFIUM"] = "True"
+        return
+    if pdf_to_cairo_available() or pdf_to_ppm_available():
+        os.environ["USE_DD_POPPLER"] = "True"
+        os.environ["USE_DD_PDFIUM"] = "False"
+        return
+    raise DependencyError("No pdf rendering library found. Please install Poppler or pdfium.")
 # pylint: enable=import-outside-toplevel

deepdoctection/utils/file_utils.py CHANGED Viewed

@@ -616,6 +616,25 @@ def get_pillow_requirement() -> Requirement:
     return "pillow", pillow_available(), _PILLOW_ERR_MSG
+# Pypdfium2
+_PYPDFIUM2_AVAILABLE = importlib.util.find_spec("pypdfium2") is not None
+_PYPDFIUM2_ERR_MSG = f"pypdfium2 must be installed. {_GENERIC_ERR_MSG}"
+def pypdfium2_available() -> bool:
+    """
+    Returns True if pypdfium2 is installed
+    """
+    return bool(_PYPDFIUM2_AVAILABLE)
+def get_pypdfium2_requirement() -> Requirement:
+    """
+    Return pypdfium2 requirement
+    """
+    return "pypdfium2", pypdfium2_available(), _PYPDFIUM2_ERR_MSG
 # SpaCy
 _SPACY_AVAILABLE = importlib.util.find_spec("spacy") is not None
 _SPACY_ERR_MSG = f"SpaCy must be installed. {_GENERIC_ERR_MSG}"

deepdoctection/utils/metacfg.py CHANGED Viewed

@@ -18,6 +18,7 @@
 """
 Class AttrDict for maintaining configs and some functions for generating and saving AttrDict instances to .yaml files
 """
+from __future__ import annotations
 import pprint
 from typing import Any
@@ -105,6 +106,17 @@ class AttrDict:
                 v = eval(v)  # pylint: disable=C0103, W0123
             setattr(dic, key, v)
+    def overwrite_config(self, other_config: AttrDict) -> None:
+        """
+        Overwrite the current config with values from another config.
+        :param other_config: The other AttrDict instance to copy values from.
+        :raises AttributeError: If a key from other_config is not an attribute of self.
+        """
+        if self._freezed:
+            raise AttributeError("Config was freezed! Cannot overwrite config.")
+        self.from_dict(other_config.to_dict())
     def freeze(self, freezed: bool = True) -> None:
         """
         :param freezed: freeze the instance, so that no attributes can be added or changed

deepdoctection/utils/pdf_utils.py CHANGED Viewed

@@ -24,13 +24,16 @@ import subprocess
 import sys
 from errno import ENOENT
 from io import BytesIO
+from pathlib import Path
 from shutil import copyfile
-from typing import Generator, Optional
+from typing import Generator, Literal, Optional
+from lazy_imports import try_import
 from numpy import uint8
 from pypdf import PdfReader, PdfWriter, errors
 from .context import save_tmp_file, timeout_manager
+from .env_info import ENV_VARS_TRUE
 from .error import DependencyError, FileExtensionError
 from .file_utils import pdf_to_cairo_available, pdf_to_ppm_available, qpdf_available
 from .logger import LoggingRecord, logger
@@ -38,7 +41,17 @@ from .types import PathLikeOrStr, PixelValues
 from .utils import is_file_extension
 from .viz import viz_handler
-__all__ = ["decrypt_pdf_document", "get_pdf_file_reader", "get_pdf_file_writer", "PDFStreamer", "pdf_to_np_array"]
+with try_import() as pt_import_guard:
+    import pypdfium2
+__all__ = [
+    "decrypt_pdf_document",
+    "get_pdf_file_reader",
+    "get_pdf_file_writer",
+    "PDFStreamer",
+    "pdf_to_np_array",
+    "split_pdf",
+]
 def decrypt_pdf_document(path: PathLikeOrStr) -> bool:
@@ -234,7 +247,7 @@ def _run_poppler(poppler_args: list[str]) -> None:
             raise PopplerError(status=proc.returncode, message="Syntax Error: PDF cannot be read with Poppler")
-def pdf_to_np_array(pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dpi: int = 200) -> PixelValues:
+def pdf_to_np_array_poppler(pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dpi: int = 200) -> PixelValues:
     """
     Convert a single pdf page from its byte representation to a numpy array. This function will save the pdf as to a tmp
     file and then call poppler via `pdftoppm` resp. `pdftocairo` if the former is not available.
@@ -250,3 +263,73 @@ def pdf_to_np_array(pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dp
         image = viz_handler.read_image(tmp_name + "-1.png")
     return image.astype(uint8)
+def pdf_to_np_array_pdfmium(pdf_bytes: bytes, dpi: int = 200) -> PixelValues:
+    """
+    Convert a single pdf page from its byte representation to a numpy array using pdfium.
+    :param pdf_bytes: Bytes representing the PDF file
+    :param dpi:  Image quality in DPI/dots-per-inch (default 200)
+    :return: numpy array
+    """
+    page = pypdfium2.PdfDocument(pdf_bytes)[0]
+    return page.render(scale=dpi * 1 / 72).to_numpy().astype(uint8)
+def pdf_to_np_array(pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dpi: int = 200) -> PixelValues:
+    """
+    Convert a single pdf page from its byte representation to a numpy array. This function will either use Poppler or
+    pdfium to render the pdf.
+    :param pdf_bytes: Bytes representing the PDF file
+    :param size: Size of the resulting image(s), uses (width, height) standard
+    :param dpi:  Image quality in DPI/dots-per-inch (default 200)
+    :return: numpy array
+    """
+    if os.environ.get("USE_DD_PDFIUM", "False") in ENV_VARS_TRUE:
+        if size is not None:
+            logger.warning(
+                LoggingRecord(
+                    f"pdf_to_np_array_pdfmium does not support the size parameter. Will use dpi = {dpi} instead."
+                )
+            )
+        return pdf_to_np_array_pdfmium(pdf_bytes, dpi)
+    return pdf_to_np_array_poppler(pdf_bytes, size, dpi)
+def split_pdf(
+    pdf_path: PathLikeOrStr, output_dir: PathLikeOrStr, file_type: Literal["image", "pdf"], dpi: int = 200
+) -> None:
+    """
+    Split a pdf into single pages. The pages are saved as single pdf/png files in a subfolder of the output directory.
+    :param pdf_path: Path to the pdf file
+    :param output_dir: Path to the output directory
+    :param file_type: Type of the output file. Either "image" or "pdf"
+    :param dpi: Image quality in DPI/dots-per-inch (default
+    """
+    pdf_path = Path(pdf_path)
+    filename = pdf_path.stem
+    output_dir = Path(output_dir)
+    file_dir = output_dir / filename
+    if not file_dir.exists():
+        os.makedirs(file_dir)
+    with open(pdf_path, "rb") as file:
+        pdf = PdfReader(file)
+        for i, page in enumerate(pdf.pages):
+            writer = PdfWriter()
+            writer.add_page(page)
+            if file_type == ".pdf":
+                with open(file_dir / f"{filename}_{i}.pdf", "wb") as out:
+                    writer.write(out)
+                    writer.close()
+            else:
+                with BytesIO() as buffer:
+                    writer.write(buffer)
+                    buffer.seek(0)
+                    np_image = pdf_to_np_array(buffer.getvalue(), dpi=dpi)
+                    viz_handler.write_image(file_dir / f"{filename}_{i}.png", np_image)
+                    writer.close()

deepdoctection/utils/utils.py CHANGED Viewed

@@ -155,3 +155,42 @@ def is_file_extension(file_name: PathLikeOrStr, extension: Union[str, Sequence[s
     if isinstance(extension, str):
         return os.path.splitext(file_name)[-1].lower() == extension
     return os.path.splitext(file_name)[-1].lower() in extension
+def partition_list(base_list: list[str], stop_value: str) -> list[list[str]]:
+    """
+    Partitions a list of strings into sublists, where each sublist starts with the first occurrence of the stop value.
+    Consecutive stop values are grouped together in the same sublist.
+    :param base_list: The list of strings to be partitioned.
+    :param stop_value: The string value that indicates the start of a new partition.
+    :return: A list of lists, where each sublist is a partition of the original list.
+    ** Example:**
+        strings = ['a', 'a', 'c', 'c', 'b', 'd', 'c', 'c', 'a', 'b', 'a', 'b', 'a', 'a']
+        stop_string = 'a'
+        partition_list(strings, stop_string)
+       # Output [['a', 'a', 'c', 'c', 'b', 'd', 'c', 'c'], ['a', 'b'], ['a', 'b'], ['a', 'a']]
+    """
+    partitions = []
+    current_partition: list[str] = []
+    stop_found = False
+    for s in base_list:
+        if s == stop_value:
+            if not stop_found and current_partition:
+                partitions.append(current_partition)
+                current_partition = []
+            current_partition.append(s)
+            stop_found = True
+        else:
+            current_partition.append(s)
+            stop_found = False
+    if current_partition:
+        partitions.append(current_partition)
+    return partitions

deepdoctection/utils/viz.py CHANGED Viewed

@@ -205,6 +205,7 @@ def draw_boxes(
     font_scale: float = 1.0,
     rectangle_thickness: int = 4,
     box_color_by_category: bool = True,
+    show_palette: bool = True,
 ) -> PixelValues:
     """
     Dray bounding boxes with category names into image.
@@ -216,6 +217,7 @@ def draw_boxes(
     :param font_scale: Font scale of text box
     :param rectangle_thickness: Thickness of bounding box
     :param box_color_by_category:
+    :param show_palette: Whether to show a color palette of the categories
     :return: A new image np.ndarray
     """
     if color is not None:
@@ -261,19 +263,20 @@ def draw_boxes(
         )
     # draw a (very ugly) color palette
-    y_0 = np_image.shape[0]
-    for category, col in category_to_color.items():
-        if category is not None:
-            np_image = viz_handler.draw_text(
-                np_image,
-                (np_image.shape[1], y_0),
-                category,
-                color=col,
-                font_scale=font_scale * 3,
-                rectangle_thickness=rectangle_thickness,
-            )
-            _, text_h = viz_handler.get_text_size(category, font_scale * 2)
-            y_0 = y_0 - int(10 * text_h)
+    if show_palette:
+        y_0 = np_image.shape[0]
+        for category, col in category_to_color.items():
+            if category is not None:
+                np_image = viz_handler.draw_text(
+                    np_image,
+                    (np_image.shape[1], y_0),
+                    category,
+                    color=col,
+                    font_scale=font_scale,
+                    rectangle_thickness=rectangle_thickness,
+                )
+                _, text_h = viz_handler.get_text_size(category, font_scale * 2)
+                y_0 = y_0 - int(1 * text_h)
     return np_image

{deepdoctection-0.34.dist-info → deepdoctection-0.36.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: deepdoctection
-Version: 0.34
+Version: 0.36
 Summary: Repository for Document AI
 Home-page: https://github.com/deepdoctection/deepdoctection
 Author: Dr. Janis Meyer
@@ -16,114 +16,117 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: catalogue ==2.0.10
-Requires-Dist: huggingface-hub >=0.12.0
-Requires-Dist: importlib-metadata >=5.0.0
-Requires-Dist: jsonlines ==3.1.0
-Requires-Dist: lazy-imports ==0.3.1
-Requires-Dist: mock ==4.0.3
-Requires-Dist: networkx >=2.7.1
-Requires-Dist: numpy <2.0,>=1.21
-Requires-Dist: packaging >=20.0
-Requires-Dist: Pillow >=10.0.0
-Requires-Dist: pypdf >=3.16.0
-Requires-Dist: pyyaml >=6.0.1
-Requires-Dist: pyzmq >=16
-Requires-Dist: scipy >=1.13.1
-Requires-Dist: termcolor >=1.1
-Requires-Dist: tabulate >=0.7.7
-Requires-Dist: tqdm ==4.64.0
-Provides-Extra: dev
-Requires-Dist: python-dotenv ==1.0.0 ; extra == 'dev'
-Requires-Dist: click ; extra == 'dev'
-Requires-Dist: black ==23.7.0 ; extra == 'dev'
-Requires-Dist: isort ==5.13.2 ; extra == 'dev'
-Requires-Dist: pylint ==2.17.4 ; extra == 'dev'
-Requires-Dist: mypy ==1.4.1 ; extra == 'dev'
-Requires-Dist: wandb ; extra == 'dev'
-Requires-Dist: types-PyYAML >=6.0.12.12 ; extra == 'dev'
-Requires-Dist: types-termcolor >=1.1.3 ; extra == 'dev'
-Requires-Dist: types-tabulate >=0.9.0.3 ; extra == 'dev'
-Requires-Dist: types-tqdm >=4.66.0.5 ; extra == 'dev'
-Requires-Dist: lxml-stubs >=0.5.1 ; extra == 'dev'
-Requires-Dist: types-Pillow >=10.2.0.20240406 ; extra == 'dev'
-Requires-Dist: types-urllib3 >=1.26.25.14 ; extra == 'dev'
-Provides-Extra: docs
-Requires-Dist: tensorpack ==0.11 ; extra == 'docs'
-Requires-Dist: boto3 ==1.34.102 ; extra == 'docs'
-Requires-Dist: transformers >=4.36.0 ; extra == 'docs'
-Requires-Dist: accelerate >=0.29.1 ; extra == 'docs'
-Requires-Dist: pdfplumber >=0.11.0 ; extra == 'docs'
-Requires-Dist: lxml >=4.9.1 ; extra == 'docs'
-Requires-Dist: lxml-stubs >=0.5.1 ; extra == 'docs'
-Requires-Dist: jdeskew >=0.2.2 ; extra == 'docs'
-Requires-Dist: jinja2 ==3.0.3 ; extra == 'docs'
-Requires-Dist: mkdocs-material ; extra == 'docs'
-Requires-Dist: mkdocstrings-python ; extra == 'docs'
-Requires-Dist: griffe ==0.25.0 ; extra == 'docs'
+Requires-Dist: catalogue==2.0.10
+Requires-Dist: huggingface_hub<0.26,>=0.12.0
+Requires-Dist: importlib-metadata>=5.0.0
+Requires-Dist: jsonlines==3.1.0
+Requires-Dist: lazy-imports==0.3.1
+Requires-Dist: mock==4.0.3
+Requires-Dist: networkx>=2.7.1
+Requires-Dist: numpy<2.0,>=1.21
+Requires-Dist: packaging>=20.0
+Requires-Dist: Pillow>=10.0.0
+Requires-Dist: pypdf>=3.16.0
+Requires-Dist: pypdfium2>=4.30.0
+Requires-Dist: pyyaml>=6.0.1
+Requires-Dist: pyzmq>=16
+Requires-Dist: scipy>=1.13.1
+Requires-Dist: termcolor>=1.1
+Requires-Dist: tabulate>=0.7.7
+Requires-Dist: tqdm==4.64.0
+Provides-Extra: tf
+Requires-Dist: catalogue==2.0.10; extra == "tf"
+Requires-Dist: huggingface_hub<0.26,>=0.12.0; extra == "tf"
+Requires-Dist: importlib-metadata>=5.0.0; extra == "tf"
+Requires-Dist: jsonlines==3.1.0; extra == "tf"
+Requires-Dist: lazy-imports==0.3.1; extra == "tf"
+Requires-Dist: mock==4.0.3; extra == "tf"
+Requires-Dist: networkx>=2.7.1; extra == "tf"
+Requires-Dist: numpy<2.0,>=1.21; extra == "tf"
+Requires-Dist: packaging>=20.0; extra == "tf"
+Requires-Dist: Pillow>=10.0.0; extra == "tf"
+Requires-Dist: pypdf>=3.16.0; extra == "tf"
+Requires-Dist: pypdfium2>=4.30.0; extra == "tf"
+Requires-Dist: pyyaml>=6.0.1; extra == "tf"
+Requires-Dist: pyzmq>=16; extra == "tf"
+Requires-Dist: scipy>=1.13.1; extra == "tf"
+Requires-Dist: termcolor>=1.1; extra == "tf"
+Requires-Dist: tabulate>=0.7.7; extra == "tf"
+Requires-Dist: tqdm==4.64.0; extra == "tf"
+Requires-Dist: tensorpack==0.11; extra == "tf"
+Requires-Dist: protobuf==3.20.1; extra == "tf"
+Requires-Dist: tensorflow-addons>=0.17.1; extra == "tf"
+Requires-Dist: tf2onnx>=1.9.2; extra == "tf"
+Requires-Dist: python-doctr==0.8.1; extra == "tf"
+Requires-Dist: pycocotools>=2.0.2; extra == "tf"
+Requires-Dist: boto3==1.34.102; extra == "tf"
+Requires-Dist: pdfplumber>=0.11.0; extra == "tf"
+Requires-Dist: fasttext==0.9.2; extra == "tf"
+Requires-Dist: jdeskew>=0.2.2; extra == "tf"
+Requires-Dist: apted==1.0.3; extra == "tf"
+Requires-Dist: distance==0.1.3; extra == "tf"
+Requires-Dist: lxml>=4.9.1; extra == "tf"
 Provides-Extra: pt
-Requires-Dist: catalogue ==2.0.10 ; extra == 'pt'
-Requires-Dist: huggingface-hub >=0.12.0 ; extra == 'pt'
-Requires-Dist: importlib-metadata >=5.0.0 ; extra == 'pt'
-Requires-Dist: jsonlines ==3.1.0 ; extra == 'pt'
-Requires-Dist: lazy-imports ==0.3.1 ; extra == 'pt'
-Requires-Dist: mock ==4.0.3 ; extra == 'pt'
-Requires-Dist: networkx >=2.7.1 ; extra == 'pt'
-Requires-Dist: numpy <2.0,>=1.21 ; extra == 'pt'
-Requires-Dist: packaging >=20.0 ; extra == 'pt'
-Requires-Dist: Pillow >=10.0.0 ; extra == 'pt'
-Requires-Dist: pypdf >=3.16.0 ; extra == 'pt'
-Requires-Dist: pyyaml >=6.0.1 ; extra == 'pt'
-Requires-Dist: pyzmq >=16 ; extra == 'pt'
-Requires-Dist: scipy >=1.13.1 ; extra == 'pt'
-Requires-Dist: termcolor >=1.1 ; extra == 'pt'
-Requires-Dist: tabulate >=0.7.7 ; extra == 'pt'
-Requires-Dist: tqdm ==4.64.0 ; extra == 'pt'
-Requires-Dist: timm >=0.9.16 ; extra == 'pt'
-Requires-Dist: transformers >=4.36.0 ; extra == 'pt'
-Requires-Dist: accelerate >=0.29.1 ; extra == 'pt'
-Requires-Dist: python-doctr ==0.8.1 ; extra == 'pt'
-Requires-Dist: boto3 ==1.34.102 ; extra == 'pt'
-Requires-Dist: pdfplumber >=0.11.0 ; extra == 'pt'
-Requires-Dist: fasttext ==0.9.2 ; extra == 'pt'
-Requires-Dist: jdeskew >=0.2.2 ; extra == 'pt'
-Requires-Dist: apted ==1.0.3 ; extra == 'pt'
-Requires-Dist: distance ==0.1.3 ; extra == 'pt'
-Requires-Dist: lxml >=4.9.1 ; extra == 'pt'
+Requires-Dist: catalogue==2.0.10; extra == "pt"
+Requires-Dist: huggingface_hub<0.26,>=0.12.0; extra == "pt"
+Requires-Dist: importlib-metadata>=5.0.0; extra == "pt"
+Requires-Dist: jsonlines==3.1.0; extra == "pt"
+Requires-Dist: lazy-imports==0.3.1; extra == "pt"
+Requires-Dist: mock==4.0.3; extra == "pt"
+Requires-Dist: networkx>=2.7.1; extra == "pt"
+Requires-Dist: numpy<2.0,>=1.21; extra == "pt"
+Requires-Dist: packaging>=20.0; extra == "pt"
+Requires-Dist: Pillow>=10.0.0; extra == "pt"
+Requires-Dist: pypdf>=3.16.0; extra == "pt"
+Requires-Dist: pypdfium2>=4.30.0; extra == "pt"
+Requires-Dist: pyyaml>=6.0.1; extra == "pt"
+Requires-Dist: pyzmq>=16; extra == "pt"
+Requires-Dist: scipy>=1.13.1; extra == "pt"
+Requires-Dist: termcolor>=1.1; extra == "pt"
+Requires-Dist: tabulate>=0.7.7; extra == "pt"
+Requires-Dist: tqdm==4.64.0; extra == "pt"
+Requires-Dist: timm>=0.9.16; extra == "pt"
+Requires-Dist: transformers>=4.36.0; extra == "pt"
+Requires-Dist: accelerate>=0.29.1; extra == "pt"
+Requires-Dist: python-doctr==0.8.1; extra == "pt"
+Requires-Dist: boto3==1.34.102; extra == "pt"
+Requires-Dist: pdfplumber>=0.11.0; extra == "pt"
+Requires-Dist: fasttext==0.9.2; extra == "pt"
+Requires-Dist: jdeskew>=0.2.2; extra == "pt"
+Requires-Dist: apted==1.0.3; extra == "pt"
+Requires-Dist: distance==0.1.3; extra == "pt"
+Requires-Dist: lxml>=4.9.1; extra == "pt"
+Provides-Extra: docs
+Requires-Dist: tensorpack==0.11; extra == "docs"
+Requires-Dist: boto3==1.34.102; extra == "docs"
+Requires-Dist: transformers>=4.36.0; extra == "docs"
+Requires-Dist: accelerate>=0.29.1; extra == "docs"
+Requires-Dist: pdfplumber>=0.11.0; extra == "docs"
+Requires-Dist: lxml>=4.9.1; extra == "docs"
+Requires-Dist: lxml-stubs>=0.5.1; extra == "docs"
+Requires-Dist: jdeskew>=0.2.2; extra == "docs"
+Requires-Dist: jinja2==3.0.3; extra == "docs"
+Requires-Dist: mkdocs-material; extra == "docs"
+Requires-Dist: mkdocstrings-python; extra == "docs"
+Requires-Dist: griffe==0.25.0; extra == "docs"
+Provides-Extra: dev
+Requires-Dist: python-dotenv==1.0.0; extra == "dev"
+Requires-Dist: click; extra == "dev"
+Requires-Dist: black==23.7.0; extra == "dev"
+Requires-Dist: isort==5.13.2; extra == "dev"
+Requires-Dist: pylint==2.17.4; extra == "dev"
+Requires-Dist: mypy==1.4.1; extra == "dev"
+Requires-Dist: wandb; extra == "dev"
+Requires-Dist: types-PyYAML>=6.0.12.12; extra == "dev"
+Requires-Dist: types-termcolor>=1.1.3; extra == "dev"
+Requires-Dist: types-tabulate>=0.9.0.3; extra == "dev"
+Requires-Dist: types-tqdm>=4.66.0.5; extra == "dev"
+Requires-Dist: lxml-stubs>=0.5.1; extra == "dev"
+Requires-Dist: types-Pillow>=10.2.0.20240406; extra == "dev"
+Requires-Dist: types-urllib3>=1.26.25.14; extra == "dev"
 Provides-Extra: test
-Requires-Dist: pytest ==8.0.2 ; extra == 'test'
-Requires-Dist: pytest-cov ; extra == 'test'
-Provides-Extra: tf
-Requires-Dist: catalogue ==2.0.10 ; extra == 'tf'
-Requires-Dist: huggingface-hub >=0.12.0 ; extra == 'tf'
-Requires-Dist: importlib-metadata >=5.0.0 ; extra == 'tf'
-Requires-Dist: jsonlines ==3.1.0 ; extra == 'tf'
-Requires-Dist: lazy-imports ==0.3.1 ; extra == 'tf'
-Requires-Dist: mock ==4.0.3 ; extra == 'tf'
-Requires-Dist: networkx >=2.7.1 ; extra == 'tf'
-Requires-Dist: numpy <2.0,>=1.21 ; extra == 'tf'
-Requires-Dist: packaging >=20.0 ; extra == 'tf'
-Requires-Dist: Pillow >=10.0.0 ; extra == 'tf'
-Requires-Dist: pypdf >=3.16.0 ; extra == 'tf'
-Requires-Dist: pyyaml >=6.0.1 ; extra == 'tf'
-Requires-Dist: pyzmq >=16 ; extra == 'tf'
-Requires-Dist: scipy >=1.13.1 ; extra == 'tf'
-Requires-Dist: termcolor >=1.1 ; extra == 'tf'
-Requires-Dist: tabulate >=0.7.7 ; extra == 'tf'
-Requires-Dist: tqdm ==4.64.0 ; extra == 'tf'
-Requires-Dist: tensorpack ==0.11 ; extra == 'tf'
-Requires-Dist: protobuf ==3.20.1 ; extra == 'tf'
-Requires-Dist: tensorflow-addons >=0.17.1 ; extra == 'tf'
-Requires-Dist: tf2onnx >=1.9.2 ; extra == 'tf'
-Requires-Dist: python-doctr ==0.8.1 ; extra == 'tf'
-Requires-Dist: pycocotools >=2.0.2 ; extra == 'tf'
-Requires-Dist: boto3 ==1.34.102 ; extra == 'tf'
-Requires-Dist: pdfplumber >=0.11.0 ; extra == 'tf'
-Requires-Dist: fasttext ==0.9.2 ; extra == 'tf'
-Requires-Dist: jdeskew >=0.2.2 ; extra == 'tf'
-Requires-Dist: apted ==1.0.3 ; extra == 'tf'
-Requires-Dist: distance ==0.1.3 ; extra == 'tf'
-Requires-Dist: lxml >=4.9.1 ; extra == 'tf'
+Requires-Dist: pytest==8.0.2; extra == "test"
+Requires-Dist: pytest-cov; extra == "test"
 <p align="center">
@@ -172,13 +175,17 @@ pipelines. Its core function does not depend on any specific deep learning libra
  - Document layout analysis and table recognition now runs with
    [**Torchscript**](https://pytorch.org/docs/stable/jit.html) (CPU) as well and [**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) is not required
    anymore for basic inference.
- - [**new**] More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
-   (not contained in the built-in Analyzer).
- - [**new**] Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
+ - More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
+ - Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
    [**transformers**](https://github.com/huggingface/transformers).
    We have added a model wrapper for token classification with LiLT and added a some LiLT models to the model catalog
    that seem to look promising, especially if you want to train a model on non-english data. The training script for
-   LayoutLM can be used for LiLT as well and we will be providing a notebook on how to train a model on a custom dataset soon.
+   LayoutLM can be used for LiLT as well.
+ - [**new**] There are two notebooks available that show, how to write a
+   [custom predictor](https://github.com/deepdoctection/notebooks/blob/main/Doclaynet_Analyzer_Config.ipynb) based on
+   a third party library that has not been supported yet and how to use
+   [advanced configuration](https://github.com/deepdoctection/notebooks/blob/main/Doclaynet_Analyzer_Config.ipynb) to
+   get links between layout segments e.g. captions and tables or figures.
 **deep**doctection provides on top of that methods for pre-processing inputs to models like cropping or resizing and to
 post-process results, like validating duplicate outputs, relating words to detected layout segments or ordering words
@@ -263,7 +270,7 @@ documentation.
 ## Requirements
-![requirements](https://github.com/deepdoctection/deepdoctection/raw/master/docs/tutorials/_imgs/requirements_deepdoctection.png)
+![requirements](https://github.com/deepdoctection/deepdoctection/raw/master/docs/tutorials/_imgs/requirements_deepdoctection_081124.png)
 Everything in the overview listed below the **deep**doctection layer are necessary requirements and have to be installed
 separately.
@@ -272,13 +279,16 @@ separately.
 - Python >= 3.9
 - 1.13 <= PyTorch  **or** 2.11 <= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
 In general, if you want to train or fine-tune models, a GPU is required.
-- **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF documents into
-images.
 - With respect to the Deep Learning framework, you must decide between [Tensorflow](https://www.tensorflow.org/install?hl=en)
   and [PyTorch](https://pytorch.org/get-started/locally/).
 - [Tesseract](https://github.com/tesseract-ocr/tesseract) OCR engine will be used through a Python wrapper. The core
   engine has to be installed separately.
+- For release `v.0.34.0` and below **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF
+  documents into images. For release `v.0.35.0` this dependency will be optional.
 The following overview shows the availability of the models in conjunction with the DL framework.
 | Task                                          | PyTorch | Torchscript    |  Tensorflow  |
@@ -396,8 +406,8 @@ to develop this framework.
 ## Problems
 We try hard to eliminate bugs. We also know that the code is not free of issues. We welcome all issues relevant to this
-repo and try to address them as quickly as possible. Bug fixes or enhancements will be deployed in a new release every 4
-to 6 weeks.
+repo and try to address them as quickly as possible. Bug fixes or enhancements will be deployed in a new release every 10
+to 12 weeks.
 ## If you like **deep**doctection ...

deepdoctection 0.34__py3-none-any.whl → 0.36__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.34py3-none-any.whl → 0.36py3-none-any.whl