PyPI - docling - Versions diffs - 2.15.1__py3-none-any.whl → 2.17.0__py3-none-any.whl - Mend

docling 2.15.1py3-none-any.whl → 2.17.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

docling/backend/abstract_backend.py +0 -1
docling/backend/asciidoc_backend.py +0 -1
docling/backend/docling_parse_backend.py +1 -1
docling/backend/docling_parse_v2_backend.py +1 -1
docling/backend/html_backend.py +4 -3
docling/backend/json/__init__.py +0 -0
docling/backend/json/docling_json_backend.py +58 -0
docling/backend/md_backend.py +49 -36
docling/backend/msexcel_backend.py +50 -38
docling/backend/msword_backend.py +0 -1
docling/backend/pdf_backend.py +0 -2
docling/backend/pypdfium2_backend.py +1 -1
docling/backend/xml/uspto_backend.py +25 -25
docling/cli/main.py +18 -3
docling/datamodel/base_models.py +30 -3
docling/datamodel/document.py +4 -0
docling/datamodel/pipeline_options.py +7 -9
docling/document_converter.py +4 -0
docling/models/base_model.py +62 -6
docling/models/code_formula_model.py +245 -0
docling/models/document_picture_classifier.py +187 -0
docling/models/layout_model.py +10 -86
docling/models/page_assemble_model.py +1 -33
docling/models/rapid_ocr_model.py +1 -0
docling/models/tesseract_ocr_cli_model.py +72 -5
docling/models/tesseract_ocr_model.py +68 -20
docling/pipeline/base_pipeline.py +40 -17
docling/pipeline/standard_pdf_pipeline.py +31 -2
docling/utils/glm_utils.py +4 -1
docling/utils/ocr_utils.py +9 -0
docling/utils/visualization.py +80 -0
{docling-2.15.1.dist-info → docling-2.17.0.dist-info}/METADATA +17 -13
docling-2.17.0.dist-info/RECORD +62 -0
docling-2.15.1.dist-info/RECORD +0 -56
{docling-2.15.1.dist-info → docling-2.17.0.dist-info}/LICENSE +0 -0
{docling-2.15.1.dist-info → docling-2.17.0.dist-info}/WHEEL +0 -0
{docling-2.15.1.dist-info → docling-2.17.0.dist-info}/entry_points.txt +0 -0

docling/models/code_formula_model.py ADDED Viewed

@@ -0,0 +1,245 @@
+import re
+from pathlib import Path
+from typing import Iterable, List, Literal, Optional, Tuple, Union
+from docling_core.types.doc import (
+    CodeItem,
+    DocItemLabel,
+    DoclingDocument,
+    NodeItem,
+    TextItem,
+)
+from docling_core.types.doc.labels import CodeLanguageLabel
+from PIL import Image
+from pydantic import BaseModel
+from docling.datamodel.base_models import ItemAndImageEnrichmentElement
+from docling.datamodel.pipeline_options import AcceleratorOptions
+from docling.models.base_model import BaseItemAndImageEnrichmentModel
+from docling.utils.accelerator_utils import decide_device
+class CodeFormulaModelOptions(BaseModel):
+    """
+    Configuration options for the CodeFormulaModel.
+    Attributes
+    ----------
+    kind : str
+        Type of the model. Fixed value "code_formula".
+    do_code_enrichment : bool
+        True if code enrichment is enabled, False otherwise.
+    do_formula_enrichment : bool
+        True if formula enrichment is enabled, False otherwise.
+    """
+    kind: Literal["code_formula"] = "code_formula"
+    do_code_enrichment: bool = True
+    do_formula_enrichment: bool = True
+class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
+    """
+    Model for processing and enriching documents with code and formula predictions.
+    Attributes
+    ----------
+    enabled : bool
+        True if the model is enabled, False otherwise.
+    options : CodeFormulaModelOptions
+        Configuration options for the CodeFormulaModel.
+    code_formula_model : CodeFormulaPredictor
+        The predictor model for code and formula processing.
+    Methods
+    -------
+    __init__(self, enabled, artifacts_path, accelerator_options, code_formula_options)
+        Initializes the CodeFormulaModel with the given configuration options.
+    is_processable(self, doc, element)
+        Determines if a given element in a document can be processed by the model.
+    __call__(self, doc, element_batch)
+        Processes the given batch of elements and enriches them with predictions.
+    """
+    images_scale = 1.66  # = 120 dpi, aligned with training data resolution
+    expansion_factor = 0.03
+    def __init__(
+        self,
+        enabled: bool,
+        artifacts_path: Optional[Union[Path, str]],
+        options: CodeFormulaModelOptions,
+        accelerator_options: AcceleratorOptions,
+    ):
+        """
+        Initializes the CodeFormulaModel with the given configuration.
+        Parameters
+        ----------
+        enabled : bool
+            True if the model is enabled, False otherwise.
+        artifacts_path : Path
+            Path to the directory containing the model artifacts.
+        options : CodeFormulaModelOptions
+            Configuration options for the model.
+        accelerator_options : AcceleratorOptions
+            Options specifying the device and number of threads for acceleration.
+        """
+        self.enabled = enabled
+        self.options = options
+        if self.enabled:
+            device = decide_device(accelerator_options.device)
+            from docling_ibm_models.code_formula_model.code_formula_predictor import (
+                CodeFormulaPredictor,
+            )
+            if artifacts_path is None:
+                artifacts_path = self.download_models_hf()
+            else:
+                artifacts_path = Path(artifacts_path)
+            self.code_formula_model = CodeFormulaPredictor(
+                artifacts_path=artifacts_path,
+                device=device,
+                num_threads=accelerator_options.num_threads,
+            )
+    @staticmethod
+    def download_models_hf(
+        local_dir: Optional[Path] = None, force: bool = False
+    ) -> Path:
+        from huggingface_hub import snapshot_download
+        from huggingface_hub.utils import disable_progress_bars
+        disable_progress_bars()
+        download_path = snapshot_download(
+            repo_id="ds4sd/CodeFormula",
+            force_download=force,
+            local_dir=local_dir,
+            revision="v1.0.0",
+        )
+        return Path(download_path)
+    def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
+        """
+        Determines if a given element in a document can be processed by the model.
+        Parameters
+        ----------
+        doc : DoclingDocument
+            The document being processed.
+        element : NodeItem
+            The element within the document to check.
+        Returns
+        -------
+        bool
+            True if the element can be processed, False otherwise.
+        """
+        return self.enabled and (
+            (isinstance(element, CodeItem) and self.options.do_code_enrichment)
+            or (
+                isinstance(element, TextItem)
+                and element.label == DocItemLabel.FORMULA
+                and self.options.do_formula_enrichment
+            )
+        )
+    def _extract_code_language(self, input_string: str) -> Tuple[str, Optional[str]]:
+        """Extracts a programming language from the beginning of a string.
+        This function checks if the input string starts with a pattern of the form
+        ``<_some_language_>``. If it does, it extracts the language string and returns
+        a tuple of (remainder, language). Otherwise, it returns the original string
+        and `None`.
+        Args:
+            input_string (str): The input string, which may start with ``<_language_>``.
+        Returns:
+            Tuple[str, Optional[str]]:
+                A tuple where:
+                - The first element is either:
+                    - The remainder of the string (everything after ``<_language_>``),
+                    if a match is found; or
+                    - The original string, if no match is found.
+                - The second element is the extracted language if a match is found;
+                otherwise, `None`.
+        """
+        pattern = r"^<_([^>]+)_>\s*(.*)"
+        match = re.match(pattern, input_string, flags=re.DOTALL)
+        if match:
+            language = str(match.group(1))  # the captured programming language
+            remainder = str(match.group(2))  # everything after the <_language_>
+            return remainder, language
+        else:
+            return input_string, None
+    def _get_code_language_enum(self, value: Optional[str]) -> CodeLanguageLabel:
+        """
+        Converts a string to a corresponding `CodeLanguageLabel` enum member.
+        If the provided string does not match any value in `CodeLanguageLabel`,
+        it defaults to `CodeLanguageLabel.UNKNOWN`.
+        Args:
+            value (Optional[str]): The string representation of the code language or None.
+        Returns:
+            CodeLanguageLabel: The corresponding enum member if the value is valid,
+            otherwise `CodeLanguageLabel.UNKNOWN`.
+        """
+        if not isinstance(value, str):
+            return CodeLanguageLabel.UNKNOWN
+        try:
+            return CodeLanguageLabel(value)
+        except ValueError:
+            return CodeLanguageLabel.UNKNOWN
+    def __call__(
+        self,
+        doc: DoclingDocument,
+        element_batch: Iterable[ItemAndImageEnrichmentElement],
+    ) -> Iterable[NodeItem]:
+        """
+        Processes the given batch of elements and enriches them with predictions.
+        Parameters
+        ----------
+        doc : DoclingDocument
+            The document being processed.
+        element_batch : Iterable[ItemAndImageEnrichmentElement]
+            A batch of elements to be processed.
+        Returns
+        -------
+        Iterable[Any]
+            An iterable of enriched elements.
+        """
+        if not self.enabled:
+            for element in element_batch:
+                yield element.item
+            return
+        labels: List[str] = []
+        images: List[Image.Image] = []
+        elements: List[TextItem] = []
+        for el in element_batch:
+            assert isinstance(el.item, TextItem)
+            elements.append(el.item)
+            labels.append(el.item.label)
+            images.append(el.image)
+        outputs = self.code_formula_model.predict(images, labels)
+        for item, output in zip(elements, outputs):
+            if isinstance(item, CodeItem):
+                output, code_language = self._extract_code_language(output)
+                item.code_language = self._get_code_language_enum(code_language)
+            item.text = output
+            yield item

docling/models/document_picture_classifier.py ADDED Viewed

@@ -0,0 +1,187 @@
+from pathlib import Path
+from typing import Iterable, List, Literal, Optional, Tuple, Union
+from docling_core.types.doc import (
+    DoclingDocument,
+    NodeItem,
+    PictureClassificationClass,
+    PictureClassificationData,
+    PictureItem,
+)
+from PIL import Image
+from pydantic import BaseModel
+from docling.datamodel.pipeline_options import AcceleratorOptions
+from docling.models.base_model import BaseEnrichmentModel
+from docling.utils.accelerator_utils import decide_device
+class DocumentPictureClassifierOptions(BaseModel):
+    """
+    Options for configuring the DocumentPictureClassifier.
+    Attributes
+    ----------
+    kind : Literal["document_picture_classifier"]
+        Identifier for the type of classifier.
+    """
+    kind: Literal["document_picture_classifier"] = "document_picture_classifier"
+class DocumentPictureClassifier(BaseEnrichmentModel):
+    """
+    A model for classifying pictures in documents.
+    This class enriches document pictures with predicted classifications
+    based on a predefined set of classes.
+    Attributes
+    ----------
+    enabled : bool
+        Whether the classifier is enabled for use.
+    options : DocumentPictureClassifierOptions
+        Configuration options for the classifier.
+    document_picture_classifier : DocumentPictureClassifierPredictor
+        The underlying prediction model, loaded if the classifier is enabled.
+    Methods
+    -------
+    __init__(enabled, artifacts_path, options, accelerator_options)
+        Initializes the classifier with specified configurations.
+    is_processable(doc, element)
+        Checks if the given element can be processed by the classifier.
+    __call__(doc, element_batch)
+        Processes a batch of elements and adds classification annotations.
+    """
+    images_scale = 2
+    def __init__(
+        self,
+        enabled: bool,
+        artifacts_path: Optional[Union[Path, str]],
+        options: DocumentPictureClassifierOptions,
+        accelerator_options: AcceleratorOptions,
+    ):
+        """
+        Initializes the DocumentPictureClassifier.
+        Parameters
+        ----------
+        enabled : bool
+            Indicates whether the classifier is enabled.
+        artifacts_path : Optional[Union[Path, str]],
+            Path to the directory containing model artifacts.
+        options : DocumentPictureClassifierOptions
+            Configuration options for the classifier.
+        accelerator_options : AcceleratorOptions
+            Options for configuring the device and parallelism.
+        """
+        self.enabled = enabled
+        self.options = options
+        if self.enabled:
+            device = decide_device(accelerator_options.device)
+            from docling_ibm_models.document_figure_classifier_model.document_figure_classifier_predictor import (
+                DocumentFigureClassifierPredictor,
+            )
+            if artifacts_path is None:
+                artifacts_path = self.download_models_hf()
+            else:
+                artifacts_path = Path(artifacts_path)
+            self.document_picture_classifier = DocumentFigureClassifierPredictor(
+                artifacts_path=artifacts_path,
+                device=device,
+                num_threads=accelerator_options.num_threads,
+            )
+    @staticmethod
+    def download_models_hf(
+        local_dir: Optional[Path] = None, force: bool = False
+    ) -> Path:
+        from huggingface_hub import snapshot_download
+        from huggingface_hub.utils import disable_progress_bars
+        disable_progress_bars()
+        download_path = snapshot_download(
+            repo_id="ds4sd/DocumentFigureClassifier",
+            force_download=force,
+            local_dir=local_dir,
+            revision="v1.0.0",
+        )
+        return Path(download_path)
+    def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
+        """
+        Determines if the given element can be processed by the classifier.
+        Parameters
+        ----------
+        doc : DoclingDocument
+            The document containing the element.
+        element : NodeItem
+            The element to be checked.
+        Returns
+        -------
+        bool
+            True if the element is a PictureItem and processing is enabled; False otherwise.
+        """
+        return self.enabled and isinstance(element, PictureItem)
+    def __call__(
+        self,
+        doc: DoclingDocument,
+        element_batch: Iterable[NodeItem],
+    ) -> Iterable[NodeItem]:
+        """
+        Processes a batch of elements and enriches them with classification predictions.
+        Parameters
+        ----------
+        doc : DoclingDocument
+            The document containing the elements to be processed.
+        element_batch : Iterable[NodeItem]
+            A batch of pictures to classify.
+        Returns
+        -------
+        Iterable[NodeItem]
+            An iterable of NodeItem objects after processing. The field
+            'data.classification' is added containing the classification for each picture.
+        """
+        if not self.enabled:
+            for element in element_batch:
+                yield element
+            return
+        images: List[Image.Image] = []
+        elements: List[PictureItem] = []
+        for el in element_batch:
+            assert isinstance(el, PictureItem)
+            elements.append(el)
+            img = el.get_image(doc)
+            assert img is not None
+            images.append(img)
+        outputs = self.document_picture_classifier.predict(images)
+        for element, output in zip(elements, outputs):
+            element.annotations.append(
+                PictureClassificationData(
+                    provenance="DocumentPictureClassifier",
+                    predicted_classes=[
+                        PictureClassificationClass(
+                            class_name=pred[0],
+                            confidence=pred[1],
+                        )
+                        for pred in output
+                    ],
+                )
+            )
+            yield element

docling/models/layout_model.py CHANGED Viewed

@@ -1,28 +1,21 @@
 import copy
 import logging
-import random
-import time
 from pathlib import Path
-from typing import Iterable, List
+from typing import Iterable
-from docling_core.types.doc import CoordOrigin, DocItemLabel
+from docling_core.types.doc import DocItemLabel
 from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
-from PIL import Image, ImageDraw, ImageFont
-from docling.datamodel.base_models import (
-    BoundingBox,
-    Cell,
-    Cluster,
-    LayoutPrediction,
-    Page,
-)
+from PIL import Image
+from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
+from docling.datamodel.pipeline_options import AcceleratorOptions
 from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.layout_postprocessor import LayoutPostprocessor
 from docling.utils.profiling import TimeRecorder
+from docling.utils.visualization import draw_clusters
 _log = logging.getLogger(__name__)
@@ -40,7 +33,7 @@ class LayoutModel(BasePageModel):
         DocItemLabel.PAGE_FOOTER,
         DocItemLabel.CODE,
         DocItemLabel.LIST_ITEM,
-        # "Formula",
+        DocItemLabel.FORMULA,
     ]
     PAGE_HEADER_LABELS = [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]
@@ -82,78 +75,9 @@ class LayoutModel(BasePageModel):
         left_image = copy.deepcopy(page.image)
         right_image = copy.deepcopy(page.image)
-        # Function to draw clusters on an image
-        def draw_clusters(image, clusters):
-            draw = ImageDraw.Draw(image, "RGBA")
-            # Create a smaller font for the labels
-            try:
-                font = ImageFont.truetype("arial.ttf", 12)
-            except OSError:
-                # Fallback to default font if arial is not available
-                font = ImageFont.load_default()
-            for c_tl in clusters:
-                all_clusters = [c_tl, *c_tl.children]
-                for c in all_clusters:
-                    # Draw cells first (underneath)
-                    cell_color = (0, 0, 0, 40)  # Transparent black for cells
-                    for tc in c.cells:
-                        cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
-                        cx0 *= scale_x
-                        cx1 *= scale_x
-                        cy0 *= scale_x
-                        cy1 *= scale_y
-                        draw.rectangle(
-                            [(cx0, cy0), (cx1, cy1)],
-                            outline=None,
-                            fill=cell_color,
-                        )
-                    # Draw cluster rectangle
-                    x0, y0, x1, y1 = c.bbox.as_tuple()
-                    x0 *= scale_x
-                    x1 *= scale_x
-                    y0 *= scale_x
-                    y1 *= scale_y
-                    cluster_fill_color = (*list(DocItemLabel.get_color(c.label)), 70)
-                    cluster_outline_color = (
-                        *list(DocItemLabel.get_color(c.label)),
-                        255,
-                    )
-                    draw.rectangle(
-                        [(x0, y0), (x1, y1)],
-                        outline=cluster_outline_color,
-                        fill=cluster_fill_color,
-                    )
-                    # Add label name and confidence
-                    label_text = f"{c.label.name} ({c.confidence:.2f})"
-                    # Create semi-transparent background for text
-                    text_bbox = draw.textbbox((x0, y0), label_text, font=font)
-                    text_bg_padding = 2
-                    draw.rectangle(
-                        [
-                            (
-                                text_bbox[0] - text_bg_padding,
-                                text_bbox[1] - text_bg_padding,
-                            ),
-                            (
-                                text_bbox[2] + text_bg_padding,
-                                text_bbox[3] + text_bg_padding,
-                            ),
-                        ],
-                        fill=(255, 255, 255, 180),  # Semi-transparent white
-                    )
-                    # Draw text
-                    draw.text(
-                        (x0, y0),
-                        label_text,
-                        fill=(0, 0, 0, 255),  # Solid black
-                        font=font,
-                    )
         # Draw clusters on both images
-        draw_clusters(left_image, left_clusters)
-        draw_clusters(right_image, right_clusters)
+        draw_clusters(left_image, left_clusters, scale_x, scale_y)
+        draw_clusters(right_image, right_clusters, scale_x, scale_y)
         # Combine the images side by side
         combined_width = left_image.width * 2
         combined_height = left_image.height

docling/models/page_assemble_model.py CHANGED Viewed

@@ -22,7 +22,7 @@ _log = logging.getLogger(__name__)
 class PageAssembleOptions(BaseModel):
-    keep_images: bool = False
+    pass
 class PageAssembleModel(BasePageModel):
@@ -135,31 +135,6 @@ class PageAssembleModel(BasePageModel):
                                 )
                             elements.append(fig)
                             body.append(fig)
-                        elif cluster.label == LayoutModel.FORMULA_LABEL:
-                            equation = None
-                            if page.predictions.equations_prediction:
-                                equation = page.predictions.equations_prediction.equation_map.get(
-                                    cluster.id, None
-                                )
-                            if (
-                                not equation
-                            ):  # fallback: add empty formula, if it isn't present
-                                text = self.sanitize_text(
-                                    [
-                                        cell.text.replace("\x02", "-").strip()
-                                        for cell in cluster.cells
-                                        if len(cell.text.strip()) > 0
-                                    ]
-                                )
-                                equation = TextElement(
-                                    label=cluster.label,
-                                    id=cluster.id,
-                                    cluster=cluster,
-                                    page_no=page.page_no,
-                                    text=text,
-                                )
-                            elements.append(equation)
-                            body.append(equation)
                         elif cluster.label in LayoutModel.CONTAINER_LABELS:
                             container_el = ContainerElement(
                                 label=cluster.label,
@@ -174,11 +149,4 @@ class PageAssembleModel(BasePageModel):
                         elements=elements, headers=headers, body=body
                     )
-                    # Remove page images (can be disabled)
-                    if not self.options.keep_images:
-                        page._image_cache = {}
-                    # Unload backend
-                    page._backend.unload()
                 yield page

docling/models/rapid_ocr_model.py CHANGED Viewed

@@ -59,6 +59,7 @@ class RapidOcrModel(BaseOcrModel):
                 det_model_path=self.options.det_model_path,
                 cls_model_path=self.options.cls_model_path,
                 rec_model_path=self.options.rec_model_path,
+                rec_keys_path=self.options.rec_keys_path,
             )
     def __call__(

docling 2.15.1__py3-none-any.whl → 2.17.0__py3-none-any.whl

docling 2.15.1py3-none-any.whl → 2.17.0py3-none-any.whl