PyPI - docling - Versions diffs - 2.56.1__py3-none-any.whl → 2.57.0__py3-none-any.whl - Mend

docling 2.56.1py3-none-any.whl → 2.57.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling might be problematic. Click here for more details.

Files changed (9) hide show

docling/backend/docx/drawingml/utils.py ADDED Viewed

@@ -0,0 +1,131 @@
+import os
+import shutil
+import subprocess
+from pathlib import Path
+from tempfile import mkdtemp
+from typing import Callable, Optional
+import pypdfium2
+from docx.document import Document
+from PIL import Image, ImageChops
+def get_libreoffice_cmd(raise_if_unavailable: bool = False) -> Optional[str]:
+    """Return the libreoffice cmd and optionally test it."""
+    libreoffice_cmd = (
+        shutil.which("libreoffice")
+        or shutil.which("soffice")
+        or (
+            "/Applications/LibreOffice.app/Contents/MacOS/soffice"
+            if os.path.isfile("/Applications/LibreOffice.app/Contents/MacOS/soffice")
+            else None
+        )
+    )
+    if raise_if_unavailable:
+        if libreoffice_cmd is None:
+            raise RuntimeError("Libreoffice not found")
+        # The following test will raise if the libreoffice_cmd cannot be used
+        subprocess.run(
+            [
+                libreoffice_cmd,
+                "-h",
+            ],
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+            check=True,
+        )
+    return libreoffice_cmd
+def get_docx_to_pdf_converter() -> Optional[Callable]:
+    """
+    Detects the best available DOCX to PDF tool and returns a conversion function.
+    The returned function accepts (input_path, output_path).
+    Returns None if no tool is available.
+    """
+    # Try LibreOffice
+    libreoffice_cmd = get_libreoffice_cmd()
+    if libreoffice_cmd:
+        def convert_with_libreoffice(input_path, output_path):
+            subprocess.run(
+                [
+                    libreoffice_cmd,
+                    "--headless",
+                    "--convert-to",
+                    "pdf",
+                    "--outdir",
+                    os.path.dirname(output_path),
+                    input_path,
+                ],
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+                check=True,
+            )
+            expected_output = os.path.join(
+                os.path.dirname(output_path),
+                os.path.splitext(os.path.basename(input_path))[0] + ".pdf",
+            )
+            if expected_output != output_path:
+                os.rename(expected_output, output_path)
+        return convert_with_libreoffice
+    ## Space for other DOCX to PDF converters if available
+    # No tools found
+    return None
+def crop_whitespace(image: Image.Image, bg_color=None, padding=0) -> Image.Image:
+    if bg_color is None:
+        bg_color = image.getpixel((0, 0))
+    bg = Image.new(image.mode, image.size, bg_color)
+    diff = ImageChops.difference(image, bg)
+    bbox = diff.getbbox()
+    if bbox:
+        left, upper, right, lower = bbox
+        left = max(0, left - padding)
+        upper = max(0, upper - padding)
+        right = min(image.width, right + padding)
+        lower = min(image.height, lower + padding)
+        return image.crop((left, upper, right, lower))
+    else:
+        return image
+def get_pil_from_dml_docx(
+    docx: Document, converter: Optional[Callable]
+) -> Optional[Image.Image]:
+    if converter is None:
+        return None
+    temp_dir = Path(mkdtemp())
+    temp_docx = Path(temp_dir / "drawing_only.docx")
+    temp_pdf = Path(temp_dir / "drawing_only.pdf")
+    # 1) Save docx temporarily
+    docx.save(str(temp_docx))
+    # 2) Export to PDF
+    converter(temp_docx, temp_pdf)
+    # 3) Load PDF as PNG
+    pdf = pypdfium2.PdfDocument(temp_pdf)
+    page = pdf[0]
+    image = crop_whitespace(page.render(scale=2).to_pil())
+    page.close()
+    pdf.close()
+    shutil.rmtree(temp_dir, ignore_errors=True)
+    return image

docling/backend/msword_backend.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import logging
 import re
+from copy import deepcopy
 from io import BytesIO
 from pathlib import Path
-from typing import Any, List, Optional, Union
+from typing import Any, Callable, List, Optional, Union
 from docling_core.types.doc import (
     DocItemLabel,
@@ -33,6 +34,11 @@ from pydantic import AnyUrl
 from typing_extensions import override
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.backend.docx.drawingml.utils import (
+    get_docx_to_pdf_converter,
+    get_libreoffice_cmd,
+    get_pil_from_dml_docx,
+)
 from docling.backend.docx.latex.omml import oMath2Latex
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument
@@ -64,6 +70,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         self.equation_bookends: str = "<eq>{EQ}</eq>"
         # Track processed textbox elements to avoid duplication
         self.processed_textbox_elements: List[int] = []
+        self.docx_to_pdf_converter: Optional[Callable] = None
+        self.docx_to_pdf_converter_init = False
+        self.display_drawingml_warning = True
         for i in range(-1, self.max_levels):
             self.parents[i] = None
@@ -80,18 +89,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
             "indents": [None],
         }
-        self.docx_obj = None
-        try:
-            if isinstance(self.path_or_stream, BytesIO):
-                self.docx_obj = Document(self.path_or_stream)
-            elif isinstance(self.path_or_stream, Path):
-                self.docx_obj = Document(str(self.path_or_stream))
+        self.docx_obj = self.load_msword_file(
+            path_or_stream=self.path_or_stream, document_hash=self.document_hash
+        )
+        if self.docx_obj:
             self.valid = True
-        except Exception as e:
-            raise RuntimeError(
-                f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
-            ) from e
     @override
     def is_valid(self) -> bool:
@@ -139,6 +141,22 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 f"Cannot convert doc with {self.document_hash} because the backend failed to init."
             )
+    @staticmethod
+    def load_msword_file(
+        path_or_stream: Union[BytesIO, Path], document_hash: str
+    ) -> DocxDocument:
+        try:
+            if isinstance(path_or_stream, BytesIO):
+                return Document(path_or_stream)
+            elif isinstance(path_or_stream, Path):
+                return Document(str(path_or_stream))
+            else:
+                return None
+        except Exception as e:
+            raise RuntimeError(
+                f"MsWordDocumentBackend could not load document with hash {document_hash}"
+            ) from e
     def _update_history(
         self,
         name: str,
@@ -195,6 +213,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
             }
             xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces)
             drawing_blip = xpath_expr(element)
+            drawingml_els = element.findall(".//w:drawing", namespaces=namespaces)
             # Check for textbox content - check multiple textbox formats
             # Only process if the element hasn't been processed before
@@ -274,6 +293,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 ):
                     te1 = self._handle_text_elements(element, docx_obj, doc)
                     added_elements.extend(te1)
+            # Check for DrawingML elements
+            elif drawingml_els:
+                if (
+                    self.docx_to_pdf_converter is None
+                    and self.docx_to_pdf_converter_init is False
+                ):
+                    self.docx_to_pdf_converter = get_docx_to_pdf_converter()
+                    self.docx_to_pdf_converter_init = True
+                if self.docx_to_pdf_converter is None:
+                    if self.display_drawingml_warning:
+                        if self.docx_to_pdf_converter is None:
+                            _log.warning(
+                                "Found DrawingML elements in document, but no DOCX to PDF converters. "
+                                "If you want these exported, make sure you have "
+                                "LibreOffice binary in PATH or specify its path with DOCLING_LIBREOFFICE_CMD."
+                            )
+                            self.display_drawingml_warning = False
+                else:
+                    self._handle_drawingml(doc=doc, drawingml_els=drawingml_els)
             # Check for the sdt containers, like table of contents
             elif tag_name in ["sdt"]:
                 sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
@@ -1381,3 +1420,39 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 )
                 elem_ref.append(p3.get_ref())
         return elem_ref
+    def _handle_drawingml(self, doc: DoclingDocument, drawingml_els: Any):
+        # 1) Make an empty copy of the original document
+        dml_doc = self.load_msword_file(self.path_or_stream, self.document_hash)
+        body = dml_doc._element.body
+        for child in list(body):
+            body.remove(child)
+        # 2) Add DrawingML to empty document
+        new_para = dml_doc.add_paragraph()
+        new_r = new_para.add_run()
+        for dml in drawingml_els:
+            new_r._r.append(deepcopy(dml))
+        # 3) Export DOCX->PDF->PNG and save it in DoclingDocument
+        level = self._get_level()
+        try:
+            pil_image = get_pil_from_dml_docx(
+                dml_doc, converter=self.docx_to_pdf_converter
+            )
+            if pil_image is None:
+                raise UnidentifiedImageError
+            doc.add_picture(
+                parent=self.parents[level - 1],
+                image=ImageRef.from_pil(image=pil_image, dpi=72),
+                caption=None,
+            )
+        except (UnidentifiedImageError, OSError):
+            _log.warning("Warning: DrawingML image cannot be loaded by Pillow")
+            doc.add_picture(
+                parent=self.parents[level - 1],
+                caption=None,
+            )
+        return

docling/pipeline/vlm_pipeline.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing import List, Optional, Union, cast
 from docling_core.types.doc import (
     BoundingBox,
+    ContentLayer,
     DocItem,
     DoclingDocument,
     ImageRef,
@@ -251,9 +252,9 @@ class VlmPipeline(PaginatedPipeline):
                 # No code blocks found, return original text
                 return text
-        for pg_idx, page in enumerate(conv_res.pages):
-            page_no = pg_idx + 1  # FIXME: might be incorrect
+        page_docs = []
+        for pg_idx, page in enumerate(conv_res.pages):
             predicted_text = ""
             if page.predictions.vlm_response:
                 predicted_text = page.predictions.vlm_response.text + "\n\n"
@@ -273,6 +274,24 @@ class VlmPipeline(PaginatedPipeline):
             )
             page_doc = backend.convert()
+            # Modify provenance in place for all items in the page document
+            for item, level in page_doc.iterate_items(
+                with_groups=True,
+                traverse_pictures=True,
+                included_content_layers=set(ContentLayer),
+            ):
+                if isinstance(item, DocItem):
+                    item.prov = [
+                        ProvenanceItem(
+                            page_no=pg_idx + 1,
+                            bbox=BoundingBox(
+                                t=0.0, b=0.0, l=0.0, r=0.0
+                            ),  # FIXME: would be nice not to have to "fake" it
+                            charspan=[0, 0],
+                        )
+                    ]
+            # Add page metadata to the page document before concatenation
             if page.image is not None:
                 pg_width = page.image.width
                 pg_height = page.image.height
@@ -280,27 +299,18 @@ class VlmPipeline(PaginatedPipeline):
                 pg_width = 1
                 pg_height = 1
-            conv_res.document.add_page(
-                page_no=page_no,
+            page_doc.add_page(
+                page_no=pg_idx + 1,
                 size=Size(width=pg_width, height=pg_height),
                 image=ImageRef.from_pil(image=page.image, dpi=72)
                 if page.image
                 else None,
             )
-            for item, level in page_doc.iterate_items():
-                item.prov = [
-                    ProvenanceItem(
-                        page_no=pg_idx + 1,
-                        bbox=BoundingBox(
-                            t=0.0, b=0.0, l=0.0, r=0.0
-                        ),  # FIXME: would be nice not to have to "fake" it
-                        charspan=[0, 0],
-                    )
-                ]
-                conv_res.document.append_child_item(child=item)
+            page_docs.append(page_doc)
-        return conv_res.document
+        final_doc = DoclingDocument.concatenate(docs=page_docs)
+        return final_doc
     def _turn_html_into_doc(self, conv_res):
         def _extract_html_code(text):
@@ -328,9 +338,9 @@ class VlmPipeline(PaginatedPipeline):
                 # No code blocks found, return original text
                 return text
-        for pg_idx, page in enumerate(conv_res.pages):
-            page_no = pg_idx + 1  # FIXME: might be incorrect
+        page_docs = []
+        for pg_idx, page in enumerate(conv_res.pages):
             predicted_text = ""
             if page.predictions.vlm_response:
                 predicted_text = page.predictions.vlm_response.text + "\n\n"
@@ -341,7 +351,7 @@ class VlmPipeline(PaginatedPipeline):
             out_doc = InputDocument(
                 path_or_stream=response_bytes,
                 filename=conv_res.input.file.name,
-                format=InputFormat.MD,
+                format=InputFormat.HTML,
                 backend=HTMLDocumentBackend,
             )
             backend = HTMLDocumentBackend(
@@ -350,6 +360,24 @@ class VlmPipeline(PaginatedPipeline):
             )
             page_doc = backend.convert()
+            # Modify provenance in place for all items in the page document
+            for item, level in page_doc.iterate_items(
+                with_groups=True,
+                traverse_pictures=True,
+                included_content_layers=set(ContentLayer),
+            ):
+                if isinstance(item, DocItem):
+                    item.prov = [
+                        ProvenanceItem(
+                            page_no=pg_idx + 1,
+                            bbox=BoundingBox(
+                                t=0.0, b=0.0, l=0.0, r=0.0
+                            ),  # FIXME: would be nice not to have to "fake" it
+                            charspan=[0, 0],
+                        )
+                    ]
+            # Add page metadata to the page document before concatenation
             if page.image is not None:
                 pg_width = page.image.width
                 pg_height = page.image.height
@@ -357,27 +385,19 @@ class VlmPipeline(PaginatedPipeline):
                 pg_width = 1
                 pg_height = 1
-            conv_res.document.add_page(
-                page_no=page_no,
+            page_doc.add_page(
+                page_no=pg_idx + 1,
                 size=Size(width=pg_width, height=pg_height),
                 image=ImageRef.from_pil(image=page.image, dpi=72)
                 if page.image
                 else None,
             )
-            for item, level in page_doc.iterate_items():
-                item.prov = [
-                    ProvenanceItem(
-                        page_no=pg_idx + 1,
-                        bbox=BoundingBox(
-                            t=0.0, b=0.0, l=0.0, r=0.0
-                        ),  # FIXME: would be nice not to have to "fake" it
-                        charspan=[0, 0],
-                    )
-                ]
-                conv_res.document.append_child_item(child=item)
+            page_docs.append(page_doc)
-        return conv_res.document
+        # Concatenate all page documents to preserve hierarchy
+        final_doc = DoclingDocument.concatenate(docs=page_docs)
+        return final_doc
     @classmethod
     def get_default_options(cls) -> VlmPipelineOptions:

{docling-2.56.1.dist-info → docling-2.57.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docling
-Version: 2.56.1
+Version: 2.57.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
 License-Expression: MIT

{docling-2.56.1.dist-info → docling-2.57.0.dist-info}/RECORD RENAMED Viewed

@@ -15,12 +15,13 @@ docling/backend/md_backend.py,sha256=TWboEPHl93pqI_Go1a3XpP-KpzI3d17xo5ZW42Ul0kY
 docling/backend/mets_gbs_backend.py,sha256=EA8sY6tbmGiysKGYPPZiNlK-i7Adn8bLTo-7Ym15hTU,12774
 docling/backend/msexcel_backend.py,sha256=GOuA-MlShpzFmCmJq3-Z28iquwWUg4k8v-AT4O-aAQI,19305
 docling/backend/mspowerpoint_backend.py,sha256=wJgB2JStEPfD7MPpWQlpPN7bffPxaHFUnKD4wj8SLxU,15114
-docling/backend/msword_backend.py,sha256=Jfd57hzG8iFVAzqsOAHe5jG8LCHAIBXJhQCW0tESnMM,54405
+docling/backend/msword_backend.py,sha256=L44vFoSHOtbX-S_lSb8EKW-nzwL_ptVPhNV74ydmwqE,57457
 docling/backend/noop_backend.py,sha256=EOPbD86FzZPX-K_DpNrJh0_lC0bZz--4DpG-OagDNGY,1688
 docling/backend/pdf_backend.py,sha256=Wcd1NSrAMjXK8VicTki5p-j-JLofklt07eF0kIG17_0,3361
 docling/backend/pypdfium2_backend.py,sha256=AYhWs9S8W_TkAK0-OkRmUNf4HUZl26FP7-XYjwU5zDk,14209
 docling/backend/webvtt_backend.py,sha256=9xPcfWVLuqhEAFrkv8aU36qHnSgjeINZAXT_C9C6XJA,19165
 docling/backend/docx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+docling/backend/docx/drawingml/utils.py,sha256=E9Iq8_052eEV5L1IN3ZqFX9eBidH56DKNlh6Tk7Do0I,3640
 docling/backend/docx/latex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/backend/docx/latex/latex_dict.py,sha256=tFJp4ScT_AkY2ON7nLEa560p601Jq2glcZvMKxxjn7w,6593
 docling/backend/docx/latex/omml.py,sha256=4vh9FCbXh-Tb6KJGqNwzlMUMYEnnJgBtBI24dwy6t2U,12416
@@ -88,7 +89,7 @@ docling/pipeline/extraction_vlm_pipeline.py,sha256=veUOTe8nGdnduZKaGn1RRb-NfU1H6
 docling/pipeline/simple_pipeline.py,sha256=FSL_ucDd9k0D9DjNKMUkyCULIU8a057dvWfLEPmAc2A,2287
 docling/pipeline/standard_pdf_pipeline.py,sha256=xOge0zP5wli51n_6QLrFHQlwwvsivI7OMt00tht3my4,10479
 docling/pipeline/threaded_standard_pdf_pipeline.py,sha256=i67G5AOW7PIFCe5JS2sdBmPAKvAH6ScxIBhjwOGZcrI,28183
-docling/pipeline/vlm_pipeline.py,sha256=oMcdgzym_UQbVN3bajux_hENY40XGOnb6NU6Kwje2Os,15376
+docling/pipeline/vlm_pipeline.py,sha256=HSbSoGZyy4eIK8eOL2g_NymrHg8r-DrB2buggJQAqHU,16189
 docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/utils/accelerator_utils.py,sha256=DSajLxVx1JEVT0zt5de26llciLNlVfIDfSa2zYCFJzQ,2909
 docling/utils/api_image_request.py,sha256=kQDmTvQT6M2IgXnGYeoNflI6sLUG6WTCcEft94CRwWg,5379
@@ -102,9 +103,9 @@ docling/utils/orientation.py,sha256=jTyLxyT31FlOodZoBMlADHNQK2lAWKYVs5z7pXd_6Cg,
 docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
 docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
 docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
-docling-2.56.1.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
-docling-2.56.1.dist-info/METADATA,sha256=LgkCRBKjI20tarzBBeFpRi0vo_fZ4Sh6rD6K8yQkrY8,11364
-docling-2.56.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-docling-2.56.1.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
-docling-2.56.1.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
-docling-2.56.1.dist-info/RECORD,,
+docling-2.57.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
+docling-2.57.0.dist-info/METADATA,sha256=oDfwFunLJTLSDVastMVq9JkUpIgeKOOVX1MZb6rtqcE,11364
+docling-2.57.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+docling-2.57.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
+docling-2.57.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
+docling-2.57.0.dist-info/RECORD,,

{docling-2.56.1.dist-info → docling-2.57.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{docling-2.56.1.dist-info → docling-2.57.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{docling-2.56.1.dist-info → docling-2.57.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{docling-2.56.1.dist-info → docling-2.57.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

docling 2.56.1__py3-none-any.whl → 2.57.0__py3-none-any.whl

Potentially problematic release.

docling 2.56.1py3-none-any.whl → 2.57.0py3-none-any.whl