PyPI - docling - Versions diffs - 2.30.0__py3-none-any.whl → 2.31.1__py3-none-any.whl - Mend

docling 2.30.0py3-none-any.whl → 2.31.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

docling/backend/asciidoc_backend.py +7 -15
docling/backend/csv_backend.py +1 -1
docling/backend/docling_parse_backend.py +2 -2
docling/backend/docling_parse_v2_backend.py +2 -2
docling/backend/docling_parse_v4_backend.py +3 -4
docling/backend/docx/latex/latex_dict.py +0 -5
docling/backend/docx/latex/omml.py +4 -7
docling/backend/html_backend.py +66 -25
docling/backend/md_backend.py +6 -8
docling/backend/msexcel_backend.py +1 -7
docling/backend/mspowerpoint_backend.py +4 -7
docling/backend/msword_backend.py +5 -5
docling/backend/pdf_backend.py +2 -1
docling/backend/pypdfium2_backend.py +3 -3
docling/backend/xml/jats_backend.py +11 -14
docling/backend/xml/uspto_backend.py +19 -23
docling/cli/main.py +8 -8
docling/cli/models.py +6 -3
docling/datamodel/base_models.py +7 -5
docling/datamodel/document.py +19 -10
docling/datamodel/pipeline_options.py +0 -1
docling/document_converter.py +8 -6
docling/models/api_vlm_model.py +1 -2
docling/models/base_model.py +2 -4
docling/models/base_ocr_model.py +2 -2
docling/models/code_formula_model.py +2 -1
docling/models/document_picture_classifier.py +2 -1
docling/models/easyocr_model.py +10 -11
docling/models/factories/__init__.py +2 -2
docling/models/factories/base_factory.py +1 -1
docling/models/hf_mlx_model.py +4 -6
docling/models/hf_vlm_model.py +7 -5
docling/models/layout_model.py +2 -2
docling/models/ocr_mac_model.py +3 -4
docling/models/page_assemble_model.py +7 -12
docling/models/page_preprocessing_model.py +2 -1
docling/models/picture_description_api_model.py +2 -1
docling/models/picture_description_base_model.py +2 -3
docling/models/picture_description_vlm_model.py +6 -4
docling/models/rapid_ocr_model.py +2 -3
docling/models/readingorder_model.py +9 -24
docling/models/table_structure_model.py +4 -8
docling/models/tesseract_ocr_cli_model.py +17 -16
docling/models/tesseract_ocr_model.py +9 -5
docling/pipeline/base_pipeline.py +4 -8
docling/pipeline/simple_pipeline.py +0 -1
docling/pipeline/standard_pdf_pipeline.py +0 -1
docling/pipeline/vlm_pipeline.py +0 -3
docling/utils/export.py +2 -4
docling/utils/glm_utils.py +2 -2
docling/utils/layout_postprocessor.py +4 -2
docling/utils/model_downloader.py +31 -7
docling/utils/utils.py +3 -3
{docling-2.30.0.dist-info → docling-2.31.1.dist-info}/METADATA +2 -1
docling-2.31.1.dist-info/RECORD +86 -0
docling-2.30.0.dist-info/RECORD +0 -86
{docling-2.30.0.dist-info → docling-2.31.1.dist-info}/LICENSE +0 -0
{docling-2.30.0.dist-info → docling-2.31.1.dist-info}/WHEEL +0 -0
{docling-2.30.0.dist-info → docling-2.31.1.dist-info}/entry_points.txt +0 -0

docling/backend/msword_backend.py CHANGED Viewed

@@ -158,7 +158,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
     def _get_level(self) -> int:
         """Return the first None index."""
         for k, v in self.parents.items():
-            if k >= 0 and v == None:
+            if k >= 0 and v is None:
                 return k
         return 0
@@ -418,7 +418,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
             else prev_parent
         )
-    def _handle_text_elements(
+    def _handle_text_elements(  # noqa: C901
         self,
         element: BaseOxmlElement,
         docx_obj: DocxDocument,
@@ -436,7 +436,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         # Common styles for bullet and numbered lists.
         # "List Bullet", "List Number", "List Paragraph"
-        # Identify wether list is a numbered list or not
+        # Identify whether list is a numbered list or not
         # is_numbered = "List Bullet" not in paragraph.style.name
         is_numbered = False
         p_style_id, p_level = self._get_label_and_level(paragraph)
@@ -812,7 +812,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                     f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
                 )
                 if cell is None or cell._tc in cell_set:
-                    _log.debug(f"  skipped since repeated content")
+                    _log.debug("  skipped since repeated content")
                     col_idx += cell.grid_span
                     continue
                 else:
@@ -879,7 +879,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                     image=ImageRef.from_pil(image=pil_image, dpi=72),
                     caption=None,
                 )
-            except (UnidentifiedImageError, OSError) as e:
+            except (UnidentifiedImageError, OSError):
                 _log.warning("Warning: image cannot be loaded by Pillow")
                 doc.add_picture(
                     parent=self.parents[level - 1],

docling/backend/pdf_backend.py CHANGED Viewed

@@ -1,7 +1,8 @@
 from abc import ABC, abstractmethod
+from collections.abc import Iterable
 from io import BytesIO
 from pathlib import Path
-from typing import Iterable, Optional, Set, Union
+from typing import Optional, Set, Union
 from docling_core.types.doc import BoundingBox, Size
 from docling_core.types.doc.page import SegmentedPdfPage, TextCell

docling/backend/pypdfium2_backend.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import logging
 import random
+from collections.abc import Iterable
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Iterable, List, Optional, Union
+from typing import TYPE_CHECKING, List, Optional, Union
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
@@ -29,7 +30,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
         self.valid = True  # No better way to tell from pypdfium.
         try:
             self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
-        except PdfiumError as e:
+        except PdfiumError:
             _log.info(
                 f"An exception occurred when loading page {page_no} of document {document_hash}.",
                 exc_info=True,
@@ -225,7 +226,6 @@ class PyPdfiumPageBackend(PdfPageBackend):
     def get_page_image(
         self, scale: float = 1, cropbox: Optional[BoundingBox] = None
     ) -> Image.Image:
         page_size = self.get_size()
         if not cropbox:

docling/backend/xml/jats_backend.py CHANGED Viewed

@@ -91,7 +91,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
         super().__init__(in_doc, path_or_stream)
         self.path_or_stream = path_or_stream
-        # Initialize the root of the document hiearchy
+        # Initialize the root of the document hierarchy
         self.root: Optional[NodeItem] = None
         self.valid = False
@@ -102,13 +102,13 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
             doc_info: etree.DocInfo = self.tree.docinfo
             if doc_info.system_url and any(
-                [kwd in doc_info.system_url for kwd in JATS_DTD_URL]
+                kwd in doc_info.system_url for kwd in JATS_DTD_URL
             ):
                 self.valid = True
                 return
             for ent in doc_info.internalDTD.iterentities():
                 if ent.system_url and any(
-                    [kwd in ent.system_url for kwd in JATS_DTD_URL]
+                    kwd in ent.system_url for kwd in JATS_DTD_URL
                 ):
                     self.valid = True
                     return
@@ -232,10 +232,9 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
                 # TODO: once superscript is supported, add label with formatting
                 aff = aff.removeprefix(f"{label[0].text}, ")
             affiliation_names.append(aff)
-        affiliation_ids_names = {
-            id: name
-            for id, name in zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
-        }
+        affiliation_ids_names = dict(
+            zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
+        )
         # Get author names and affiliation names
         for author_node in meta.xpath(
@@ -300,7 +299,6 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
     def _add_abstract(
         self, doc: DoclingDocument, xml_components: XMLComponents
     ) -> None:
         for abstract in xml_components["abstract"]:
             text: str = abstract["content"]
             title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT
@@ -349,7 +347,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
         return
-    def _parse_element_citation(self, node: etree._Element) -> str:
+    def _parse_element_citation(self, node: etree._Element) -> str:  # noqa: C901
         citation: Citation = {
             "author_names": "",
             "title": "",
@@ -440,7 +438,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
             citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
             if len(node.xpath("lpage")) > 0:
                 citation["page"] += (
-                    "–" + node.xpath("lpage")[0].text.replace("\n", " ").strip()
+                    "–" + node.xpath("lpage")[0].text.replace("\n", " ").strip()  # noqa: RUF001
                 )
         # Flatten the citation to string
@@ -595,9 +593,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
         try:
             self._add_table(doc, parent, table)
-        except Exception as e:
-            _log.warning(f"Skipping unsupported table in {str(self.file)}")
-            pass
+        except Exception:
+            _log.warning(f"Skipping unsupported table in {self.file!s}")
         return
@@ -609,7 +606,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
         )
         return
-    def _walk_linear(
+    def _walk_linear(  # noqa: C901
         self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
     ) -> str:
         skip_tags = ["term"]

docling/backend/xml/uspto_backend.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """Backend to parse patents from the United States Patent Office (USPTO).
-The parsers included in this module can handle patent grants pubished since 1976 and
+The parsers included in this module can handle patent grants published since 1976 and
 patent applications since 2001.
 The original files can be found in https://bulkdata.uspto.gov.
 """
@@ -122,7 +122,6 @@ class PatentUsptoDocumentBackend(DeclarativeDocumentBackend):
     @override
     def convert(self) -> DoclingDocument:
         if self.parser is not None:
             doc = self.parser.parse(self.patent_content)
             if doc is None:
@@ -163,7 +162,6 @@ class PatentUspto(ABC):
         Returns:
             The patent parsed as a docling document.
         """
-        pass
 class PatentUsptoIce(PatentUspto):
@@ -265,7 +263,7 @@ class PatentUsptoIce(PatentUspto):
             self.style_html = HtmlEntity()
         @override
-        def startElement(self, tag, attributes):  # noqa: N802
+        def startElement(self, tag, attributes):
             """Signal the start of an element.
             Args:
@@ -281,7 +279,7 @@ class PatentUsptoIce(PatentUspto):
             self._start_registered_elements(tag, attributes)
         @override
-        def skippedEntity(self, name):  # noqa: N802
+        def skippedEntity(self, name):
             """Receive notification of a skipped entity.
             HTML entities will be skipped by the parser. This method will unescape them
@@ -315,7 +313,7 @@ class PatentUsptoIce(PatentUspto):
                         self.text += unescaped
         @override
-        def endElement(self, tag):  # noqa: N802
+        def endElement(self, tag):
             """Signal the end of an element.
             Args:
@@ -442,7 +440,7 @@ class PatentUsptoIce(PatentUspto):
                     )
             elif name == self.Element.PARAGRAPH.value and text:
-                # remmove blank spaces added in paragraphs
+                # remove blank spaces added in paragraphs
                 text = re.sub("\\s+", " ", text)
                 if self.Element.ABSTRACT.value in self.property:
                     self.abstract = (
@@ -603,7 +601,7 @@ class PatentUsptoGrantV2(PatentUspto):
             self.style_html = HtmlEntity()
         @override
-        def startElement(self, tag, attributes):  # noqa: N802
+        def startElement(self, tag, attributes):
             """Signal the start of an element.
             Args:
@@ -616,7 +614,7 @@ class PatentUsptoGrantV2(PatentUspto):
             self._start_registered_elements(tag, attributes)
         @override
-        def skippedEntity(self, name):  # noqa: N802
+        def skippedEntity(self, name):
             """Receive notification of a skipped entity.
             HTML entities will be skipped by the parser. This method will unescape them
@@ -650,7 +648,7 @@ class PatentUsptoGrantV2(PatentUspto):
                         self.text += unescaped
         @override
-        def endElement(self, tag):  # noqa: N802
+        def endElement(self, tag):
             """Signal the end of an element.
             Args:
@@ -691,7 +689,7 @@ class PatentUsptoGrantV2(PatentUspto):
             if tag in [member.value for member in self.Element]:
                 if (
                     tag == self.Element.HEADING.value
-                    and not self.Element.SDOCL.value in self.property
+                    and self.Element.SDOCL.value not in self.property
                 ):
                     level_attr: str = attributes.get("LVL", "")
                     new_level: int = int(level_attr) if level_attr.isnumeric() else 1
@@ -743,7 +741,7 @@ class PatentUsptoGrantV2(PatentUspto):
                 # headers except claims statement
                 elif (
                     self.Element.HEADING.value in self.property
-                    and not self.Element.SDOCL.value in self.property
+                    and self.Element.SDOCL.value not in self.property
                     and text.strip()
                 ):
                     self.parents[self.level + 1] = self.doc.add_heading(
@@ -1164,7 +1162,7 @@ class PatentUsptoAppV1(PatentUspto):
             self.style_html = HtmlEntity()
         @override
-        def startElement(self, tag, attributes):  # noqa: N802
+        def startElement(self, tag, attributes):
             """Signal the start of an element.
             Args:
@@ -1177,7 +1175,7 @@ class PatentUsptoAppV1(PatentUspto):
             self._start_registered_elements(tag, attributes)
         @override
-        def skippedEntity(self, name):  # noqa: N802
+        def skippedEntity(self, name):
             """Receive notification of a skipped entity.
             HTML entities will be skipped by the parser. This method will unescape them
@@ -1211,7 +1209,7 @@ class PatentUsptoAppV1(PatentUspto):
                         self.text += unescaped
         @override
-        def endElement(self, tag):  # noqa: N802
+        def endElement(self, tag):
             """Signal the end of an element.
             Args:
@@ -1474,9 +1472,7 @@ class XmlTable:
                 if cw == 0:
                     offset_w0.append(col["offset"][ic])
-            min_colinfo["offset"] = sorted(
-                list(set(col["offset"] + min_colinfo["offset"]))
-            )
+            min_colinfo["offset"] = sorted(set(col["offset"] + min_colinfo["offset"]))
         # add back the 0 width cols to offset list
         offset_w0 = list(set(offset_w0))
@@ -1527,7 +1523,7 @@ class XmlTable:
         return ncols_max
-    def _parse_table(self, table: Tag) -> TableData:
+    def _parse_table(self, table: Tag) -> TableData:  # noqa: C901
         """Parse the content of a table tag.
         Args:
@@ -1701,7 +1697,7 @@ class XmlTable:
 class HtmlEntity:
     """Provide utility functions to get the HTML entities of styled characters.
-    This class has been developped from:
+    This class has been developed from:
     https://unicode-table.com/en/html-entities/
     https://www.w3.org/TR/WD-math-970515/table03.html
     """
@@ -1722,7 +1718,7 @@ class HtmlEntity:
                 "0": "&#8304;",
                 "+": "&#8314;",
                 "-": "&#8315;",
-                "−": "&#8315;",
+                "−": "&#8315;",  # noqa: RUF001
                 "=": "&#8316;",
                 "(": "&#8317;",
                 ")": "&#8318;",
@@ -1746,7 +1742,7 @@ class HtmlEntity:
                 "0": "&#8320;",
                 "+": "&#8330;",
                 "-": "&#8331;",
-                "−": "&#8331;",
+                "−": "&#8331;",  # noqa: RUF001
                 "=": "&#8332;",
                 "(": "&#8333;",
                 ")": "&#8334;",
@@ -1900,7 +1896,7 @@ class HtmlEntity:
         """Get an HTML entity of a greek letter in ISO 8879.
         Args:
-            The text to transform, as an ISO 8879 entitiy.
+            The text to transform, as an ISO 8879 entity.
         Returns:
             The HTML entity representing a greek letter. If the input text is not

docling/cli/main.py CHANGED Viewed

@@ -6,14 +6,16 @@ import sys
 import tempfile
 import time
 import warnings
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Annotated, Dict, Iterable, List, Optional, Type
+from typing import Annotated, Dict, List, Optional, Type
 import rich.table
 import typer
 from docling_core.types.doc import ImageRefMode
 from docling_core.utils.file import resolve_source_to_path
 from pydantic import TypeAdapter
+from rich.console import Console
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
@@ -53,7 +55,6 @@ warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|
 warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
 _log = logging.getLogger(__name__)
-from rich.console import Console
 console = Console()
 err_console = Console(stderr=True)
@@ -160,7 +161,6 @@ def export_documents(
     export_doctags: bool,
     image_export_mode: ImageRefMode,
 ):
     success_count = 0
     failure_count = 0
@@ -233,7 +233,7 @@ def _split_list(raw: Optional[str]) -> Optional[List[str]]:
 @app.command(no_args_is_help=True)
-def convert(
+def convert(  # noqa: C901
     input_sources: Annotated[
         List[str],
         typer.Argument(
@@ -289,7 +289,7 @@ def convert(
             ...,
             help=(
                 f"The OCR engine to use. When --allow-external-plugins is *not* set, the available values are: "
-                f"{', '.join((o.value for o in ocr_engines_enum_internal))}. "
+                f"{', '.join(o.value for o in ocr_engines_enum_internal)}. "
                 f"Use the option --show-external-plugins to see the options allowed with external plugins."
             ),
         ),
@@ -421,7 +421,7 @@ def convert(
         logging.basicConfig(level=logging.WARNING)
     elif verbose == 1:
         logging.basicConfig(level=logging.INFO)
-    elif verbose == 2:
+    else:
         logging.basicConfig(level=logging.DEBUG)
     settings.debug.visualize_cells = debug_visualize_cells
@@ -430,7 +430,7 @@ def convert(
     settings.debug.visualize_ocr = debug_visualize_ocr
     if from_formats is None:
-        from_formats = [e for e in InputFormat]
+        from_formats = list(InputFormat)
     parsed_headers: Optional[Dict[str, str]] = None
     if headers is not None:
@@ -521,7 +521,7 @@ def convert(
             if image_export_mode != ImageRefMode.PLACEHOLDER:
                 pipeline_options.generate_page_images = True
                 pipeline_options.generate_picture_images = (
-                    True  # FIXME: to be deprecated in verson 3
+                    True  # FIXME: to be deprecated in version 3
                 )
                 pipeline_options.images_scale = 2

docling/cli/models.py CHANGED Viewed

@@ -32,6 +32,8 @@ class _AvailableModels(str, Enum):
     CODE_FORMULA = "code_formula"
     PICTURE_CLASSIFIER = "picture_classifier"
     SMOLVLM = "smolvlm"
+    SMOLDOCLING = "smoldocling"
+    SMOLDOCLING_MLX = "smoldocling_mlx"
     GRANITE_VISION = "granite_vision"
     EASYOCR = "easyocr"
@@ -62,7 +64,7 @@ def download(
     models: Annotated[
         Optional[list[_AvailableModels]],
         typer.Argument(
-            help=f"Models to download (default behavior: a predefined set of models will be downloaded).",
+            help="Models to download (default behavior: a predefined set of models will be downloaded).",
         ),
     ] = None,
     all: Annotated[
@@ -89,14 +91,13 @@ def download(
             "Cannot simultaneously set 'all' parameter and specify models to download."
         )
     if not quiet:
-        FORMAT = "%(message)s"
         logging.basicConfig(
             level=logging.INFO,
             format="[blue]%(message)s[/blue]",
             datefmt="[%X]",
             handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
         )
-    to_download = models or ([m for m in _AvailableModels] if all else _default_models)
+    to_download = models or (list(_AvailableModels) if all else _default_models)
     output_dir = download_models(
         output_dir=output_dir,
         force=force,
@@ -106,6 +107,8 @@ def download(
         with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
         with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
         with_smolvlm=_AvailableModels.SMOLVLM in to_download,
+        with_smoldocling=_AvailableModels.SMOLDOCLING in to_download,
+        with_smoldocling_mlx=_AvailableModels.SMOLDOCLING_MLX in to_download,
         with_granite_vision=_AvailableModels.GRANITE_VISION in to_download,
         with_easyocr=_AvailableModels.EASYOCR in to_download,
     )

docling/datamodel/base_models.py CHANGED Viewed

@@ -10,7 +10,9 @@ from docling_core.types.doc import (
     TableCell,
 )
 from docling_core.types.doc.page import SegmentedPdfPage, TextCell
-from docling_core.types.io import (  # DO ΝΟΤ REMOVE; explicitly exposed from this location
+# DO NOT REMOVE; explicitly exposed from this location
+from docling_core.types.io import (
     DocumentStream,
 )
 from PIL.Image import Image
@@ -233,9 +235,9 @@ class Page(BaseModel):
         None  # Internal PDF backend. By default it is cleared during assembling.
     )
     _default_image_scale: float = 1.0  # Default image scale for external usage.
-    _image_cache: Dict[float, Image] = (
-        {}
-    )  # Cache of images in different scales. By default it is cleared during assembling.
+    _image_cache: Dict[
+        float, Image
+    ] = {}  # Cache of images in different scales. By default it is cleared during assembling.
     def get_image(
         self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
@@ -243,7 +245,7 @@ class Page(BaseModel):
         if self._backend is None:
             return self._image_cache.get(scale, None)
-        if not scale in self._image_cache:
+        if scale not in self._image_cache:
             if cropbox is None:
                 self._image_cache[scale] = self._backend.get_page_image(scale=scale)
             else:

docling/datamodel/document.py CHANGED Viewed

@@ -1,13 +1,13 @@
 import csv
 import logging
 import re
+from collections.abc import Iterable
 from enum import Enum
 from io import BytesIO
 from pathlib import Path, PurePath
 from typing import (
     TYPE_CHECKING,
     Dict,
-    Iterable,
     List,
     Literal,
     Optional,
@@ -17,6 +17,8 @@ from typing import (
 )
 import filetype
+# DO NOT REMOVE; explicitly exposed from this location
 from docling_core.types.doc import (
     DocItem,
     DocItemLabel,
@@ -35,14 +37,14 @@ from docling_core.types.legacy_doc.base import (
     PageReference,
     Prov,
     Ref,
+    Table as DsSchemaTable,
+    TableCell,
 )
-from docling_core.types.legacy_doc.base import Table as DsSchemaTable
-from docling_core.types.legacy_doc.base import TableCell
 from docling_core.types.legacy_doc.document import (
     CCSDocumentDescription as DsDocumentDescription,
+    CCSFileInfoObject as DsFileInfoObject,
+    ExportedCCSDocument as DsDocument,
 )
-from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
-from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
 from docling_core.utils.file import resolve_source_to_stream
 from docling_core.utils.legacy import docling_document_to_legacy
 from pydantic import BaseModel
@@ -65,7 +67,7 @@ from docling.datamodel.base_models import (
 )
 from docling.datamodel.settings import DocumentLimits
 from docling.utils.profiling import ProfilingItem
-from docling.utils.utils import create_file_hash, create_hash
+from docling.utils.utils import create_file_hash
 if TYPE_CHECKING:
     from docling.document_converter import FormatOption
@@ -134,9 +136,9 @@ class InputDocument(BaseModel):
                     self._init_doc(backend, path_or_stream)
             elif isinstance(path_or_stream, BytesIO):
-                assert (
-                    filename is not None
-                ), "Can't construct InputDocument from stream without providing filename arg."
+                assert filename is not None, (
+                    "Can't construct InputDocument from stream without providing filename arg."
+                )
                 self.file = PurePath(filename)
                 self.filesize = path_or_stream.getbuffer().nbytes
@@ -228,7 +230,6 @@ class _DummyBackend(AbstractDocumentBackend):
 class _DocumentConversionInput(BaseModel):
     path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
     headers: Optional[Dict[str, str]] = None
     limits: Optional[DocumentLimits] = DocumentLimits()
@@ -302,6 +303,14 @@ class _DocumentConversionInput(BaseModel):
                     else ""
                 )
                 mime = _DocumentConversionInput._mime_from_extension(ext)
+            if mime is not None and mime.lower() == "application/zip":
+                objname = obj.name.lower()
+                if objname.endswith(".xlsx"):
+                    mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+                elif objname.endswith(".docx"):
+                    mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+                elif objname.endswith(".pptx"):
+                    mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
         mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
         mime = mime or _DocumentConversionInput._detect_csv(content)

docling/datamodel/pipeline_options.py CHANGED Viewed

@@ -380,7 +380,6 @@ class PaginatedPipelineOptions(PipelineOptions):
 class VlmPipelineOptions(PaginatedPipelineOptions):
     generate_page_images: bool = True
     force_backend_text: bool = (
         False  # (To be used with vlms, or other generative models)

docling/document_converter.py CHANGED Viewed

@@ -1,11 +1,11 @@
 import hashlib
 import logging
-import math
 import sys
 import time
+from collections.abc import Iterable, Iterator
 from functools import partial
 from pathlib import Path
-from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union
+from typing import Dict, List, Optional, Tuple, Type, Union
 from pydantic import BaseModel, ConfigDict, model_validator, validate_call
@@ -172,7 +172,7 @@ class DocumentConverter:
         format_options: Optional[Dict[InputFormat, FormatOption]] = None,
     ):
         self.allowed_formats = (
-            allowed_formats if allowed_formats is not None else [e for e in InputFormat]
+            allowed_formats if allowed_formats is not None else list(InputFormat)
         )
         self.format_to_options = {
             format: (
@@ -189,7 +189,9 @@ class DocumentConverter:
     def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
         """Generate a hash of pipeline options to use as part of the cache key."""
         options_str = str(pipeline_options.model_dump())
-        return hashlib.md5(options_str.encode("utf-8")).hexdigest()
+        return hashlib.md5(
+            options_str.encode("utf-8"), usedforsecurity=False
+        ).hexdigest()
     def initialize_pipeline(self, format: InputFormat):
         """Initialize the conversion pipeline for the selected format."""
@@ -254,7 +256,7 @@ class DocumentConverter:
         if not had_result and raises_on_error:
             raise ConversionError(
-                f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
+                "Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
             )
     def _convert(
@@ -266,7 +268,7 @@ class DocumentConverter:
             conv_input.docs(self.format_to_options),
             settings.perf.doc_batch_size,  # pass format_options
         ):
-            _log.info(f"Going to convert document batch...")
+            _log.info("Going to convert document batch...")
             # parallel processing only within input_batch
             # with ThreadPoolExecutor(

docling/models/api_vlm_model.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Iterable
+from collections.abc import Iterable
 from docling.datamodel.base_models import Page, VlmPrediction
 from docling.datamodel.document import ConversionResult
@@ -10,7 +10,6 @@ from docling.utils.profiling import TimeRecorder
 class ApiVlmModel(BasePageModel):
     def __init__(
         self,
         enabled: bool,

docling 2.30.0__py3-none-any.whl → 2.31.1__py3-none-any.whl

docling 2.30.0py3-none-any.whl → 2.31.1py3-none-any.whl