PyPI - docling - Versions diffs - 2.29.0__py3-none-any.whl → 2.31.0__py3-none-any.whl - Mend

docling 2.29.0py3-none-any.whl → 2.31.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

docling/backend/asciidoc_backend.py +7 -15
docling/backend/csv_backend.py +1 -1
docling/backend/docling_parse_backend.py +2 -2
docling/backend/docling_parse_v2_backend.py +2 -2
docling/backend/docling_parse_v4_backend.py +3 -4
docling/backend/docx/latex/latex_dict.py +0 -5
docling/backend/docx/latex/omml.py +4 -7
docling/backend/html_backend.py +26 -9
docling/backend/md_backend.py +5 -7
docling/backend/msexcel_backend.py +271 -95
docling/backend/mspowerpoint_backend.py +4 -7
docling/backend/msword_backend.py +23 -15
docling/backend/pdf_backend.py +2 -1
docling/backend/pypdfium2_backend.py +3 -3
docling/backend/xml/jats_backend.py +10 -13
docling/backend/xml/uspto_backend.py +15 -19
docling/cli/main.py +27 -9
docling/cli/models.py +2 -3
docling/datamodel/base_models.py +40 -5
docling/datamodel/document.py +18 -10
docling/datamodel/pipeline_options.py +29 -4
docling/document_converter.py +5 -5
docling/models/api_vlm_model.py +66 -0
docling/models/base_model.py +2 -4
docling/models/base_ocr_model.py +2 -2
docling/models/code_formula_model.py +2 -1
docling/models/document_picture_classifier.py +2 -1
docling/models/easyocr_model.py +10 -11
docling/models/factories/__init__.py +2 -2
docling/models/factories/base_factory.py +1 -1
docling/models/hf_mlx_model.py +4 -6
docling/models/hf_vlm_model.py +7 -5
docling/models/layout_model.py +2 -2
docling/models/ocr_mac_model.py +3 -4
docling/models/page_assemble_model.py +7 -12
docling/models/page_preprocessing_model.py +2 -1
docling/models/picture_description_api_model.py +9 -75
docling/models/picture_description_base_model.py +16 -5
docling/models/picture_description_vlm_model.py +2 -3
docling/models/rapid_ocr_model.py +2 -3
docling/models/readingorder_model.py +8 -23
docling/models/table_structure_model.py +2 -6
docling/models/tesseract_ocr_cli_model.py +17 -16
docling/models/tesseract_ocr_model.py +8 -6
docling/pipeline/base_pipeline.py +4 -8
docling/pipeline/simple_pipeline.py +0 -1
docling/pipeline/standard_pdf_pipeline.py +6 -3
docling/pipeline/vlm_pipeline.py +27 -20
docling/utils/api_image_request.py +61 -0
docling/utils/export.py +2 -4
docling/utils/glm_utils.py +2 -2
docling/utils/layout_postprocessor.py +4 -2
docling/utils/model_downloader.py +7 -7
docling/utils/utils.py +1 -1
{docling-2.29.0.dist-info → docling-2.31.0.dist-info}/METADATA +4 -3
docling-2.31.0.dist-info/RECORD +86 -0
docling-2.29.0.dist-info/RECORD +0 -84
{docling-2.29.0.dist-info → docling-2.31.0.dist-info}/LICENSE +0 -0
{docling-2.29.0.dist-info → docling-2.31.0.dist-info}/WHEEL +0 -0
{docling-2.29.0.dist-info → docling-2.31.0.dist-info}/entry_points.txt +0 -0

docling/models/readingorder_model.py CHANGED Viewed

@@ -1,12 +1,7 @@
-import copy
-import random
 from pathlib import Path
 from typing import Dict, List
 from docling_core.types.doc import (
-    BoundingBox,
-    CoordOrigin,
-    DocItem,
     DocItemLabel,
     DoclingDocument,
     DocumentOrigin,
@@ -17,13 +12,10 @@ from docling_core.types.doc import (
     TableData,
 )
 from docling_core.types.doc.document import ContentLayer
-from docling_core.types.legacy_doc.base import Ref
-from docling_core.types.legacy_doc.document import BaseText
 from docling_ibm_models.reading_order.reading_order_rb import (
     PageElement as ReadingOrderPageElement,
+    ReadingOrderPredictor,
 )
-from docling_ibm_models.reading_order.reading_order_rb import ReadingOrderPredictor
-from PIL import ImageDraw
 from pydantic import BaseModel, ConfigDict
 from docling.datamodel.base_models import (
@@ -35,7 +27,6 @@ from docling.datamodel.base_models import (
     TextElement,
 )
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.settings import settings
 from docling.utils.profiling import ProfilingScope, TimeRecorder
@@ -53,12 +44,10 @@ class ReadingOrderModel:
     def _assembled_to_readingorder_elements(
         self, conv_res: ConversionResult
     ) -> List[ReadingOrderPageElement]:
         elements: List[ReadingOrderPageElement] = []
         page_no_to_pages = {p.page_no: p for p in conv_res.pages}
         for element in conv_res.assembled.elements:
             page_height = page_no_to_pages[element.page_no].size.height  # type: ignore
             bbox = element.cluster.bbox.to_bottom_left_origin(page_height)
             text = element.text or ""
@@ -84,7 +73,6 @@ class ReadingOrderModel:
     def _add_child_elements(
         self, element: BasePageElement, doc_item: NodeItem, doc: DoclingDocument
     ):
         child: Cluster
         for child in element.cluster.children:
             c_label = child.label
@@ -110,7 +98,7 @@ class ReadingOrderModel:
             else:
                 doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
-    def _readingorder_elements_to_docling_doc(
+    def _readingorder_elements_to_docling_doc(  # noqa: C901
         self,
         conv_res: ConversionResult,
         ro_elements: List[ReadingOrderPageElement],
@@ -118,7 +106,6 @@ class ReadingOrderModel:
         el_to_footnotes_mapping: Dict[int, List[int]],
         el_merges_mapping: Dict[int, List[int]],
     ) -> DoclingDocument:
         id_to_elem = {
             RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem
             for elem in conv_res.assembled.elements
@@ -192,7 +179,6 @@ class ReadingOrderModel:
                             code_item.footnotes.append(new_footnote_item.get_ref())
                 else:
                     new_item, current_list = self._handle_text_element(
                         element, out_doc, current_list, page_height
                     )
@@ -206,7 +192,6 @@ class ReadingOrderModel:
                             )
             elif isinstance(element, Table):
                 tbl_data = TableData(
                     num_rows=element.num_rows,
                     num_cols=element.num_cols,
@@ -342,12 +327,12 @@ class ReadingOrderModel:
         return new_item, current_list
     def _merge_elements(self, element, merged_elem, new_item, page_height):
-        assert isinstance(
-            merged_elem, type(element)
-        ), "Merged element must be of same type as element."
-        assert (
-            merged_elem.label == new_item.label
-        ), "Labels of merged elements must match."
+        assert isinstance(merged_elem, type(element)), (
+            "Merged element must be of same type as element."
+        )
+        assert merged_elem.label == new_item.label, (
+            "Labels of merged elements must match."
+        )
         prov = ProvenanceItem(
             page_no=element.page_no + 1,
             charspan=(

docling/models/table_structure_model.py CHANGED Viewed

@@ -1,13 +1,13 @@
 import copy
 import warnings
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, Optional, Union
+from typing import Optional
 import numpy
 from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
 from docling_core.types.doc.page import (
     BoundingRectangle,
-    SegmentedPdfPage,
     TextCellUnit,
 )
 from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
@@ -44,7 +44,6 @@ class TableStructureModel(BasePageModel):
         self.enabled = enabled
         if self.enabled:
             if artifacts_path is None:
                 artifacts_path = self.download_models() / self._model_path
             else:
@@ -175,7 +174,6 @@ class TableStructureModel(BasePageModel):
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
         if not self.enabled:
             yield from page_batch
             return
@@ -186,7 +184,6 @@ class TableStructureModel(BasePageModel):
                 yield page
             else:
                 with TimeRecorder(conv_res, "table_structure"):
                     assert page.predictions.layout is not None
                     assert page.size is not None
@@ -260,7 +257,6 @@ class TableStructureModel(BasePageModel):
                             table_out = tf_output[0]
                             table_cells = []
                             for element in table_out["tf_responses"]:
                                 if not self.do_cell_matching:
                                     the_bbox = BoundingBox.model_validate(
                                         element["bbox"]

docling/models/tesseract_ocr_cli_model.py CHANGED Viewed

@@ -3,9 +3,10 @@ import io
 import logging
 import os
 import tempfile
+from collections.abc import Iterable
 from pathlib import Path
 from subprocess import DEVNULL, PIPE, Popen
-from typing import Iterable, List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Type
 import pandas as pd
 from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -63,8 +64,7 @@ class TesseractOcrCliModel(BaseOcrModel):
                 )
     def _get_name_and_version(self) -> Tuple[str, str]:
-        if self._name != None and self._version != None:
+        if self._name is not None and self._version is not None:
             return self._name, self._version  # type: ignore
         cmd = [self.options.tesseract_cmd, "--version"]
@@ -125,14 +125,16 @@ class TesseractOcrCliModel(BaseOcrModel):
         # _log.info(decoded_data)
         # Read the TSV file generated by Tesseract
-        df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t")
+        df_result = pd.read_csv(
+            io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t"
+        )
         # Display the dataframe (optional)
         # _log.info("df: ", df.head())
         # Filter rows that contain actual text (ignore header or empty rows)
-        df_filtered = df[
-            df["text"].notnull() & (df["text"].apply(str).str.strip() != "")
+        df_filtered = df_result[
+            df_result["text"].notna() & (df_result["text"].apply(str).str.strip() != "")
         ]
         return df_filtered
@@ -149,10 +151,10 @@ class TesseractOcrCliModel(BaseOcrModel):
         proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
         output, _ = proc.communicate()
         decoded_data = output.decode("utf-8")
-        df = pd.read_csv(
+        df_detected = pd.read_csv(
             io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
         )
-        scripts = df.loc[df["key"] == "Script"].value.tolist()
+        scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
         if len(scripts) == 0:
             _log.warning("Tesseract cannot detect the script of the page")
             return None
@@ -183,11 +185,11 @@ class TesseractOcrCliModel(BaseOcrModel):
         proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
         output, _ = proc.communicate()
         decoded_data = output.decode("utf-8")
-        df = pd.read_csv(io.StringIO(decoded_data), header=None)
-        self._tesseract_languages = df[0].tolist()[1:]
+        df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
+        self._tesseract_languages = df_list[0].tolist()[1:]
         # Decide the script prefix
-        if any([l.startswith("script/") for l in self._tesseract_languages]):
+        if any(lang.startswith("script/") for lang in self._tesseract_languages):
             script_prefix = "script/"
         else:
             script_prefix = ""
@@ -197,7 +199,6 @@ class TesseractOcrCliModel(BaseOcrModel):
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
         if not self.enabled:
             yield from page_batch
             return
@@ -225,19 +226,19 @@ class TesseractOcrCliModel(BaseOcrModel):
                                 fname = image_file.name
                                 high_res_image.save(image_file)
-                            df = self._run_tesseract(fname)
+                            df_result = self._run_tesseract(fname)
                         finally:
                             if os.path.exists(fname):
                                 os.remove(fname)
-                        # _log.info(df)
+                        # _log.info(df_result)
                         # Print relevant columns (bounding box and text)
-                        for ix, row in df.iterrows():
+                        for ix, row in df_result.iterrows():
                             text = row["text"]
                             conf = row["conf"]
-                            l = float(row["left"])
+                            l = float(row["left"])  # noqa: E741
                             b = float(row["top"])
                             w = float(row["width"])
                             h = float(row["height"])

docling/models/tesseract_ocr_model.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import logging
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, Optional, Type
+from typing import Optional, Type
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import BoundingRectangle, TextCell
@@ -37,9 +38,6 @@ class TesseractOcrModel(BaseOcrModel):
         self.options: TesseractOcrOptions
         self.scale = 3  # multiplier for 72 dpi == 216 dpi.
-        self.reader = None
-        self.osd_reader = None
-        self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
         if self.enabled:
             install_errmsg = (
@@ -64,7 +62,7 @@ class TesseractOcrModel(BaseOcrModel):
                 raise ImportError(install_errmsg)
             try:
                 tesseract_version = tesserocr.tesseract_version()
-            except:
+            except Exception:
                 raise ImportError(install_errmsg)
             _, self._tesserocr_languages = tesserocr.get_languages()
@@ -75,7 +73,7 @@ class TesseractOcrModel(BaseOcrModel):
             _log.debug("Initializing TesserOCR: %s", tesseract_version)
             lang = "+".join(self.options.lang)
-            if any([l.startswith("script/") for l in self._tesserocr_languages]):
+            if any(lang.startswith("script/") for lang in self._tesserocr_languages):
                 self.script_prefix = "script/"
             else:
                 self.script_prefix = ""
@@ -86,6 +84,10 @@ class TesseractOcrModel(BaseOcrModel):
                 "oem": tesserocr.OEM.DEFAULT,
             }
+            self.reader = None
+            self.osd_reader = None
+            self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
             if self.options.path is not None:
                 tesserocr_kwargs["path"] = self.options.path

docling/pipeline/base_pipeline.py CHANGED Viewed

@@ -3,9 +3,10 @@ import logging
 import time
 import traceback
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Iterable, List
+from collections.abc import Iterable
+from typing import Any, Callable, List
-from docling_core.types.doc import DoclingDocument, NodeItem
+from docling_core.types.doc import NodeItem
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
@@ -64,7 +65,6 @@ class BasePipeline(ABC):
         return conv_res
     def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
         def _prepare_elements(
             conv_res: ConversionResult, model: GenericEnrichmentModel[Any]
         ) -> Iterable[NodeItem]:
@@ -113,7 +113,6 @@ class BasePipeline(ABC):
 class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
     def __init__(self, pipeline_options: PipelineOptions):
         super().__init__(pipeline_options)
         self.keep_backend = False
@@ -127,7 +126,6 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
         yield from page_batch
     def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
         if not isinstance(conv_res.input._backend, PdfDocumentBackend):
             raise RuntimeError(
                 f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. "
@@ -139,8 +137,7 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
         total_elapsed_time = 0.0
         with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
-            for i in range(0, conv_res.input.page_count):
+            for i in range(conv_res.input.page_count):
                 start_page, end_page = conv_res.input.limits.page_range
                 if (start_page - 1) <= i <= (end_page - 1):
                     conv_res.pages.append(Page(page_no=i))
@@ -161,7 +158,6 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
                     pipeline_pages = self._apply_on_pages(conv_res, init_pages)
                     for p in pipeline_pages:  # Must exhaust!
                         # Cleanup cached images
                         if not self.keep_images:
                             p._image_cache = {}

docling/pipeline/simple_pipeline.py CHANGED Viewed

@@ -24,7 +24,6 @@ class SimplePipeline(BasePipeline):
         super().__init__(pipeline_options)
     def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
         if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend):
             raise RuntimeError(
                 f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. "

docling/pipeline/standard_pdf_pipeline.py CHANGED Viewed

@@ -1,8 +1,7 @@
 import logging
-import sys
 import warnings
 from pathlib import Path
-from typing import Optional
+from typing import Optional, cast
 from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
@@ -226,7 +225,11 @@ class StandardPdfPipeline(PaginatedPipeline):
                         and self.pipeline_options.generate_table_images
                     ):
                         page_ix = element.prov[0].page_no - 1
-                        page = conv_res.pages[page_ix]
+                        page = next(
+                            (p for p in conv_res.pages if p.page_no == page_ix),
+                            cast("Page", None),
+                        )
+                        assert page is not None
                         assert page.size is not None
                         assert page.image is not None

docling/pipeline/vlm_pipeline.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import logging
-import warnings
 from io import BytesIO
 from pathlib import Path
 from typing import List, Optional, Union, cast
@@ -15,11 +14,14 @@ from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.datamodel.base_models import InputFormat, Page
 from docling.datamodel.document import ConversionResult, InputDocument
 from docling.datamodel.pipeline_options import (
+    ApiVlmOptions,
+    HuggingFaceVlmOptions,
     InferenceFramework,
     ResponseFormat,
     VlmPipelineOptions,
 )
 from docling.datamodel.settings import settings
+from docling.models.api_vlm_model import ApiVlmModel
 from docling.models.hf_mlx_model import HuggingFaceMlxModel
 from docling.models.hf_vlm_model import HuggingFaceVlmModel
 from docling.pipeline.base_pipeline import PaginatedPipeline
@@ -29,7 +31,6 @@ _log = logging.getLogger(__name__)
 class VlmPipeline(PaginatedPipeline):
     def __init__(self, pipeline_options: VlmPipelineOptions):
         super().__init__(pipeline_options)
         self.keep_backend = True
@@ -57,27 +58,34 @@ class VlmPipeline(PaginatedPipeline):
         self.keep_images = self.pipeline_options.generate_page_images
-        if (
-            self.pipeline_options.vlm_options.inference_framework
-            == InferenceFramework.MLX
-        ):
+        if isinstance(pipeline_options.vlm_options, ApiVlmOptions):
             self.build_pipe = [
-                HuggingFaceMlxModel(
+                ApiVlmModel(
                     enabled=True,  # must be always enabled for this pipeline to make sense.
-                    artifacts_path=artifacts_path,
-                    accelerator_options=pipeline_options.accelerator_options,
-                    vlm_options=self.pipeline_options.vlm_options,
-                ),
-            ]
-        else:
-            self.build_pipe = [
-                HuggingFaceVlmModel(
-                    enabled=True,  # must be always enabled for this pipeline to make sense.
-                    artifacts_path=artifacts_path,
-                    accelerator_options=pipeline_options.accelerator_options,
-                    vlm_options=self.pipeline_options.vlm_options,
+                    enable_remote_services=self.pipeline_options.enable_remote_services,
+                    vlm_options=cast(ApiVlmOptions, self.pipeline_options.vlm_options),
                 ),
             ]
+        elif isinstance(self.pipeline_options.vlm_options, HuggingFaceVlmOptions):
+            vlm_options = cast(HuggingFaceVlmOptions, self.pipeline_options.vlm_options)
+            if vlm_options.inference_framework == InferenceFramework.MLX:
+                self.build_pipe = [
+                    HuggingFaceMlxModel(
+                        enabled=True,  # must be always enabled for this pipeline to make sense.
+                        artifacts_path=artifacts_path,
+                        accelerator_options=pipeline_options.accelerator_options,
+                        vlm_options=vlm_options,
+                    ),
+                ]
+            else:
+                self.build_pipe = [
+                    HuggingFaceVlmModel(
+                        enabled=True,  # must be always enabled for this pipeline to make sense.
+                        artifacts_path=artifacts_path,
+                        accelerator_options=pipeline_options.accelerator_options,
+                        vlm_options=vlm_options,
+                    ),
+                ]
         self.enrichment_pipe = [
             # Other models working on `NodeItem` elements in the DoclingDocument
@@ -104,7 +112,6 @@ class VlmPipeline(PaginatedPipeline):
     def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
         with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
             if (
                 self.pipeline_options.vlm_options.response_format
                 == ResponseFormat.DOCTAGS

docling/utils/api_image_request.py ADDED Viewed

@@ -0,0 +1,61 @@
+import base64
+import logging
+from io import BytesIO
+from typing import Dict, Optional
+import requests
+from PIL import Image
+from pydantic import AnyUrl
+from docling.datamodel.base_models import OpenAiApiResponse
+_log = logging.getLogger(__name__)
+def api_image_request(
+    image: Image.Image,
+    prompt: str,
+    url: AnyUrl,
+    timeout: float = 20,
+    headers: Optional[Dict[str, str]] = None,
+    **params,
+) -> str:
+    img_io = BytesIO()
+    image.save(img_io, "PNG")
+    image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/png;base64,{image_base64}"},
+                },
+                {
+                    "type": "text",
+                    "text": prompt,
+                },
+            ],
+        }
+    ]
+    payload = {
+        "messages": messages,
+        **params,
+    }
+    headers = headers or {}
+    r = requests.post(
+        str(url),
+        headers=headers,
+        json=payload,
+        timeout=timeout,
+    )
+    if not r.ok:
+        _log.error(f"Error calling the API. Response was {r.text}")
+    r.raise_for_status()
+    api_resp = OpenAiApiResponse.model_validate_json(r.text)
+    generated_text = api_resp.choices[0].message.content.strip()
+    return generated_text

docling/utils/export.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import logging
-from typing import Any, Dict, Iterable, List, Tuple, Union
+from collections.abc import Iterable
+from typing import Any, Dict, List, Tuple, Union
 from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling_core.types.doc.page import TextCell
 from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
 from docling.datamodel.document import ConversionResult, Page
@@ -13,7 +13,6 @@ _log = logging.getLogger(__name__)
 def generate_multimodal_pages(
     doc_result: ConversionResult,
 ) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
     label_to_doclaynet = {
         "title": "title",
         "table-of-contents": "document_index",
@@ -122,7 +121,6 @@ def generate_multimodal_pages(
     if doc.main_text is None:
         return
     for ix, orig_item in enumerate(doc.main_text):
         item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item
         if item is None or item.prov is None or len(item.prov) == 0:
             _log.debug(f"Skipping item {orig_item}")

docling/utils/glm_utils.py CHANGED Viewed

@@ -29,7 +29,7 @@ def resolve_item(paths, obj):
     try:
         key = int(paths[0])
-    except:
+    except Exception:
         key = paths[0]
     if len(paths) == 1:
@@ -67,7 +67,7 @@ def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]:
     return unique_objects
-def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
+def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:  # noqa: C901
     origin = DocumentOrigin(
         mimetype="application/pdf",
         filename=doc_glm["file-info"]["filename"],

docling/utils/layout_postprocessor.py CHANGED Viewed

@@ -18,7 +18,7 @@ class UnionFind:
     def __init__(self, elements):
         self.parent = {elem: elem for elem in elements}
-        self.rank = {elem: 0 for elem in elements}
+        self.rank = dict.fromkeys(elements, 0)
     def find(self, x):
         if self.parent[x] != x:
@@ -484,7 +484,9 @@ class LayoutPostprocessor:
         spatial_index = (
             self.regular_index
             if cluster_type == "regular"
-            else self.picture_index if cluster_type == "picture" else self.wrapper_index
+            else self.picture_index
+            if cluster_type == "picture"
+            else self.wrapper_index
         )
         # Map of currently valid clusters

docling/utils/model_downloader.py CHANGED Viewed

@@ -37,7 +37,7 @@ def download_models(
     output_dir.mkdir(exist_ok=True, parents=True)
     if with_layout:
-        _log.info(f"Downloading layout model...")
+        _log.info("Downloading layout model...")
         LayoutModel.download_models(
             local_dir=output_dir / LayoutModel._model_repo_folder,
             force=force,
@@ -45,7 +45,7 @@ def download_models(
         )
     if with_tableformer:
-        _log.info(f"Downloading tableformer model...")
+        _log.info("Downloading tableformer model...")
         TableStructureModel.download_models(
             local_dir=output_dir / TableStructureModel._model_repo_folder,
             force=force,
@@ -53,7 +53,7 @@ def download_models(
         )
     if with_picture_classifier:
-        _log.info(f"Downloading picture classifier model...")
+        _log.info("Downloading picture classifier model...")
         DocumentPictureClassifier.download_models(
             local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
             force=force,
@@ -61,7 +61,7 @@ def download_models(
         )
     if with_code_formula:
-        _log.info(f"Downloading code formula model...")
+        _log.info("Downloading code formula model...")
         CodeFormulaModel.download_models(
             local_dir=output_dir / CodeFormulaModel._model_repo_folder,
             force=force,
@@ -69,7 +69,7 @@ def download_models(
         )
     if with_smolvlm:
-        _log.info(f"Downloading SmolVlm model...")
+        _log.info("Downloading SmolVlm model...")
         PictureDescriptionVlmModel.download_models(
             repo_id=smolvlm_picture_description.repo_id,
             local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
@@ -78,7 +78,7 @@ def download_models(
         )
     if with_granite_vision:
-        _log.info(f"Downloading Granite Vision model...")
+        _log.info("Downloading Granite Vision model...")
         PictureDescriptionVlmModel.download_models(
             repo_id=granite_picture_description.repo_id,
             local_dir=output_dir / granite_picture_description.repo_cache_folder,
@@ -87,7 +87,7 @@ def download_models(
         )
     if with_easyocr:
-        _log.info(f"Downloading easyocr models...")
+        _log.info("Downloading easyocr models...")
         EasyOcrModel.download_models(
             local_dir=output_dir / EasyOcrModel._model_repo_folder,
             force=force,

docling/utils/utils.py CHANGED Viewed

@@ -13,7 +13,7 @@ def chunkify(iterator, chunk_size):
     if isinstance(iterator, List):
         iterator = iter(iterator)
     for first in iterator:  # Take the first element from the iterator
-        yield [first] + list(islice(iterator, chunk_size - 1))
+        yield [first, *list(islice(iterator, chunk_size - 1))]
 def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str:

docling 2.29.0__py3-none-any.whl → 2.31.0__py3-none-any.whl

docling 2.29.0py3-none-any.whl → 2.31.0py3-none-any.whl