PyPI - docling - Versions diffs - 2.27.0__py3-none-any.whl → 2.28.1__py3-none-any.whl - Mend

docling 2.27.0py3-none-any.whl → 2.28.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

docling/backend/docling_parse_v4_backend.py +20 -13
docling/backend/mspowerpoint_backend.py +18 -0
docling/backend/msword_backend.py +56 -14
docling/cli/main.py +81 -38
docling/datamodel/pipeline_options.py +28 -2
docling/document_converter.py +29 -17
docling/models/hf_mlx_model.py +137 -0
docling/models/page_preprocessing_model.py +7 -1
docling/pipeline/vlm_pipeline.py +78 -398
{docling-2.27.0.dist-info → docling-2.28.1.dist-info}/METADATA +27 -32
{docling-2.27.0.dist-info → docling-2.28.1.dist-info}/RECORD +14 -13
{docling-2.27.0.dist-info → docling-2.28.1.dist-info}/LICENSE +0 -0
{docling-2.27.0.dist-info → docling-2.28.1.dist-info}/WHEEL +0 -0
{docling-2.27.0.dist-info → docling-2.28.1.dist-info}/entry_points.txt +0 -0

docling/backend/docling_parse_v4_backend.py CHANGED Viewed

@@ -112,23 +112,30 @@ class DoclingParseV4PageBackend(PdfPageBackend):
             padbox.r = page_size.width - padbox.r
             padbox.t = page_size.height - padbox.t
-        image = (
-            self._ppage.render(
-                scale=scale * 1.5,
-                rotation=0,  # no additional rotation
-                crop=padbox.as_tuple(),
-            )
-            .to_pil()
-            .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
-        )  # We resize the image from 1.5x the given scale to make it sharper.
+        with pypdfium2_lock:
+            image = (
+                self._ppage.render(
+                    scale=scale * 1.5,
+                    rotation=0,  # no additional rotation
+                    crop=padbox.as_tuple(),
+                )
+                .to_pil()
+                .resize(
+                    size=(round(cropbox.width * scale), round(cropbox.height * scale))
+                )
+            )  # We resize the image from 1.5x the given scale to make it sharper.
         return image
     def get_size(self) -> Size:
-        return Size(
-            width=self._dpage.dimension.width,
-            height=self._dpage.dimension.height,
-        )
+        with pypdfium2_lock:
+            return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
+        # TODO: Take width and height from docling-parse.
+        # return Size(
+        #    width=self._dpage.dimension.width,
+        #    height=self._dpage.dimension.height,
+        # )
     def unload(self):
         self._ppage = None

docling/backend/mspowerpoint_backend.py CHANGED Viewed

@@ -16,6 +16,7 @@ from docling_core.types.doc import (
     TableCell,
     TableData,
 )
+from docling_core.types.doc.document import ContentLayer
 from PIL import Image, UnidentifiedImageError
 from pptx import Presentation
 from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
@@ -421,4 +422,21 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
             for shape in slide.shapes:
                 handle_shapes(shape, parent_slide, slide_ind, doc, slide_size)
+            # Handle notes slide
+            if slide.has_notes_slide:
+                notes_slide = slide.notes_slide
+                notes_text = notes_slide.notes_text_frame.text.strip()
+                if notes_text:
+                    bbox = BoundingBox(l=0, t=0, r=0, b=0)
+                    prov = ProvenanceItem(
+                        page_no=slide_ind + 1, charspan=[0, len(notes_text)], bbox=bbox
+                    )
+                    doc.add_text(
+                        label=DocItemLabel.TEXT,
+                        parent=parent_slide,
+                        text=notes_text,
+                        prov=prov,
+                        content_layer=ContentLayer.FURNITURE,
+                    )
         return doc

docling/backend/msword_backend.py CHANGED Viewed

@@ -53,6 +53,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         self.max_levels: int = 10
         self.level_at_new_list: Optional[int] = None
         self.parents: dict[int, Optional[NodeItem]] = {}
+        self.numbered_headers: dict[int, int] = {}
         for i in range(-1, self.max_levels):
             self.parents[i] = None
@@ -275,8 +276,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 only_equations.append(latex_equation)
                 texts_and_equations.append(latex_equation)
-        if "".join(only_texts) != text:
-            return text
+        if "".join(only_texts).strip() != text.strip():
+            # If we are not able to reconstruct the initial raw text
+            # do not try to parse equations and return the original
+            return text, []
         return "".join(texts_and_equations), only_equations
@@ -344,7 +347,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 parent=None, label=DocItemLabel.TITLE, text=text
             )
         elif "Heading" in p_style_id:
-            self.add_header(doc, p_level, text)
+            style_element = getattr(paragraph.style, "element", None)
+            if style_element:
+                is_numbered_style = (
+                    "<w:numPr>" in style_element.xml or "<w:numPr>" in element.xml
+                )
+            else:
+                is_numbered_style = False
+            self.add_header(doc, p_level, text, is_numbered_style)
         elif len(equations) > 0:
             if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
@@ -365,6 +375,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 for eq in equations:
                     if len(text_tmp) == 0:
                         break
                     pre_eq_text = text_tmp.split(eq, maxsplit=1)[0]
                     text_tmp = text_tmp.split(eq, maxsplit=1)[1]
                     if len(pre_eq_text) > 0:
@@ -412,7 +423,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         return
     def add_header(
-        self, doc: DoclingDocument, curr_level: Optional[int], text: str
+        self,
+        doc: DoclingDocument,
+        curr_level: Optional[int],
+        text: str,
+        is_numbered_style: bool = False,
     ) -> None:
         level = self.get_level()
         if isinstance(curr_level, int):
@@ -430,17 +445,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                     if key >= curr_level:
                         self.parents[key] = None
-            self.parents[curr_level] = doc.add_heading(
-                parent=self.parents[curr_level - 1],
-                text=text,
-                level=curr_level,
-            )
+            current_level = curr_level
+            parent_level = curr_level - 1
+            add_level = curr_level
         else:
-            self.parents[self.level] = doc.add_heading(
-                parent=self.parents[self.level - 1],
-                text=text,
-                level=1,
-            )
+            current_level = self.level
+            parent_level = self.level - 1
+            add_level = 1
+        if is_numbered_style:
+            if add_level in self.numbered_headers:
+                self.numbered_headers[add_level] += 1
+            else:
+                self.numbered_headers[add_level] = 1
+            text = f"{self.numbered_headers[add_level]} {text}"
+            # Reset deeper levels
+            next_level = add_level + 1
+            while next_level in self.numbered_headers:
+                self.numbered_headers[next_level] = 0
+                next_level += 1
+            # Scan upper levels
+            previous_level = add_level - 1
+            while previous_level in self.numbered_headers:
+                # MSWord convention: no empty sublevels
+                # I.e., sub-sub section (2.0.1) without a sub-section (2.1)
+                # is processed as 2.1.1
+                if self.numbered_headers[previous_level] == 0:
+                    self.numbered_headers[previous_level] += 1
+                text = f"{self.numbered_headers[previous_level]}.{text}"
+                previous_level -= 1
+        self.parents[current_level] = doc.add_heading(
+            parent=self.parents[parent_level],
+            text=text,
+            level=add_level,
+        )
         return
     def add_listitem(

docling/cli/main.py CHANGED Viewed

@@ -32,13 +32,21 @@ from docling.datamodel.pipeline_options import (
     AcceleratorOptions,
     EasyOcrOptions,
     OcrOptions,
+    PaginatedPipelineOptions,
     PdfBackend,
+    PdfPipeline,
     PdfPipelineOptions,
     TableFormerMode,
+    VlmModelType,
+    VlmPipelineOptions,
+    granite_vision_vlm_conversion_options,
+    smoldocling_vlm_conversion_options,
+    smoldocling_vlm_mlx_conversion_options,
 )
 from docling.datamodel.settings import settings
 from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
 from docling.models.factories import get_ocr_factory
+from docling.pipeline.vlm_pipeline import VlmPipeline
 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
 warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
@@ -200,6 +208,14 @@ def convert(
             help="Image export mode for the document (only in case of JSON, Markdown or HTML). With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.",
         ),
     ] = ImageRefMode.EMBEDDED,
+    pipeline: Annotated[
+        PdfPipeline,
+        typer.Option(..., help="Choose the pipeline to process PDF or image files."),
+    ] = PdfPipeline.STANDARD,
+    vlm_model: Annotated[
+        VlmModelType,
+        typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
+    ] = VlmModelType.SMOLDOCLING,
     ocr: Annotated[
         bool,
         typer.Option(
@@ -420,50 +436,77 @@ def convert(
             ocr_options.lang = ocr_lang_list
         accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
-        pipeline_options = PdfPipelineOptions(
-            allow_external_plugins=allow_external_plugins,
-            enable_remote_services=enable_remote_services,
-            accelerator_options=accelerator_options,
-            do_ocr=ocr,
-            ocr_options=ocr_options,
-            do_table_structure=True,
-            do_code_enrichment=enrich_code,
-            do_formula_enrichment=enrich_formula,
-            do_picture_description=enrich_picture_description,
-            do_picture_classification=enrich_picture_classes,
-            document_timeout=document_timeout,
-        )
-        pipeline_options.table_structure_options.do_cell_matching = (
-            True  # do_cell_matching
-        )
-        pipeline_options.table_structure_options.mode = table_mode
+        pipeline_options: PaginatedPipelineOptions
+        if pipeline == PdfPipeline.STANDARD:
+            pipeline_options = PdfPipelineOptions(
+                allow_external_plugins=allow_external_plugins,
+                enable_remote_services=enable_remote_services,
+                accelerator_options=accelerator_options,
+                do_ocr=ocr,
+                ocr_options=ocr_options,
+                do_table_structure=True,
+                do_code_enrichment=enrich_code,
+                do_formula_enrichment=enrich_formula,
+                do_picture_description=enrich_picture_description,
+                do_picture_classification=enrich_picture_classes,
+                document_timeout=document_timeout,
+            )
+            pipeline_options.table_structure_options.do_cell_matching = (
+                True  # do_cell_matching
+            )
+            pipeline_options.table_structure_options.mode = table_mode
+            if image_export_mode != ImageRefMode.PLACEHOLDER:
+                pipeline_options.generate_page_images = True
+                pipeline_options.generate_picture_images = (
+                    True  # FIXME: to be deprecated in verson 3
+                )
+                pipeline_options.images_scale = 2
+            backend: Type[PdfDocumentBackend]
+            if pdf_backend == PdfBackend.DLPARSE_V1:
+                backend = DoclingParseDocumentBackend
+            elif pdf_backend == PdfBackend.DLPARSE_V2:
+                backend = DoclingParseV2DocumentBackend
+            elif pdf_backend == PdfBackend.DLPARSE_V4:
+                backend = DoclingParseV4DocumentBackend  # type: ignore
+            elif pdf_backend == PdfBackend.PYPDFIUM2:
+                backend = PyPdfiumDocumentBackend  # type: ignore
+            else:
+                raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
+            pdf_format_option = PdfFormatOption(
+                pipeline_options=pipeline_options,
+                backend=backend,  # pdf_backend
+            )
+        elif pipeline == PdfPipeline.VLM:
+            pipeline_options = VlmPipelineOptions()
+            if vlm_model == VlmModelType.GRANITE_VISION:
+                pipeline_options.vlm_options = granite_vision_vlm_conversion_options
+            elif vlm_model == VlmModelType.SMOLDOCLING:
+                pipeline_options.vlm_options = smoldocling_vlm_conversion_options
+                if sys.platform == "darwin":
+                    try:
+                        import mlx_vlm
+                        pipeline_options.vlm_options = (
+                            smoldocling_vlm_mlx_conversion_options
+                        )
+                    except ImportError:
+                        _log.warning(
+                            "To run SmolDocling faster, please install mlx-vlm:\n"
+                            "pip install mlx-vlm"
+                        )
-        if image_export_mode != ImageRefMode.PLACEHOLDER:
-            pipeline_options.generate_page_images = True
-            pipeline_options.generate_picture_images = (
-                True  # FIXME: to be deprecated in verson 3
+            pdf_format_option = PdfFormatOption(
+                pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
             )
-            pipeline_options.images_scale = 2
         if artifacts_path is not None:
             pipeline_options.artifacts_path = artifacts_path
-        backend: Type[PdfDocumentBackend]
-        if pdf_backend == PdfBackend.DLPARSE_V1:
-            backend = DoclingParseDocumentBackend
-        elif pdf_backend == PdfBackend.DLPARSE_V2:
-            backend = DoclingParseV2DocumentBackend
-        elif pdf_backend == PdfBackend.DLPARSE_V4:
-            backend = DoclingParseV4DocumentBackend  # type: ignore
-        elif pdf_backend == PdfBackend.PYPDFIUM2:
-            backend = PyPdfiumDocumentBackend  # type: ignore
-        else:
-            raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
-        pdf_format_option = PdfFormatOption(
-            pipeline_options=pipeline_options,
-            backend=backend,  # pdf_backend
-        )
         format_options: Dict[InputFormat, FormatOption] = {
             InputFormat.PDF: pdf_format_option,
             InputFormat.IMAGE: pdf_format_option,

docling/datamodel/pipeline_options.py CHANGED Viewed

@@ -263,6 +263,11 @@ class ResponseFormat(str, Enum):
     MARKDOWN = "markdown"
+class InferenceFramework(str, Enum):
+    MLX = "mlx"
+    TRANSFORMERS = "transformers"
 class HuggingFaceVlmOptions(BaseVlmOptions):
     kind: Literal["hf_model_options"] = "hf_model_options"
@@ -271,6 +276,7 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
     llm_int8_threshold: float = 6.0
     quantized: bool = False
+    inference_framework: InferenceFramework
     response_format: ResponseFormat
     @property
@@ -278,10 +284,19 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
         return self.repo_id.replace("/", "--")
+smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
+    repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
+    prompt="Convert this page to docling.",
+    response_format=ResponseFormat.DOCTAGS,
+    inference_framework=InferenceFramework.MLX,
+)
 smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
     repo_id="ds4sd/SmolDocling-256M-preview",
     prompt="Convert this page to docling.",
     response_format=ResponseFormat.DOCTAGS,
+    inference_framework=InferenceFramework.TRANSFORMERS,
 )
 granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
@@ -289,9 +304,15 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
     # prompt="OCR the full page to markdown.",
     prompt="OCR this image.",
     response_format=ResponseFormat.MARKDOWN,
+    inference_framework=InferenceFramework.TRANSFORMERS,
 )
+class VlmModelType(str, Enum):
+    SMOLDOCLING = "smoldocling"
+    GRANITE_VISION = "granite_vision"
 # Define an enum for the backend options
 class PdfBackend(str, Enum):
     """Enum of valid PDF backends."""
@@ -327,13 +348,14 @@ class PipelineOptions(BaseModel):
 class PaginatedPipelineOptions(PipelineOptions):
+    artifacts_path: Optional[Union[Path, str]] = None
     images_scale: float = 1.0
     generate_page_images: bool = False
     generate_picture_images: bool = False
 class VlmPipelineOptions(PaginatedPipelineOptions):
-    artifacts_path: Optional[Union[Path, str]] = None
     generate_page_images: bool = True
     force_backend_text: bool = (
@@ -346,7 +368,6 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
 class PdfPipelineOptions(PaginatedPipelineOptions):
     """Options for the PDF pipeline."""
-    artifacts_path: Optional[Union[Path, str]] = None
     do_table_structure: bool = True  # True: perform table structure extraction
     do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
     do_code_enrichment: bool = False  # True: perform code OCR
@@ -377,3 +398,8 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
     )
     generate_parsed_pages: bool = False
+class PdfPipeline(str, Enum):
+    STANDARD = "standard"
+    VLM = "vlm"

docling/document_converter.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import hashlib
 import logging
 import math
 import sys
@@ -181,7 +182,14 @@ class DocumentConverter:
             )
             for format in self.allowed_formats
         }
-        self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
+        self.initialized_pipelines: Dict[
+            Tuple[Type[BasePipeline], str], BasePipeline
+        ] = {}
+    def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
+        """Generate a hash of pipeline options to use as part of the cache key."""
+        options_str = str(pipeline_options.model_dump())
+        return hashlib.md5(options_str.encode("utf-8")).hexdigest()
     def initialize_pipeline(self, format: InputFormat):
         """Initialize the conversion pipeline for the selected format."""
@@ -279,31 +287,36 @@ class DocumentConverter:
                 yield item
     def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
+        """Retrieve or initialize a pipeline, reusing instances based on class and options."""
         fopt = self.format_to_options.get(doc_format)
-        if fopt is None:
+        if fopt is None or fopt.pipeline_options is None:
             return None
-        else:
-            pipeline_class = fopt.pipeline_cls
-            pipeline_options = fopt.pipeline_options
-        if pipeline_options is None:
-            return None
-        # TODO this will ignore if different options have been defined for the same pipeline class.
-        if (
-            pipeline_class not in self.initialized_pipelines
-            or self.initialized_pipelines[pipeline_class].pipeline_options
-            != pipeline_options
-        ):
-            self.initialized_pipelines[pipeline_class] = pipeline_class(
+        pipeline_class = fopt.pipeline_cls
+        pipeline_options = fopt.pipeline_options
+        options_hash = self._get_pipeline_options_hash(pipeline_options)
+        # Use a composite key to cache pipelines
+        cache_key = (pipeline_class, options_hash)
+        if cache_key not in self.initialized_pipelines:
+            _log.info(
+                f"Initializing pipeline for {pipeline_class.__name__} with options hash {options_hash}"
+            )
+            self.initialized_pipelines[cache_key] = pipeline_class(
                 pipeline_options=pipeline_options
             )
-        return self.initialized_pipelines[pipeline_class]
+        else:
+            _log.debug(
+                f"Reusing cached pipeline for {pipeline_class.__name__} with options hash {options_hash}"
+            )
+        return self.initialized_pipelines[cache_key]
     def _process_document(
         self, in_doc: InputDocument, raises_on_error: bool
     ) -> ConversionResult:
         valid = (
             self.allowed_formats is not None and in_doc.format in self.allowed_formats
         )
@@ -345,7 +358,6 @@ class DocumentConverter:
         else:
             if raises_on_error:
                 raise ConversionError(f"Input document {in_doc.file} is not valid.")
             else:
                 # invalid doc or not of desired format
                 conv_res = ConversionResult(

docling/models/hf_mlx_model.py ADDED Viewed

@@ -0,0 +1,137 @@
+import logging
+import time
+from pathlib import Path
+from typing import Iterable, List, Optional
+from docling.datamodel.base_models import Page, VlmPrediction
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import (
+    AcceleratorDevice,
+    AcceleratorOptions,
+    HuggingFaceVlmOptions,
+)
+from docling.datamodel.settings import settings
+from docling.models.base_model import BasePageModel
+from docling.utils.accelerator_utils import decide_device
+from docling.utils.profiling import TimeRecorder
+_log = logging.getLogger(__name__)
+class HuggingFaceMlxModel(BasePageModel):
+    def __init__(
+        self,
+        enabled: bool,
+        artifacts_path: Optional[Path],
+        accelerator_options: AcceleratorOptions,
+        vlm_options: HuggingFaceVlmOptions,
+    ):
+        self.enabled = enabled
+        self.vlm_options = vlm_options
+        if self.enabled:
+            try:
+                from mlx_vlm import generate, load  # type: ignore
+                from mlx_vlm.prompt_utils import apply_chat_template  # type: ignore
+                from mlx_vlm.utils import load_config, stream_generate  # type: ignore
+            except ImportError:
+                raise ImportError(
+                    "mlx-vlm is not installed. Please install it via `pip install mlx-vlm` to use MLX VLM models."
+                )
+            repo_cache_folder = vlm_options.repo_id.replace("/", "--")
+            self.apply_chat_template = apply_chat_template
+            self.stream_generate = stream_generate
+            # PARAMETERS:
+            if artifacts_path is None:
+                artifacts_path = self.download_models(self.vlm_options.repo_id)
+            elif (artifacts_path / repo_cache_folder).exists():
+                artifacts_path = artifacts_path / repo_cache_folder
+            self.param_question = vlm_options.prompt  # "Perform Layout Analysis."
+            ## Load the model
+            self.vlm_model, self.processor = load(artifacts_path)
+            self.config = load_config(artifacts_path)
+    @staticmethod
+    def download_models(
+        repo_id: str,
+        local_dir: Optional[Path] = None,
+        force: bool = False,
+        progress: bool = False,
+    ) -> Path:
+        from huggingface_hub import snapshot_download
+        from huggingface_hub.utils import disable_progress_bars
+        if not progress:
+            disable_progress_bars()
+        download_path = snapshot_download(
+            repo_id=repo_id,
+            force_download=force,
+            local_dir=local_dir,
+            # revision="v0.0.1",
+        )
+        return Path(download_path)
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
+        for page in page_batch:
+            assert page._backend is not None
+            if not page._backend.is_valid():
+                yield page
+            else:
+                with TimeRecorder(conv_res, "vlm"):
+                    assert page.size is not None
+                    hi_res_image = page.get_image(scale=2.0)  # 144dpi
+                    # hi_res_image = page.get_image(scale=1.0)  # 72dpi
+                    if hi_res_image is not None:
+                        im_width, im_height = hi_res_image.size
+                    # populate page_tags with predicted doc tags
+                    page_tags = ""
+                    if hi_res_image:
+                        if hi_res_image.mode != "RGB":
+                            hi_res_image = hi_res_image.convert("RGB")
+                    prompt = self.apply_chat_template(
+                        self.processor, self.config, self.param_question, num_images=1
+                    )
+                    start_time = time.time()
+                    # Call model to generate:
+                    output = ""
+                    for token in self.stream_generate(
+                        self.vlm_model,
+                        self.processor,
+                        prompt,
+                        [hi_res_image],
+                        max_tokens=4096,
+                        verbose=False,
+                    ):
+                        output += token.text
+                        if "</doctag>" in token.text:
+                            break
+                    generation_time = time.time() - start_time
+                    page_tags = output
+                    # inference_time = time.time() - start_time
+                    # tokens_per_second = num_tokens / generation_time
+                    # print("")
+                    # print(f"Page Inference Time: {inference_time:.2f} seconds")
+                    # print(f"Total tokens on page: {num_tokens:.2f}")
+                    # print(f"Tokens/sec: {tokens_per_second:.2f}")
+                    # print("")
+                    page.predictions.vlm_response = VlmPrediction(text=page_tags)
+                yield page

docling/models/page_preprocessing_model.py CHANGED Viewed

@@ -63,7 +63,13 @@ class PagePreprocessingModel(BasePageModel):
         def draw_text_boxes(image, cells, show: bool = False):
             draw = ImageDraw.Draw(image)
             for c in cells:
-                x0, y0, x1, y1 = c.bbox.as_tuple()
+                x0, y0, x1, y1 = (
+                    c.to_bounding_box().l,
+                    c.to_bounding_box().t,
+                    c.to_bounding_box().r,
+                    c.to_bounding_box().b,
+                )
                 draw.rectangle([(x0, y0), (x1, y1)], outline="red")
             if show:
                 image.show()

docling 2.27.0__py3-none-any.whl → 2.28.1__py3-none-any.whl

docling 2.27.0py3-none-any.whl → 2.28.1py3-none-any.whl