PyPI - docling - Versions diffs - 2.43.0__py3-none-any.whl → 2.45.0__py3-none-any.whl - Mend

docling 2.43.0py3-none-any.whl → 2.45.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

docling/backend/html_backend.py +406 -69
docling/backend/mets_gbs_backend.py +399 -0
docling/backend/pdf_backend.py +3 -3
docling/cli/main.py +16 -0
docling/datamodel/base_models.py +3 -0
docling/datamodel/document.py +26 -0
docling/datamodel/pipeline_options_vlm_model.py +8 -2
docling/document_converter.py +34 -0
docling/models/api_vlm_model.py +2 -5
docling/models/vlm_models_inline/hf_transformers_model.py +2 -4
docling/models/vlm_models_inline/mlx_model.py +4 -6
docling/pipeline/base_pipeline.py +7 -4
{docling-2.43.0.dist-info → docling-2.45.0.dist-info}/METADATA +2 -2
{docling-2.43.0.dist-info → docling-2.45.0.dist-info}/RECORD +18 -17
{docling-2.43.0.dist-info → docling-2.45.0.dist-info}/WHEEL +0 -0
{docling-2.43.0.dist-info → docling-2.45.0.dist-info}/entry_points.txt +0 -0
{docling-2.43.0.dist-info → docling-2.45.0.dist-info}/licenses/LICENSE +0 -0
{docling-2.43.0.dist-info → docling-2.45.0.dist-info}/top_level.txt +0 -0

docling/backend/mets_gbs_backend.py ADDED Viewed

@@ -0,0 +1,399 @@
+"""Backend for GBS Google Books schema."""
+import logging
+import tarfile
+from collections.abc import Iterable
+from dataclasses import dataclass
+from enum import Enum
+from io import BytesIO
+from pathlib import Path
+from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union
+from docling_core.types.doc import BoundingBox, CoordOrigin, Size
+from docling_core.types.doc.page import (
+    BoundingRectangle,
+    PdfPageBoundaryType,
+    PdfPageGeometry,
+    SegmentedPdfPage,
+    TextCell,
+)
+from lxml import etree
+from PIL import Image
+from PIL.Image import Image as PILImage
+from docling.backend.abstract_backend import PaginatedDocumentBackend
+from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
+from docling.datamodel.base_models import InputFormat
+if TYPE_CHECKING:
+    from docling.datamodel.document import InputDocument
+_log = logging.getLogger(__name__)
+def _get_pdf_page_geometry(
+    size: Size,
+) -> PdfPageGeometry:
+    boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX
+    bbox_tuple = (0, 0, size.width, size.height)
+    bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.TOPLEFT)
+    return PdfPageGeometry(
+        angle=0.0,
+        rect=BoundingRectangle.from_bounding_box(bbox),
+        boundary_type=boundary_type,
+        art_bbox=bbox,
+        bleed_bbox=bbox,
+        crop_bbox=bbox,
+        media_bbox=bbox,
+        trim_bbox=bbox,
+    )
+class MetsGbsPageBackend(PdfPageBackend):
+    def __init__(self, parsed_page: SegmentedPdfPage, page_im: PILImage):
+        self._im = page_im
+        self._dpage = parsed_page
+        self.valid = parsed_page is not None
+    def is_valid(self) -> bool:
+        return self.valid
+    def get_text_in_rect(self, bbox: BoundingBox) -> str:
+        # Find intersecting cells on the page
+        text_piece = ""
+        page_size = self.get_size()
+        scale = (
+            1  # FIX - Replace with param in get_text_in_rect across backends (optional)
+        )
+        for i, cell in enumerate(self._dpage.textline_cells):
+            cell_bbox = (
+                cell.rect.to_bounding_box()
+                .to_top_left_origin(page_height=page_size.height)
+                .scaled(scale)
+            )
+            overlap_frac = cell_bbox.intersection_over_self(bbox)
+            if overlap_frac > 0.5:
+                if len(text_piece) > 0:
+                    text_piece += " "
+                text_piece += cell.text
+        return text_piece
+    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
+        return self._dpage
+    def get_text_cells(self) -> Iterable[TextCell]:
+        return self._dpage.textline_cells
+    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
+        AREA_THRESHOLD = 0  # 32 * 32
+        images = self._dpage.bitmap_resources
+        for img in images:
+            cropbox = img.rect.to_bounding_box().to_top_left_origin(
+                self.get_size().height
+            )
+            if cropbox.area() > AREA_THRESHOLD:
+                cropbox = cropbox.scaled(scale=scale)
+                yield cropbox
+    def get_page_image(
+        self, scale: float = 1, cropbox: Optional[BoundingBox] = None
+    ) -> Image.Image:
+        page_size = self.get_size()
+        assert (
+            page_size.width == self._im.size[0] and page_size.height == self._im.size[1]
+        )
+        if not cropbox:
+            cropbox = BoundingBox(
+                l=0,
+                r=page_size.width,
+                t=0,
+                b=page_size.height,
+                coord_origin=CoordOrigin.TOPLEFT,
+            )
+        image = self._im.resize(
+            size=(round(page_size.width * scale), round(page_size.height * scale))
+        ).crop(cropbox.scaled(scale=scale).as_tuple())
+        return image
+    def get_size(self) -> Size:
+        return Size(
+            width=self._dpage.dimension.width, height=self._dpage.dimension.height
+        )
+    def unload(self) -> None:
+        if hasattr(self, "_im"):
+            delattr(self, "_im")
+        if hasattr(self, "_dpage"):
+            delattr(self, "_dpage")
+class _UseType(str, Enum):
+    IMAGE = "image"
+    OCR = "OCR"
+    COORD_OCR = "coordOCR"
+@dataclass
+class _FileInfo:
+    file_id: str
+    mimetype: str
+    path: str
+    use: _UseType
+@dataclass
+class _PageFiles:
+    image: Optional[_FileInfo] = None
+    ocr: Optional[_FileInfo] = None
+    coordOCR: Optional[_FileInfo] = None
+def _extract_rect(title_str: str) -> Optional[BoundingRectangle]:
+    """
+    Extracts bbox from title string like 'bbox 279 177 306 214;x_wconf 97'
+    """
+    parts = title_str.split(";")
+    for part in parts:
+        part = part.strip()
+        if part.startswith("bbox "):
+            try:
+                coords = part.split()[1:]
+                rect = BoundingRectangle.from_bounding_box(
+                    bbox=BoundingBox.from_tuple(
+                        tuple(map(int, coords)), origin=CoordOrigin.TOPLEFT
+                    )
+                )
+                return rect
+            except Exception:
+                return None
+    return None
+def _extract_confidence(title_str) -> float:
+    """Extracts x_wconf (OCR confidence) value from title string."""
+    for part in title_str.split(";"):
+        part = part.strip()
+        if part.startswith("x_wconf"):
+            try:
+                return float(part.split()[1]) / 100.0
+            except Exception:
+                return 1
+    return 1
+class MetsGbsDocumentBackend(PdfDocumentBackend):
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+        self._tar: tarfile.TarFile = (
+            tarfile.open(name=self.path_or_stream, mode="r:gz")
+            if isinstance(self.path_or_stream, Path)
+            else tarfile.open(fileobj=self.path_or_stream, mode="r:gz")
+        )
+        self.root_mets: Optional[etree._Element] = None
+        self.page_map: Dict[int, _PageFiles] = {}
+        for member in self._tar.getmembers():
+            if member.name.endswith(".xml"):
+                file = self._tar.extractfile(member)
+                if file is not None:
+                    content = file.read()
+                    self.root_mets = self._validate_mets_xml(content)
+                    if self.root_mets is not None:
+                        break
+        if self.root_mets is None:
+            raise RuntimeError(
+                f"METS GBS backend could not load document {self.document_hash}."
+            )
+        ns = {
+            "mets": "http://www.loc.gov/METS/",
+            "xlink": "http://www.w3.org/1999/xlink",
+            "xsi": "http://www.w3.org/2001/XMLSchema-instance",
+            "gbs": "http://books.google.com/gbs",
+            "premis": "info:lc/xmlns/premis-v2",
+            "marc": "http://www.loc.gov/MARC21/slim",
+        }
+        file_info_by_id: Dict[str, _FileInfo] = {}
+        for filegrp in self.root_mets.xpath(".//mets:fileGrp", namespaces=ns):
+            use_raw = filegrp.get("USE")
+            try:
+                use = _UseType(use_raw)
+            except ValueError:
+                continue  # Ignore unknown USE types
+            for file_elem in filegrp.xpath("./mets:file", namespaces=ns):
+                file_id = file_elem.get("ID")
+                mimetype = file_elem.get("MIMETYPE")
+                flocat_elem = file_elem.find("mets:FLocat", namespaces=ns)
+                href = (
+                    flocat_elem.get("{http://www.w3.org/1999/xlink}href")
+                    if flocat_elem is not None
+                    else None
+                )
+                if href is None:
+                    continue
+                file_info_by_id[file_id] = _FileInfo(
+                    file_id=file_id, mimetype=mimetype, path=href, use=use
+                )
+        USE_TO_ATTR = {
+            _UseType.IMAGE: "image",
+            _UseType.OCR: "ocr",
+            _UseType.COORD_OCR: "coordOCR",
+        }
+        for div in self.root_mets.xpath('.//mets:div[@TYPE="page"]', namespaces=ns):
+            order_str = div.get("ORDER")
+            if not order_str:
+                continue
+            try:
+                page_no = int(order_str) - 1  # make 0-index pages
+            except ValueError:
+                continue
+            page_files = _PageFiles()
+            for fptr in div.xpath("./mets:fptr", namespaces=ns):
+                file_id = fptr.get("FILEID")
+                file_info = file_info_by_id.get(file_id)
+                if file_info:
+                    attr = USE_TO_ATTR.get(file_info.use)
+                    if attr:
+                        setattr(page_files, attr, file_info)
+            self.page_map[page_no] = page_files
+    def _validate_mets_xml(self, xml_string) -> Optional[etree._Element]:
+        root: etree._Element = etree.fromstring(xml_string)
+        if (
+            root.tag == "{http://www.loc.gov/METS/}mets"
+            and root.get("PROFILE") == "gbs"
+        ):
+            return root
+        _log.warning(f"The root element is not <mets:mets> with PROFILE='gbs': {root}")
+        return None
+    def _parse_page(self, page_no: int) -> Tuple[SegmentedPdfPage, PILImage]:
+        # TODO: use better fallbacks...
+        image_info = self.page_map[page_no].image
+        assert image_info is not None
+        ocr_info = self.page_map[page_no].coordOCR
+        assert ocr_info is not None
+        image_file = self._tar.extractfile(image_info.path)
+        assert image_file is not None
+        buf = BytesIO(image_file.read())
+        im: PILImage = Image.open(buf)
+        ocr_file = self._tar.extractfile(ocr_info.path)
+        assert ocr_file is not None
+        ocr_content = ocr_file.read()
+        parser = etree.HTMLParser()
+        ocr_root: etree._Element = etree.fromstring(ocr_content, parser=parser)
+        line_cells: List[TextCell] = []
+        word_cells: List[TextCell] = []
+        page_div = ocr_root.xpath("//div[@class='ocr_page']")
+        size = Size(width=im.size[0], height=im.size[1])
+        if page_div:
+            title = page_div[0].attrib.get("title", "")
+            rect = _extract_rect(title)
+            if rect:
+                size = Size(width=rect.width, height=rect.height)
+        else:
+            _log.error(f"Could not find ocr_page for page {page_no}")
+        im = im.resize(size=(round(size.width), round(size.height)))
+        im = im.convert("RGB")
+        # Extract all ocrx_word spans
+        for ix, word in enumerate(ocr_root.xpath("//span[@class='ocrx_word']")):
+            text = "".join(word.itertext()).strip()
+            title = word.attrib.get("title", "")
+            rect = _extract_rect(title)
+            conf = _extract_confidence(title)
+            if rect:
+                word_cells.append(
+                    TextCell(
+                        index=ix,
+                        text=text,
+                        orig=text,
+                        rect=rect,
+                        from_ocr=True,
+                        confidence=conf,
+                    )
+                )
+        # Extract all ocr_line spans
+        # line: etree._Element
+        for ix, line in enumerate(ocr_root.xpath("//span[@class='ocr_line']")):
+            text = "".join(line.itertext()).strip()
+            title = line.attrib.get("title", "")
+            rect = _extract_rect(title)
+            conf = _extract_confidence(title)
+            if rect:
+                line_cells.append(
+                    TextCell(
+                        index=ix,
+                        text=text,
+                        orig=text,
+                        rect=rect,
+                        from_ocr=True,
+                        confidence=conf,
+                    )
+                )
+        page = SegmentedPdfPage(
+            dimension=_get_pdf_page_geometry(size),
+            textline_cells=line_cells,
+            char_cells=[],
+            word_cells=word_cells,
+            has_textlines=True,
+            has_words=True,
+            has_chars=False,
+        )
+        return page, im
+    def page_count(self) -> int:
+        return len(self.page_map)
+    def load_page(self, page_no: int) -> MetsGbsPageBackend:
+        # TODO: is this thread-safe?
+        page, im = self._parse_page(page_no)
+        return MetsGbsPageBackend(parsed_page=page, page_im=im)
+    def is_valid(self) -> bool:
+        return self.root_mets is not None and self.page_count() > 0
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return {InputFormat.METS_GBS}
+    @classmethod
+    def supports_pagination(cls) -> bool:
+        return True
+    def unload(self) -> None:
+        super().unload()
+        self._tar.close()

docling/backend/pdf_backend.py CHANGED Viewed

@@ -84,9 +84,9 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
                 buf.seek(0)
                 self.path_or_stream = buf
-            else:
+            elif self.input_format not in self.supported_formats():
                 raise RuntimeError(
-                    f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend."
+                    f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend. Valid format are {','.join(self.supported_formats())}."
                 )
     @abstractmethod
@@ -99,7 +99,7 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
     @classmethod
     def supported_formats(cls) -> Set[InputFormat]:
-        return {InputFormat.PDF}
+        return {InputFormat.PDF, InputFormat.IMAGE}
     @classmethod
     def supports_pagination(cls) -> bool:

docling/cli/main.py CHANGED Viewed

@@ -26,6 +26,7 @@ from rich.console import Console
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
 from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
+from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
@@ -262,6 +263,12 @@ def export_documents(
         else:
             _log.warning(f"Document {conv_res.input.file} failed to convert.")
+            if _log.isEnabledFor(logging.INFO):
+                for err in conv_res.errors:
+                    _log.info(
+                        f"  [Failure Detail] Component: {err.component_type}, "
+                        f"Module: {err.module_name}, Message: {err.error_message}"
+                    )
             failure_count += 1
     _log.info(
@@ -601,9 +608,18 @@ def convert(  # noqa: C901
                 backend=backend,  # pdf_backend
             )
+            # METS GBS options
+            mets_gbs_options = pipeline_options.model_copy()
+            mets_gbs_options.do_ocr = False
+            mets_gbs_format_option = PdfFormatOption(
+                pipeline_options=mets_gbs_options,
+                backend=MetsGbsDocumentBackend,
+            )
             format_options = {
                 InputFormat.PDF: pdf_format_option,
                 InputFormat.IMAGE: pdf_format_option,
+                InputFormat.METS_GBS: mets_gbs_format_option,
             }
         elif pipeline == ProcessingPipeline.VLM:

docling/datamodel/base_models.py CHANGED Viewed

@@ -56,6 +56,7 @@ class InputFormat(str, Enum):
     XLSX = "xlsx"
     XML_USPTO = "xml_uspto"
     XML_JATS = "xml_jats"
+    METS_GBS = "mets_gbs"
     JSON_DOCLING = "json_docling"
     AUDIO = "audio"
@@ -81,6 +82,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
     InputFormat.CSV: ["csv"],
     InputFormat.XLSX: ["xlsx", "xlsm"],
     InputFormat.XML_USPTO: ["xml", "txt"],
+    InputFormat.METS_GBS: ["tar.gz"],
     InputFormat.JSON_DOCLING: ["json"],
     InputFormat.AUDIO: ["wav", "mp3"],
 }
@@ -113,6 +115,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
         "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
     ],
     InputFormat.XML_USPTO: ["application/xml", "text/plain"],
+    InputFormat.METS_GBS: ["application/mets+xml"],
     InputFormat.JSON_DOCLING: ["application/json"],
     InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
 }

docling/datamodel/document.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import csv
 import logging
 import re
+import tarfile
 from collections.abc import Iterable
 from enum import Enum
 from io import BytesIO
@@ -314,6 +315,10 @@ class _DocumentConversionInput(BaseModel):
                 elif objname.endswith(".pptx"):
                     mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
+        if mime is not None and mime.lower() == "application/gzip":
+            if detected_mime := _DocumentConversionInput._detect_mets_gbs(obj):
+                mime = detected_mime
         mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
         mime = mime or _DocumentConversionInput._detect_csv(content)
         mime = mime or "text/plain"
@@ -457,3 +462,24 @@ class _DocumentConversionInput(BaseModel):
             return None
         return None
+    @staticmethod
+    def _detect_mets_gbs(
+        obj: Union[Path, DocumentStream],
+    ) -> Optional[Literal["application/mets+xml"]]:
+        content = obj if isinstance(obj, Path) else obj.stream
+        tar: tarfile.TarFile
+        member: tarfile.TarInfo
+        with tarfile.open(
+            name=content if isinstance(content, Path) else None,
+            fileobj=content if isinstance(content, BytesIO) else None,
+            mode="r:gz",
+        ) as tar:
+            for member in tar.getmembers():
+                if member.name.endswith(".xml"):
+                    file = tar.extractfile(member)
+                    if file is not None:
+                        content_str = file.read().decode(errors="ignore")
+                        if "http://www.loc.gov/METS/" in content_str:
+                            return "application/mets+xml"
+        return None

docling/datamodel/pipeline_options_vlm_model.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from enum import Enum
-from typing import Any, Callable, Dict, List, Literal, Optional, Union
+from typing import Any, Dict, List, Literal, Optional
 from docling_core.types.doc.page import SegmentedPage
 from pydantic import AnyUrl, BaseModel
@@ -10,11 +10,17 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
 class BaseVlmOptions(BaseModel):
     kind: str
-    prompt: Union[str, Callable[[Optional[SegmentedPage]], str]]
+    prompt: str
     scale: float = 2.0
     max_size: Optional[int] = None
     temperature: float = 0.0
+    def build_prompt(self, page: Optional[SegmentedPage]) -> str:
+        return self.prompt
+    def decode_response(self, text: str) -> str:
+        return text
 class ResponseFormat(str, Enum):
     DOCTAGS = "doctags"

docling/document_converter.py CHANGED Viewed

@@ -5,7 +5,9 @@ import threading
 import time
 from collections.abc import Iterable, Iterator
 from concurrent.futures import ThreadPoolExecutor
+from datetime import datetime
 from functools import partial
+from io import BytesIO
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Type, Union
@@ -18,6 +20,7 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
 from docling.backend.html_backend import HTMLDocumentBackend
 from docling.backend.json.docling_json_backend import DoclingJSONBackend
 from docling.backend.md_backend import MarkdownDocumentBackend
+from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
 from docling.backend.msexcel_backend import MsExcelDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
@@ -157,6 +160,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
         InputFormat.XML_JATS: FormatOption(
             pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
         ),
+        InputFormat.METS_GBS: FormatOption(
+            pipeline_cls=StandardPdfPipeline, backend=MetsGbsDocumentBackend
+        ),
         InputFormat.IMAGE: FormatOption(
             pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
         ),
@@ -275,6 +281,34 @@ class DocumentConverter:
                 "Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
             )
+    @validate_call(config=ConfigDict(strict=True))
+    def convert_string(
+        self,
+        content: str,
+        format: InputFormat,
+        name: Optional[str],
+    ) -> ConversionResult:
+        name = name or datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+        if format == InputFormat.MD:
+            if not name.endswith(".md"):
+                name += ".md"
+            buff = BytesIO(content.encode("utf-8"))
+            doc_stream = DocumentStream(name=name, stream=buff)
+            return self.convert(doc_stream)
+        elif format == InputFormat.HTML:
+            if not name.endswith(".html"):
+                name += ".html"
+            buff = BytesIO(content.encode("utf-8"))
+            doc_stream = DocumentStream(name=name, stream=buff)
+            return self.convert(doc_stream)
+        else:
+            raise ValueError(f"format {format} is not supported in `convert_string`")
     def _convert(
         self, conv_input: _DocumentConversionInput, raises_on_error: bool
     ) -> Iterator[ConversionResult]:

docling/models/api_vlm_model.py CHANGED Viewed

@@ -53,11 +53,7 @@ class ApiVlmModel(BasePageModel):
                         if hi_res_image.mode != "RGB":
                             hi_res_image = hi_res_image.convert("RGB")
-                    if callable(self.vlm_options.prompt):
-                        prompt = self.vlm_options.prompt(page.parsed_page)
-                    else:
-                        prompt = self.vlm_options.prompt
+                    prompt = self.vlm_options.build_prompt(page.parsed_page)
                     page_tags = api_image_request(
                         image=hi_res_image,
                         prompt=prompt,
@@ -67,6 +63,7 @@ class ApiVlmModel(BasePageModel):
                         **self.params,
                     )
+                    page_tags = self.vlm_options.decode_response(page_tags)
                     page.predictions.vlm_response = VlmPrediction(text=page_tags)
                 return page

docling/models/vlm_models_inline/hf_transformers_model.py CHANGED Viewed

@@ -135,10 +135,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
                     )
                     # Define prompt structure
-                    if callable(self.vlm_options.prompt):
-                        user_prompt = self.vlm_options.prompt(page.parsed_page)
-                    else:
-                        user_prompt = self.vlm_options.prompt
+                    user_prompt = self.vlm_options.build_prompt(page.parsed_page)
                     prompt = self.formulate_prompt(user_prompt)
                     inputs = self.processor(
@@ -166,6 +163,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
                     _log.debug(
                         f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
                     )
+                    generated_texts = self.vlm_options.decode_response(generated_texts)
                     page.predictions.vlm_response = VlmPrediction(
                         text=generated_texts,
                         generation_time=generation_time,

docling/models/vlm_models_inline/mlx_model.py CHANGED Viewed

@@ -35,9 +35,9 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
         if self.enabled:
             try:
-                from mlx_vlm import generate, load  # type: ignore
+                from mlx_vlm import generate, load, stream_generate  # type: ignore
                 from mlx_vlm.prompt_utils import apply_chat_template  # type: ignore
-                from mlx_vlm.utils import load_config, stream_generate  # type: ignore
+                from mlx_vlm.utils import load_config  # type: ignore
             except ImportError:
                 raise ImportError(
                     "mlx-vlm is not installed. Please install it via `pip install mlx-vlm` to use MLX VLM models."
@@ -84,10 +84,7 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
                         if hi_res_image.mode != "RGB":
                             hi_res_image = hi_res_image.convert("RGB")
-                    if callable(self.vlm_options.prompt):
-                        user_prompt = self.vlm_options.prompt(page.parsed_page)
-                    else:
-                        user_prompt = self.vlm_options.prompt
+                    user_prompt = self.vlm_options.build_prompt(page.parsed_page)
                     prompt = self.apply_chat_template(
                         self.processor, self.config, user_prompt, num_images=1
                     )
@@ -142,6 +139,7 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
                     _log.debug(
                         f"{generation_time:.2f} seconds for {len(tokens)} tokens ({len(tokens) / generation_time} tokens/sec)."
                     )
+                    page_tags = self.vlm_options.decode_response(page_tags)
                     page.predictions.vlm_response = VlmPrediction(
                         text=page_tags,
                         generation_time=generation_time,

docling 2.43.0__py3-none-any.whl → 2.45.0__py3-none-any.whl

docling 2.43.0py3-none-any.whl → 2.45.0py3-none-any.whl