PyPI - docling - Versions diffs - 2.44.0__py3-none-any.whl → 2.46.0__py3-none-any.whl - Mend

docling 2.44.0py3-none-any.whl → 2.46.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

docling/backend/docling_parse_v4_backend.py +61 -27
docling/backend/html_backend.py +356 -80
docling/backend/mets_gbs_backend.py +399 -0
docling/backend/pdf_backend.py +3 -3
docling/cli/main.py +10 -0
docling/datamodel/base_models.py +3 -0
docling/datamodel/document.py +26 -0
docling/datamodel/pipeline_options.py +1 -3
docling/datamodel/pipeline_options_vlm_model.py +8 -2
docling/document_converter.py +4 -0
docling/models/api_vlm_model.py +2 -5
docling/models/code_formula_model.py +87 -76
docling/models/tesseract_ocr_cli_model.py +4 -2
docling/models/vlm_models_inline/hf_transformers_model.py +2 -4
docling/models/vlm_models_inline/mlx_model.py +2 -4
docling/pipeline/base_pipeline.py +14 -5
docling/pipeline/threaded_standard_pdf_pipeline.py +6 -4
{docling-2.44.0.dist-info → docling-2.46.0.dist-info}/METADATA +2 -2
{docling-2.44.0.dist-info → docling-2.46.0.dist-info}/RECORD +23 -22
{docling-2.44.0.dist-info → docling-2.46.0.dist-info}/WHEEL +0 -0
{docling-2.44.0.dist-info → docling-2.46.0.dist-info}/entry_points.txt +0 -0
{docling-2.44.0.dist-info → docling-2.46.0.dist-info}/licenses/LICENSE +0 -0
{docling-2.44.0.dist-info → docling-2.46.0.dist-info}/top_level.txt +0 -0

docling/backend/mets_gbs_backend.py ADDED Viewed

@@ -0,0 +1,399 @@
+"""Backend for GBS Google Books schema."""
+import logging
+import tarfile
+from collections.abc import Iterable
+from dataclasses import dataclass
+from enum import Enum
+from io import BytesIO
+from pathlib import Path
+from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union
+from docling_core.types.doc import BoundingBox, CoordOrigin, Size
+from docling_core.types.doc.page import (
+    BoundingRectangle,
+    PdfPageBoundaryType,
+    PdfPageGeometry,
+    SegmentedPdfPage,
+    TextCell,
+)
+from lxml import etree
+from PIL import Image
+from PIL.Image import Image as PILImage
+from docling.backend.abstract_backend import PaginatedDocumentBackend
+from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
+from docling.datamodel.base_models import InputFormat
+if TYPE_CHECKING:
+    from docling.datamodel.document import InputDocument
+_log = logging.getLogger(__name__)
+def _get_pdf_page_geometry(
+    size: Size,
+) -> PdfPageGeometry:
+    boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX
+    bbox_tuple = (0, 0, size.width, size.height)
+    bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.TOPLEFT)
+    return PdfPageGeometry(
+        angle=0.0,
+        rect=BoundingRectangle.from_bounding_box(bbox),
+        boundary_type=boundary_type,
+        art_bbox=bbox,
+        bleed_bbox=bbox,
+        crop_bbox=bbox,
+        media_bbox=bbox,
+        trim_bbox=bbox,
+    )
+class MetsGbsPageBackend(PdfPageBackend):
+    def __init__(self, parsed_page: SegmentedPdfPage, page_im: PILImage):
+        self._im = page_im
+        self._dpage = parsed_page
+        self.valid = parsed_page is not None
+    def is_valid(self) -> bool:
+        return self.valid
+    def get_text_in_rect(self, bbox: BoundingBox) -> str:
+        # Find intersecting cells on the page
+        text_piece = ""
+        page_size = self.get_size()
+        scale = (
+            1  # FIX - Replace with param in get_text_in_rect across backends (optional)
+        )
+        for i, cell in enumerate(self._dpage.textline_cells):
+            cell_bbox = (
+                cell.rect.to_bounding_box()
+                .to_top_left_origin(page_height=page_size.height)
+                .scaled(scale)
+            )
+            overlap_frac = cell_bbox.intersection_over_self(bbox)
+            if overlap_frac > 0.5:
+                if len(text_piece) > 0:
+                    text_piece += " "
+                text_piece += cell.text
+        return text_piece
+    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
+        return self._dpage
+    def get_text_cells(self) -> Iterable[TextCell]:
+        return self._dpage.textline_cells
+    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
+        AREA_THRESHOLD = 0  # 32 * 32
+        images = self._dpage.bitmap_resources
+        for img in images:
+            cropbox = img.rect.to_bounding_box().to_top_left_origin(
+                self.get_size().height
+            )
+            if cropbox.area() > AREA_THRESHOLD:
+                cropbox = cropbox.scaled(scale=scale)
+                yield cropbox
+    def get_page_image(
+        self, scale: float = 1, cropbox: Optional[BoundingBox] = None
+    ) -> Image.Image:
+        page_size = self.get_size()
+        assert (
+            page_size.width == self._im.size[0] and page_size.height == self._im.size[1]
+        )
+        if not cropbox:
+            cropbox = BoundingBox(
+                l=0,
+                r=page_size.width,
+                t=0,
+                b=page_size.height,
+                coord_origin=CoordOrigin.TOPLEFT,
+            )
+        image = self._im.resize(
+            size=(round(page_size.width * scale), round(page_size.height * scale))
+        ).crop(cropbox.scaled(scale=scale).as_tuple())
+        return image
+    def get_size(self) -> Size:
+        return Size(
+            width=self._dpage.dimension.width, height=self._dpage.dimension.height
+        )
+    def unload(self) -> None:
+        if hasattr(self, "_im"):
+            delattr(self, "_im")
+        if hasattr(self, "_dpage"):
+            delattr(self, "_dpage")
+class _UseType(str, Enum):
+    IMAGE = "image"
+    OCR = "OCR"
+    COORD_OCR = "coordOCR"
+@dataclass
+class _FileInfo:
+    file_id: str
+    mimetype: str
+    path: str
+    use: _UseType
+@dataclass
+class _PageFiles:
+    image: Optional[_FileInfo] = None
+    ocr: Optional[_FileInfo] = None
+    coordOCR: Optional[_FileInfo] = None
+def _extract_rect(title_str: str) -> Optional[BoundingRectangle]:
+    """
+    Extracts bbox from title string like 'bbox 279 177 306 214;x_wconf 97'
+    """
+    parts = title_str.split(";")
+    for part in parts:
+        part = part.strip()
+        if part.startswith("bbox "):
+            try:
+                coords = part.split()[1:]
+                rect = BoundingRectangle.from_bounding_box(
+                    bbox=BoundingBox.from_tuple(
+                        tuple(map(int, coords)), origin=CoordOrigin.TOPLEFT
+                    )
+                )
+                return rect
+            except Exception:
+                return None
+    return None
+def _extract_confidence(title_str) -> float:
+    """Extracts x_wconf (OCR confidence) value from title string."""
+    for part in title_str.split(";"):
+        part = part.strip()
+        if part.startswith("x_wconf"):
+            try:
+                return float(part.split()[1]) / 100.0
+            except Exception:
+                return 1
+    return 1
+class MetsGbsDocumentBackend(PdfDocumentBackend):
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+        self._tar: tarfile.TarFile = (
+            tarfile.open(name=self.path_or_stream, mode="r:gz")
+            if isinstance(self.path_or_stream, Path)
+            else tarfile.open(fileobj=self.path_or_stream, mode="r:gz")
+        )
+        self.root_mets: Optional[etree._Element] = None
+        self.page_map: Dict[int, _PageFiles] = {}
+        for member in self._tar.getmembers():
+            if member.name.endswith(".xml"):
+                file = self._tar.extractfile(member)
+                if file is not None:
+                    content = file.read()
+                    self.root_mets = self._validate_mets_xml(content)
+                    if self.root_mets is not None:
+                        break
+        if self.root_mets is None:
+            raise RuntimeError(
+                f"METS GBS backend could not load document {self.document_hash}."
+            )
+        ns = {
+            "mets": "http://www.loc.gov/METS/",
+            "xlink": "http://www.w3.org/1999/xlink",
+            "xsi": "http://www.w3.org/2001/XMLSchema-instance",
+            "gbs": "http://books.google.com/gbs",
+            "premis": "info:lc/xmlns/premis-v2",
+            "marc": "http://www.loc.gov/MARC21/slim",
+        }
+        file_info_by_id: Dict[str, _FileInfo] = {}
+        for filegrp in self.root_mets.xpath(".//mets:fileGrp", namespaces=ns):
+            use_raw = filegrp.get("USE")
+            try:
+                use = _UseType(use_raw)
+            except ValueError:
+                continue  # Ignore unknown USE types
+            for file_elem in filegrp.xpath("./mets:file", namespaces=ns):
+                file_id = file_elem.get("ID")
+                mimetype = file_elem.get("MIMETYPE")
+                flocat_elem = file_elem.find("mets:FLocat", namespaces=ns)
+                href = (
+                    flocat_elem.get("{http://www.w3.org/1999/xlink}href")
+                    if flocat_elem is not None
+                    else None
+                )
+                if href is None:
+                    continue
+                file_info_by_id[file_id] = _FileInfo(
+                    file_id=file_id, mimetype=mimetype, path=href, use=use
+                )
+        USE_TO_ATTR = {
+            _UseType.IMAGE: "image",
+            _UseType.OCR: "ocr",
+            _UseType.COORD_OCR: "coordOCR",
+        }
+        for div in self.root_mets.xpath('.//mets:div[@TYPE="page"]', namespaces=ns):
+            order_str = div.get("ORDER")
+            if not order_str:
+                continue
+            try:
+                page_no = int(order_str) - 1  # make 0-index pages
+            except ValueError:
+                continue
+            page_files = _PageFiles()
+            for fptr in div.xpath("./mets:fptr", namespaces=ns):
+                file_id = fptr.get("FILEID")
+                file_info = file_info_by_id.get(file_id)
+                if file_info:
+                    attr = USE_TO_ATTR.get(file_info.use)
+                    if attr:
+                        setattr(page_files, attr, file_info)
+            self.page_map[page_no] = page_files
+    def _validate_mets_xml(self, xml_string) -> Optional[etree._Element]:
+        root: etree._Element = etree.fromstring(xml_string)
+        if (
+            root.tag == "{http://www.loc.gov/METS/}mets"
+            and root.get("PROFILE") == "gbs"
+        ):
+            return root
+        _log.warning(f"The root element is not <mets:mets> with PROFILE='gbs': {root}")
+        return None
+    def _parse_page(self, page_no: int) -> Tuple[SegmentedPdfPage, PILImage]:
+        # TODO: use better fallbacks...
+        image_info = self.page_map[page_no].image
+        assert image_info is not None
+        ocr_info = self.page_map[page_no].coordOCR
+        assert ocr_info is not None
+        image_file = self._tar.extractfile(image_info.path)
+        assert image_file is not None
+        buf = BytesIO(image_file.read())
+        im: PILImage = Image.open(buf)
+        ocr_file = self._tar.extractfile(ocr_info.path)
+        assert ocr_file is not None
+        ocr_content = ocr_file.read()
+        parser = etree.HTMLParser()
+        ocr_root: etree._Element = etree.fromstring(ocr_content, parser=parser)
+        line_cells: List[TextCell] = []
+        word_cells: List[TextCell] = []
+        page_div = ocr_root.xpath("//div[@class='ocr_page']")
+        size = Size(width=im.size[0], height=im.size[1])
+        if page_div:
+            title = page_div[0].attrib.get("title", "")
+            rect = _extract_rect(title)
+            if rect:
+                size = Size(width=rect.width, height=rect.height)
+        else:
+            _log.error(f"Could not find ocr_page for page {page_no}")
+        im = im.resize(size=(round(size.width), round(size.height)))
+        im = im.convert("RGB")
+        # Extract all ocrx_word spans
+        for ix, word in enumerate(ocr_root.xpath("//span[@class='ocrx_word']")):
+            text = "".join(word.itertext()).strip()
+            title = word.attrib.get("title", "")
+            rect = _extract_rect(title)
+            conf = _extract_confidence(title)
+            if rect:
+                word_cells.append(
+                    TextCell(
+                        index=ix,
+                        text=text,
+                        orig=text,
+                        rect=rect,
+                        from_ocr=True,
+                        confidence=conf,
+                    )
+                )
+        # Extract all ocr_line spans
+        # line: etree._Element
+        for ix, line in enumerate(ocr_root.xpath("//span[@class='ocr_line']")):
+            text = "".join(line.itertext()).strip()
+            title = line.attrib.get("title", "")
+            rect = _extract_rect(title)
+            conf = _extract_confidence(title)
+            if rect:
+                line_cells.append(
+                    TextCell(
+                        index=ix,
+                        text=text,
+                        orig=text,
+                        rect=rect,
+                        from_ocr=True,
+                        confidence=conf,
+                    )
+                )
+        page = SegmentedPdfPage(
+            dimension=_get_pdf_page_geometry(size),
+            textline_cells=line_cells,
+            char_cells=[],
+            word_cells=word_cells,
+            has_textlines=True,
+            has_words=True,
+            has_chars=False,
+        )
+        return page, im
+    def page_count(self) -> int:
+        return len(self.page_map)
+    def load_page(self, page_no: int) -> MetsGbsPageBackend:
+        # TODO: is this thread-safe?
+        page, im = self._parse_page(page_no)
+        return MetsGbsPageBackend(parsed_page=page, page_im=im)
+    def is_valid(self) -> bool:
+        return self.root_mets is not None and self.page_count() > 0
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return {InputFormat.METS_GBS}
+    @classmethod
+    def supports_pagination(cls) -> bool:
+        return True
+    def unload(self) -> None:
+        super().unload()
+        self._tar.close()

docling/backend/pdf_backend.py CHANGED Viewed

@@ -84,9 +84,9 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
                 buf.seek(0)
                 self.path_or_stream = buf
-            else:
+            elif self.input_format not in self.supported_formats():
                 raise RuntimeError(
-                    f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend."
+                    f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend. Valid format are {','.join(self.supported_formats())}."
                 )
     @abstractmethod
@@ -99,7 +99,7 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
     @classmethod
     def supported_formats(cls) -> Set[InputFormat]:
-        return {InputFormat.PDF}
+        return {InputFormat.PDF, InputFormat.IMAGE}
     @classmethod
     def supports_pagination(cls) -> bool:

docling/cli/main.py CHANGED Viewed

@@ -26,6 +26,7 @@ from rich.console import Console
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
 from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
+from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
@@ -607,9 +608,18 @@ def convert(  # noqa: C901
                 backend=backend,  # pdf_backend
             )
+            # METS GBS options
+            mets_gbs_options = pipeline_options.model_copy()
+            mets_gbs_options.do_ocr = False
+            mets_gbs_format_option = PdfFormatOption(
+                pipeline_options=mets_gbs_options,
+                backend=MetsGbsDocumentBackend,
+            )
             format_options = {
                 InputFormat.PDF: pdf_format_option,
                 InputFormat.IMAGE: pdf_format_option,
+                InputFormat.METS_GBS: mets_gbs_format_option,
             }
         elif pipeline == ProcessingPipeline.VLM:

docling/datamodel/base_models.py CHANGED Viewed

@@ -56,6 +56,7 @@ class InputFormat(str, Enum):
     XLSX = "xlsx"
     XML_USPTO = "xml_uspto"
     XML_JATS = "xml_jats"
+    METS_GBS = "mets_gbs"
     JSON_DOCLING = "json_docling"
     AUDIO = "audio"
@@ -81,6 +82,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
     InputFormat.CSV: ["csv"],
     InputFormat.XLSX: ["xlsx", "xlsm"],
     InputFormat.XML_USPTO: ["xml", "txt"],
+    InputFormat.METS_GBS: ["tar.gz"],
     InputFormat.JSON_DOCLING: ["json"],
     InputFormat.AUDIO: ["wav", "mp3"],
 }
@@ -113,6 +115,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
         "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
     ],
     InputFormat.XML_USPTO: ["application/xml", "text/plain"],
+    InputFormat.METS_GBS: ["application/mets+xml"],
     InputFormat.JSON_DOCLING: ["application/json"],
     InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
 }

docling/datamodel/document.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import csv
 import logging
 import re
+import tarfile
 from collections.abc import Iterable
 from enum import Enum
 from io import BytesIO
@@ -314,6 +315,10 @@ class _DocumentConversionInput(BaseModel):
                 elif objname.endswith(".pptx"):
                     mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
+        if mime is not None and mime.lower() == "application/gzip":
+            if detected_mime := _DocumentConversionInput._detect_mets_gbs(obj):
+                mime = detected_mime
         mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
         mime = mime or _DocumentConversionInput._detect_csv(content)
         mime = mime or "text/plain"
@@ -457,3 +462,24 @@ class _DocumentConversionInput(BaseModel):
             return None
         return None
+    @staticmethod
+    def _detect_mets_gbs(
+        obj: Union[Path, DocumentStream],
+    ) -> Optional[Literal["application/mets+xml"]]:
+        content = obj if isinstance(obj, Path) else obj.stream
+        tar: tarfile.TarFile
+        member: tarfile.TarInfo
+        with tarfile.open(
+            name=content if isinstance(content, Path) else None,
+            fileobj=content if isinstance(content, BytesIO) else None,
+            mode="r:gz",
+        ) as tar:
+            for member in tar.getmembers():
+                if member.name.endswith(".xml"):
+                    file = tar.extractfile(member)
+                    if file is not None:
+                        content_str = file.read().decode(errors="ignore")
+                        if "http://www.loc.gov/METS/" in content_str:
+                            return "application/mets+xml"
+        return None

docling/datamodel/pipeline_options.py CHANGED Viewed

@@ -323,9 +323,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
         ),
     )
-    generate_parsed_pages: Literal[True] = (
-        True  # Always True since parsed_page is now mandatory
-    )
+    generate_parsed_pages: bool = False
 class ProcessingPipeline(str, Enum):

docling/datamodel/pipeline_options_vlm_model.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from enum import Enum
-from typing import Any, Callable, Dict, List, Literal, Optional, Union
+from typing import Any, Dict, List, Literal, Optional
 from docling_core.types.doc.page import SegmentedPage
 from pydantic import AnyUrl, BaseModel
@@ -10,11 +10,17 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
 class BaseVlmOptions(BaseModel):
     kind: str
-    prompt: Union[str, Callable[[Optional[SegmentedPage]], str]]
+    prompt: str
     scale: float = 2.0
     max_size: Optional[int] = None
     temperature: float = 0.0
+    def build_prompt(self, page: Optional[SegmentedPage]) -> str:
+        return self.prompt
+    def decode_response(self, text: str) -> str:
+        return text
 class ResponseFormat(str, Enum):
     DOCTAGS = "doctags"

docling/document_converter.py CHANGED Viewed

@@ -20,6 +20,7 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
 from docling.backend.html_backend import HTMLDocumentBackend
 from docling.backend.json.docling_json_backend import DoclingJSONBackend
 from docling.backend.md_backend import MarkdownDocumentBackend
+from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
 from docling.backend.msexcel_backend import MsExcelDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
@@ -159,6 +160,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
         InputFormat.XML_JATS: FormatOption(
             pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
         ),
+        InputFormat.METS_GBS: FormatOption(
+            pipeline_cls=StandardPdfPipeline, backend=MetsGbsDocumentBackend
+        ),
         InputFormat.IMAGE: FormatOption(
             pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
         ),

docling/models/api_vlm_model.py CHANGED Viewed

@@ -53,11 +53,7 @@ class ApiVlmModel(BasePageModel):
                         if hi_res_image.mode != "RGB":
                             hi_res_image = hi_res_image.convert("RGB")
-                    if callable(self.vlm_options.prompt):
-                        prompt = self.vlm_options.prompt(page.parsed_page)
-                    else:
-                        prompt = self.vlm_options.prompt
+                    prompt = self.vlm_options.build_prompt(page.parsed_page)
                     page_tags = api_image_request(
                         image=hi_res_image,
                         prompt=prompt,
@@ -67,6 +63,7 @@ class ApiVlmModel(BasePageModel):
                         **self.params,
                     )
+                    page_tags = self.vlm_options.decode_response(page_tags)
                     page.predictions.vlm_response = VlmPrediction(text=page_tags)
                 return page

docling 2.44.0__py3-none-any.whl → 2.46.0__py3-none-any.whl

docling 2.44.0py3-none-any.whl → 2.46.0py3-none-any.whl