PyPI - docling - Versions diffs - 2.44.0__py3-none-any.whl → 2.45.0__py3-none-any.whl - Mend

docling 2.44.0py3-none-any.whl → 2.45.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

docling/backend/html_backend.py +349 -77
docling/backend/mets_gbs_backend.py +399 -0
docling/backend/pdf_backend.py +3 -3
docling/cli/main.py +10 -0
docling/datamodel/base_models.py +3 -0
docling/datamodel/document.py +26 -0
docling/datamodel/pipeline_options_vlm_model.py +8 -2
docling/document_converter.py +4 -0
docling/models/api_vlm_model.py +2 -5
docling/models/vlm_models_inline/hf_transformers_model.py +2 -4
docling/models/vlm_models_inline/mlx_model.py +2 -4
docling/pipeline/base_pipeline.py +7 -4
{docling-2.44.0.dist-info → docling-2.45.0.dist-info}/METADATA +1 -1
{docling-2.44.0.dist-info → docling-2.45.0.dist-info}/RECORD +18 -17
{docling-2.44.0.dist-info → docling-2.45.0.dist-info}/WHEEL +0 -0
{docling-2.44.0.dist-info → docling-2.45.0.dist-info}/entry_points.txt +0 -0
{docling-2.44.0.dist-info → docling-2.45.0.dist-info}/licenses/LICENSE +0 -0
{docling-2.44.0.dist-info → docling-2.45.0.dist-info}/top_level.txt +0 -0

docling/backend/html_backend.py CHANGED Viewed

@@ -1,8 +1,11 @@
 import logging
 import re
+from contextlib import contextmanager
+from copy import deepcopy
 from io import BytesIO
 from pathlib import Path
 from typing import Final, Optional, Union, cast
+from urllib.parse import urljoin
 from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
 from bs4.element import PreformattedString
@@ -18,7 +21,7 @@ from docling_core.types.doc import (
     TextItem,
 )
 from docling_core.types.doc.document import ContentLayer
-from pydantic import BaseModel
+from pydantic import AnyUrl, BaseModel, ValidationError
 from typing_extensions import override
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
@@ -56,12 +59,76 @@ class _Context(BaseModel):
     list_start_by_ref: dict[str, int] = {}
+class AnnotatedText(BaseModel):
+    text: str
+    hyperlink: Union[AnyUrl, Path, None] = None
+class AnnotatedTextList(list):
+    def to_single_text_element(self) -> AnnotatedText:
+        current_h = None
+        current_text = ""
+        for at in self:
+            t = at.text
+            h = at.hyperlink
+            current_text += t.strip() + " "
+            if h is not None and current_h is None:
+                current_h = h
+            elif h is not None and current_h is not None and h != current_h:
+                _log.warning(
+                    f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'"
+                )
+        return AnnotatedText(text=current_text.strip(), hyperlink=current_h)
+    def simplify_text_elements(self) -> "AnnotatedTextList":
+        simplified = AnnotatedTextList()
+        if not self:
+            return self
+        text = self[0].text
+        hyperlink = self[0].hyperlink
+        last_elm = text
+        for i in range(1, len(self)):
+            if hyperlink == self[i].hyperlink:
+                sep = " "
+                if not self[i].text.strip() or not last_elm.strip():
+                    sep = ""
+                text += sep + self[i].text
+                last_elm = self[i].text
+            else:
+                simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
+                text = self[i].text
+                last_elm = text
+                hyperlink = self[i].hyperlink
+        if text:
+            simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
+        return simplified
+    def split_by_newline(self):
+        super_list = []
+        active_annotated_text_list = AnnotatedTextList()
+        for el in self:
+            sub_texts = el.text.split("\n")
+            if len(sub_texts) == 1:
+                active_annotated_text_list.append(el)
+            else:
+                for text in sub_texts:
+                    sub_el = deepcopy(el)
+                    sub_el.text = text
+                    active_annotated_text_list.append(sub_el)
+                    super_list.append(active_annotated_text_list)
+                    active_annotated_text_list = AnnotatedTextList()
+        if active_annotated_text_list:
+            super_list.append(active_annotated_text_list)
+        return super_list
 class HTMLDocumentBackend(DeclarativeDocumentBackend):
     @override
     def __init__(
         self,
         in_doc: InputDocument,
         path_or_stream: Union[BytesIO, Path],
+        original_url: Optional[AnyUrl] = None,
     ):
         super().__init__(in_doc, path_or_stream)
         self.soup: Optional[Tag] = None
@@ -74,6 +141,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         self.ctx = _Context()
         for i in range(self.max_levels):
             self.parents[i] = None
+        self.hyperlink = None
+        self.original_url = original_url
         try:
             raw = (
@@ -160,26 +229,32 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             element: The XML tag to parse.
             doc: The Docling document to be updated with the parsed content.
         """
-        buffer: list[str] = []
+        buffer: AnnotatedTextList = AnnotatedTextList()
         def flush_buffer():
             if not buffer:
                 return
-            text = "".join(buffer).strip()
+            annotated_text_list = buffer.simplify_text_elements()
+            parts = annotated_text_list.split_by_newline()
             buffer.clear()
-            if not text:
+            if not "".join([el.text for el in annotated_text_list]):
                 return
-            for part in text.split("\n"):
-                seg = part.strip()
-                seg_clean = HTMLDocumentBackend._clean_unicode(seg)
-                if seg:
-                    doc.add_text(
-                        label=DocItemLabel.TEXT,
-                        text=seg_clean,
-                        orig=seg,
-                        parent=self.parents[self.level],
-                        content_layer=self.content_layer,
-                    )
+            for annotated_text_list in parts:
+                with self.use_inline_group(annotated_text_list, doc):
+                    for annotated_text in annotated_text_list:
+                        if annotated_text.text.strip():
+                            seg_clean = HTMLDocumentBackend._clean_unicode(
+                                annotated_text.text.strip()
+                            )
+                            doc.add_text(
+                                parent=self.parents[self.level],
+                                label=DocItemLabel.TEXT,
+                                text=seg_clean,
+                                content_layer=self.content_layer,
+                                hyperlink=annotated_text.hyperlink,
+                            )
         for node in element.contents:
             if isinstance(node, Tag):
@@ -187,6 +262,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 if name == "img":
                     flush_buffer()
                     self._emit_image(node, doc)
+                elif name == "a":
+                    with self.use_hyperlink(node):
+                        self._walk(node, doc)
                 elif name in _BLOCK_TAGS:
                     flush_buffer()
                     self._handle_block(node, doc)
@@ -194,28 +272,154 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                     flush_buffer()
                     self._walk(node, doc)
                 else:
-                    buffer.append(node.text)
+                    buffer.extend(
+                        self._extract_text_and_hyperlink_recursively(
+                            node, find_parent_annotation=True, keep_newlines=True
+                        )
+                    )
             elif isinstance(node, NavigableString) and not isinstance(
                 node, PreformattedString
             ):
-                buffer.append(str(node))
+                if str(node).strip("\n\r") == "":
+                    flush_buffer()
+                else:
+                    buffer.extend(
+                        self._extract_text_and_hyperlink_recursively(
+                            node, find_parent_annotation=True, keep_newlines=True
+                        )
+                    )
         flush_buffer()
+    def _extract_text_and_hyperlink_recursively(
+        self,
+        item: PageElement,
+        ignore_list=False,
+        find_parent_annotation=False,
+        keep_newlines=False,
+    ) -> AnnotatedTextList:
+        result: AnnotatedTextList = AnnotatedTextList()
+        # If find_parent_annotation, make sure that we keep track of
+        # any a-tag that has been present in the DOM-parents already.
+        if find_parent_annotation:
+            this_parent = item.parent
+            while this_parent is not None:
+                if this_parent.name == "a" and this_parent.get("href"):
+                    with self.use_hyperlink(this_parent):
+                        return self._extract_text_and_hyperlink_recursively(
+                            item, ignore_list
+                        )
+                this_parent = this_parent.parent
+        if isinstance(item, PreformattedString):
+            return AnnotatedTextList()
+        if isinstance(item, NavigableString):
+            text = item.strip()
+            if text:
+                return AnnotatedTextList(
+                    [AnnotatedText(text=text, hyperlink=self.hyperlink)]
+                )
+            if keep_newlines and item.strip("\n\r") == "":
+                return AnnotatedTextList(
+                    [AnnotatedText(text="\n", hyperlink=self.hyperlink)]
+                )
+            return AnnotatedTextList()
+        tag = cast(Tag, item)
+        if not ignore_list or (tag.name not in ["ul", "ol"]):
+            for child in tag:
+                if isinstance(child, Tag) and child.name == "a":
+                    with self.use_hyperlink(child):
+                        result.extend(
+                            self._extract_text_and_hyperlink_recursively(
+                                child, ignore_list, keep_newlines=keep_newlines
+                            )
+                        )
+                else:
+                    # Recursively get the child's text content
+                    result.extend(
+                        self._extract_text_and_hyperlink_recursively(
+                            child, ignore_list, keep_newlines=keep_newlines
+                        )
+                    )
+        return result
+    @contextmanager
+    def use_hyperlink(self, tag):
+        this_href = tag.get("href")
+        if this_href is None:
+            yield None
+        else:
+            if this_href:
+                old_hyperlink = self.hyperlink
+                if self.original_url is not None:
+                    this_href = urljoin(self.original_url, this_href)
+                # ugly fix for relative links since pydantic does not support them.
+                try:
+                    AnyUrl(this_href)
+                except ValidationError:
+                    this_href = Path(this_href)
+                self.hyperlink = this_href
+            try:
+                yield None
+            finally:
+                if this_href:
+                    self.hyperlink = old_hyperlink
+    @contextmanager
+    def use_inline_group(
+        self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument
+    ):
+        """Create an inline group for annotated texts.
+        Checks if annotated_text_list has more than one item and if so creates an inline
+        group in which the text elements can then be generated. While the context manager
+        is active the inline group is set as the current parent.
+        Args:
+            annotated_text_list (AnnotatedTextList): Annotated text
+            doc (DoclingDocument): Currently used document
+        Yields:
+            None: _description_
+        """
+        if len(annotated_text_list) > 1:
+            inline_fmt = doc.add_group(
+                label=GroupLabel.INLINE,
+                parent=self.parents[self.level],
+                content_layer=self.content_layer,
+            )
+            self.parents[self.level + 1] = inline_fmt
+            self.level += 1
+            try:
+                yield None
+            finally:
+                self.parents[self.level] = None
+                self.level -= 1
+        else:
+            yield None
     def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
         tag_name = tag.name.lower()
         # set default content layer to BODY as soon as we encounter a heading
         self.content_layer = ContentLayer.BODY
         level = int(tag_name[1])
-        text = tag.get_text(strip=True, separator=" ")
-        text_clean = HTMLDocumentBackend._clean_unicode(text)
+        annotated_text_list = self._extract_text_and_hyperlink_recursively(
+            tag, find_parent_annotation=True
+        )
+        annotated_text = annotated_text_list.to_single_text_element()
+        text_clean = HTMLDocumentBackend._clean_unicode(annotated_text.text)
         # the first level is for the title item
         if level == 1:
             for key in self.parents.keys():
                 self.parents[key] = None
             self.level = 0
             self.parents[self.level + 1] = doc.add_title(
-                text=text_clean, orig=text, content_layer=self.content_layer
+                text_clean,
+                content_layer=self.content_layer,
+                hyperlink=annotated_text.hyperlink,
             )
         # the other levels need to be lowered by 1 if a title was set
         else:
@@ -241,9 +445,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             self.parents[self.level + 1] = doc.add_heading(
                 parent=self.parents[self.level],
                 text=text_clean,
-                orig=text,
+                orig=annotated_text.text,
                 level=self.level,
                 content_layer=self.content_layer,
+                hyperlink=annotated_text.hyperlink,
             )
         self.level += 1
         for img_tag in tag("img"):
@@ -292,37 +497,69 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                     marker = ""
                 # 2) extract only the "direct" text from this <li>
-                parts: list[str] = []
-                for child in li.contents:
-                    if isinstance(child, NavigableString) and not isinstance(
-                        child, PreformattedString
-                    ):
-                        parts.append(child)
-                    elif isinstance(child, Tag) and child.name not in ("ul", "ol"):
-                        text_part = HTMLDocumentBackend.get_text(child)
-                        if text_part:
-                            parts.append(text_part)
-                li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
-                li_clean = HTMLDocumentBackend._clean_unicode(li_text)
+                parts = self._extract_text_and_hyperlink_recursively(
+                    li, ignore_list=True, find_parent_annotation=True
+                )
+                min_parts = parts.simplify_text_elements()
+                li_text = re.sub(
+                    r"\s+|\n+", " ", "".join([el.text for el in min_parts])
+                ).strip()
                 # 3) add the list item
                 if li_text:
-                    self.parents[self.level + 1] = doc.add_list_item(
-                        text=li_clean,
-                        enumerated=is_ordered,
-                        marker=marker,
-                        orig=li_text,
-                        parent=list_group,
-                        content_layer=self.content_layer,
-                    )
-                    # 4) recurse into any nested lists, attaching them to this <li> item
-                    for sublist in li({"ul", "ol"}, recursive=False):
-                        if isinstance(sublist, Tag):
-                            self.level += 1
-                            self._handle_block(sublist, doc)
-                            self.parents[self.level + 1] = None
-                            self.level -= 1
+                    if len(min_parts) > 1:
+                        # create an empty list element in order to hook the inline group onto that one
+                        self.parents[self.level + 1] = doc.add_list_item(
+                            text="",
+                            enumerated=is_ordered,
+                            marker=marker,
+                            parent=list_group,
+                            content_layer=self.content_layer,
+                        )
+                        self.level += 1
+                        with self.use_inline_group(min_parts, doc):
+                            for annotated_text in min_parts:
+                                li_text = re.sub(
+                                    r"\s+|\n+", " ", annotated_text.text
+                                ).strip()
+                                li_clean = HTMLDocumentBackend._clean_unicode(li_text)
+                                doc.add_text(
+                                    parent=self.parents[self.level],
+                                    label=DocItemLabel.TEXT,
+                                    text=li_clean,
+                                    content_layer=self.content_layer,
+                                    hyperlink=annotated_text.hyperlink,
+                                )
+                        # 4) recurse into any nested lists, attaching them to this <li> item
+                        for sublist in li({"ul", "ol"}, recursive=False):
+                            if isinstance(sublist, Tag):
+                                self._handle_block(sublist, doc)
+                        # now the list element with inline group is not a parent anymore
+                        self.parents[self.level] = None
+                        self.level -= 1
+                    else:
+                        annotated_text = min_parts[0]
+                        li_text = re.sub(r"\s+|\n+", " ", annotated_text.text).strip()
+                        li_clean = HTMLDocumentBackend._clean_unicode(li_text)
+                        self.parents[self.level + 1] = doc.add_list_item(
+                            text=li_clean,
+                            enumerated=is_ordered,
+                            marker=marker,
+                            orig=li_text,
+                            parent=list_group,
+                            content_layer=self.content_layer,
+                            hyperlink=annotated_text.hyperlink,
+                        )
+                        # 4) recurse into any nested lists, attaching them to this <li> item
+                        for sublist in li({"ul", "ol"}, recursive=False):
+                            if isinstance(sublist, Tag):
+                                self.level += 1
+                                self._handle_block(sublist, doc)
+                                self.parents[self.level + 1] = None
+                                self.level -= 1
                 else:
                     for sublist in li({"ul", "ol"}, recursive=False):
                         if isinstance(sublist, Tag):
@@ -351,17 +588,23 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             self._handle_list(tag, doc)
         elif tag_name in {"p", "address", "summary"}:
-            for part in tag.text.split("\n"):
-                seg = part.strip()
-                seg_clean = HTMLDocumentBackend._clean_unicode(seg)
-                if seg:
-                    doc.add_text(
-                        label=DocItemLabel.TEXT,
-                        text=seg_clean,
-                        orig=seg,
-                        parent=self.parents[self.level],
-                        content_layer=self.content_layer,
-                    )
+            text_list = self._extract_text_and_hyperlink_recursively(
+                tag, find_parent_annotation=True
+            )
+            annotated_texts = text_list.simplify_text_elements()
+            for part in annotated_texts.split_by_newline():
+                with self.use_inline_group(part, doc):
+                    for annotated_text in part:
+                        if seg := annotated_text.text.strip():
+                            seg_clean = HTMLDocumentBackend._clean_unicode(seg)
+                            doc.add_text(
+                                parent=self.parents[self.level],
+                                label=DocItemLabel.TEXT,
+                                text=seg_clean,
+                                content_layer=self.content_layer,
+                                hyperlink=annotated_text.hyperlink,
+                            )
             for img_tag in tag("img"):
                 if isinstance(img_tag, Tag):
                     self._emit_image(img_tag, doc)
@@ -380,15 +623,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         elif tag_name in {"pre", "code"}:
             # handle monospace code snippets (pre).
-            text = tag.get_text(strip=True)
-            text_clean = HTMLDocumentBackend._clean_unicode(text)
-            if text:
-                doc.add_code(
-                    parent=self.parents[self.level],
-                    text=text_clean,
-                    orig=text,
-                    content_layer=self.content_layer,
-                )
+            text_list = self._extract_text_and_hyperlink_recursively(
+                tag, find_parent_annotation=True
+            )
+            annotated_texts = text_list.simplify_text_elements()
+            with self.use_inline_group(annotated_texts, doc):
+                for annotated_text in annotated_texts:
+                    text_clean = HTMLDocumentBackend._clean_unicode(
+                        annotated_text.text.strip()
+                    )
+                    doc.add_code(
+                        parent=self.parents[self.level],
+                        text=text_clean,
+                        content_layer=self.content_layer,
+                        hyperlink=annotated_text.hyperlink,
+                    )
         elif tag_name == "details":
             # handle details and its content.
@@ -405,22 +654,45 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
     def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
         figure = img_tag.find_parent("figure")
-        caption: str = ""
+        caption: AnnotatedTextList = AnnotatedTextList()
+        # check if the figure has a link - this is HACK:
+        def get_img_hyperlink(img_tag):
+            this_parent = img_tag.parent
+            while this_parent is not None:
+                if this_parent.name == "a" and this_parent.get("href"):
+                    return this_parent.get("href")
+                this_parent = this_parent.parent
+            return None
+        if img_hyperlink := get_img_hyperlink(img_tag):
+            caption.append(
+                AnnotatedText(text="Image Hyperlink.", hyperlink=img_hyperlink)
+            )
         if isinstance(figure, Tag):
             caption_tag = figure.find("figcaption", recursive=False)
             if isinstance(caption_tag, Tag):
-                caption = caption_tag.get_text()
-        if not caption:
-            caption = str(img_tag.get("alt", "")).strip()
+                caption = self._extract_text_and_hyperlink_recursively(
+                    caption_tag, find_parent_annotation=True
+                )
+        if not caption and img_tag.get("alt"):
+            caption = AnnotatedTextList([AnnotatedText(text=img_tag.get("alt"))])
+        caption_anno_text = caption.to_single_text_element()
         caption_item: Optional[TextItem] = None
-        if caption:
-            caption_clean = HTMLDocumentBackend._clean_unicode(caption)
+        if caption_anno_text.text:
+            text_clean = HTMLDocumentBackend._clean_unicode(
+                caption_anno_text.text.strip()
+            )
+            print(caption_anno_text)
             caption_item = doc.add_text(
                 label=DocItemLabel.CAPTION,
-                text=caption_clean,
-                orig=caption,
+                text=text_clean,
+                orig=caption_anno_text.text,
                 content_layer=self.content_layer,
+                hyperlink=caption_anno_text.hyperlink,
             )
         doc.add_picture(

docling/backend/mets_gbs_backend.py ADDED Viewed

@@ -0,0 +1,399 @@
+"""Backend for GBS Google Books schema."""
+import logging
+import tarfile
+from collections.abc import Iterable
+from dataclasses import dataclass
+from enum import Enum
+from io import BytesIO
+from pathlib import Path
+from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union
+from docling_core.types.doc import BoundingBox, CoordOrigin, Size
+from docling_core.types.doc.page import (
+    BoundingRectangle,
+    PdfPageBoundaryType,
+    PdfPageGeometry,
+    SegmentedPdfPage,
+    TextCell,
+)
+from lxml import etree
+from PIL import Image
+from PIL.Image import Image as PILImage
+from docling.backend.abstract_backend import PaginatedDocumentBackend
+from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
+from docling.datamodel.base_models import InputFormat
+if TYPE_CHECKING:
+    from docling.datamodel.document import InputDocument
+_log = logging.getLogger(__name__)
+def _get_pdf_page_geometry(
+    size: Size,
+) -> PdfPageGeometry:
+    boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX
+    bbox_tuple = (0, 0, size.width, size.height)
+    bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.TOPLEFT)
+    return PdfPageGeometry(
+        angle=0.0,
+        rect=BoundingRectangle.from_bounding_box(bbox),
+        boundary_type=boundary_type,
+        art_bbox=bbox,
+        bleed_bbox=bbox,
+        crop_bbox=bbox,
+        media_bbox=bbox,
+        trim_bbox=bbox,
+    )
+class MetsGbsPageBackend(PdfPageBackend):
+    def __init__(self, parsed_page: SegmentedPdfPage, page_im: PILImage):
+        self._im = page_im
+        self._dpage = parsed_page
+        self.valid = parsed_page is not None
+    def is_valid(self) -> bool:
+        return self.valid
+    def get_text_in_rect(self, bbox: BoundingBox) -> str:
+        # Find intersecting cells on the page
+        text_piece = ""
+        page_size = self.get_size()
+        scale = (
+            1  # FIX - Replace with param in get_text_in_rect across backends (optional)
+        )
+        for i, cell in enumerate(self._dpage.textline_cells):
+            cell_bbox = (
+                cell.rect.to_bounding_box()
+                .to_top_left_origin(page_height=page_size.height)
+                .scaled(scale)
+            )
+            overlap_frac = cell_bbox.intersection_over_self(bbox)
+            if overlap_frac > 0.5:
+                if len(text_piece) > 0:
+                    text_piece += " "
+                text_piece += cell.text
+        return text_piece
+    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
+        return self._dpage
+    def get_text_cells(self) -> Iterable[TextCell]:
+        return self._dpage.textline_cells
+    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
+        AREA_THRESHOLD = 0  # 32 * 32
+        images = self._dpage.bitmap_resources
+        for img in images:
+            cropbox = img.rect.to_bounding_box().to_top_left_origin(
+                self.get_size().height
+            )
+            if cropbox.area() > AREA_THRESHOLD:
+                cropbox = cropbox.scaled(scale=scale)
+                yield cropbox
+    def get_page_image(
+        self, scale: float = 1, cropbox: Optional[BoundingBox] = None
+    ) -> Image.Image:
+        page_size = self.get_size()
+        assert (
+            page_size.width == self._im.size[0] and page_size.height == self._im.size[1]
+        )
+        if not cropbox:
+            cropbox = BoundingBox(
+                l=0,
+                r=page_size.width,
+                t=0,
+                b=page_size.height,
+                coord_origin=CoordOrigin.TOPLEFT,
+            )
+        image = self._im.resize(
+            size=(round(page_size.width * scale), round(page_size.height * scale))
+        ).crop(cropbox.scaled(scale=scale).as_tuple())
+        return image
+    def get_size(self) -> Size:
+        return Size(
+            width=self._dpage.dimension.width, height=self._dpage.dimension.height
+        )
+    def unload(self) -> None:
+        if hasattr(self, "_im"):
+            delattr(self, "_im")
+        if hasattr(self, "_dpage"):
+            delattr(self, "_dpage")
+class _UseType(str, Enum):
+    IMAGE = "image"
+    OCR = "OCR"
+    COORD_OCR = "coordOCR"
+@dataclass
+class _FileInfo:
+    file_id: str
+    mimetype: str
+    path: str
+    use: _UseType
+@dataclass
+class _PageFiles:
+    image: Optional[_FileInfo] = None
+    ocr: Optional[_FileInfo] = None
+    coordOCR: Optional[_FileInfo] = None
+def _extract_rect(title_str: str) -> Optional[BoundingRectangle]:
+    """
+    Extracts bbox from title string like 'bbox 279 177 306 214;x_wconf 97'
+    """
+    parts = title_str.split(";")
+    for part in parts:
+        part = part.strip()
+        if part.startswith("bbox "):
+            try:
+                coords = part.split()[1:]
+                rect = BoundingRectangle.from_bounding_box(
+                    bbox=BoundingBox.from_tuple(
+                        tuple(map(int, coords)), origin=CoordOrigin.TOPLEFT
+                    )
+                )
+                return rect
+            except Exception:
+                return None
+    return None
+def _extract_confidence(title_str) -> float:
+    """Extracts x_wconf (OCR confidence) value from title string."""
+    for part in title_str.split(";"):
+        part = part.strip()
+        if part.startswith("x_wconf"):
+            try:
+                return float(part.split()[1]) / 100.0
+            except Exception:
+                return 1
+    return 1
+class MetsGbsDocumentBackend(PdfDocumentBackend):
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+        self._tar: tarfile.TarFile = (
+            tarfile.open(name=self.path_or_stream, mode="r:gz")
+            if isinstance(self.path_or_stream, Path)
+            else tarfile.open(fileobj=self.path_or_stream, mode="r:gz")
+        )
+        self.root_mets: Optional[etree._Element] = None
+        self.page_map: Dict[int, _PageFiles] = {}
+        for member in self._tar.getmembers():
+            if member.name.endswith(".xml"):
+                file = self._tar.extractfile(member)
+                if file is not None:
+                    content = file.read()
+                    self.root_mets = self._validate_mets_xml(content)
+                    if self.root_mets is not None:
+                        break
+        if self.root_mets is None:
+            raise RuntimeError(
+                f"METS GBS backend could not load document {self.document_hash}."
+            )
+        ns = {
+            "mets": "http://www.loc.gov/METS/",
+            "xlink": "http://www.w3.org/1999/xlink",
+            "xsi": "http://www.w3.org/2001/XMLSchema-instance",
+            "gbs": "http://books.google.com/gbs",
+            "premis": "info:lc/xmlns/premis-v2",
+            "marc": "http://www.loc.gov/MARC21/slim",
+        }
+        file_info_by_id: Dict[str, _FileInfo] = {}
+        for filegrp in self.root_mets.xpath(".//mets:fileGrp", namespaces=ns):
+            use_raw = filegrp.get("USE")
+            try:
+                use = _UseType(use_raw)
+            except ValueError:
+                continue  # Ignore unknown USE types
+            for file_elem in filegrp.xpath("./mets:file", namespaces=ns):
+                file_id = file_elem.get("ID")
+                mimetype = file_elem.get("MIMETYPE")
+                flocat_elem = file_elem.find("mets:FLocat", namespaces=ns)
+                href = (
+                    flocat_elem.get("{http://www.w3.org/1999/xlink}href")
+                    if flocat_elem is not None
+                    else None
+                )
+                if href is None:
+                    continue
+                file_info_by_id[file_id] = _FileInfo(
+                    file_id=file_id, mimetype=mimetype, path=href, use=use
+                )
+        USE_TO_ATTR = {
+            _UseType.IMAGE: "image",
+            _UseType.OCR: "ocr",
+            _UseType.COORD_OCR: "coordOCR",
+        }
+        for div in self.root_mets.xpath('.//mets:div[@TYPE="page"]', namespaces=ns):
+            order_str = div.get("ORDER")
+            if not order_str:
+                continue
+            try:
+                page_no = int(order_str) - 1  # make 0-index pages
+            except ValueError:
+                continue
+            page_files = _PageFiles()
+            for fptr in div.xpath("./mets:fptr", namespaces=ns):
+                file_id = fptr.get("FILEID")
+                file_info = file_info_by_id.get(file_id)
+                if file_info:
+                    attr = USE_TO_ATTR.get(file_info.use)
+                    if attr:
+                        setattr(page_files, attr, file_info)
+            self.page_map[page_no] = page_files
+    def _validate_mets_xml(self, xml_string) -> Optional[etree._Element]:
+        root: etree._Element = etree.fromstring(xml_string)
+        if (
+            root.tag == "{http://www.loc.gov/METS/}mets"
+            and root.get("PROFILE") == "gbs"
+        ):
+            return root
+        _log.warning(f"The root element is not <mets:mets> with PROFILE='gbs': {root}")
+        return None
+    def _parse_page(self, page_no: int) -> Tuple[SegmentedPdfPage, PILImage]:
+        # TODO: use better fallbacks...
+        image_info = self.page_map[page_no].image
+        assert image_info is not None
+        ocr_info = self.page_map[page_no].coordOCR
+        assert ocr_info is not None
+        image_file = self._tar.extractfile(image_info.path)
+        assert image_file is not None
+        buf = BytesIO(image_file.read())
+        im: PILImage = Image.open(buf)
+        ocr_file = self._tar.extractfile(ocr_info.path)
+        assert ocr_file is not None
+        ocr_content = ocr_file.read()
+        parser = etree.HTMLParser()
+        ocr_root: etree._Element = etree.fromstring(ocr_content, parser=parser)
+        line_cells: List[TextCell] = []
+        word_cells: List[TextCell] = []
+        page_div = ocr_root.xpath("//div[@class='ocr_page']")
+        size = Size(width=im.size[0], height=im.size[1])
+        if page_div:
+            title = page_div[0].attrib.get("title", "")
+            rect = _extract_rect(title)
+            if rect:
+                size = Size(width=rect.width, height=rect.height)
+        else:
+            _log.error(f"Could not find ocr_page for page {page_no}")
+        im = im.resize(size=(round(size.width), round(size.height)))
+        im = im.convert("RGB")
+        # Extract all ocrx_word spans
+        for ix, word in enumerate(ocr_root.xpath("//span[@class='ocrx_word']")):
+            text = "".join(word.itertext()).strip()
+            title = word.attrib.get("title", "")
+            rect = _extract_rect(title)
+            conf = _extract_confidence(title)
+            if rect:
+                word_cells.append(
+                    TextCell(
+                        index=ix,
+                        text=text,
+                        orig=text,
+                        rect=rect,
+                        from_ocr=True,
+                        confidence=conf,
+                    )
+                )
+        # Extract all ocr_line spans
+        # line: etree._Element
+        for ix, line in enumerate(ocr_root.xpath("//span[@class='ocr_line']")):
+            text = "".join(line.itertext()).strip()
+            title = line.attrib.get("title", "")
+            rect = _extract_rect(title)
+            conf = _extract_confidence(title)
+            if rect:
+                line_cells.append(
+                    TextCell(
+                        index=ix,
+                        text=text,
+                        orig=text,
+                        rect=rect,
+                        from_ocr=True,
+                        confidence=conf,
+                    )
+                )
+        page = SegmentedPdfPage(
+            dimension=_get_pdf_page_geometry(size),
+            textline_cells=line_cells,
+            char_cells=[],
+            word_cells=word_cells,
+            has_textlines=True,
+            has_words=True,
+            has_chars=False,
+        )
+        return page, im
+    def page_count(self) -> int:
+        return len(self.page_map)
+    def load_page(self, page_no: int) -> MetsGbsPageBackend:
+        # TODO: is this thread-safe?
+        page, im = self._parse_page(page_no)
+        return MetsGbsPageBackend(parsed_page=page, page_im=im)
+    def is_valid(self) -> bool:
+        return self.root_mets is not None and self.page_count() > 0
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return {InputFormat.METS_GBS}
+    @classmethod
+    def supports_pagination(cls) -> bool:
+        return True
+    def unload(self) -> None:
+        super().unload()
+        self._tar.close()

docling/backend/pdf_backend.py CHANGED Viewed

@@ -84,9 +84,9 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
                 buf.seek(0)
                 self.path_or_stream = buf
-            else:
+            elif self.input_format not in self.supported_formats():
                 raise RuntimeError(
-                    f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend."
+                    f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend. Valid format are {','.join(self.supported_formats())}."
                 )
     @abstractmethod
@@ -99,7 +99,7 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
     @classmethod
     def supported_formats(cls) -> Set[InputFormat]:
-        return {InputFormat.PDF}
+        return {InputFormat.PDF, InputFormat.IMAGE}
     @classmethod
     def supports_pagination(cls) -> bool:

docling/cli/main.py CHANGED Viewed

@@ -26,6 +26,7 @@ from rich.console import Console
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
 from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
+from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
@@ -607,9 +608,18 @@ def convert(  # noqa: C901
                 backend=backend,  # pdf_backend
             )
+            # METS GBS options
+            mets_gbs_options = pipeline_options.model_copy()
+            mets_gbs_options.do_ocr = False
+            mets_gbs_format_option = PdfFormatOption(
+                pipeline_options=mets_gbs_options,
+                backend=MetsGbsDocumentBackend,
+            )
             format_options = {
                 InputFormat.PDF: pdf_format_option,
                 InputFormat.IMAGE: pdf_format_option,
+                InputFormat.METS_GBS: mets_gbs_format_option,
             }
         elif pipeline == ProcessingPipeline.VLM:

docling/datamodel/base_models.py CHANGED Viewed

@@ -56,6 +56,7 @@ class InputFormat(str, Enum):
     XLSX = "xlsx"
     XML_USPTO = "xml_uspto"
     XML_JATS = "xml_jats"
+    METS_GBS = "mets_gbs"
     JSON_DOCLING = "json_docling"
     AUDIO = "audio"
@@ -81,6 +82,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
     InputFormat.CSV: ["csv"],
     InputFormat.XLSX: ["xlsx", "xlsm"],
     InputFormat.XML_USPTO: ["xml", "txt"],
+    InputFormat.METS_GBS: ["tar.gz"],
     InputFormat.JSON_DOCLING: ["json"],
     InputFormat.AUDIO: ["wav", "mp3"],
 }
@@ -113,6 +115,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
         "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
     ],
     InputFormat.XML_USPTO: ["application/xml", "text/plain"],
+    InputFormat.METS_GBS: ["application/mets+xml"],
     InputFormat.JSON_DOCLING: ["application/json"],
     InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
 }

docling/datamodel/document.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import csv
 import logging
 import re
+import tarfile
 from collections.abc import Iterable
 from enum import Enum
 from io import BytesIO
@@ -314,6 +315,10 @@ class _DocumentConversionInput(BaseModel):
                 elif objname.endswith(".pptx"):
                     mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
+        if mime is not None and mime.lower() == "application/gzip":
+            if detected_mime := _DocumentConversionInput._detect_mets_gbs(obj):
+                mime = detected_mime
         mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
         mime = mime or _DocumentConversionInput._detect_csv(content)
         mime = mime or "text/plain"
@@ -457,3 +462,24 @@ class _DocumentConversionInput(BaseModel):
             return None
         return None
+    @staticmethod
+    def _detect_mets_gbs(
+        obj: Union[Path, DocumentStream],
+    ) -> Optional[Literal["application/mets+xml"]]:
+        content = obj if isinstance(obj, Path) else obj.stream
+        tar: tarfile.TarFile
+        member: tarfile.TarInfo
+        with tarfile.open(
+            name=content if isinstance(content, Path) else None,
+            fileobj=content if isinstance(content, BytesIO) else None,
+            mode="r:gz",
+        ) as tar:
+            for member in tar.getmembers():
+                if member.name.endswith(".xml"):
+                    file = tar.extractfile(member)
+                    if file is not None:
+                        content_str = file.read().decode(errors="ignore")
+                        if "http://www.loc.gov/METS/" in content_str:
+                            return "application/mets+xml"
+        return None

docling/datamodel/pipeline_options_vlm_model.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from enum import Enum
-from typing import Any, Callable, Dict, List, Literal, Optional, Union
+from typing import Any, Dict, List, Literal, Optional
 from docling_core.types.doc.page import SegmentedPage
 from pydantic import AnyUrl, BaseModel
@@ -10,11 +10,17 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
 class BaseVlmOptions(BaseModel):
     kind: str
-    prompt: Union[str, Callable[[Optional[SegmentedPage]], str]]
+    prompt: str
     scale: float = 2.0
     max_size: Optional[int] = None
     temperature: float = 0.0
+    def build_prompt(self, page: Optional[SegmentedPage]) -> str:
+        return self.prompt
+    def decode_response(self, text: str) -> str:
+        return text
 class ResponseFormat(str, Enum):
     DOCTAGS = "doctags"

docling/document_converter.py CHANGED Viewed

@@ -20,6 +20,7 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
 from docling.backend.html_backend import HTMLDocumentBackend
 from docling.backend.json.docling_json_backend import DoclingJSONBackend
 from docling.backend.md_backend import MarkdownDocumentBackend
+from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
 from docling.backend.msexcel_backend import MsExcelDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
@@ -159,6 +160,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
         InputFormat.XML_JATS: FormatOption(
             pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
         ),
+        InputFormat.METS_GBS: FormatOption(
+            pipeline_cls=StandardPdfPipeline, backend=MetsGbsDocumentBackend
+        ),
         InputFormat.IMAGE: FormatOption(
             pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
         ),

docling/models/api_vlm_model.py CHANGED Viewed

@@ -53,11 +53,7 @@ class ApiVlmModel(BasePageModel):
                         if hi_res_image.mode != "RGB":
                             hi_res_image = hi_res_image.convert("RGB")
-                    if callable(self.vlm_options.prompt):
-                        prompt = self.vlm_options.prompt(page.parsed_page)
-                    else:
-                        prompt = self.vlm_options.prompt
+                    prompt = self.vlm_options.build_prompt(page.parsed_page)
                     page_tags = api_image_request(
                         image=hi_res_image,
                         prompt=prompt,
@@ -67,6 +63,7 @@ class ApiVlmModel(BasePageModel):
                         **self.params,
                     )
+                    page_tags = self.vlm_options.decode_response(page_tags)
                     page.predictions.vlm_response = VlmPrediction(text=page_tags)
                 return page

docling/models/vlm_models_inline/hf_transformers_model.py CHANGED Viewed

@@ -135,10 +135,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
                     )
                     # Define prompt structure
-                    if callable(self.vlm_options.prompt):
-                        user_prompt = self.vlm_options.prompt(page.parsed_page)
-                    else:
-                        user_prompt = self.vlm_options.prompt
+                    user_prompt = self.vlm_options.build_prompt(page.parsed_page)
                     prompt = self.formulate_prompt(user_prompt)
                     inputs = self.processor(
@@ -166,6 +163,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
                     _log.debug(
                         f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
                     )
+                    generated_texts = self.vlm_options.decode_response(generated_texts)
                     page.predictions.vlm_response = VlmPrediction(
                         text=generated_texts,
                         generation_time=generation_time,

docling/models/vlm_models_inline/mlx_model.py CHANGED Viewed

@@ -84,10 +84,7 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
                         if hi_res_image.mode != "RGB":
                             hi_res_image = hi_res_image.convert("RGB")
-                    if callable(self.vlm_options.prompt):
-                        user_prompt = self.vlm_options.prompt(page.parsed_page)
-                    else:
-                        user_prompt = self.vlm_options.prompt
+                    user_prompt = self.vlm_options.build_prompt(page.parsed_page)
                     prompt = self.apply_chat_template(
                         self.processor, self.config, user_prompt, num_images=1
                     )
@@ -142,6 +139,7 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
                     _log.debug(
                         f"{generation_time:.2f} seconds for {len(tokens)} tokens ({len(tokens) / generation_time} tokens/sec)."
                     )
+                    page_tags = self.vlm_options.decode_response(page_tags)
                     page.predictions.vlm_response = VlmPrediction(
                         text=page_tags,
                         generation_time=generation_time,

docling/pipeline/base_pipeline.py CHANGED Viewed

@@ -8,7 +8,10 @@ from typing import Any, Callable, List
 from docling_core.types.doc import NodeItem
-from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.backend.abstract_backend import (
+    AbstractDocumentBackend,
+    PaginatedDocumentBackend,
+)
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.datamodel.base_models import (
     ConversionStatus,
@@ -126,10 +129,10 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
         yield from page_batch
     def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
-        if not isinstance(conv_res.input._backend, PdfDocumentBackend):
+        if not isinstance(conv_res.input._backend, PaginatedDocumentBackend):
             raise RuntimeError(
-                f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. "
-                f"Can not convert this with a PDF pipeline. "
+                f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a paginated backend. "
+                f"Can not convert this with a paginated PDF pipeline. "
                 f"Please check your format configuration on DocumentConverter."
             )
             # conv_res.status = ConversionStatus.FAILURE

{docling-2.44.0.dist-info → docling-2.45.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docling
-Version: 2.44.0
+Version: 2.45.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
 License-Expression: MIT

{docling-2.44.0.dist-info → docling-2.45.0.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/document_converter.py,sha256=l4b9m9NcbnwzXNNvf777nszyXznQJiaTXyIl_WehkyQ,15724
+docling/document_converter.py,sha256=7lid_uhGNuurYICweaA1jqtSbnhf3hpuUYUNleHh-Ww,15924
 docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
 docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
 docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -9,13 +9,14 @@ docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE
 docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3Uea00XrLluTg,7918
 docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
 docling/backend/docling_parse_v4_backend.py,sha256=qR_WRVq9JGtRioWCw6MnLWgbvXbC6Y1yds7Ol1-E6UQ,6550
-docling/backend/html_backend.py,sha256=0_l-I9gBAs0HKU3yKLQ3OqyYgB3V48hInv42GudnSjA,22856
+docling/backend/html_backend.py,sha256=jTkpdJ-EKMmkbUfh88DONVG-gENE7m0_cnIhWpWSobI,34523
 docling/backend/md_backend.py,sha256=qCI7SD9hnWWGrkG_drpzQv2Z7DVBG4Tsq3hhTsYV790,22562
+docling/backend/mets_gbs_backend.py,sha256=EA8sY6tbmGiysKGYPPZiNlK-i7Adn8bLTo-7Ym15hTU,12774
 docling/backend/msexcel_backend.py,sha256=cq8MQ2RSh6pqCiVrldjOerSww7dOPTWmCQoCBI57i6w,18579
 docling/backend/mspowerpoint_backend.py,sha256=wJgB2JStEPfD7MPpWQlpPN7bffPxaHFUnKD4wj8SLxU,15114
 docling/backend/msword_backend.py,sha256=DxMgPcq-Ao1vq7X2v8qqWeMs9MryPw_Jw3YRAAdXBtM,44904
 docling/backend/noop_backend.py,sha256=EOPbD86FzZPX-K_DpNrJh0_lC0bZz--4DpG-OagDNGY,1688
-docling/backend/pdf_backend.py,sha256=sUBrCz1zvt6E7sVl4xHtrkpTBClOK0vBV2lLi_TRHNg,3237
+docling/backend/pdf_backend.py,sha256=Wcd1NSrAMjXK8VicTki5p-j-JLofklt07eF0kIG17_0,3361
 docling/backend/pypdfium2_backend.py,sha256=8dVniLHgiTdJuDbYr66kPp6Ccv5ZDlqDMEbA2xIfS7U,13370
 docling/backend/docx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/backend/docx/latex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -28,22 +29,22 @@ docling/backend/xml/jats_backend.py,sha256=LPj33EFdi2MRCakkLWrRLlUAc-B-949f8zp5g
 docling/backend/xml/uspto_backend.py,sha256=nyAMr5ht7dclxkVDwsKNeiOhLQrUtRLS8JdscB2AVJg,70924
 docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
 docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/cli/main.py,sha256=rXWR2QJFLeHLPWkMsLXvsVblX-KOXwbM8r0ku80KU5Q,29925
+docling/cli/main.py,sha256=-W_vdKvSm5gZUZyvRpFH0YMI_1iJrP5sJOZ5_1bLorw,30359
 docling/cli/models.py,sha256=9yLGp6QRJGpR86U3SjmWAXDt3MvBaJLLY4xDVdsu3O8,4160
 docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
 docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/datamodel/accelerator_options.py,sha256=wv6dOFTVAwr9onkE-0pfUqX_fDb6gX53iPPE6o8nKjI,2511
 docling/datamodel/asr_model_specs.py,sha256=Wg7z3zm_wXIWu122iPVy0RMECsA_JCFHrlFF-xxHoVQ,2187
-docling/datamodel/base_models.py,sha256=9FslHkGUNmBp264LpLL_2JTfDAdaikldYs3SiQOHb5A,11828
-docling/datamodel/document.py,sha256=CA_dgt4V_phze5HXpfgfKNBKd1cPC1o3WE_IENX63EM,16252
+docling/datamodel/base_models.py,sha256=Ifd8PPHs4sW7ScwSqpa-y3rwgPbde_iw13Y2NUCPfU8,11944
+docling/datamodel/document.py,sha256=zsxFYXvo6GtwGNogSDoBB1TFvkm7IOrP_VnqXNqBhJs,17329
 docling/datamodel/layout_model_specs.py,sha256=GSkJ-Z_0PVgwWGi7C7TsxbzRjlrWS9ZrHJjHumv-Z5U,2339
 docling/datamodel/pipeline_options.py,sha256=TaBmCBRjSxyoh79UkpEkPzokLYS8BA2QJam86g9pT5g,10544
 docling/datamodel/pipeline_options_asr_model.py,sha256=7X068xl-qpbyPxC7-TwX7Q6tLyZXGT5h1osZ_xLNLM0,1454
-docling/datamodel/pipeline_options_vlm_model.py,sha256=z-pUqwRA8nJp6C3SEXZLem2zvSYdgavaAVYa8wkAIZY,2400
+docling/datamodel/pipeline_options_vlm_model.py,sha256=eH-Cj_8aic9FdX4xGlBcf5_R9e152JAL2LhtY8d0rhw,2498
 docling/datamodel/settings.py,sha256=c0MTw6pO5be_BKxHKYl4SaBJAw_qL-aapxp-g5HHj1A,2084
 docling/datamodel/vlm_model_specs.py,sha256=--jZexGeu-s_lWp7y_WwWEf6CD1J4XqADrS1-OY_pWM,4737
 docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/models/api_vlm_model.py,sha256=foBvzaWeHFH1t-VdvRWLdiXiiofhvhjvHqRI0eNA_3w,2923
+docling/models/api_vlm_model.py,sha256=-zisU32pgDRbychyG6-neB0qweNbPaYnLXwiGT7SEdI,2859
 docling/models/base_model.py,sha256=NNjIapqCruAEAWR-CCdsNgXc2QkwiPYAcaQ_ZYe1W28,2978
 docling/models/base_ocr_model.py,sha256=kT8TylASOpPlY60rIG6VL6_eLVsfg5KvEVnZHzDWtR0,8193
 docling/models/code_formula_model.py,sha256=5uWh-eI-Ejmv3DujKJoKKgJBuvPLokt7AJ_ybt8VHEw,11373
@@ -70,11 +71,11 @@ docling/models/plugins/defaults.py,sha256=OAHWW2tCcUXSyDMFxV_lXVRjSBJ1n6z-Eb3R8c
 docling/models/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/models/utils/hf_model_download.py,sha256=scBEfsM4yl7xPzqe7UtPvDh9RfQZQnuOhqQKilYBHls,984
 docling/models/vlm_models_inline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/models/vlm_models_inline/hf_transformers_model.py,sha256=LAnWFIHGblWln6DQMLtCQQW3-YUPDMbgeD2tjfM8vLM,8415
-docling/models/vlm_models_inline/mlx_model.py,sha256=tqbJ8tmf2VBDuMLYIv9s1Ysn3G831k2uE_PdOv0kCaE,5948
+docling/models/vlm_models_inline/hf_transformers_model.py,sha256=Rwdr7neDpn5ehtrp6n7G21fcPBK2m9Har_6BFNdyw-Q,8359
+docling/models/vlm_models_inline/mlx_model.py,sha256=YYYmopsITlX17JVS5KhLlb1IQSEVoSECNx_fXLHNpAc,5880
 docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/pipeline/asr_pipeline.py,sha256=tQkhu9fXdkSuYIL22xzV2YRUlQh-9qktHBbs2qeXhJI,9070
-docling/pipeline/base_pipeline.py,sha256=iwUqmttXF9D2myXyCAaIqFuGjBFhPkjAybcSAGpww-Q,9525
+docling/pipeline/base_pipeline.py,sha256=MOKZtx3jNYotfntgoJHoyb6UsvdvG6bQLyDl9Lxvc1w,9586
 docling/pipeline/simple_pipeline.py,sha256=TXZOwR7hZRji462ZTIpte0VJjzbxvNVE8dbLFANDhSU,2253
 docling/pipeline/standard_pdf_pipeline.py,sha256=yFishq4Cu01BiBGHk3Irr7ogcTQKeSC0QZImQVAhIaY,12740
 docling/pipeline/threaded_standard_pdf_pipeline.py,sha256=Rjdq1x2fRHBA0rMHJ6rqqHzxVVzgTEALBBj5d30oOZ8,26018
@@ -92,9 +93,9 @@ docling/utils/orientation.py,sha256=jTyLxyT31FlOodZoBMlADHNQK2lAWKYVs5z7pXd_6Cg,
 docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
 docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
 docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
-docling-2.44.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
-docling-2.44.0.dist-info/METADATA,sha256=SjD3EXlvgfyXIo8YoeldcAFX0r_nbJszp7VPoMLPFBk,10459
-docling-2.44.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-docling-2.44.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
-docling-2.44.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
-docling-2.44.0.dist-info/RECORD,,
+docling-2.45.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
+docling-2.45.0.dist-info/METADATA,sha256=-iB6xJ4H7DIStzPn-ruYcBa_Tq45Ijk52zfoM_6FkCE,10459
+docling-2.45.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+docling-2.45.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
+docling-2.45.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
+docling-2.45.0.dist-info/RECORD,,

{docling-2.44.0.dist-info → docling-2.45.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{docling-2.44.0.dist-info → docling-2.45.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{docling-2.44.0.dist-info → docling-2.45.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{docling-2.44.0.dist-info → docling-2.45.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

docling 2.44.0__py3-none-any.whl → 2.45.0__py3-none-any.whl

docling 2.44.0py3-none-any.whl → 2.45.0py3-none-any.whl