PyPI - docling - Versions diffs - 2.43.0__py3-none-any.whl → 2.45.0__py3-none-any.whl - Mend

docling 2.43.0py3-none-any.whl → 2.45.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

docling/backend/html_backend.py +406 -69
docling/backend/mets_gbs_backend.py +399 -0
docling/backend/pdf_backend.py +3 -3
docling/cli/main.py +16 -0
docling/datamodel/base_models.py +3 -0
docling/datamodel/document.py +26 -0
docling/datamodel/pipeline_options_vlm_model.py +8 -2
docling/document_converter.py +34 -0
docling/models/api_vlm_model.py +2 -5
docling/models/vlm_models_inline/hf_transformers_model.py +2 -4
docling/models/vlm_models_inline/mlx_model.py +4 -6
docling/pipeline/base_pipeline.py +7 -4
{docling-2.43.0.dist-info → docling-2.45.0.dist-info}/METADATA +2 -2
{docling-2.43.0.dist-info → docling-2.45.0.dist-info}/RECORD +18 -17
{docling-2.43.0.dist-info → docling-2.45.0.dist-info}/WHEEL +0 -0
{docling-2.43.0.dist-info → docling-2.45.0.dist-info}/entry_points.txt +0 -0
{docling-2.43.0.dist-info → docling-2.45.0.dist-info}/licenses/LICENSE +0 -0
{docling-2.43.0.dist-info → docling-2.45.0.dist-info}/top_level.txt +0 -0

docling/backend/html_backend.py CHANGED Viewed

@@ -1,8 +1,11 @@
 import logging
 import re
+from contextlib import contextmanager
+from copy import deepcopy
 from io import BytesIO
 from pathlib import Path
 from typing import Final, Optional, Union, cast
+from urllib.parse import urljoin
 from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
 from bs4.element import PreformattedString
@@ -18,7 +21,7 @@ from docling_core.types.doc import (
     TextItem,
 )
 from docling_core.types.doc.document import ContentLayer
-from pydantic import BaseModel
+from pydantic import AnyUrl, BaseModel, ValidationError
 from typing_extensions import override
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
@@ -56,12 +59,76 @@ class _Context(BaseModel):
     list_start_by_ref: dict[str, int] = {}
+class AnnotatedText(BaseModel):
+    text: str
+    hyperlink: Union[AnyUrl, Path, None] = None
+class AnnotatedTextList(list):
+    def to_single_text_element(self) -> AnnotatedText:
+        current_h = None
+        current_text = ""
+        for at in self:
+            t = at.text
+            h = at.hyperlink
+            current_text += t.strip() + " "
+            if h is not None and current_h is None:
+                current_h = h
+            elif h is not None and current_h is not None and h != current_h:
+                _log.warning(
+                    f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'"
+                )
+        return AnnotatedText(text=current_text.strip(), hyperlink=current_h)
+    def simplify_text_elements(self) -> "AnnotatedTextList":
+        simplified = AnnotatedTextList()
+        if not self:
+            return self
+        text = self[0].text
+        hyperlink = self[0].hyperlink
+        last_elm = text
+        for i in range(1, len(self)):
+            if hyperlink == self[i].hyperlink:
+                sep = " "
+                if not self[i].text.strip() or not last_elm.strip():
+                    sep = ""
+                text += sep + self[i].text
+                last_elm = self[i].text
+            else:
+                simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
+                text = self[i].text
+                last_elm = text
+                hyperlink = self[i].hyperlink
+        if text:
+            simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
+        return simplified
+    def split_by_newline(self):
+        super_list = []
+        active_annotated_text_list = AnnotatedTextList()
+        for el in self:
+            sub_texts = el.text.split("\n")
+            if len(sub_texts) == 1:
+                active_annotated_text_list.append(el)
+            else:
+                for text in sub_texts:
+                    sub_el = deepcopy(el)
+                    sub_el.text = text
+                    active_annotated_text_list.append(sub_el)
+                    super_list.append(active_annotated_text_list)
+                    active_annotated_text_list = AnnotatedTextList()
+        if active_annotated_text_list:
+            super_list.append(active_annotated_text_list)
+        return super_list
 class HTMLDocumentBackend(DeclarativeDocumentBackend):
     @override
     def __init__(
         self,
         in_doc: InputDocument,
         path_or_stream: Union[BytesIO, Path],
+        original_url: Optional[AnyUrl] = None,
     ):
         super().__init__(in_doc, path_or_stream)
         self.soup: Optional[Tag] = None
@@ -74,6 +141,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         self.ctx = _Context()
         for i in range(self.max_levels):
             self.parents[i] = None
+        self.hyperlink = None
+        self.original_url = original_url
         try:
             raw = (
@@ -125,8 +194,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         # set the title as furniture, since it is part of the document metadata
         title = self.soup.title
         if title:
+            title_text = title.get_text(separator=" ", strip=True)
+            title_clean = HTMLDocumentBackend._clean_unicode(title_text)
             doc.add_title(
-                text=title.get_text(separator=" ", strip=True),
+                text=title_clean,
+                orig=title_text,
                 content_layer=ContentLayer.FURNITURE,
             )
         # remove scripts/styles
@@ -157,24 +229,32 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             element: The XML tag to parse.
             doc: The Docling document to be updated with the parsed content.
         """
-        buffer: list[str] = []
+        buffer: AnnotatedTextList = AnnotatedTextList()
         def flush_buffer():
             if not buffer:
                 return
-            text = "".join(buffer).strip()
+            annotated_text_list = buffer.simplify_text_elements()
+            parts = annotated_text_list.split_by_newline()
             buffer.clear()
-            if not text:
+            if not "".join([el.text for el in annotated_text_list]):
                 return
-            for part in text.split("\n"):
-                seg = part.strip()
-                if seg:
-                    doc.add_text(
-                        DocItemLabel.TEXT,
-                        seg,
-                        parent=self.parents[self.level],
-                        content_layer=self.content_layer,
-                    )
+            for annotated_text_list in parts:
+                with self.use_inline_group(annotated_text_list, doc):
+                    for annotated_text in annotated_text_list:
+                        if annotated_text.text.strip():
+                            seg_clean = HTMLDocumentBackend._clean_unicode(
+                                annotated_text.text.strip()
+                            )
+                            doc.add_text(
+                                parent=self.parents[self.level],
+                                label=DocItemLabel.TEXT,
+                                text=seg_clean,
+                                content_layer=self.content_layer,
+                                hyperlink=annotated_text.hyperlink,
+                            )
         for node in element.contents:
             if isinstance(node, Tag):
@@ -182,6 +262,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 if name == "img":
                     flush_buffer()
                     self._emit_image(node, doc)
+                elif name == "a":
+                    with self.use_hyperlink(node):
+                        self._walk(node, doc)
                 elif name in _BLOCK_TAGS:
                     flush_buffer()
                     self._handle_block(node, doc)
@@ -189,27 +272,154 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                     flush_buffer()
                     self._walk(node, doc)
                 else:
-                    buffer.append(node.text)
+                    buffer.extend(
+                        self._extract_text_and_hyperlink_recursively(
+                            node, find_parent_annotation=True, keep_newlines=True
+                        )
+                    )
             elif isinstance(node, NavigableString) and not isinstance(
                 node, PreformattedString
             ):
-                buffer.append(str(node))
+                if str(node).strip("\n\r") == "":
+                    flush_buffer()
+                else:
+                    buffer.extend(
+                        self._extract_text_and_hyperlink_recursively(
+                            node, find_parent_annotation=True, keep_newlines=True
+                        )
+                    )
         flush_buffer()
+    def _extract_text_and_hyperlink_recursively(
+        self,
+        item: PageElement,
+        ignore_list=False,
+        find_parent_annotation=False,
+        keep_newlines=False,
+    ) -> AnnotatedTextList:
+        result: AnnotatedTextList = AnnotatedTextList()
+        # If find_parent_annotation, make sure that we keep track of
+        # any a-tag that has been present in the DOM-parents already.
+        if find_parent_annotation:
+            this_parent = item.parent
+            while this_parent is not None:
+                if this_parent.name == "a" and this_parent.get("href"):
+                    with self.use_hyperlink(this_parent):
+                        return self._extract_text_and_hyperlink_recursively(
+                            item, ignore_list
+                        )
+                this_parent = this_parent.parent
+        if isinstance(item, PreformattedString):
+            return AnnotatedTextList()
+        if isinstance(item, NavigableString):
+            text = item.strip()
+            if text:
+                return AnnotatedTextList(
+                    [AnnotatedText(text=text, hyperlink=self.hyperlink)]
+                )
+            if keep_newlines and item.strip("\n\r") == "":
+                return AnnotatedTextList(
+                    [AnnotatedText(text="\n", hyperlink=self.hyperlink)]
+                )
+            return AnnotatedTextList()
+        tag = cast(Tag, item)
+        if not ignore_list or (tag.name not in ["ul", "ol"]):
+            for child in tag:
+                if isinstance(child, Tag) and child.name == "a":
+                    with self.use_hyperlink(child):
+                        result.extend(
+                            self._extract_text_and_hyperlink_recursively(
+                                child, ignore_list, keep_newlines=keep_newlines
+                            )
+                        )
+                else:
+                    # Recursively get the child's text content
+                    result.extend(
+                        self._extract_text_and_hyperlink_recursively(
+                            child, ignore_list, keep_newlines=keep_newlines
+                        )
+                    )
+        return result
+    @contextmanager
+    def use_hyperlink(self, tag):
+        this_href = tag.get("href")
+        if this_href is None:
+            yield None
+        else:
+            if this_href:
+                old_hyperlink = self.hyperlink
+                if self.original_url is not None:
+                    this_href = urljoin(self.original_url, this_href)
+                # ugly fix for relative links since pydantic does not support them.
+                try:
+                    AnyUrl(this_href)
+                except ValidationError:
+                    this_href = Path(this_href)
+                self.hyperlink = this_href
+            try:
+                yield None
+            finally:
+                if this_href:
+                    self.hyperlink = old_hyperlink
+    @contextmanager
+    def use_inline_group(
+        self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument
+    ):
+        """Create an inline group for annotated texts.
+        Checks if annotated_text_list has more than one item and if so creates an inline
+        group in which the text elements can then be generated. While the context manager
+        is active the inline group is set as the current parent.
+        Args:
+            annotated_text_list (AnnotatedTextList): Annotated text
+            doc (DoclingDocument): Currently used document
+        Yields:
+            None: _description_
+        """
+        if len(annotated_text_list) > 1:
+            inline_fmt = doc.add_group(
+                label=GroupLabel.INLINE,
+                parent=self.parents[self.level],
+                content_layer=self.content_layer,
+            )
+            self.parents[self.level + 1] = inline_fmt
+            self.level += 1
+            try:
+                yield None
+            finally:
+                self.parents[self.level] = None
+                self.level -= 1
+        else:
+            yield None
     def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
         tag_name = tag.name.lower()
         # set default content layer to BODY as soon as we encounter a heading
         self.content_layer = ContentLayer.BODY
         level = int(tag_name[1])
-        text = tag.get_text(strip=True, separator=" ")
+        annotated_text_list = self._extract_text_and_hyperlink_recursively(
+            tag, find_parent_annotation=True
+        )
+        annotated_text = annotated_text_list.to_single_text_element()
+        text_clean = HTMLDocumentBackend._clean_unicode(annotated_text.text)
         # the first level is for the title item
         if level == 1:
             for key in self.parents.keys():
                 self.parents[key] = None
             self.level = 0
             self.parents[self.level + 1] = doc.add_title(
-                text, content_layer=self.content_layer
+                text_clean,
+                content_layer=self.content_layer,
+                hyperlink=annotated_text.hyperlink,
             )
         # the other levels need to be lowered by 1 if a title was set
         else:
@@ -234,9 +444,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 self.level = level
             self.parents[self.level + 1] = doc.add_heading(
                 parent=self.parents[self.level],
-                text=text,
+                text=text_clean,
+                orig=annotated_text.text,
                 level=self.level,
                 content_layer=self.content_layer,
+                hyperlink=annotated_text.hyperlink,
             )
         self.level += 1
         for img_tag in tag("img"):
@@ -285,35 +497,69 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                     marker = ""
                 # 2) extract only the "direct" text from this <li>
-                parts: list[str] = []
-                for child in li.contents:
-                    if isinstance(child, NavigableString) and not isinstance(
-                        child, PreformattedString
-                    ):
-                        parts.append(child)
-                    elif isinstance(child, Tag) and child.name not in ("ul", "ol"):
-                        text_part = HTMLDocumentBackend.get_text(child)
-                        if text_part:
-                            parts.append(text_part)
-                li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
+                parts = self._extract_text_and_hyperlink_recursively(
+                    li, ignore_list=True, find_parent_annotation=True
+                )
+                min_parts = parts.simplify_text_elements()
+                li_text = re.sub(
+                    r"\s+|\n+", " ", "".join([el.text for el in min_parts])
+                ).strip()
                 # 3) add the list item
                 if li_text:
-                    self.parents[self.level + 1] = doc.add_list_item(
-                        text=li_text,
-                        enumerated=is_ordered,
-                        marker=marker,
-                        parent=list_group,
-                        content_layer=self.content_layer,
-                    )
-                    # 4) recurse into any nested lists, attaching them to this <li> item
-                    for sublist in li({"ul", "ol"}, recursive=False):
-                        if isinstance(sublist, Tag):
-                            self.level += 1
-                            self._handle_block(sublist, doc)
-                            self.parents[self.level + 1] = None
-                            self.level -= 1
+                    if len(min_parts) > 1:
+                        # create an empty list element in order to hook the inline group onto that one
+                        self.parents[self.level + 1] = doc.add_list_item(
+                            text="",
+                            enumerated=is_ordered,
+                            marker=marker,
+                            parent=list_group,
+                            content_layer=self.content_layer,
+                        )
+                        self.level += 1
+                        with self.use_inline_group(min_parts, doc):
+                            for annotated_text in min_parts:
+                                li_text = re.sub(
+                                    r"\s+|\n+", " ", annotated_text.text
+                                ).strip()
+                                li_clean = HTMLDocumentBackend._clean_unicode(li_text)
+                                doc.add_text(
+                                    parent=self.parents[self.level],
+                                    label=DocItemLabel.TEXT,
+                                    text=li_clean,
+                                    content_layer=self.content_layer,
+                                    hyperlink=annotated_text.hyperlink,
+                                )
+                        # 4) recurse into any nested lists, attaching them to this <li> item
+                        for sublist in li({"ul", "ol"}, recursive=False):
+                            if isinstance(sublist, Tag):
+                                self._handle_block(sublist, doc)
+                        # now the list element with inline group is not a parent anymore
+                        self.parents[self.level] = None
+                        self.level -= 1
+                    else:
+                        annotated_text = min_parts[0]
+                        li_text = re.sub(r"\s+|\n+", " ", annotated_text.text).strip()
+                        li_clean = HTMLDocumentBackend._clean_unicode(li_text)
+                        self.parents[self.level + 1] = doc.add_list_item(
+                            text=li_clean,
+                            enumerated=is_ordered,
+                            marker=marker,
+                            orig=li_text,
+                            parent=list_group,
+                            content_layer=self.content_layer,
+                            hyperlink=annotated_text.hyperlink,
+                        )
+                        # 4) recurse into any nested lists, attaching them to this <li> item
+                        for sublist in li({"ul", "ol"}, recursive=False):
+                            if isinstance(sublist, Tag):
+                                self.level += 1
+                                self._handle_block(sublist, doc)
+                                self.parents[self.level + 1] = None
+                                self.level -= 1
                 else:
                     for sublist in li({"ul", "ol"}, recursive=False):
                         if isinstance(sublist, Tag):
@@ -342,15 +588,23 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             self._handle_list(tag, doc)
         elif tag_name in {"p", "address", "summary"}:
-            for part in tag.text.split("\n"):
-                seg = part.strip()
-                if seg:
-                    doc.add_text(
-                        parent=self.parents[self.level],
-                        label=DocItemLabel.TEXT,
-                        text=seg,
-                        content_layer=self.content_layer,
-                    )
+            text_list = self._extract_text_and_hyperlink_recursively(
+                tag, find_parent_annotation=True
+            )
+            annotated_texts = text_list.simplify_text_elements()
+            for part in annotated_texts.split_by_newline():
+                with self.use_inline_group(part, doc):
+                    for annotated_text in part:
+                        if seg := annotated_text.text.strip():
+                            seg_clean = HTMLDocumentBackend._clean_unicode(seg)
+                            doc.add_text(
+                                parent=self.parents[self.level],
+                                label=DocItemLabel.TEXT,
+                                text=seg_clean,
+                                content_layer=self.content_layer,
+                                hyperlink=annotated_text.hyperlink,
+                            )
             for img_tag in tag("img"):
                 if isinstance(img_tag, Tag):
                     self._emit_image(img_tag, doc)
@@ -369,13 +623,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         elif tag_name in {"pre", "code"}:
             # handle monospace code snippets (pre).
-            text = tag.get_text(strip=True)
-            if text:
-                doc.add_code(
-                    parent=self.parents[self.level],
-                    text=text,
-                    content_layer=self.content_layer,
-                )
+            text_list = self._extract_text_and_hyperlink_recursively(
+                tag, find_parent_annotation=True
+            )
+            annotated_texts = text_list.simplify_text_elements()
+            with self.use_inline_group(annotated_texts, doc):
+                for annotated_text in annotated_texts:
+                    text_clean = HTMLDocumentBackend._clean_unicode(
+                        annotated_text.text.strip()
+                    )
+                    doc.add_code(
+                        parent=self.parents[self.level],
+                        text=text_clean,
+                        content_layer=self.content_layer,
+                        hyperlink=annotated_text.hyperlink,
+                    )
         elif tag_name == "details":
             # handle details and its content.
@@ -392,18 +654,45 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
     def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
         figure = img_tag.find_parent("figure")
-        caption: str = ""
+        caption: AnnotatedTextList = AnnotatedTextList()
+        # check if the figure has a link - this is HACK:
+        def get_img_hyperlink(img_tag):
+            this_parent = img_tag.parent
+            while this_parent is not None:
+                if this_parent.name == "a" and this_parent.get("href"):
+                    return this_parent.get("href")
+                this_parent = this_parent.parent
+            return None
+        if img_hyperlink := get_img_hyperlink(img_tag):
+            caption.append(
+                AnnotatedText(text="Image Hyperlink.", hyperlink=img_hyperlink)
+            )
         if isinstance(figure, Tag):
             caption_tag = figure.find("figcaption", recursive=False)
             if isinstance(caption_tag, Tag):
-                caption = caption_tag.get_text()
-        if not caption:
-            caption = str(img_tag.get("alt", "")).strip()
+                caption = self._extract_text_and_hyperlink_recursively(
+                    caption_tag, find_parent_annotation=True
+                )
+        if not caption and img_tag.get("alt"):
+            caption = AnnotatedTextList([AnnotatedText(text=img_tag.get("alt"))])
+        caption_anno_text = caption.to_single_text_element()
         caption_item: Optional[TextItem] = None
-        if caption:
+        if caption_anno_text.text:
+            text_clean = HTMLDocumentBackend._clean_unicode(
+                caption_anno_text.text.strip()
+            )
+            print(caption_anno_text)
             caption_item = doc.add_text(
-                DocItemLabel.CAPTION, text=caption, content_layer=self.content_layer
+                label=DocItemLabel.CAPTION,
+                text=text_clean,
+                orig=caption_anno_text.text,
+                content_layer=self.content_layer,
+                hyperlink=caption_anno_text.hyperlink,
             )
         doc.add_picture(
@@ -442,6 +731,46 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         return "".join(parts)
+    @staticmethod
+    def _clean_unicode(text: str) -> str:
+        """Replace typical Unicode characters in HTML for text processing.
+        Several Unicode characters (e.g., non-printable or formatting) are typically
+        found in HTML but are worth replacing to sanitize text and ensure consistency
+        in text processing tasks.
+        Args:
+            text: The original text.
+        Returns:
+            The sanitized text without typical Unicode characters.
+        """
+        replacements = {
+            "\u00a0": " ",  # non-breaking space
+            "\u200b": "",  # zero-width space
+            "\u200c": "",  # zero-width non-joiner
+            "\u200d": "",  # zero-width joiner
+            "\u2010": "-",  # hyphen
+            "\u2011": "-",  # non-breaking hyphen
+            "\u2012": "-",  # dash
+            "\u2013": "-",  # dash
+            "\u2014": "-",  # dash
+            "\u2015": "-",  # horizontal bar
+            "\u2018": "'",  # left single quotation mark
+            "\u2019": "'",  # right single quotation mark
+            "\u201c": '"',  # left double quotation mark
+            "\u201d": '"',  # right double quotation mark
+            "\u2026": "...",  # ellipsis
+            "\u00ad": "",  # soft hyphen
+            "\ufeff": "",  # zero width non-break space
+            "\u202f": " ",  # narrow non-break space
+            "\u2060": "",  # word joiner
+        }
+        for raw, clean in replacements.items():
+            text = text.replace(raw, clean)
+        return text
     @staticmethod
     def _get_cell_spans(cell: Tag) -> tuple[int, int]:
         """Extract colspan and rowspan values from a table cell tag.
@@ -454,9 +783,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             str(cell.get("colspan", "1")),
             str(cell.get("rowspan", "1")),
         )
+        def _extract_num(s: str) -> int:
+            if s and s[0].isnumeric():
+                match = re.search(r"\d+", s)
+                if match:
+                    return int(match.group())
+            return 1
         int_spans: tuple[int, int] = (
-            int(raw_spans[0]) if raw_spans[0].isnumeric() else 1,
-            int(raw_spans[1]) if raw_spans[0].isnumeric() else 1,
+            _extract_num(raw_spans[0]),
+            _extract_num(raw_spans[1]),
         )
         return int_spans

docling 2.43.0__py3-none-any.whl → 2.45.0__py3-none-any.whl

docling 2.43.0py3-none-any.whl → 2.45.0py3-none-any.whl