PyPI - docling - Versions diffs - 2.41.0__py3-none-any.whl → 2.42.1__py3-none-any.whl - Mend

docling 2.41.0py3-none-any.whl → 2.42.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

docling/backend/docx/latex/omml.py CHANGED Viewed

@@ -260,7 +260,15 @@ class oMath2Latex(Tag2Method):
         the fraction object
         """
         c_dict = self.process_children_dict(elm)
-        pr = c_dict["fPr"]
+        pr = c_dict.get("fPr")
+        if pr is None:
+            # Handle missing fPr element gracefully
+            _log.debug("Missing fPr element in fraction, using default formatting")
+            latex_s = F_DEFAULT
+            return latex_s.format(
+                num=c_dict.get("num"),
+                den=c_dict.get("den"),
+            )
         latex_s = get_val(pr.type, default=F_DEFAULT, store=F)
         return pr.text + latex_s.format(num=c_dict.get("num"), den=c_dict.get("den"))

docling/backend/html_backend.py CHANGED Viewed

@@ -1,10 +1,11 @@
 import logging
+import re
 import traceback
 from io import BytesIO
 from pathlib import Path
 from typing import Final, Optional, Union, cast
-from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
+from bs4 import BeautifulSoup, NavigableString, Tag
 from bs4.element import PreformattedString
 from docling_core.types.doc import (
     DocItem,
@@ -15,6 +16,7 @@ from docling_core.types.doc import (
     GroupLabel,
     TableCell,
     TableData,
+    TextItem,
 )
 from docling_core.types.doc.document import ContentLayer
 from pydantic import BaseModel
@@ -26,10 +28,14 @@ from docling.datamodel.document import InputDocument
 _log = logging.getLogger(__name__)
-# tags that generate NodeItem elements
-TAGS_FOR_NODE_ITEMS: Final = [
+DEFAULT_IMAGE_WIDTH = 128
+DEFAULT_IMAGE_HEIGHT = 128
+# Tags that initiate distinct Docling items
+_BLOCK_TAGS: Final = {
     "address",
     "details",
+    "figure",
     "h1",
     "h2",
     "h3",
@@ -41,12 +47,9 @@ TAGS_FOR_NODE_ITEMS: Final = [
     "code",
     "ul",
     "ol",
-    "li",
     "summary",
     "table",
-    "figure",
-    "img",
-]
+}
 class _Context(BaseModel):
@@ -56,12 +59,16 @@ class _Context(BaseModel):
 class HTMLDocumentBackend(DeclarativeDocumentBackend):
     @override
-    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+    def __init__(
+        self,
+        in_doc: InputDocument,
+        path_or_stream: Union[BytesIO, Path],
+    ):
         super().__init__(in_doc, path_or_stream)
         self.soup: Optional[Tag] = None
-        # HTML file:
         self.path_or_stream = path_or_stream
-        # Initialise the parents for the hierarchy
+        # Initialize the parents for the hierarchy
         self.max_levels = 10
         self.level = 0
         self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
@@ -70,13 +77,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             self.parents[i] = None
         try:
-            if isinstance(self.path_or_stream, BytesIO):
-                text_stream = self.path_or_stream.getvalue()
-                self.soup = BeautifulSoup(text_stream, "html.parser")
-            if isinstance(self.path_or_stream, Path):
-                with open(self.path_or_stream, "rb") as f:
-                    html_content = f.read()
-                    self.soup = BeautifulSoup(html_content, "html.parser")
+            raw = (
+                path_or_stream.getvalue()
+                if isinstance(path_or_stream, BytesIO)
+                else Path(path_or_stream).read_bytes()
+            )
+            self.soup = BeautifulSoup(raw, "html.parser")
         except Exception as e:
             raise RuntimeError(
                 "Could not initialize HTML backend for file with "
@@ -96,7 +102,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
     def unload(self):
         if isinstance(self.path_or_stream, BytesIO):
             self.path_or_stream.close()
         self.path_or_stream = None
     @classmethod
@@ -106,211 +111,156 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
     @override
     def convert(self) -> DoclingDocument:
-        # access self.path_or_stream to load stuff
+        _log.debug("Starting HTML conversion...")
+        if not self.is_valid():
+            raise RuntimeError("Invalid HTML document.")
         origin = DocumentOrigin(
             filename=self.file.name or "file",
             mimetype="text/html",
             binary_hash=self.document_hash,
         )
         doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
-        _log.debug("Trying to convert HTML...")
-        if self.is_valid():
-            assert self.soup is not None
-            content = self.soup.body or self.soup
-            # Replace <br> tags with newline characters
-            # TODO: remove style to avoid losing text from tags like i, b, span, ...
-            for br in content("br"):
-                br.replace_with(NavigableString("\n"))
-            headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"])
-            self.content_layer = (
-                ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
-            )
-            self.ctx = _Context()  # reset context
-            self.walk(content, doc)
-        else:
-            raise RuntimeError(
-                f"Cannot convert doc with {self.document_hash} because the backend "
-                "failed to init."
-            )
-        return doc
-    def walk(self, tag: Tag, doc: DoclingDocument) -> None:
-        # Iterate over elements in the body of the document
-        text: str = ""
-        for element in tag.children:
-            if isinstance(element, Tag):
-                try:
-                    self.analyze_tag(cast(Tag, element), doc)
-                except Exception as exc_child:
-                    _log.error(
-                        f"Error processing child from tag {tag.name}:\n{traceback.format_exc()}"
-                    )
-                    raise exc_child
-            elif isinstance(element, NavigableString) and not isinstance(
-                element, PreformattedString
-            ):
-                # Floating text outside paragraphs or analyzed tags
-                text += element
-                siblings: list[Tag] = [
-                    item for item in element.next_siblings if isinstance(item, Tag)
-                ]
-                if element.next_sibling is None or any(
-                    item.name in TAGS_FOR_NODE_ITEMS for item in siblings
-                ):
-                    text = text.strip()
-                    if text and tag.name in ["div"]:
-                        doc.add_text(
-                            parent=self.parents[self.level],
-                            label=DocItemLabel.TEXT,
-                            text=text,
-                            content_layer=self.content_layer,
-                        )
-                    text = ""
-        return
-    def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
-        if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
-            self.handle_header(tag, doc)
-        elif tag.name in ["p", "address", "summary"]:
-            self.handle_paragraph(tag, doc)
-        elif tag.name in ["pre", "code"]:
-            self.handle_code(tag, doc)
-        elif tag.name in ["ul", "ol"]:
-            self.handle_list(tag, doc)
-        elif tag.name in ["li"]:
-            self.handle_list_item(tag, doc)
-        elif tag.name == "table":
-            self.handle_table(tag, doc)
-        elif tag.name == "figure":
-            self.handle_figure(tag, doc)
-        elif tag.name == "img":
-            self.handle_image(tag, doc)
-        elif tag.name == "details":
-            self.handle_details(tag, doc)
-        else:
-            self.walk(tag, doc)
-    def get_text(self, item: PageElement) -> str:
-        """Get the text content of a tag."""
-        parts: list[str] = self.extract_text_recursively(item)
-        return "".join(parts) + " "
-    # Function to recursively extract text from all child nodes
-    def extract_text_recursively(self, item: PageElement) -> list[str]:
-        result: list[str] = []
-        if isinstance(item, NavigableString):
-            return [item]
-        tag = cast(Tag, item)
-        if tag.name not in ["ul", "ol"]:
-            for child in tag:
-                # Recursively get the child's text content
-                result.extend(self.extract_text_recursively(child))
+        assert self.soup is not None
+        # set the title as furniture, since it is part of the document metadata
+        title = self.soup.title
+        if title:
+            doc.add_title(
+                text=title.get_text(separator=" ", strip=True),
+                content_layer=ContentLayer.FURNITURE,
+            )
+        # remove scripts/styles
+        for tag in self.soup(["script", "style"]):
+            tag.decompose()
+        content = self.soup.body or self.soup
+        # normalize <br> tags
+        for br in content("br"):
+            br.replace_with(NavigableString("\n"))
+        # set default content layer
+        headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"])
+        self.content_layer = (
+            ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
+        )
+        # reset context
+        self.ctx = _Context()
-        return ["".join(result) + " "]
+        try:
+            self._walk(content, doc)
+        except Exception:
+            print(traceback.format_exc())
-    def handle_details(self, element: Tag, doc: DoclingDocument) -> None:
-        """Handle details tag (details) and its content."""
+        return doc
-        self.parents[self.level + 1] = doc.add_group(
-            name="details",
-            label=GroupLabel.SECTION,
-            parent=self.parents[self.level],
-            content_layer=self.content_layer,
-        )
+    def _walk(self, element: Tag, doc: DoclingDocument) -> None:
+        """Parse an XML tag by recursively walking its content.
+        While walking, the method buffers inline text across tags like <b> or <span>,
+        emitting text nodes only at block boundaries.
+        Args:
+            element: The XML tag to parse.
+            doc: The Docling document to be updated with the parsed content.
+        """
+        buffer: list[str] = []
+        def flush_buffer():
+            if not buffer:
+                return
+            text = "".join(buffer).strip()
+            buffer.clear()
+            if not text:
+                return
+            for part in text.split("\n"):
+                seg = part.strip()
+                if seg:
+                    doc.add_text(
+                        DocItemLabel.TEXT,
+                        seg,
+                        parent=self.parents[self.level],
+                        content_layer=self.content_layer,
+                    )
-        self.level += 1
-        self.walk(element, doc)
-        self.parents[self.level + 1] = None
-        self.level -= 1
+        for node in element.contents:
+            if isinstance(node, Tag):
+                name = node.name.lower()
+                if name == "img":
+                    flush_buffer()
+                    self._emit_image(node, doc)
+                elif name in _BLOCK_TAGS:
+                    flush_buffer()
+                    self._handle_block(node, doc)
+                elif node.find(_BLOCK_TAGS):
+                    flush_buffer()
+                    self._walk(node, doc)
+                else:
+                    buffer.append(node.text)
+            elif isinstance(node, NavigableString) and not isinstance(
+                node, PreformattedString
+            ):
+                buffer.append(str(node))
-    def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
-        """Handles header tags (h1, h2, etc.)."""
-        hlevel = int(element.name.replace("h", ""))
-        text = element.text.strip()
+        flush_buffer()
+    def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
+        tag_name = tag.name.lower()
+        # set default content layer to BODY as soon as we encounter a heading
         self.content_layer = ContentLayer.BODY
-        if hlevel == 1:
+        level = int(tag_name[1])
+        text = tag.get_text(strip=True, separator=" ")
+        # the first level is for the title item
+        if level == 1:
             for key in self.parents.keys():
                 self.parents[key] = None
-            self.level = 1
-            self.parents[self.level] = doc.add_text(
-                parent=self.parents[0],
-                label=DocItemLabel.TITLE,
-                text=text,
-                content_layer=self.content_layer,
+            self.level = 0
+            self.parents[self.level + 1] = doc.add_title(
+                text, content_layer=self.content_layer
             )
+        # the other levels need to be lowered by 1 if a title was set
         else:
-            if hlevel > self.level:
+            level -= 1
+            if level > self.level:
                 # add invisible group
-                for i in range(self.level + 1, hlevel):
-                    self.parents[i] = doc.add_group(
-                        name=f"header-{i}",
+                for i in range(self.level, level):
+                    _log.debug(f"Adding invisible group to level {i}")
+                    self.parents[i + 1] = doc.add_group(
+                        name=f"header-{i + 1}",
                         label=GroupLabel.SECTION,
-                        parent=self.parents[i - 1],
+                        parent=self.parents[i],
                         content_layer=self.content_layer,
                     )
-                self.level = hlevel
-            elif hlevel < self.level:
+                self.level = level
+            elif level < self.level:
                 # remove the tail
                 for key in self.parents.keys():
-                    if key > hlevel:
+                    if key > level + 1:
+                        _log.debug(f"Remove the tail of level {key}")
                         self.parents[key] = None
-                self.level = hlevel
-            self.parents[hlevel] = doc.add_heading(
-                parent=self.parents[hlevel - 1],
-                text=text,
-                level=hlevel - 1,
-                content_layer=self.content_layer,
-            )
-    def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
-        """Handles monospace code snippets (pre)."""
-        if element.text is None:
-            return
-        text = element.text.strip()
-        if text:
-            doc.add_code(
-                parent=self.parents[self.level],
-                text=text,
-                content_layer=self.content_layer,
-            )
-    def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
-        """Handles paragraph tags (p) or equivalent ones."""
-        if element.text is None:
-            return
-        text = element.text.strip()
-        if text:
-            doc.add_text(
+                self.level = level
+            self.parents[self.level + 1] = doc.add_heading(
                 parent=self.parents[self.level],
-                label=DocItemLabel.TEXT,
                 text=text,
+                level=self.level,
                 content_layer=self.content_layer,
             )
+        self.level += 1
+        for img_tag in tag("img"):
+            if isinstance(img_tag, Tag):
+                self._emit_image(img_tag, doc)
-    def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
-        """Handles list tags (ul, ol) and their list items."""
+    def _handle_list(self, tag: Tag, doc: DoclingDocument) -> None:
+        tag_name = tag.name.lower()
         start: Optional[int] = None
-        if is_ordered := element.name == "ol":
-            start_attr = element.get("start")
+        name: str = ""
+        is_ordered = tag_name == "ol"
+        if is_ordered:
+            start_attr = tag.get("start")
             if isinstance(start_attr, str) and start_attr.isnumeric():
                 start = int(start_attr)
             name = "ordered list" + (f" start {start}" if start is not None else "")
         else:
             name = "list"
-        # create a list group
+        # Create the list container
         list_group = doc.add_list_group(
             name=name,
             parent=self.parents[self.level],
@@ -320,64 +270,171 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         self.ctx.list_ordered_flag_by_ref[list_group.self_ref] = is_ordered
         if is_ordered and start is not None:
             self.ctx.list_start_by_ref[list_group.self_ref] = start
         self.level += 1
-        self.walk(element, doc)
+        # For each top-level <li> in this list
+        for li in tag.find_all({"li", "ul", "ol"}, recursive=False):
+            if not isinstance(li, Tag):
+                continue
+            # sub-list items should be indented under main list items, but temporarily
+            # addressing invalid HTML (docling-core/issues/357)
+            if li.name in {"ul", "ol"}:
+                self._handle_block(li, doc)
+            else:
+                # 1) determine the marker
+                if is_ordered and start is not None:
+                    marker = f"{start + len(list_group.children)}."
+                else:
+                    marker = ""
+                # 2) extract only the "direct" text from this <li>
+                parts: list[str] = []
+                for child in li.contents:
+                    if isinstance(child, NavigableString) and not isinstance(
+                        child, PreformattedString
+                    ):
+                        parts.append(child)
+                    elif isinstance(child, Tag) and child.name not in ("ul", "ol"):
+                        text_part = child.get_text()
+                        if text_part:
+                            parts.append(text_part)
+                li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
+                # 3) add the list item
+                if li_text:
+                    self.parents[self.level + 1] = doc.add_list_item(
+                        text=li_text,
+                        enumerated=is_ordered,
+                        marker=marker,
+                        parent=list_group,
+                        content_layer=self.content_layer,
+                    )
+                    # 4) recurse into any nested lists, attaching them to this <li> item
+                    for sublist in li({"ul", "ol"}, recursive=False):
+                        if isinstance(sublist, Tag):
+                            self.level += 1
+                            self._handle_block(sublist, doc)
+                            self.parents[self.level + 1] = None
+                            self.level -= 1
+                else:
+                    for sublist in li({"ul", "ol"}, recursive=False):
+                        if isinstance(sublist, Tag):
+                            self._handle_block(sublist, doc)
+                # 5) extract any images under this <li>
+                for img_tag in li("img"):
+                    if isinstance(img_tag, Tag):
+                        self._emit_image(img_tag, doc)
         self.parents[self.level + 1] = None
         self.level -= 1
-    def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
-        """Handles list item tags (li)."""
-        nested_list = element.find(["ul", "ol"])
-        parent = self.parents[self.level]
-        if parent is None:
-            _log.debug(f"list-item has no parent in DoclingDocument: {element}")
-            return
-        enumerated = self.ctx.list_ordered_flag_by_ref.get(parent.self_ref, False)
-        if enumerated and (start := self.ctx.list_start_by_ref.get(parent.self_ref)):
-            marker = f"{start + len(parent.children)}."
-        else:
-            marker = ""
-        if nested_list:
-            # Text in list item can be hidden within hierarchy, hence
-            # we need to extract it recursively
-            text: str = self.get_text(element)
-            # Flatten text, remove break lines:
-            text = text.replace("\n", "").replace("\r", "")
-            text = " ".join(text.split()).strip()
-            if len(text) > 0:
-                # create a list-item
-                self.parents[self.level + 1] = doc.add_list_item(
-                    text=text,
-                    enumerated=enumerated,
-                    marker=marker,
-                    parent=parent,
+    def _handle_block(self, tag: Tag, doc: DoclingDocument) -> None:
+        tag_name = tag.name.lower()
+        if tag_name == "figure":
+            img_tag = tag.find("img")
+            if isinstance(img_tag, Tag):
+                self._emit_image(img_tag, doc)
+        elif tag_name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
+            self._handle_heading(tag, doc)
+        elif tag_name in {"ul", "ol"}:
+            self._handle_list(tag, doc)
+        elif tag_name in {"p", "address", "summary"}:
+            for part in tag.text.split("\n"):
+                seg = part.strip()
+                if seg:
+                    doc.add_text(
+                        parent=self.parents[self.level],
+                        label=DocItemLabel.TEXT,
+                        text=seg,
+                        content_layer=self.content_layer,
+                    )
+            for img_tag in tag("img"):
+                if isinstance(img_tag, Tag):
+                    self._emit_image(img_tag, doc)
+        elif tag_name == "table":
+            data = HTMLDocumentBackend.parse_table_data(tag)
+            for img_tag in tag("img"):
+                if isinstance(img_tag, Tag):
+                    self._emit_image(tag, doc)
+            if data is not None:
+                doc.add_table(
+                    data=data,
+                    parent=self.parents[self.level],
                     content_layer=self.content_layer,
                 )
-                self.level += 1
-                self.walk(element, doc)
-                self.parents[self.level + 1] = None
-                self.level -= 1
-            else:
-                self.walk(element, doc)
-        elif element.text.strip():
-            text = element.text.strip()
+        elif tag_name in {"pre", "code"}:
+            # handle monospace code snippets (pre).
+            text = tag.get_text(strip=True)
+            if text:
+                doc.add_code(
+                    parent=self.parents[self.level],
+                    text=text,
+                    content_layer=self.content_layer,
+                )
-            doc.add_list_item(
-                text=text,
-                enumerated=enumerated,
-                marker=marker,
-                parent=parent,
+        elif tag_name == "details":
+            # handle details and its content.
+            self.parents[self.level + 1] = doc.add_group(
+                name="details",
+                label=GroupLabel.SECTION,
+                parent=self.parents[self.level],
                 content_layer=self.content_layer,
             )
-        else:
-            _log.debug(f"list-item has no text: {element}")
+            self.level += 1
+            self._walk(tag, doc)
+            self.parents[self.level + 1] = None
+            self.level -= 1
+    def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
+        figure = img_tag.find_parent("figure")
+        caption: str = ""
+        if isinstance(figure, Tag):
+            caption_tag = figure.find("figcaption", recursive=False)
+            if isinstance(caption_tag, Tag):
+                caption = caption_tag.get_text()
+        if not caption:
+            caption = str(img_tag.get("alt", "")).strip()
+        caption_item: Optional[TextItem] = None
+        if caption:
+            caption_item = doc.add_text(
+                DocItemLabel.CAPTION, text=caption, content_layer=self.content_layer
+            )
+        doc.add_picture(
+            caption=caption_item,
+            parent=self.parents[self.level],
+            content_layer=self.content_layer,
+        )
+    @staticmethod
+    def _get_cell_spans(cell: Tag) -> tuple[int, int]:
+        """Extract colspan and rowspan values from a table cell tag.
+        This function retrieves the 'colspan' and 'rowspan' attributes from a given
+        table cell tag.
+        If the attribute does not exist or it is not numeric, it defaults to 1.
+        """
+        raw_spans: tuple[str, str] = (
+            str(cell.get("colspan", "1")),
+            str(cell.get("rowspan", "1")),
+        )
+        int_spans: tuple[int, int] = (
+            int(raw_spans[0]) if raw_spans[0].isnumeric() else 1,
+            int(raw_spans[1]) if raw_spans[0].isnumeric() else 1,
+        )
+        return int_spans
     @staticmethod
     def parse_table_data(element: Tag) -> Optional[TableData]:  # noqa: C901
@@ -398,10 +455,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 if not isinstance(row, Tag):
                     continue
                 cell_tag = cast(Tag, cell)
-                val = cell_tag.get("colspan", "1")
-                colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1
-                col_count += colspan
-                if cell_tag.name == "td" or cell_tag.get("rowspan") is None:
+                col_span, row_span = HTMLDocumentBackend._get_cell_spans(cell_tag)
+                col_count += col_span
+                if cell_tag.name == "td" or row_span == 1:
                     is_row_header = False
             num_cols = max(num_cols, col_count)
             if not is_row_header:
@@ -428,10 +484,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             row_header = True
             for html_cell in cells:
                 if isinstance(html_cell, Tag):
+                    _, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
                     if html_cell.name == "td":
                         col_header = False
                         row_header = False
-                    elif html_cell.get("rowspan") is None:
+                    elif row_span == 1:
                         row_header = False
             if not row_header:
                 row_idx += 1
@@ -456,18 +513,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 text = html_cell.text
                 # label = html_cell.name
-                col_val = html_cell.get("colspan", "1")
-                col_span = (
-                    int(col_val)
-                    if isinstance(col_val, str) and col_val.isnumeric()
-                    else 1
-                )
-                row_val = html_cell.get("rowspan", "1")
-                row_span = (
-                    int(row_val)
-                    if isinstance(row_val, str) and row_val.isnumeric()
-                    else 1
-                )
+                col_span, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
                 if row_header:
                     row_span -= 1
                 while (
@@ -494,84 +540,3 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 data.table_cells.append(table_cell)
         return data
-    def handle_table(self, element: Tag, doc: DoclingDocument) -> None:
-        """Handles table tags."""
-        table_data = HTMLDocumentBackend.parse_table_data(element)
-        if table_data is not None:
-            doc.add_table(
-                data=table_data,
-                parent=self.parents[self.level],
-                content_layer=self.content_layer,
-            )
-    def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
-        """Recursively extract text from <ul> or <ol> with proper indentation."""
-        result = []
-        bullet_char = "*"  # Default bullet character for unordered lists
-        if list_element.name == "ol":  # For ordered lists, use numbers
-            for i, li in enumerate(list_element("li", recursive=False), 1):
-                if not isinstance(li, Tag):
-                    continue
-                # Add numbering for ordered lists
-                result.append(f"{'    ' * level}{i}. {li.get_text(strip=True)}")
-                # Handle nested lists
-                nested_list = li.find(["ul", "ol"])
-                if isinstance(nested_list, Tag):
-                    result.extend(self.get_list_text(nested_list, level + 1))
-        elif list_element.name == "ul":  # For unordered lists, use bullet points
-            for li in list_element("li", recursive=False):
-                if not isinstance(li, Tag):
-                    continue
-                # Add bullet points for unordered lists
-                result.append(
-                    f"{'    ' * level}{bullet_char} {li.get_text(strip=True)}"
-                )
-                # Handle nested lists
-                nested_list = li.find(["ul", "ol"])
-                if isinstance(nested_list, Tag):
-                    result.extend(self.get_list_text(nested_list, level + 1))
-        return result
-    def handle_figure(self, element: Tag, doc: DoclingDocument) -> None:
-        """Handles image tags (img)."""
-        # Extract the image URI from the <img> tag
-        # image_uri = root.xpath('//figure//img/@src')[0]
-        contains_captions = element.find(["figcaption"])
-        if not isinstance(contains_captions, Tag):
-            doc.add_picture(
-                parent=self.parents[self.level],
-                caption=None,
-                content_layer=self.content_layer,
-            )
-        else:
-            texts = []
-            for item in contains_captions:
-                texts.append(item.text)
-            fig_caption = doc.add_text(
-                label=DocItemLabel.CAPTION,
-                text=("".join(texts)).strip(),
-                content_layer=self.content_layer,
-            )
-            doc.add_picture(
-                parent=self.parents[self.level],
-                caption=fig_caption,
-                content_layer=self.content_layer,
-            )
-    def handle_image(self, element: Tag, doc: DoclingDocument) -> None:
-        """Handles image tags (img)."""
-        _log.debug(f"ignoring <img> tags at the moment: {element}")
-        doc.add_picture(
-            parent=self.parents[self.level],
-            caption=None,
-            content_layer=self.content_layer,
-        )

docling/backend/xml/jats_backend.py CHANGED Viewed

@@ -93,8 +93,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
         # Initialize the root of the document hierarchy
         self.root: Optional[NodeItem] = None
-        self.valid = False
+        self.hlevel: int = 0
+        self.valid: bool = False
         try:
             if isinstance(self.path_or_stream, BytesIO):
                 self.path_or_stream.seek(0)
@@ -147,6 +147,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
                 binary_hash=self.document_hash,
             )
             doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
+            self.hlevel = 0
             # Get metadata XML components
             xml_components: XMLComponents = self._parse_metadata()
@@ -304,7 +305,9 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
             title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT
             if not text:
                 continue
-            parent = doc.add_heading(parent=self.root, text=title)
+            parent = doc.add_heading(
+                parent=self.root, text=title, level=self.hlevel + 1
+            )
             doc.add_text(
                 parent=parent,
                 text=text,
@@ -637,7 +640,10 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
                 elif child.tag == "ack":
                     text = DEFAULT_HEADER_ACKNOWLEDGMENTS
                 if text:
-                    new_parent = doc.add_heading(text=text, parent=parent)
+                    self.hlevel += 1
+                    new_parent = doc.add_heading(
+                        text=text, parent=parent, level=self.hlevel
+                    )
             elif child.tag == "list":
                 new_parent = doc.add_group(
                     label=GroupLabel.LIST, name="list", parent=parent
@@ -694,6 +700,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
                 new_text = self._walk_linear(doc, new_parent, child)
                 if not (node.getparent().tag == "p" and node.tag in flush_tags):
                     node_text += new_text
+                if child.tag in ("sec", "ack") and text:
+                    self.hlevel -= 1
             # pick up the tail text
             node_text += child.tail.replace("\n", " ") if child.tail else ""

docling/datamodel/pipeline_options.py CHANGED Viewed

@@ -217,7 +217,7 @@ smolvlm_picture_description = PictureDescriptionVlmOptions(
 # GraniteVision
 granite_picture_description = PictureDescriptionVlmOptions(
-    repo_id="ibm-granite/granite-vision-3.2-2b-preview",
+    repo_id="ibm-granite/granite-vision-3.3-2b",
     prompt="What is shown in this image?",
 )
@@ -279,6 +279,9 @@ class LayoutOptions(BaseModel):
     """Options for layout processing."""
     create_orphan_clusters: bool = True  # Whether to create clusters for orphaned cells
+    keep_empty_clusters: bool = (
+        False  # Whether to keep clusters that contain no text cells
+    )
     model_spec: LayoutModelConfig = DOCLING_LAYOUT_V2

docling/document_converter.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import hashlib
 import logging
 import sys
+import threading
 import time
 from collections.abc import Iterable, Iterator
 from functools import partial
@@ -49,6 +50,7 @@ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
 from docling.utils.utils import chunkify
 _log = logging.getLogger(__name__)
+_PIPELINE_CACHE_LOCK = threading.Lock()
 class FormatOption(BaseModel):
@@ -315,17 +317,18 @@ class DocumentConverter:
         # Use a composite key to cache pipelines
         cache_key = (pipeline_class, options_hash)
-        if cache_key not in self.initialized_pipelines:
-            _log.info(
-                f"Initializing pipeline for {pipeline_class.__name__} with options hash {options_hash}"
-            )
-            self.initialized_pipelines[cache_key] = pipeline_class(
-                pipeline_options=pipeline_options
-            )
-        else:
-            _log.debug(
-                f"Reusing cached pipeline for {pipeline_class.__name__} with options hash {options_hash}"
-            )
+        with _PIPELINE_CACHE_LOCK:
+            if cache_key not in self.initialized_pipelines:
+                _log.info(
+                    f"Initializing pipeline for {pipeline_class.__name__} with options hash {options_hash}"
+                )
+                self.initialized_pipelines[cache_key] = pipeline_class(
+                    pipeline_options=pipeline_options
+                )
+            else:
+                _log.debug(
+                    f"Reusing cached pipeline for {pipeline_class.__name__} with options hash {options_hash}"
+                )
         return self.initialized_pipelines[cache_key]

docling/models/picture_description_vlm_model.py CHANGED Viewed

@@ -65,6 +65,7 @@ class PictureDescriptionVlmModel(
                 self.processor = AutoProcessor.from_pretrained(artifacts_path)
                 self.model = AutoModelForVision2Seq.from_pretrained(
                     artifacts_path,
+                    device_map=self.device,
                     torch_dtype=torch.bfloat16,
                     _attn_implementation=(
                         "flash_attention_2"
@@ -72,7 +73,7 @@ class PictureDescriptionVlmModel(
                         and accelerator_options.cuda_use_flash_attention2
                         else "eager"
                     ),
-                ).to(self.device)
+                )
             self.provenance = f"{self.options.repo_id}"

docling/utils/layout_postprocessor.py CHANGED Viewed

@@ -267,8 +267,14 @@ class LayoutPostprocessor:
         # Initial cell assignment
         clusters = self._assign_cells_to_clusters(clusters)
-        # Remove clusters with no cells
-        clusters = [cluster for cluster in clusters if cluster.cells]
+        # Remove clusters with no cells (if keep_empty_clusters is False),
+        # but always keep clusters with label DocItemLabel.FORMULA
+        if not self.options.keep_empty_clusters:
+            clusters = [
+                cluster
+                for cluster in clusters
+                if cluster.cells or cluster.label == DocItemLabel.FORMULA
+            ]
         # Handle orphaned cells
         unassigned = self._find_unassigned_cells(clusters)

{docling-2.41.0.dist-info → docling-2.42.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docling
-Version: 2.41.0
+Version: 2.42.1
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
 License-Expression: MIT
@@ -50,6 +50,7 @@ Requires-Dist: tqdm<5.0.0,>=4.65.0
 Requires-Dist: pluggy<2.0.0,>=1.0.0
 Requires-Dist: pylatexenc<3.0,>=2.10
 Requires-Dist: scipy<2.0.0,>=1.6.0
+Requires-Dist: accelerate<2,>=1.0.0
 Provides-Extra: tesserocr
 Requires-Dist: tesserocr<3.0.0,>=2.7.1; extra == "tesserocr"
 Provides-Extra: ocrmac

{docling-2.41.0.dist-info → docling-2.42.1.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/document_converter.py,sha256=3jWywP_TLy-1PMvjJBUlnTM9FNzpBLRCHYA1RKFvGR4,14333
+docling/document_converter.py,sha256=9aH8B30_jOYN4P_ySCCvtgEb3GoIpec15r7lEAFlMDU,14469
 docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
 docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
 docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -9,7 +9,7 @@ docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE
 docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3Uea00XrLluTg,7918
 docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
 docling/backend/docling_parse_v4_backend.py,sha256=qR_WRVq9JGtRioWCw6MnLWgbvXbC6Y1yds7Ol1-E6UQ,6550
-docling/backend/html_backend.py,sha256=Z959dzqYQO2pPE4xgPRxC5MR9j3nFGtiD6_F_osQ2iI,20670
+docling/backend/html_backend.py,sha256=gGkm3i7FpW2WCJ-_GPpOJNh1LUq1_-vRGyGURuPagck,19284
 docling/backend/md_backend.py,sha256=mfwGj8g2hGC-Q_HREtl_Web65uMVXD-Ie1nRqWTXzF0,21013
 docling/backend/msexcel_backend.py,sha256=cq8MQ2RSh6pqCiVrldjOerSww7dOPTWmCQoCBI57i6w,18579
 docling/backend/mspowerpoint_backend.py,sha256=wJgB2JStEPfD7MPpWQlpPN7bffPxaHFUnKD4wj8SLxU,15114
@@ -20,11 +20,11 @@ docling/backend/pypdfium2_backend.py,sha256=8dVniLHgiTdJuDbYr66kPp6Ccv5ZDlqDMEbA
 docling/backend/docx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/backend/docx/latex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/backend/docx/latex/latex_dict.py,sha256=tFJp4ScT_AkY2ON7nLEa560p601Jq2glcZvMKxxjn7w,6593
-docling/backend/docx/latex/omml.py,sha256=nEpcfyyrOucJyj6cD7wfThrIa-q0CQCoqMb3dkrhCRg,12094
+docling/backend/docx/latex/omml.py,sha256=4vh9FCbXh-Tb6KJGqNwzlMUMYEnnJgBtBI24dwy6t2U,12416
 docling/backend/json/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/backend/json/docling_json_backend.py,sha256=LlFMVoZrrCfVwbDuRbNN4Xg96Lujh4xxrTBt9jGhY9I,1984
 docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/backend/xml/jats_backend.py,sha256=ghGi9bHjx3BvaOtmzLw86-wZy4UxpQPOPQL4e73-BI8,24927
+docling/backend/xml/jats_backend.py,sha256=LPj33EFdi2MRCakkLWrRLlUAc-B-949f8zp5gKNvBcg,25238
 docling/backend/xml/uspto_backend.py,sha256=nyAMr5ht7dclxkVDwsKNeiOhLQrUtRLS8JdscB2AVJg,70924
 docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
 docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -37,7 +37,7 @@ docling/datamodel/asr_model_specs.py,sha256=Wg7z3zm_wXIWu122iPVy0RMECsA_JCFHrlFF
 docling/datamodel/base_models.py,sha256=9FslHkGUNmBp264LpLL_2JTfDAdaikldYs3SiQOHb5A,11828
 docling/datamodel/document.py,sha256=CA_dgt4V_phze5HXpfgfKNBKd1cPC1o3WE_IENX63EM,16252
 docling/datamodel/layout_model_specs.py,sha256=GSkJ-Z_0PVgwWGi7C7TsxbzRjlrWS9ZrHJjHumv-Z5U,2339
-docling/datamodel/pipeline_options.py,sha256=aMwpbyEMbAC-xGJnjQp8iw2ocpSU4eiD8D73gHf7T4U,10033
+docling/datamodel/pipeline_options.py,sha256=nlejeQjnJx2RBMkCukDECHGuVEOol9hbsSLUi2ee9hY,10134
 docling/datamodel/pipeline_options_asr_model.py,sha256=7X068xl-qpbyPxC7-TwX7Q6tLyZXGT5h1osZ_xLNLM0,1454
 docling/datamodel/pipeline_options_vlm_model.py,sha256=z-pUqwRA8nJp6C3SEXZLem2zvSYdgavaAVYa8wkAIZY,2400
 docling/datamodel/settings.py,sha256=ajMz7Ao2m0ZGYkfArqTDDbiF89O408mtgeh06PUi0MA,1900
@@ -55,7 +55,7 @@ docling/models/page_assemble_model.py,sha256=TvN1naez7dUodLxpUUBzpuMCpqZBTf6YSpe
 docling/models/page_preprocessing_model.py,sha256=x8MI4mvjizqEqAb5511dtrNRCJSb-lSmwHw0tmHPFiI,5103
 docling/models/picture_description_api_model.py,sha256=o3EkV5aHW_6WzE_fdj_VRnNCrS_btclO_ZCLAUqrfl0,2377
 docling/models/picture_description_base_model.py,sha256=kLthLhdlgwhootQ4_xhhcAk6A-vso5-qcsFJ3TcYfO0,2991
-docling/models/picture_description_vlm_model.py,sha256=nAUt-eZOX2GvaCiV2BJO7VppxUbP7udVIF4oe_sEYXo,4000
+docling/models/picture_description_vlm_model.py,sha256=yfyAFOy8RjxQJrafPMSAMrrpaYu3anahjRX6tCnVcs0,4028
 docling/models/rapid_ocr_model.py,sha256=AMdc66s_iWO4p6nQ0LNjQMUYVxrDSxMyLNPpjPYt6N8,5916
 docling/models/readingorder_model.py,sha256=bZoXHaSwUsa8niSmJrbCuy784ixCeBXT-RQBUfgHJ4A,14925
 docling/models/table_structure_model.py,sha256=RFXo73f2q4XuKyaSqbxpznh7JVtlLcT0FsOWl9oZbSg,12518
@@ -83,7 +83,7 @@ docling/utils/accelerator_utils.py,sha256=DSajLxVx1JEVT0zt5de26llciLNlVfIDfSa2zY
 docling/utils/api_image_request.py,sha256=_CgdzmPqdsyXmyYUFGLZcXcoH586qC6A1p5vsNbj1Q0,1416
 docling/utils/export.py,sha256=VwVUnYDk3mhGmISDbVm306fwpGNnoojouStBD4UajXI,4673
 docling/utils/glm_utils.py,sha256=TKOWQqWAHsX_w4fvoAA7_2xCi_urhnp1DsmjY8_sk5w,12274
-docling/utils/layout_postprocessor.py,sha256=QuTZZq4LNs1eM_n_2gubVfAuLBMkJiozfs3hp-jUpK4,24399
+docling/utils/layout_postprocessor.py,sha256=LFLbBE-o3kWu79d8ZcyHlZPIqzQfCabZCIPTJ51lZsY,24657
 docling/utils/locks.py,sha256=RzqQtD5UispgV71pGN_nU6GYfeN11BN0Sh_Dq9ycqGo,52
 docling/utils/model_downloader.py,sha256=3vijCsAIVwWqehGBDRxRq7mJ3yRb9-zBsG00iqjqegU,4076
 docling/utils/ocr_utils.py,sha256=nmresYyfin0raanpQc_GGeU3WoLsfExf6SEXNIQ7Djg,2325
@@ -91,9 +91,9 @@ docling/utils/orientation.py,sha256=jTyLxyT31FlOodZoBMlADHNQK2lAWKYVs5z7pXd_6Cg,
 docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
 docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
 docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
-docling-2.41.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
-docling-2.41.0.dist-info/METADATA,sha256=KYqB0miKX2x2ESNy8tNHdAlyTCONqhwGLR2iag2PcQ0,10274
-docling-2.41.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-docling-2.41.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
-docling-2.41.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
-docling-2.41.0.dist-info/RECORD,,
+docling-2.42.1.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
+docling-2.42.1.dist-info/METADATA,sha256=d46NOPDEps6dVLLMh3tWBCEQv7b_bwQQ46ndyqVO-ag,10310
+docling-2.42.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+docling-2.42.1.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
+docling-2.42.1.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
+docling-2.42.1.dist-info/RECORD,,

{docling-2.41.0.dist-info → docling-2.42.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{docling-2.41.0.dist-info → docling-2.42.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{docling-2.41.0.dist-info → docling-2.42.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{docling-2.41.0.dist-info → docling-2.42.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

docling 2.41.0__py3-none-any.whl → 2.42.1__py3-none-any.whl

docling 2.41.0py3-none-any.whl → 2.42.1py3-none-any.whl