PyPI - docling - Versions diffs - 2.30.0__py3-none-any.whl → 2.31.1__py3-none-any.whl - Mend

docling 2.30.0py3-none-any.whl → 2.31.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

docling/backend/asciidoc_backend.py +7 -15
docling/backend/csv_backend.py +1 -1
docling/backend/docling_parse_backend.py +2 -2
docling/backend/docling_parse_v2_backend.py +2 -2
docling/backend/docling_parse_v4_backend.py +3 -4
docling/backend/docx/latex/latex_dict.py +0 -5
docling/backend/docx/latex/omml.py +4 -7
docling/backend/html_backend.py +66 -25
docling/backend/md_backend.py +6 -8
docling/backend/msexcel_backend.py +1 -7
docling/backend/mspowerpoint_backend.py +4 -7
docling/backend/msword_backend.py +5 -5
docling/backend/pdf_backend.py +2 -1
docling/backend/pypdfium2_backend.py +3 -3
docling/backend/xml/jats_backend.py +11 -14
docling/backend/xml/uspto_backend.py +19 -23
docling/cli/main.py +8 -8
docling/cli/models.py +6 -3
docling/datamodel/base_models.py +7 -5
docling/datamodel/document.py +19 -10
docling/datamodel/pipeline_options.py +0 -1
docling/document_converter.py +8 -6
docling/models/api_vlm_model.py +1 -2
docling/models/base_model.py +2 -4
docling/models/base_ocr_model.py +2 -2
docling/models/code_formula_model.py +2 -1
docling/models/document_picture_classifier.py +2 -1
docling/models/easyocr_model.py +10 -11
docling/models/factories/__init__.py +2 -2
docling/models/factories/base_factory.py +1 -1
docling/models/hf_mlx_model.py +4 -6
docling/models/hf_vlm_model.py +7 -5
docling/models/layout_model.py +2 -2
docling/models/ocr_mac_model.py +3 -4
docling/models/page_assemble_model.py +7 -12
docling/models/page_preprocessing_model.py +2 -1
docling/models/picture_description_api_model.py +2 -1
docling/models/picture_description_base_model.py +2 -3
docling/models/picture_description_vlm_model.py +6 -4
docling/models/rapid_ocr_model.py +2 -3
docling/models/readingorder_model.py +9 -24
docling/models/table_structure_model.py +4 -8
docling/models/tesseract_ocr_cli_model.py +17 -16
docling/models/tesseract_ocr_model.py +9 -5
docling/pipeline/base_pipeline.py +4 -8
docling/pipeline/simple_pipeline.py +0 -1
docling/pipeline/standard_pdf_pipeline.py +0 -1
docling/pipeline/vlm_pipeline.py +0 -3
docling/utils/export.py +2 -4
docling/utils/glm_utils.py +2 -2
docling/utils/layout_postprocessor.py +4 -2
docling/utils/model_downloader.py +31 -7
docling/utils/utils.py +3 -3
{docling-2.30.0.dist-info → docling-2.31.1.dist-info}/METADATA +2 -1
docling-2.31.1.dist-info/RECORD +86 -0
docling-2.30.0.dist-info/RECORD +0 -86
{docling-2.30.0.dist-info → docling-2.31.1.dist-info}/LICENSE +0 -0
{docling-2.30.0.dist-info → docling-2.31.1.dist-info}/WHEEL +0 -0
{docling-2.30.0.dist-info → docling-2.31.1.dist-info}/entry_points.txt +0 -0

docling/backend/asciidoc_backend.py CHANGED Viewed

@@ -34,7 +34,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
                 text_stream = self.path_or_stream.getvalue().decode("utf-8")
                 self.lines = text_stream.split("\n")
             if isinstance(self.path_or_stream, Path):
-                with open(self.path_or_stream, "r", encoding="utf-8") as f:
+                with open(self.path_or_stream, encoding="utf-8") as f:
                     self.lines = f.readlines()
             self.valid = True
@@ -75,14 +75,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
         return doc
-    def _parse(self, doc: DoclingDocument):
+    def _parse(self, doc: DoclingDocument):  # noqa: C901
         """
         Main function that orchestrates the parsing by yielding components:
         title, section headers, text, lists, and tables.
         """
-        content = ""
         in_list = False
         in_table = False
@@ -95,7 +93,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
         # indents: dict[int, Union[DocItem, GroupItem, None]] = {}
         indents: dict[int, Union[GroupItem, None]] = {}
-        for i in range(0, 10):
+        for i in range(10):
             parents[i] = None
             indents[i] = None
@@ -125,7 +123,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
             # Lists
             elif self._is_list_item(line):
                 _log.debug(f"line: {line}")
                 item = self._parse_list_item(line)
                 _log.debug(f"parsed list-item: {item}")
@@ -147,7 +144,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
                     indents[level + 1] = item["indent"]
                 elif in_list and item["indent"] < indents[level]:
                     # print(item["indent"], " => ", indents[level])
                     while item["indent"] < indents[level]:
                         # print(item["indent"], " => ", indents[level])
@@ -176,7 +172,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
             elif in_table and (
                 (not self._is_table_line(line)) or line.strip() == "|==="
             ):  # end of table
                 caption = None
                 if len(caption_data) > 0:
                     caption = doc.add_text(
@@ -195,7 +190,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
             # Picture
             elif self._is_picture(line):
                 caption = None
                 if len(caption_data) > 0:
                     caption = doc.add_text(
@@ -250,7 +244,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
                 text_data = []
             elif len(line.strip()) > 0:  # allow multiline texts
                 item = self._parse_text(line)
                 text_data.append(item["text"])
@@ -273,14 +266,14 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
     def _get_current_level(self, parents):
         for k, v in parents.items():
-            if v == None and k > 0:
+            if v is None and k > 0:
                 return k - 1
         return 0
     def _get_current_parent(self, parents):
         for k, v in parents.items():
-            if v == None and k > 0:
+            if v is None and k > 0:
                 return parents[k - 1]
         return None
@@ -328,7 +321,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
                     "marker": marker,
                     "text": text.strip(),
                     "numbered": False,
-                    "indent": 0 if indent == None else len(indent),
+                    "indent": 0 if indent is None else len(indent),
                 }
             else:
                 return {
@@ -336,7 +329,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
                     "marker": marker,
                     "text": text.strip(),
                     "numbered": True,
-                    "indent": 0 if indent == None else len(indent),
+                    "indent": 0 if indent is None else len(indent),
                 }
         else:
             # Fallback if no match
@@ -357,7 +350,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
         return [cell.strip() for cell in line.split("|") if cell.strip()]
     def _populate_table_as_grid(self, table_data):
         num_rows = len(table_data)
         # Adjust the table data into a grid format

docling/backend/csv_backend.py CHANGED Viewed

@@ -58,7 +58,7 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
         head = self.content.readline()
         dialect = csv.Sniffer().sniff(head, ",;\t|:")
         _log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"')
-        if not dialect.delimiter in {",", ";", "\t", "|", ":"}:
+        if dialect.delimiter not in {",", ";", "\t", "|", ":"}:
             raise RuntimeError(
                 f"Cannot convert csv with unknown delimiter {dialect.delimiter}."
             )

docling/backend/docling_parse_backend.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import logging
 import random
+from collections.abc import Iterable
 from io import BytesIO
 from pathlib import Path
-from typing import Iterable, List, Optional, Union
+from typing import List, Optional, Union
 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin, Size
@@ -156,7 +157,6 @@ class DoclingParsePageBackend(PdfPageBackend):
     def get_page_image(
         self, scale: float = 1, cropbox: Optional[BoundingBox] = None
     ) -> Image.Image:
         page_size = self.get_size()
         if not cropbox:

docling/backend/docling_parse_v2_backend.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import logging
 import random
+from collections.abc import Iterable
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Iterable, List, Optional, Union
+from typing import TYPE_CHECKING, List, Optional, Union
 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -172,7 +173,6 @@ class DoclingParseV2PageBackend(PdfPageBackend):
     def get_page_image(
         self, scale: float = 1, cropbox: Optional[BoundingBox] = None
     ) -> Image.Image:
         page_size = self.get_size()
         if not cropbox:

docling/backend/docling_parse_v4_backend.py CHANGED Viewed

@@ -1,14 +1,14 @@
 import logging
-import random
+from collections.abc import Iterable
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Iterable, List, Optional, Union
+from typing import TYPE_CHECKING, Optional, Union
 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import SegmentedPdfPage, TextCell
 from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
-from PIL import Image, ImageDraw
+from PIL import Image
 from pypdfium2 import PdfPage
 from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
@@ -93,7 +93,6 @@ class DoclingParseV4PageBackend(PdfPageBackend):
     def get_page_image(
         self, scale: float = 1, cropbox: Optional[BoundingBox] = None
     ) -> Image.Image:
         page_size = self.get_size()
         if not cropbox:

docling/backend/docx/latex/latex_dict.py CHANGED Viewed

@@ -1,12 +1,8 @@
-# -*- coding: utf-8 -*-
 """
 Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
 On 23/01/2025
 """
-from __future__ import unicode_literals
 CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~")
 BLANK = ""
@@ -79,7 +75,6 @@ CHR_BO = {
 }
 T = {
-    "\u2192": "\\rightarrow ",
     # Greek letters
     "\U0001d6fc": "\\alpha ",
     "\U0001d6fd": "\\beta ",

docling/backend/docx/latex/omml.py CHANGED Viewed

@@ -76,8 +76,7 @@ def get_val(key, default=None, store=CHR):
         return default
-class Tag2Method(object):
+class Tag2Method:
     def call_method(self, elm, stag=None):
         getmethod = self.tag2meth.get
         if stag is None:
@@ -130,7 +129,6 @@ class Tag2Method(object):
 class Pr(Tag2Method):
     text = ""
     __val_tags = ("chr", "pos", "begChr", "endChr", "type")
@@ -159,7 +157,7 @@ class Pr(Tag2Method):
     def do_common(self, elm):
         stag = elm.tag.replace(OMML_NS, "")
         if stag in self.__val_tags:
-            t = elm.get("{0}val".format(OMML_NS))
+            t = elm.get(f"{OMML_NS}val")
             self.__innerdict[stag] = t
         return None
@@ -248,7 +246,6 @@ class oMath2Latex(Tag2Method):
         """
         the Pre-Sub-Superscript object -- Not support yet
         """
-        pass
     def do_sub(self, elm):
         text = self.process_children(elm)
@@ -331,7 +328,7 @@ class oMath2Latex(Tag2Method):
         t_dict = self.process_children_dict(elm, include=("e", "lim"))
         latex_s = LIM_FUNC.get(t_dict["e"])
         if not latex_s:
-            raise NotSupport("Not support lim %s" % t_dict["e"])
+            raise RuntimeError("Not support lim {}".format(t_dict["e"]))
         else:
             return latex_s.format(lim=t_dict.get("lim"))
@@ -413,7 +410,7 @@ class oMath2Latex(Tag2Method):
         """
         _str = []
         _base_str = []
-        found_text = elm.findtext("./{0}t".format(OMML_NS))
+        found_text = elm.findtext(f"./{OMML_NS}t")
         if found_text:
             for s in found_text:
                 out_latex_str = self.process_unicode(s)

docling/backend/html_backend.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import logging
+import traceback
 from io import BytesIO
 from pathlib import Path
 from typing import Final, Optional, Union, cast
@@ -26,6 +27,8 @@ _log = logging.getLogger(__name__)
 # tags that generate NodeItem elements
 TAGS_FOR_NODE_ITEMS: Final = [
+    "address",
+    "details",
     "h1",
     "h2",
     "h3",
@@ -38,6 +41,7 @@ TAGS_FOR_NODE_ITEMS: Final = [
     "ul",
     "ol",
     "li",
+    "summary",
     "table",
     "figure",
     "img",
@@ -55,7 +59,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         self.max_levels = 10
         self.level = 0
         self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
-        for i in range(0, self.max_levels):
+        for i in range(self.max_levels):
             self.parents[i] = None
         try:
@@ -126,7 +130,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         return doc
     def walk(self, tag: Tag, doc: DoclingDocument) -> None:
         # Iterate over elements in the body of the document
         text: str = ""
         for element in tag.children:
@@ -135,7 +138,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                     self.analyze_tag(cast(Tag, element), doc)
                 except Exception as exc_child:
                     _log.error(
-                        f"Error processing child from tag {tag.name}: {repr(exc_child)}"
+                        f"Error processing child from tag {tag.name}:\n{traceback.format_exc()}"
                     )
                     raise exc_child
             elif isinstance(element, NavigableString) and not isinstance(
@@ -147,7 +150,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                     item for item in element.next_siblings if isinstance(item, Tag)
                 ]
                 if element.next_sibling is None or any(
-                    [item.name in TAGS_FOR_NODE_ITEMS for item in siblings]
+                    item.name in TAGS_FOR_NODE_ITEMS for item in siblings
                 ):
                     text = text.strip()
                     if text and tag.name in ["div"]:
@@ -164,7 +167,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
     def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
         if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
             self.handle_header(tag, doc)
-        elif tag.name in ["p"]:
+        elif tag.name in ["p", "address", "summary"]:
             self.handle_paragraph(tag, doc)
         elif tag.name in ["pre", "code"]:
             self.handle_code(tag, doc)
@@ -178,6 +181,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             self.handle_figure(tag, doc)
         elif tag.name == "img":
             self.handle_image(tag, doc)
+        elif tag.name == "details":
+            self.handle_details(tag, doc)
         else:
             self.walk(tag, doc)
@@ -202,6 +207,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         return ["".join(result) + " "]
+    def handle_details(self, element: Tag, doc: DoclingDocument) -> None:
+        """Handle details tag (details) and its content."""
+        self.parents[self.level + 1] = doc.add_group(
+            name="details",
+            label=GroupLabel.SECTION,
+            parent=self.parents[self.level],
+            content_layer=self.content_layer,
+        )
+        self.level += 1
+        self.walk(element, doc)
+        self.parents[self.level + 1] = None
+        self.level -= 1
     def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
         """Handles header tags (h1, h2, etc.)."""
         hlevel = int(element.name.replace("h", ""))
@@ -222,7 +242,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             )
         else:
             if hlevel > self.level:
                 # add invisible group
                 for i in range(self.level + 1, hlevel):
                     self.parents[i] = doc.add_group(
@@ -234,7 +253,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 self.level = hlevel
             elif hlevel < self.level:
                 # remove the tail
                 for key in self.parents.keys():
                     if key > hlevel:
@@ -261,7 +279,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             )
     def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
-        """Handles paragraph tags (p)."""
+        """Handles paragraph tags (p) or equivalent ones."""
         if element.text is None:
             return
         text = element.text.strip()
@@ -360,7 +378,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             marker = ""
             enumerated = False
             if parent_label == GroupLabel.ORDERED_LIST:
-                marker = f"{str(index_in_list)}."
+                marker = f"{index_in_list!s}."
                 enumerated = True
             doc.add_list_item(
                 text=text,
@@ -373,46 +391,64 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             _log.debug(f"list-item has no text: {element}")
     @staticmethod
-    def parse_table_data(element: Tag) -> Optional[TableData]:
+    def parse_table_data(element: Tag) -> Optional[TableData]:  # noqa: C901
         nested_tables = element.find("table")
         if nested_tables is not None:
             _log.debug("Skipping nested table.")
             return None
-        # Count the number of rows (number of <tr> elements)
-        num_rows = len(element("tr"))
-        # Find the number of columns (taking into account colspan)
+        # Find the number of rows and columns (taking into account spans)
+        num_rows = 0
         num_cols = 0
         for row in element("tr"):
             col_count = 0
+            is_row_header = True
             if not isinstance(row, Tag):
                 continue
             for cell in row(["td", "th"]):
                 if not isinstance(row, Tag):
                     continue
-                val = cast(Tag, cell).get("colspan", "1")
+                cell_tag = cast(Tag, cell)
+                val = cell_tag.get("colspan", "1")
                 colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1
                 col_count += colspan
+                if cell_tag.name == "td" or cell_tag.get("rowspan") is None:
+                    is_row_header = False
             num_cols = max(num_cols, col_count)
+            if not is_row_header:
+                num_rows += 1
+        _log.debug(f"The table has {num_rows} rows and {num_cols} cols.")
         grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
         data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
         # Iterate over the rows in the table
-        for row_idx, row in enumerate(element("tr")):
+        start_row_span = 0
+        row_idx = -1
+        for row in element("tr"):
             if not isinstance(row, Tag):
                 continue
             # For each row, find all the column cells (both <td> and <th>)
             cells = row(["td", "th"])
-            # Check if each cell in the row is a header -> means it is a column header
+            # Check if cell is in a column header or row header
             col_header = True
+            row_header = True
             for html_cell in cells:
-                if isinstance(html_cell, Tag) and html_cell.name == "td":
-                    col_header = False
+                if isinstance(html_cell, Tag):
+                    if html_cell.name == "td":
+                        col_header = False
+                        row_header = False
+                    elif html_cell.get("rowspan") is None:
+                        row_header = False
+            if not row_header:
+                row_idx += 1
+                start_row_span = 0
+            else:
+                start_row_span += 1
             # Extract the text content of each cell
             col_idx = 0
@@ -443,19 +479,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                     if isinstance(row_val, str) and row_val.isnumeric()
                     else 1
                 )
-                while grid[row_idx][col_idx] is not None:
+                if row_header:
+                    row_span -= 1
+                while (
+                    col_idx < num_cols
+                    and grid[row_idx + start_row_span][col_idx] is not None
+                ):
                     col_idx += 1
-                for r in range(row_span):
+                for r in range(start_row_span, start_row_span + row_span):
                     for c in range(col_span):
-                        grid[row_idx + r][col_idx + c] = text
+                        if row_idx + r < num_rows and col_idx + c < num_cols:
+                            grid[row_idx + r][col_idx + c] = text
                 table_cell = TableCell(
                     text=text,
                     row_span=row_span,
                     col_span=col_span,
-                    start_row_offset_idx=row_idx,
-                    end_row_offset_idx=row_idx + row_span,
+                    start_row_offset_idx=start_row_span + row_idx,
+                    end_row_offset_idx=start_row_span + row_idx + row_span,
                     start_col_offset_idx=col_idx,
                     end_col_offset_idx=col_idx + col_span,
                     column_header=col_header,

docling/backend/md_backend.py CHANGED Viewed

@@ -83,7 +83,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 # otherwise they represent emphasis (bold or italic)
                 self.markdown = self._shorten_underscore_sequences(text_stream)
             if isinstance(self.path_or_stream, Path):
-                with open(self.path_or_stream, "r", encoding="utf-8") as f:
+                with open(self.path_or_stream, encoding="utf-8") as f:
                     md_content = f.read()
                     # remove invalid sequences
                     # very long sequences of underscores will lead to unnecessary long processing times.
@@ -168,7 +168,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             )
         self.inline_texts = []
-    def _iterate_elements(
+    def _iterate_elements(  # noqa: C901
         self,
         element: marko.element.Element,
         depth: int,
@@ -176,7 +176,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
         visited: Set[marko.element.Element],
         parent_item: Optional[NodeItem] = None,
     ):
         if element in visited:
             return
@@ -236,7 +235,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             if has_non_empty_list_items:
                 label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
                 parent_item = doc.add_group(
-                    label=label, name=f"list", parent=parent_item
+                    label=label, name="list", parent=parent_item
                 )
         elif (
@@ -320,7 +319,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             self._html_blocks += 1
             self._process_inline_text(parent_item, doc)
             self._close_table(doc)
-            _log.debug("HTML Block: {}".format(element))
+            _log.debug(f"HTML Block: {element}")
             if (
                 len(element.body) > 0
             ):  # If Marko doesn't return any content for HTML block, skip it
@@ -332,7 +331,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
         else:
             if not isinstance(element, str):
                 self._close_table(doc)
-                _log.debug("Some other element: {}".format(element))
+                _log.debug(f"Some other element: {element}")
         processed_block_types = (
             marko.block.Heading,
@@ -398,7 +397,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             # if HTML blocks were detected, export to HTML and delegate to HTML backend
             if self._html_blocks > 0:
                 # export to HTML
                 html_backend_cls = HTMLDocumentBackend
                 html_str = doc.export_to_html()
@@ -411,7 +409,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                         )
                     return _txt
-                # restore original HTML by removing previouly added markers
+                # restore original HTML by removing previously added markers
                 for regex in [
                     rf"<pre>\s*<code>\s*{_START_MARKER}",
                     rf"{_STOP_MARKER}\s*</code>\s*</pre>",

docling/backend/msexcel_backend.py CHANGED Viewed

@@ -184,7 +184,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
         """
         if self.workbook is not None:
             # Iterate over all sheets
             for sheet_name in self.workbook.sheetnames:
                 _log.info(f"Processing sheet: {sheet_name}")
@@ -253,7 +252,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
                 )
                 for excel_cell in excel_table.data:
                     cell = TableCell(
                         text=excel_cell.text,
                         row_span=excel_cell.row_span,
@@ -303,7 +301,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
         # Iterate over all cells in the sheet
         for ri, row in enumerate(sheet.iter_rows(values_only=False)):
             for rj, cell in enumerate(row):
                 # Skip empty or already visited cells
                 if cell.value is None or (ri, rj) in visited:
                     continue
@@ -342,7 +339,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
         visited_cells: set[tuple[int, int]] = set()
         for ri in range(start_row, max_row + 1):
             for rj in range(start_col, max_col + 1):
                 cell = sheet.cell(row=ri + 1, column=rj + 1)  # 1-based indexing
                 # Check if the cell belongs to a merged range
@@ -350,14 +346,12 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
                 col_span = 1
                 for merged_range in sheet.merged_cells.ranges:
                     if (
                         merged_range.min_row <= ri + 1
                         and ri + 1 <= merged_range.max_row
                         and merged_range.min_col <= rj + 1
                         and rj + 1 <= merged_range.max_col
                     ):
                         row_span = merged_range.max_row - merged_range.min_row + 1
                         col_span = merged_range.max_col - merged_range.min_col + 1
                         break
@@ -499,7 +493,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
                             ),
                         ),
                     )
-                except:
+                except Exception:
                     _log.error("could not extract the image from excel sheets")
         return doc

docling/backend/mspowerpoint_backend.py CHANGED Viewed

@@ -120,13 +120,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
         return prov
-    def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
+    def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):  # noqa: C901
         is_a_list = False
         is_list_group_created = False
         enum_list_item_value = 0
         new_list = None
         bullet_type = "None"
-        list_text = ""
         list_label = GroupLabel.LIST
         doc_label = DocItemLabel.LIST_ITEM
         prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
@@ -243,7 +242,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
                     enum_marker = str(enum_list_item_value) + "."
                 if not is_list_group_created:
                     new_list = doc.add_group(
-                        label=list_label, name=f"list", parent=parent_slide
+                        label=list_label, name="list", parent=parent_slide
                     )
                     is_list_group_created = True
                 doc.add_list_item(
@@ -368,11 +367,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
         slide_width = pptx_obj.slide_width
         slide_height = pptx_obj.slide_height
-        text_content = []  # type: ignore
         max_levels = 10
         parents = {}  # type: ignore
-        for i in range(0, max_levels):
+        for i in range(max_levels):
             parents[i] = None
         # Loop through each slide
@@ -383,7 +380,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
             )
             slide_size = Size(width=slide_width, height=slide_height)
-            parent_page = doc.add_page(page_no=slide_ind + 1, size=slide_size)
+            doc.add_page(page_no=slide_ind + 1, size=slide_size)
             def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
                 handle_groups(shape, parent_slide, slide_ind, doc, slide_size)

docling 2.30.0__py3-none-any.whl → 2.31.1__py3-none-any.whl

docling 2.30.0py3-none-any.whl → 2.31.1py3-none-any.whl