PyPI - docling - Versions diffs - 2.45.0__tar.gz → 2.47.0__tar.gz - Mend

docling 2.45.0tar.gz → 2.47.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (142) hide show

{docling-2.45.0 → docling-2.47.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docling
-Version: 2.45.0
+Version: 2.47.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
 License-Expression: MIT
@@ -27,7 +27,7 @@ Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: pydantic<3.0.0,>=2.0.0
 Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
-Requires-Dist: docling-parse<5.0.0,>=4.0.0
+Requires-Dist: docling-parse<5.0.0,>=4.2.2
 Requires-Dist: docling-ibm-models<4,>=3.9.0
 Requires-Dist: filetype<2.0.0,>=1.2.0
 Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
@@ -59,6 +59,7 @@ Provides-Extra: vlm
 Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
 Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
 Requires-Dist: mlx-vlm<1.0.0,>=0.3.0; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
+Requires-Dist: vllm<1.0.0,>=0.10.0; (python_version >= "3.10" and sys_platform == "linux") and extra == "vlm"
 Provides-Extra: rapidocr
 Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
 Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"

{docling-2.45.0 → docling-2.47.0}/docling/backend/docling_parse_v4_backend.py RENAMED Viewed

@@ -22,15 +22,52 @@ _log = logging.getLogger(__name__)
 class DoclingParseV4PageBackend(PdfPageBackend):
-    def __init__(self, parsed_page: SegmentedPdfPage, page_obj: PdfPage):
+    def __init__(
+        self,
+        *,
+        dp_doc: PdfDocument,
+        page_obj: PdfPage,
+        page_no: int,
+        create_words: bool = True,
+        create_textlines: bool = True,
+    ):
         self._ppage = page_obj
-        self._dpage = parsed_page
-        self.valid = parsed_page is not None
+        self._dp_doc = dp_doc
+        self._page_no = page_no
+        self._create_words = create_words
+        self._create_textlines = create_textlines
+        self._dpage: Optional[SegmentedPdfPage] = None
+        self._unloaded = False
+        self.valid = (self._ppage is not None) and (self._dp_doc is not None)
+    def _ensure_parsed(self) -> None:
+        if self._dpage is not None:
+            return
+        seg_page = self._dp_doc.get_page(
+            self._page_no + 1,
+            create_words=self._create_words,
+            create_textlines=self._create_textlines,
+        )
+        # In Docling, all TextCell instances are expected with top-left origin.
+        [
+            tc.to_top_left_origin(seg_page.dimension.height)
+            for tc in seg_page.textline_cells
+        ]
+        [tc.to_top_left_origin(seg_page.dimension.height) for tc in seg_page.char_cells]
+        [tc.to_top_left_origin(seg_page.dimension.height) for tc in seg_page.word_cells]
+        self._dpage = seg_page
     def is_valid(self) -> bool:
         return self.valid
     def get_text_in_rect(self, bbox: BoundingBox) -> str:
+        self._ensure_parsed()
+        assert self._dpage is not None
         # Find intersecting cells on the page
         text_piece = ""
         page_size = self.get_size()
@@ -56,12 +93,19 @@ class DoclingParseV4PageBackend(PdfPageBackend):
         return text_piece
     def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
+        self._ensure_parsed()
         return self._dpage
     def get_text_cells(self) -> Iterable[TextCell]:
+        self._ensure_parsed()
+        assert self._dpage is not None
         return self._dpage.textline_cells
     def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
+        self._ensure_parsed()
+        assert self._dpage is not None
         AREA_THRESHOLD = 0  # 32 * 32
         images = self._dpage.bitmap_resources
@@ -123,8 +167,13 @@ class DoclingParseV4PageBackend(PdfPageBackend):
         # )
     def unload(self):
+        if not self._unloaded and self._dp_doc is not None:
+            self._dp_doc.unload_pages((self._page_no + 1, self._page_no + 2))
+            self._unloaded = True
         self._ppage = None
         self._dpage = None
+        self._dp_doc = None
 class DoclingParseV4DocumentBackend(PdfDocumentBackend):
@@ -157,30 +206,15 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
         self, page_no: int, create_words: bool = True, create_textlines: bool = True
     ) -> DoclingParseV4PageBackend:
         with pypdfium2_lock:
-            seg_page = self.dp_doc.get_page(
-                page_no + 1,
-                create_words=create_words,
-                create_textlines=create_textlines,
-            )
-            # In Docling, all TextCell instances are expected with top-left origin.
-            [
-                tc.to_top_left_origin(seg_page.dimension.height)
-                for tc in seg_page.textline_cells
-            ]
-            [
-                tc.to_top_left_origin(seg_page.dimension.height)
-                for tc in seg_page.char_cells
-            ]
-            [
-                tc.to_top_left_origin(seg_page.dimension.height)
-                for tc in seg_page.word_cells
-            ]
-            return DoclingParseV4PageBackend(
-                seg_page,
-                self._pdoc[page_no],
-            )
+            ppage = self._pdoc[page_no]
+        return DoclingParseV4PageBackend(
+            dp_doc=self.dp_doc,
+            page_obj=ppage,
+            page_no=page_no,
+            create_words=create_words,
+            create_textlines=create_textlines,
+        )
     def is_valid(self) -> bool:
         return self.page_count() > 0

{docling-2.45.0 → docling-2.47.0}/docling/backend/html_backend.py RENAMED Viewed

@@ -20,7 +20,7 @@ from docling_core.types.doc import (
     TableData,
     TextItem,
 )
-from docling_core.types.doc.document import ContentLayer
+from docling_core.types.doc.document import ContentLayer, Formatting, Script
 from pydantic import AnyUrl, BaseModel, ValidationError
 from typing_extensions import override
@@ -38,6 +38,7 @@ _BLOCK_TAGS: Final = {
     "address",
     "details",
     "figure",
+    "footer",
     "h1",
     "h2",
     "h3",
@@ -53,6 +54,21 @@ _BLOCK_TAGS: Final = {
     "table",
 }
+_FORMAT_TAG_MAP: Final = {
+    "b": {"bold": True},
+    "strong": {"bold": True},
+    "i": {"italic": True},
+    "em": {"italic": True},
+    # "mark",
+    # "small",
+    "s": {"strikethrough": True},
+    "del": {"strikethrough": True},
+    "u": {"underline": True},
+    "ins": {"underline": True},
+    "sub": {"script": Script.SUB},
+    "sup": {"script": Script.SUPER},
+}
 class _Context(BaseModel):
     list_ordered_flag_by_ref: dict[str, bool] = {}
@@ -62,23 +78,34 @@ class _Context(BaseModel):
 class AnnotatedText(BaseModel):
     text: str
     hyperlink: Union[AnyUrl, Path, None] = None
+    formatting: Union[Formatting, None] = None
 class AnnotatedTextList(list):
     def to_single_text_element(self) -> AnnotatedText:
         current_h = None
         current_text = ""
+        current_f = None
         for at in self:
             t = at.text
             h = at.hyperlink
+            f = at.formatting
             current_text += t.strip() + " "
+            if f is not None and current_f is None:
+                current_f = f
+            elif f is not None and current_f is not None and f != current_f:
+                _log.warning(
+                    f"Clashing formatting: '{f}' and '{current_f}'! Chose '{current_f}'"
+                )
             if h is not None and current_h is None:
                 current_h = h
             elif h is not None and current_h is not None and h != current_h:
                 _log.warning(
                     f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'"
                 )
-        return AnnotatedText(text=current_text.strip(), hyperlink=current_h)
+        return AnnotatedText(
+            text=current_text.strip(), hyperlink=current_h, formatting=current_f
+        )
     def simplify_text_elements(self) -> "AnnotatedTextList":
         simplified = AnnotatedTextList()
@@ -86,21 +113,27 @@ class AnnotatedTextList(list):
             return self
         text = self[0].text
         hyperlink = self[0].hyperlink
+        formatting = self[0].formatting
         last_elm = text
         for i in range(1, len(self)):
-            if hyperlink == self[i].hyperlink:
+            if hyperlink == self[i].hyperlink and formatting == self[i].formatting:
                 sep = " "
                 if not self[i].text.strip() or not last_elm.strip():
                     sep = ""
                 text += sep + self[i].text
                 last_elm = self[i].text
             else:
-                simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
+                simplified.append(
+                    AnnotatedText(text=text, hyperlink=hyperlink, formatting=formatting)
+                )
                 text = self[i].text
                 last_elm = text
                 hyperlink = self[i].hyperlink
+                formatting = self[i].formatting
         if text:
-            simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
+            simplified.append(
+                AnnotatedText(text=text, hyperlink=hyperlink, formatting=formatting)
+            )
         return simplified
     def split_by_newline(self):
@@ -143,6 +176,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             self.parents[i] = None
         self.hyperlink = None
         self.original_url = original_url
+        self.format_tags: list[str] = []
         try:
             raw = (
@@ -253,6 +287,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                                 label=DocItemLabel.TEXT,
                                 text=seg_clean,
                                 content_layer=self.content_layer,
+                                formatting=annotated_text.formatting,
                                 hyperlink=annotated_text.hyperlink,
                             )
@@ -262,6 +297,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 if name == "img":
                     flush_buffer()
                     self._emit_image(node, doc)
+                elif name in _FORMAT_TAG_MAP:
+                    with self.use_format([name]):
+                        self._walk(node, doc)
                 elif name == "a":
                     with self.use_hyperlink(node):
                         self._walk(node, doc)
@@ -291,6 +329,27 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         flush_buffer()
+    @staticmethod
+    def _collect_parent_format_tags(item: PageElement) -> list[str]:
+        tags = []
+        for format_tag in _FORMAT_TAG_MAP:
+            this_parent = item.parent
+            while this_parent is not None:
+                if this_parent.name == format_tag:
+                    tags.append(format_tag)
+                    break
+                this_parent = this_parent.parent
+        return tags
+    @property
+    def _formatting(self):
+        kwargs = {}
+        for t in self.format_tags:
+            kwargs.update(_FORMAT_TAG_MAP[t])
+        if not kwargs:
+            return None
+        return Formatting(**kwargs)
     def _extract_text_and_hyperlink_recursively(
         self,
         item: PageElement,
@@ -301,15 +360,18 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         result: AnnotatedTextList = AnnotatedTextList()
         # If find_parent_annotation, make sure that we keep track of
-        # any a-tag that has been present in the DOM-parents already.
+        # any a- or formatting-tag that has been present in the
+        # DOM-parents already.
         if find_parent_annotation:
+            format_tags = self._collect_parent_format_tags(item)
             this_parent = item.parent
             while this_parent is not None:
                 if this_parent.name == "a" and this_parent.get("href"):
-                    with self.use_hyperlink(this_parent):
-                        return self._extract_text_and_hyperlink_recursively(
-                            item, ignore_list
-                        )
+                    with self.use_format(format_tags):
+                        with self.use_hyperlink(this_parent):
+                            return self._extract_text_and_hyperlink_recursively(
+                                item, ignore_list
+                            )
                 this_parent = this_parent.parent
         if isinstance(item, PreformattedString):
@@ -319,18 +381,37 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             text = item.strip()
             if text:
                 return AnnotatedTextList(
-                    [AnnotatedText(text=text, hyperlink=self.hyperlink)]
+                    [
+                        AnnotatedText(
+                            text=text,
+                            hyperlink=self.hyperlink,
+                            formatting=self._formatting,
+                        )
+                    ]
                 )
             if keep_newlines and item.strip("\n\r") == "":
                 return AnnotatedTextList(
-                    [AnnotatedText(text="\n", hyperlink=self.hyperlink)]
+                    [
+                        AnnotatedText(
+                            text="\n",
+                            hyperlink=self.hyperlink,
+                            formatting=self._formatting,
+                        )
+                    ]
                 )
             return AnnotatedTextList()
         tag = cast(Tag, item)
         if not ignore_list or (tag.name not in ["ul", "ol"]):
             for child in tag:
-                if isinstance(child, Tag) and child.name == "a":
+                if isinstance(child, Tag) and child.name in _FORMAT_TAG_MAP:
+                    with self.use_format([child.name]):
+                        result.extend(
+                            self._extract_text_and_hyperlink_recursively(
+                                child, ignore_list, keep_newlines=keep_newlines
+                            )
+                        )
+                elif isinstance(child, Tag) and child.name == "a":
                     with self.use_hyperlink(child):
                         result.extend(
                             self._extract_text_and_hyperlink_recursively(
@@ -368,6 +449,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 if this_href:
                     self.hyperlink = old_hyperlink
+    @contextmanager
+    def use_format(self, tags: list[str]):
+        if not tags:
+            yield None
+        else:
+            self.format_tags.extend(tags)
+            try:
+                yield None
+            finally:
+                self.format_tags = self.format_tags[: -len(tags)]
     @contextmanager
     def use_inline_group(
         self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument
@@ -419,6 +511,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             self.parents[self.level + 1] = doc.add_title(
                 text_clean,
                 content_layer=self.content_layer,
+                formatting=annotated_text.formatting,
                 hyperlink=annotated_text.hyperlink,
             )
         # the other levels need to be lowered by 1 if a title was set
@@ -448,6 +541,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 orig=annotated_text.text,
                 level=self.level,
                 content_layer=self.content_layer,
+                formatting=annotated_text.formatting,
                 hyperlink=annotated_text.hyperlink,
             )
         self.level += 1
@@ -528,6 +622,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                                     label=DocItemLabel.TEXT,
                                     text=li_clean,
                                     content_layer=self.content_layer,
+                                    formatting=annotated_text.formatting,
                                     hyperlink=annotated_text.hyperlink,
                                 )
@@ -550,6 +645,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                             orig=li_text,
                             parent=list_group,
                             content_layer=self.content_layer,
+                            formatting=annotated_text.formatting,
                             hyperlink=annotated_text.hyperlink,
                         )
@@ -602,6 +698,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                                 label=DocItemLabel.TEXT,
                                 text=seg_clean,
                                 content_layer=self.content_layer,
+                                formatting=annotated_text.formatting,
                                 hyperlink=annotated_text.hyperlink,
                             )
@@ -636,13 +733,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                         parent=self.parents[self.level],
                         text=text_clean,
                         content_layer=self.content_layer,
+                        formatting=annotated_text.formatting,
                         hyperlink=annotated_text.hyperlink,
                     )
-        elif tag_name == "details":
-            # handle details and its content.
+        elif tag_name in {"details", "footer"}:
+            if tag_name == "footer":
+                current_layer = self.content_layer
+                self.content_layer = ContentLayer.FURNITURE
             self.parents[self.level + 1] = doc.add_group(
-                name="details",
+                name=tag_name,
                 label=GroupLabel.SECTION,
                 parent=self.parents[self.level],
                 content_layer=self.content_layer,
@@ -651,6 +751,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             self._walk(tag, doc)
             self.parents[self.level + 1] = None
             self.level -= 1
+            if tag_name == "footer":
+                self.content_layer = current_layer
     def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
         figure = img_tag.find_parent("figure")
@@ -686,12 +788,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             text_clean = HTMLDocumentBackend._clean_unicode(
                 caption_anno_text.text.strip()
             )
-            print(caption_anno_text)
             caption_item = doc.add_text(
                 label=DocItemLabel.CAPTION,
                 text=text_clean,
                 orig=caption_anno_text.text,
                 content_layer=self.content_layer,
+                formatting=caption_anno_text.formatting,
                 hyperlink=caption_anno_text.hyperlink,
             )

{docling-2.45.0 → docling-2.47.0}/docling/backend/msword_backend.py RENAMED Viewed

@@ -67,6 +67,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         self.level = 0
         self.listIter = 0
+        # Track list counters per numId and ilvl
+        self.list_counters: dict[tuple[int, int], int] = {}
         self.history: dict[str, Any] = {
             "names": [None],
@@ -315,6 +317,108 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         return None, None  # If the paragraph is not part of a list
+    def _get_list_counter(self, numid: int, ilvl: int) -> int:
+        """Get and increment the counter for a specific numId and ilvl combination."""
+        key = (numid, ilvl)
+        if key not in self.list_counters:
+            self.list_counters[key] = 0
+        self.list_counters[key] += 1
+        return self.list_counters[key]
+    def _reset_list_counters_for_new_sequence(self, numid: int):
+        """Reset counters when starting a new numbering sequence."""
+        # Reset all counters for this numid
+        keys_to_reset = [key for key in self.list_counters.keys() if key[0] == numid]
+        for key in keys_to_reset:
+            self.list_counters[key] = 0
+    def _is_numbered_list(self, docx_obj: DocxDocument, numId: int, ilvl: int) -> bool:
+        """Check if a list is numbered based on its numFmt value."""
+        try:
+            # Access the numbering part of the document
+            if not hasattr(docx_obj, "part") or not hasattr(docx_obj.part, "package"):
+                return False
+            numbering_part = None
+            # Find the numbering part
+            for part in docx_obj.part.package.parts:
+                if "numbering" in part.partname:
+                    numbering_part = part
+                    break
+            if numbering_part is None:
+                return False
+            # Parse the numbering XML
+            numbering_root = numbering_part.element
+            namespaces = {
+                "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
+            }
+            # Find the numbering definition with the given numId
+            num_xpath = f".//w:num[@w:numId='{numId}']"
+            num_element = numbering_root.find(num_xpath, namespaces=namespaces)
+            if num_element is None:
+                return False
+            # Get the abstractNumId from the num element
+            abstract_num_id_elem = num_element.find(
+                ".//w:abstractNumId", namespaces=namespaces
+            )
+            if abstract_num_id_elem is None:
+                return False
+            abstract_num_id = abstract_num_id_elem.get(
+                "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
+            )
+            if abstract_num_id is None:
+                return False
+            # Find the abstract numbering definition
+            abstract_num_xpath = (
+                f".//w:abstractNum[@w:abstractNumId='{abstract_num_id}']"
+            )
+            abstract_num_element = numbering_root.find(
+                abstract_num_xpath, namespaces=namespaces
+            )
+            if abstract_num_element is None:
+                return False
+            # Find the level definition for the given ilvl
+            lvl_xpath = f".//w:lvl[@w:ilvl='{ilvl}']"
+            lvl_element = abstract_num_element.find(lvl_xpath, namespaces=namespaces)
+            if lvl_element is None:
+                return False
+            # Get the numFmt element
+            num_fmt_element = lvl_element.find(".//w:numFmt", namespaces=namespaces)
+            if num_fmt_element is None:
+                return False
+            num_fmt = num_fmt_element.get(
+                "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
+            )
+            # Numbered formats include: decimal, lowerRoman, upperRoman, lowerLetter, upperLetter
+            # Bullet formats include: bullet
+            numbered_formats = {
+                "decimal",
+                "lowerRoman",
+                "upperRoman",
+                "lowerLetter",
+                "upperLetter",
+                "decimalZero",
+            }
+            return num_fmt in numbered_formats
+        except Exception as e:
+            _log.debug(f"Error determining if list is numbered: {e}")
+            return False
     def _get_heading_and_level(self, style_label: str) -> tuple[str, Optional[int]]:
         parts = self._split_text_and_number(style_label)
@@ -713,8 +817,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         # Common styles for bullet and numbered lists.
         # "List Bullet", "List Number", "List Paragraph"
         # Identify whether list is a numbered list or not
-        # is_numbered = "List Bullet" not in paragraph.style.name
-        is_numbered = False
         p_style_id, p_level = self._get_label_and_level(paragraph)
         numid, ilevel = self._get_numId_and_ilvl(paragraph)
@@ -727,6 +829,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
             and ilevel is not None
             and p_style_id not in ["Title", "Heading"]
         ):
+            # Check if this is actually a numbered list by examining the numFmt
+            is_numbered = self._is_numbered_list(docx_obj, numid, ilevel)
             self._add_list_item(
                 doc=doc,
                 numid=numid,
@@ -983,15 +1088,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         if self._prev_numid() is None:  # Open new list
             self.level_at_new_list = level
+            # Reset counters for the new numbering sequence
+            self._reset_list_counters_for_new_sequence(numid)
             self.parents[level] = doc.add_list_group(
                 name="list", parent=self.parents[level - 1]
             )
             # Set marker and enumerated arguments if this is an enumeration element.
-            self.listIter += 1
             if is_numbered:
-                enum_marker = str(self.listIter) + "."
-                is_numbered = True
+                counter = self._get_list_counter(numid, ilevel)
+                enum_marker = str(counter) + "."
+            else:
+                enum_marker = ""
             self._add_formatted_list_item(
                 doc, elements, enum_marker, is_numbered, level
             )
@@ -1005,16 +1114,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 self.level_at_new_list + prev_indent + 1,
                 self.level_at_new_list + ilevel + 1,
             ):
-                self.listIter = 0
                 self.parents[i] = doc.add_list_group(
                     name="list", parent=self.parents[i - 1]
                 )
             # TODO: Set marker and enumerated arguments if this is an enumeration element.
-            self.listIter += 1
             if is_numbered:
-                enum_marker = str(self.listIter) + "."
-                is_numbered = True
+                counter = self._get_list_counter(numid, ilevel)
+                enum_marker = str(counter) + "."
+            else:
+                enum_marker = ""
             self._add_formatted_list_item(
                 doc,
                 elements,
@@ -1033,10 +1142,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                     self.parents[k] = None
             # TODO: Set marker and enumerated arguments if this is an enumeration element.
-            self.listIter += 1
             if is_numbered:
-                enum_marker = str(self.listIter) + "."
-                is_numbered = True
+                counter = self._get_list_counter(numid, ilevel)
+                enum_marker = str(counter) + "."
+            else:
+                enum_marker = ""
             self._add_formatted_list_item(
                 doc,
                 elements,
@@ -1044,14 +1154,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 is_numbered,
                 self.level_at_new_list + ilevel,
             )
-            self.listIter = 0
         elif self._prev_numid() == numid or prev_indent == ilevel:
             # TODO: Set marker and enumerated arguments if this is an enumeration element.
-            self.listIter += 1
             if is_numbered:
-                enum_marker = str(self.listIter) + "."
-                is_numbered = True
+                counter = self._get_list_counter(numid, ilevel)
+                enum_marker = str(counter) + "."
+            else:
+                enum_marker = ""
             self._add_formatted_list_item(
                 doc, elements, enum_marker, is_numbered, level - 1
             )

docling 2.45.0__tar.gz → 2.47.0__tar.gz

docling 2.45.0tar.gz → 2.47.0tar.gz