PyPI - docling-core - Versions diffs - 2.18.1__py3-none-any.whl → 2.19.1__py3-none-any.whl - Mend

docling-core 2.18.1py3-none-any.whl → 2.19.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling-core might be problematic. Click here for more details.

Files changed (9) hide show

docling_core/transforms/chunker/hierarchical_chunker.py CHANGED Viewed

@@ -19,6 +19,7 @@ from docling_core.search.package import VERSION_PATTERN
 from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
 from docling_core.types import DoclingDocument as DLDocument
 from docling_core.types.doc.document import (
+    CodeItem,
     DocItem,
     DocumentOrigin,
     LevelNumber,
@@ -199,8 +200,10 @@ class HierarchicalChunker(BaseChunker):
                         heading_by_level.pop(k, None)
                     continue
-                if isinstance(item, TextItem) or (
-                    (not self.merge_list_items) and isinstance(item, ListItem)
+                if (
+                    isinstance(item, TextItem)
+                    or ((not self.merge_list_items) and isinstance(item, ListItem))
+                    or isinstance(item, CodeItem)
                 ):
                     text = item.text
                 elif isinstance(item, TableItem):

docling_core/types/doc/document.py CHANGED Viewed

@@ -75,6 +75,14 @@ DEFAULT_EXPORT_LABELS = {
     DocItemLabel.PAGE_FOOTER,
 }
+DOCUMENT_TOKENS_EXPORT_LABELS = DEFAULT_EXPORT_LABELS.copy()
+DOCUMENT_TOKENS_EXPORT_LABELS.update(
+    [
+        DocItemLabel.FOOTNOTE,
+        DocItemLabel.CAPTION,
+    ]
+)
 class BasePictureData(BaseModel):
     """BasePictureData."""
@@ -564,9 +572,8 @@ class DocItem(
         self,
         doc: "DoclingDocument",
         new_line: str,
-        xsize: int = 100,
-        ysize: int = 100,
-        add_page_index: bool = True,
+        xsize: int = 500,
+        ysize: int = 500,
     ) -> str:
         """Get the location string for the BaseCell."""
         if not len(self.prov):
@@ -576,17 +583,12 @@ class DocItem(
         for prov in self.prov:
             page_w, page_h = doc.pages[prov.page_no].size.as_tuple()
-            page_i = -1
-            if add_page_index:
-                page_i = prov.page_no
             loc_str = DocumentToken.get_location(
-                bbox=prov.bbox.to_bottom_left_origin(page_h).as_tuple(),
+                bbox=prov.bbox.to_top_left_origin(page_h).as_tuple(),
                 page_w=page_w,
                 page_h=page_h,
                 xsize=xsize,
                 ysize=ysize,
-                page_i=page_i,
             )
             location += f"{loc_str}{new_line}"
@@ -641,57 +643,40 @@ class TextItem(DocItem):
     def export_to_document_tokens(
         self,
         doc: "DoclingDocument",
-        new_line: str = "\n",
-        xsize: int = 100,
-        ysize: int = 100,
+        new_line: str = "",
+        xsize: int = 500,
+        ysize: int = 500,
         add_location: bool = True,
         add_content: bool = True,
-        add_page_index: bool = True,
     ):
         r"""Export text element to document tokens format.
         :param doc: "DoclingDocument":
-        :param new_line: str:  (Default value = "\n")
-        :param xsize: int:  (Default value = 100)
-        :param ysize: int:  (Default value = 100)
+        :param new_line: str (Default value = "")
+        :param xsize: int:  (Default value = 500)
+        :param ysize: int:  (Default value = 500)
         :param add_location: bool:  (Default value = True)
         :param add_content: bool:  (Default value = True)
-        :param add_page_index: bool:  (Default value = True)
         """
-        body = f"<{self.label.value}>"
-        # TODO: This must be done through an explicit mapping.
-        # assert DocumentToken.is_known_token(
-        #    body
-        # ), f"failed DocumentToken.is_known_token({body})"
+        body = f"<{self.label.value}>{new_line}"
         if add_location:
             body += self.get_location_tokens(
                 doc=doc,
-                new_line="",
+                new_line=new_line,
                 xsize=xsize,
                 ysize=ysize,
-                add_page_index=add_page_index,
             )
         if add_content and self.text is not None:
-            body += self.text.strip()
+            body += f"{self.text.strip()}{new_line}"
-        body += f"</{self.label.value}>{new_line}"
+        body += f"</{self.label.value}>\n"
         return body
-class CodeItem(TextItem):
-    """CodeItem."""
-    label: typing.Literal[DocItemLabel.CODE] = (
-        DocItemLabel.CODE  # type: ignore[assignment]
-    )
-    code_language: CodeLanguageLabel = CodeLanguageLabel.UNKNOWN
 class SectionHeaderItem(TextItem):
     """SectionItem."""
@@ -703,25 +688,23 @@ class SectionHeaderItem(TextItem):
     def export_to_document_tokens(
         self,
         doc: "DoclingDocument",
-        new_line: str = "\n",
-        xsize: int = 100,
-        ysize: int = 100,
+        new_line: str = "",
+        xsize: int = 500,
+        ysize: int = 500,
         add_location: bool = True,
         add_content: bool = True,
-        add_page_index: bool = True,
     ):
         r"""Export text element to document tokens format.
         :param doc: "DoclingDocument":
-        :param new_line: str:  (Default value = "\n")
-        :param xsize: int:  (Default value = 100)
-        :param ysize: int:  (Default value = 100)
+        :param new_line: str (Default value = "")
+        :param xsize: int:  (Default value = 500)
+        :param ysize: int:  (Default value = 500)
         :param add_location: bool:  (Default value = True)
         :param add_content: bool:  (Default value = True)
-        :param add_page_index: bool:  (Default value = True)
         """
-        body = f"<{self.label.value}_level_{self.level}>"
+        body = f"<{self.label.value}_level_{self.level}>{new_line}"
         # TODO: This must be done through an explicit mapping.
         # assert DocumentToken.is_known_token(
@@ -731,16 +714,15 @@ class SectionHeaderItem(TextItem):
         if add_location:
             body += self.get_location_tokens(
                 doc=doc,
-                new_line="",
+                new_line=new_line,
                 xsize=xsize,
                 ysize=ysize,
-                add_page_index=add_page_index,
             )
         if add_content and self.text is not None:
-            body += self.text.strip()
+            body += f"{self.text.strip()}{new_line}"
-        body += f"</{self.label.value}_level_{self.level}>{new_line}"
+        body += f"</{self.label.value}_level_{self.level}>\n"
         return body
@@ -785,6 +767,51 @@ class FloatingItem(DocItem):
         return super().get_image(doc=doc)
+class CodeItem(FloatingItem, TextItem):
+    """CodeItem."""
+    label: typing.Literal[DocItemLabel.CODE] = (
+        DocItemLabel.CODE  # type: ignore[assignment]
+    )
+    code_language: CodeLanguageLabel = CodeLanguageLabel.UNKNOWN
+    def export_to_document_tokens(
+        self,
+        doc: "DoclingDocument",
+        new_line: str = "",
+        xsize: int = 500,
+        ysize: int = 500,
+        add_location: bool = True,
+        add_content: bool = True,
+    ):
+        r"""Export text element to document tokens format.
+        :param doc: "DoclingDocument":
+        :param new_line: str (Default value = "")
+        :param xsize: int:  (Default value = 500)
+        :param ysize: int:  (Default value = 500)
+        :param add_location: bool:  (Default value = True)
+        :param add_content: bool:  (Default value = True)
+        """
+        body = f"<{self.label.value}{new_line}"
+        if add_location:
+            body += self.get_location_tokens(
+                doc=doc,
+                new_line=new_line,
+                xsize=xsize,
+                ysize=ysize,
+            )
+        if add_content and self.text is not None:
+            body += f"<_{self.code_language.value}_>{self.text}{new_line}"
+        body += f"</{self.label.value}\n"
+        return body
 class PictureItem(FloatingItem):
     """PictureItem."""
@@ -931,47 +958,62 @@ class PictureItem(FloatingItem):
     def export_to_document_tokens(
         self,
         doc: "DoclingDocument",
-        new_line: str = "\n",
-        xsize: int = 100,
-        ysize: int = 100,
+        new_line: str = "",
+        xsize: int = 500,
+        ysize: int = 500,
         add_location: bool = True,
         add_caption: bool = True,
         add_content: bool = True,  # not used at the moment
-        add_page_index: bool = True,
     ):
         r"""Export picture to document tokens format.
         :param doc: "DoclingDocument":
-        :param new_line: str:  (Default value = "\n")
-        :param xsize: int:  (Default value = 100)
-        :param ysize: int:  (Default value = 100)
+        :param new_line: str (Default value = "")
+        :param xsize: int:  (Default value = 500)
+        :param ysize: int:  (Default value = 500)
         :param add_location: bool:  (Default value = True)
         :param add_caption: bool:  (Default value = True)
         :param add_content: bool:  (Default value = True)
-        :param # not used at the momentadd_page_index: bool:  (Default value = True)
+        :param # not used at the moment
         """
-        body = f"{DocumentToken.BEG_FIGURE.value}{new_line}"
+        body = f"<{self.label.value}>{new_line}"
         if add_location:
             body += self.get_location_tokens(
                 doc=doc,
                 new_line=new_line,
                 xsize=xsize,
                 ysize=ysize,
-                add_page_index=add_page_index,
             )
+        classifications = [
+            ann
+            for ann in self.annotations
+            if isinstance(ann, PictureClassificationData)
+        ]
+        if len(classifications) > 0:
+            # ! TODO: currently this code assumes class_name is of type 'str'
+            # ! TODO: when it will change to an ENUM --> adapt code
+            predicted_class = classifications[0].predicted_classes[0].class_name
+            body += DocumentToken.get_picture_classification_token(predicted_class)
         if add_caption and len(self.captions):
             text = self.caption_text(doc)
             if len(text):
-                body += f"{DocumentToken.BEG_CAPTION.value}"
+                body += f"<{DocItemLabel.CAPTION.value}>"
+                for caption in self.captions:
+                    body += caption.resolve(doc).get_location_tokens(
+                        doc=doc,
+                        new_line=new_line,
+                        xsize=xsize,
+                        ysize=ysize,
+                    )
                 body += f"{text.strip()}"
-                body += f"{DocumentToken.END_CAPTION.value}"
+                body += f"</{DocItemLabel.CAPTION.value}>"
                 body += f"{new_line}"
-        body += f"{DocumentToken.END_FIGURE.value}{new_line}"
+        body += f"</{self.label.value}>\n"
         return body
@@ -1143,8 +1185,8 @@ class TableItem(FloatingItem):
         doc: "DoclingDocument",
         add_cell_location: bool = True,
         add_cell_text: bool = True,
-        xsize: int = 100,
-        ysize: int = 100,
+        xsize: int = 500,
+        ysize: int = 500,
     ) -> str:
         """Export the table as OTSL."""
         # Possible OTSL tokens...
@@ -1194,7 +1236,6 @@ class TableItem(FloatingItem):
                         page_h=page_h,
                         xsize=xsize,
                         ysize=ysize,
-                        page_i=page_no,
                     )
                 if rowstart == i and colstart == j:
@@ -1234,33 +1275,29 @@ class TableItem(FloatingItem):
     def export_to_document_tokens(
         self,
         doc: "DoclingDocument",
-        new_line: str = "\n",
-        xsize: int = 100,
-        ysize: int = 100,
+        new_line: str = "",
+        xsize: int = 500,
+        ysize: int = 500,
         add_location: bool = True,
-        add_caption: bool = True,
-        add_content: bool = True,
         add_cell_location: bool = True,
-        add_cell_label: bool = True,
         add_cell_text: bool = True,
-        add_page_index: bool = True,
+        add_caption: bool = True,
     ):
         r"""Export table to document tokens format.
         :param doc: "DoclingDocument":
-        :param new_line: str:  (Default value = "\n")
-        :param xsize: int:  (Default value = 100)
-        :param ysize: int:  (Default value = 100)
+        :param new_line: str (Default value = "")
+        :param xsize: int:  (Default value = 500)
+        :param ysize: int:  (Default value = 500)
         :param add_location: bool:  (Default value = True)
-        :param add_caption: bool:  (Default value = True)
-        :param add_content: bool:  (Default value = True)
         :param add_cell_location: bool:  (Default value = True)
-        :param add_cell_label: bool:  (Default value = True)
         :param add_cell_text: bool:  (Default value = True)
-        :param add_page_index: bool:  (Default value = True)
+        :param add_caption: bool:  (Default value = True)
         """
-        body = f"{DocumentToken.BEG_TABLE.value}{new_line}"
+        otsl_tag = DocumentToken.OTSL.value
+        body = f"<{otsl_tag}>{new_line}"
         if add_location:
             body += self.get_location_tokens(
@@ -1268,76 +1305,27 @@ class TableItem(FloatingItem):
                 new_line=new_line,
                 xsize=xsize,
                 ysize=ysize,
-                add_page_index=add_page_index,
             )
+        body += self.export_to_otsl(doc, add_cell_location, add_cell_text, xsize, ysize)
         if add_caption and len(self.captions):
             text = self.caption_text(doc)
             if len(text):
-                body += f"{DocumentToken.BEG_CAPTION.value}"
+                body += f"<{DocItemLabel.CAPTION.value}>"
+                for caption in self.captions:
+                    body += caption.resolve(doc).get_location_tokens(
+                        doc=doc,
+                        new_line=new_line,
+                        xsize=xsize,
+                        ysize=ysize,
+                    )
                 body += f"{text.strip()}"
-                body += f"{DocumentToken.END_CAPTION.value}"
+                body += f"</{DocItemLabel.CAPTION.value}>"
                 body += f"{new_line}"
-        if add_content and len(self.data.table_cells) > 0:
-            for i, row in enumerate(self.data.grid):
-                body += f"<row_{i}>"
-                for j, col in enumerate(row):
-                    text = ""
-                    if add_cell_text:
-                        text = col.text.strip()
-                    cell_loc = ""
-                    if (
-                        col.bbox is not None
-                        and add_cell_location
-                        and add_page_index
-                        and len(self.prov) > 0
-                    ):
-                        page_w, page_h = doc.pages[self.prov[0].page_no].size.as_tuple()
-                        cell_loc = DocumentToken.get_location(
-                            bbox=col.bbox.to_bottom_left_origin(page_h).as_tuple(),
-                            page_w=page_w,
-                            page_h=page_h,
-                            xsize=xsize,
-                            ysize=ysize,
-                            page_i=self.prov[0].page_no,
-                        )
-                    elif (
-                        col.bbox is not None
-                        and add_cell_location
-                        and not add_page_index
-                        and len(self.prov) > 0
-                    ):
-                        page_w, page_h = doc.pages[self.prov[0].page_no].size.as_tuple()
-                        cell_loc = DocumentToken.get_location(
-                            bbox=col.bbox.to_bottom_left_origin(page_h).as_tuple(),
-                            page_w=page_w,
-                            page_h=page_h,
-                            xsize=xsize,
-                            ysize=ysize,
-                            page_i=-1,
-                        )
-                    cell_label = ""
-                    if add_cell_label:
-                        if col.column_header:
-                            cell_label = "<col_header>"
-                        elif col.row_header:
-                            cell_label = "<row_header>"
-                        elif col.row_section:
-                            cell_label = "<row_section>"
-                        else:
-                            cell_label = "<body>"
-                    body += f"<col_{j}>{cell_loc}{cell_label}{text}</col_{j}>"
-                body += f"</row_{i}>{new_line}"
-        body += f"{DocumentToken.END_TABLE.value}{new_line}"
+        body += f"</{otsl_tag}>\n"
         return body
@@ -1777,6 +1765,7 @@ class DoclingDocument(BaseModel):
         text: str,
         code_language: Optional[CodeLanguageLabel] = None,
         orig: Optional[str] = None,
+        caption: Optional[Union[TextItem, RefItem]] = None,
         prov: Optional[ProvenanceItem] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
@@ -1786,6 +1775,8 @@ class DoclingDocument(BaseModel):
         :param text: str:
         :param code_language: Optional[str]: (Default value = None)
         :param orig: Optional[str]:  (Default value = None)
+        :param caption: Optional[Union[TextItem:
+        :param RefItem]]:  (Default value = None)
         :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
@@ -1809,6 +1800,8 @@ class DoclingDocument(BaseModel):
             code_item.content_layer = content_layer
         if prov:
             code_item.prov.append(prov)
+        if caption:
+            code_item.captions.append(caption.get_ref())
         self.texts.append(code_item)
         parent.children.append(RefItem(cref=cref))
@@ -1927,6 +1920,7 @@ class DoclingDocument(BaseModel):
                     traverse_pictures=traverse_pictures,
                     page_no=page_no,
                     _level=_level + 1,
+                    included_content_layers=included_content_layers,
                 )
     def _clear_picture_pil_cache(self):
@@ -2132,6 +2126,7 @@ class DoclingDocument(BaseModel):
         indent: int = 4,
         text_width: int = -1,
         page_no: Optional[int] = None,
+        included_content_layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS,
     ):
         """Save to markdown."""
         artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
@@ -2155,6 +2150,7 @@ class DoclingDocument(BaseModel):
             indent=indent,
             text_width=text_width,
             page_no=page_no,
+            included_content_layers=included_content_layers,
         )
         with open(filename, "w", encoding="utf-8") as fw:
@@ -2173,6 +2169,7 @@ class DoclingDocument(BaseModel):
         indent: int = 4,
         text_width: int = -1,
         page_no: Optional[int] = None,
+        included_content_layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS,
     ) -> str:
         r"""Serialize to Markdown.
@@ -2254,7 +2251,12 @@ class DoclingDocument(BaseModel):
             mdtexts.append(text)
         for ix, (item, level) in enumerate(
-            self.iterate_items(self.body, with_groups=True, page_no=page_no)
+            self.iterate_items(
+                self.body,
+                with_groups=True,
+                page_no=page_no,
+                included_content_layers=included_content_layers,
+            )
         ):
             # If we've moved to a lower level, we're exiting one or more groups
             if level < previous_level:
@@ -2423,6 +2425,7 @@ class DoclingDocument(BaseModel):
         page_no: Optional[int] = None,
         html_lang: str = "en",
         html_head: str = _HTML_DEFAULT_HEAD,
+        included_content_layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS,
     ):
         """Save to HTML."""
         artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
@@ -2443,6 +2446,7 @@ class DoclingDocument(BaseModel):
             page_no=page_no,
             html_lang=html_lang,
             html_head=html_head,
+            included_content_layers=included_content_layers,
         )
         with open(filename, "w", encoding="utf-8") as fw:
@@ -2490,6 +2494,7 @@ class DoclingDocument(BaseModel):
         page_no: Optional[int] = None,
         html_lang: str = "en",
         html_head: str = _HTML_DEFAULT_HEAD,
+        included_content_layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS,
     ) -> str:
         r"""Serialize to HTML."""
@@ -2531,7 +2536,12 @@ class DoclingDocument(BaseModel):
             return text
         for ix, (item, curr_level) in enumerate(
-            self.iterate_items(self.body, with_groups=True, page_no=page_no)
+            self.iterate_items(
+                self.body,
+                with_groups=True,
+                page_no=page_no,
+                included_content_layers=included_content_layers,
+            )
         ):
             # If we've moved to a lower level, we're exiting one or more groups
             if curr_level < prev_level and len(in_ordered_list) > 0:
@@ -2708,22 +2718,18 @@ class DoclingDocument(BaseModel):
     def save_as_document_tokens(
         self,
         filename: Path,
-        delim: str = "\n\n",
+        delim: str = "",
         from_element: int = 0,
         to_element: int = sys.maxsize,
-        labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
-        xsize: int = 100,
-        ysize: int = 100,
+        labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
+        xsize: int = 500,
+        ysize: int = 500,
         add_location: bool = True,
         add_content: bool = True,
         add_page_index: bool = True,
         # table specific flags
         add_table_cell_location: bool = False,
-        add_table_cell_label: bool = True,
         add_table_cell_text: bool = True,
-        # specifics
-        page_no: Optional[int] = None,
-        with_groups: bool = True,
     ):
         r"""Save the document content to a DocumentToken format."""
         out = self.export_to_document_tokens(
@@ -2738,198 +2744,230 @@ class DoclingDocument(BaseModel):
             add_page_index=add_page_index,
             # table specific flags
             add_table_cell_location=add_table_cell_location,
-            add_table_cell_label=add_table_cell_label,
             add_table_cell_text=add_table_cell_text,
-            # specifics
-            page_no=page_no,
-            with_groups=with_groups,
         )
         with open(filename, "w", encoding="utf-8") as fw:
             fw.write(out)
-    def export_to_document_tokens(
+    def export_to_document_tokens(  # noqa: C901
         self,
-        delim: str = "\n",
+        delim: str = "",
         from_element: int = 0,
         to_element: int = sys.maxsize,
-        labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
-        xsize: int = 100,
-        ysize: int = 100,
+        labels: set[DocItemLabel] = DOCUMENT_TOKENS_EXPORT_LABELS,
+        xsize: int = 500,
+        ysize: int = 500,
         add_location: bool = True,
         add_content: bool = True,
         add_page_index: bool = True,
         # table specific flags
         add_table_cell_location: bool = False,
-        add_table_cell_label: bool = True,
         add_table_cell_text: bool = True,
-        # specifics
-        page_no: Optional[int] = None,
-        with_groups: bool = True,
-        newline: bool = True,
     ) -> str:
         r"""Exports the document content to a DocumentToken format.
         Operates on a slice of the document's body as defined through arguments
         from_element and to_element; defaulting to the whole main_text.
-        :param delim: str:  (Default value = "\n\n")
+        :param delim: str:  (Default value = "")
         :param from_element: int:  (Default value = 0)
         :param to_element: Optional[int]:  (Default value = None)
         :param labels: set[DocItemLabel]
-        :param xsize: int:  (Default value = 100)
-        :param ysize: int:  (Default value = 100)
+        :param xsize: int:  (Default value = 500)
+        :param ysize: int:  (Default value = 500)
         :param add_location: bool:  (Default value = True)
         :param add_content: bool:  (Default value = True)
         :param add_page_index: bool:  (Default value = True)
         :param # table specific flagsadd_table_cell_location: bool
-        :param add_table_cell_label: bool:  (Default value = True)
         :param add_table_cell_text: bool:  (Default value = True)
         :returns: The content of the document formatted as a DocTags string.
         :rtype: str
         """
-        def close_lists(
-            curr_level: int,
-            prev_level: int,
-            in_ordered_list: List[bool],
-            result: str,
-            delim: str,
-        ):
-            if len(in_ordered_list) == 0:
-                return (in_ordered_list, result)
-            while curr_level < prev_level and len(in_ordered_list) > 0:
-                if in_ordered_list[-1]:
-                    result += f"</ordered_list>{delim}"
+        def _close_lists(
+            current_level: int,
+            previous_level: int,
+            ordered_list_stack: List[bool],
+            output_parts: List[str],
+        ) -> List[bool]:
+            """Close open list tags until the nesting level matches item's level."""
+            while current_level < previous_level and ordered_list_stack:
+                last_is_ordered = ordered_list_stack.pop()
+                if last_is_ordered:
+                    output_parts.append(f"</{DocumentToken.ORDERED_LIST.value}>\n")
                 else:
-                    result += f"</unordered_list>{delim}"
-                prev_level -= 1
-                in_ordered_list.pop()  # = in_ordered_list[:-1]
-            return (in_ordered_list, result)
-        if newline:
-            delim = "\n"
-        else:
-            delim = ""
-        prev_level = 0  # Track the previous item's level
-        in_ordered_list: List[bool] = []  # False
-        result = f"{DocumentToken.BEG_DOCUMENT.value}{delim}"
-        for ix, (item, curr_level) in enumerate(
-            self.iterate_items(self.body, with_groups=True)
+                    output_parts.append(f"</{DocumentToken.UNORDERED_LIST.value}>\n")
+                previous_level -= 1
+            return ordered_list_stack
+        def _add_page_break_if_needed(
+            output_parts: List[str],
+            item,
+            prev_page_no,
+            page_break_enabled: bool,
         ):
-            # If we've moved to a lower level, we're exiting one or more groups
-            if curr_level < prev_level and len(in_ordered_list) > 0:
-                # Calculate how many levels we've exited
-                # level_difference = previous_level - level
-                # Decrement list_nesting_level for each list group we've exited
-                # list_nesting_level = max(0, list_nesting_level - level_difference)
-                in_ordered_list, result = close_lists(
-                    curr_level=curr_level,
-                    prev_level=prev_level,
-                    in_ordered_list=in_ordered_list,
-                    result=result,
-                    delim=delim,
+            """Inserts a page-break token.
+            Inserts a page-break token if the item's page number is different
+            from the previous item and page breaks are enabled.
+            Returns the updated output_parts list and the current page number.
+            """
+            if not page_break_enabled:
+                return output_parts, prev_page_no
+            if not item.prov:
+                return output_parts, prev_page_no
+            current_page_no = item.prov[0].page_no
+            if prev_page_no is None:
+                return output_parts, current_page_no
+            if current_page_no != prev_page_no:
+                output_parts.append(f"<{DocumentToken.PAGE_BREAK.value}>\n")
+            return output_parts, current_page_no
+        def _get_standalone_captions(document_body):
+            """Identify captions that are not attached to any table or figure."""
+            all_captions = set()
+            matched_captions = set()
+            for item, _ in self.iterate_items(document_body, with_groups=True):
+                if item.label == DocItemLabel.CAPTION:
+                    all_captions.update([item.self_ref])
+                if item.label in [DocItemLabel.PICTURE, DocItemLabel.TABLE]:
+                    matched_captions.update([caption.cref for caption in item.captions])
+            return all_captions - matched_captions
+        # Initialization
+        output_parts: List[str] = []
+        ordered_list_stack: List[bool] = []
+        previous_level = 0
+        previous_page_no = None
+        # Precompute standalone captions
+        standalone_captions = _get_standalone_captions(self.body)
+        # Begin document
+        output_parts.append(f"<{DocumentToken.DOCUMENT.value}>{delim}")
+        for ix, (item, current_level) in enumerate(
+            self.iterate_items(
+                self.body,
+                with_groups=True,
+                included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE},
+            )
+        ):
+            # Close lists if we've moved to a lower nesting level
+            if current_level < previous_level and ordered_list_stack:
+                ordered_list_stack = _close_lists(
+                    current_level, previous_level, ordered_list_stack, output_parts
                 )
+            previous_level = current_level
-            prev_level = curr_level  # Update previous_level for next iteration
-            if ix < from_element or to_element <= ix:
-                continue  # skip as many items as you want
-            if (isinstance(item, DocItem)) and (item.label not in labels):
-                continue  # skip any label that is not whitelisted
-            if isinstance(item, GroupItem) and item.label in [
-                GroupLabel.ORDERED_LIST,
-            ]:
+            # Skip items outside the specified element range
+            if ix < from_element or ix >= to_element:
+                continue
-                result += f"<ordered_list>{delim}"
-                in_ordered_list.append(True)
+            # Skip items whose label is not in the allowed set
+            if isinstance(item, DocItem) and (item.label not in labels):
+                continue
-            elif isinstance(item, GroupItem) and item.label in [
-                GroupLabel.LIST,
-            ]:
+            # Skip captions that are not standalone as they will be included below
+            # by the export functions of Table and Picture
+            if (
+                isinstance(item, TextItem)
+                and item.label == DocItemLabel.CAPTION
+                and item.self_ref not in standalone_captions
+            ):
+                continue
-                result += f"<unordered_list>{delim}"
-                in_ordered_list.append(False)
+            # Handle list groups
+            if isinstance(item, GroupItem):
+                if item.label == GroupLabel.ORDERED_LIST:
+                    output_parts.append(f"<{DocumentToken.ORDERED_LIST.value}>{delim}")
+                    ordered_list_stack.append(True)
+                elif item.label == GroupLabel.LIST:
+                    output_parts.append(
+                        f"<{DocumentToken.UNORDERED_LIST.value}>{delim}"
+                    )
+                    ordered_list_stack.append(False)
+                continue
-            elif isinstance(item, SectionHeaderItem):
+            # For other item types, optionally insert page-break if the page changed
+            output_parts, previous_page_no = _add_page_break_if_needed(
+                output_parts, item, previous_page_no, add_page_index
+            )
-                result += item.export_to_document_tokens(
-                    doc=self,
-                    new_line=delim,
-                    xsize=xsize,
-                    ysize=ysize,
-                    add_location=add_location,
-                    add_content=add_content,
-                    add_page_index=add_page_index,
+            if isinstance(item, SectionHeaderItem):
+                output_parts.append(
+                    item.export_to_document_tokens(
+                        doc=self,
+                        new_line=delim,
+                        xsize=xsize,
+                        ysize=ysize,
+                        add_location=add_location,
+                        add_content=add_content,
+                    )
                 )
-            elif isinstance(item, CodeItem) and (item.label in labels):
-                result += item.export_to_document_tokens(
-                    doc=self,
-                    new_line=delim,
-                    xsize=xsize,
-                    ysize=ysize,
-                    add_location=add_location,
-                    add_content=add_content,
-                    add_page_index=add_page_index,
+            elif isinstance(item, CodeItem):
+                output_parts.append(
+                    item.export_to_document_tokens(
+                        doc=self,
+                        new_line=delim,
+                        xsize=xsize,
+                        ysize=ysize,
+                        add_location=add_location,
+                        add_content=add_content,
+                    )
                 )
-            elif isinstance(item, TextItem) and (item.label in labels):
-                result += item.export_to_document_tokens(
-                    doc=self,
-                    new_line=delim,
-                    xsize=xsize,
-                    ysize=ysize,
-                    add_location=add_location,
-                    add_content=add_content,
-                    add_page_index=add_page_index,
+            elif isinstance(item, TextItem):
+                output_parts.append(
+                    item.export_to_document_tokens(
+                        doc=self,
+                        new_line=delim,
+                        xsize=xsize,
+                        ysize=ysize,
+                        add_location=add_location,
+                        add_content=add_content,
+                    )
                 )
-            elif isinstance(item, TableItem) and (item.label in labels):
-                result += item.export_to_document_tokens(
-                    doc=self,
-                    new_line=delim,
-                    xsize=xsize,
-                    ysize=ysize,
-                    add_caption=True,
-                    add_location=add_location,
-                    add_content=add_content,
-                    add_cell_location=add_table_cell_location,
-                    add_cell_label=add_table_cell_label,
-                    add_cell_text=add_table_cell_text,
-                    add_page_index=add_page_index,
+            elif isinstance(item, TableItem):
+                output_parts.append(
+                    item.export_to_document_tokens(
+                        doc=self,
+                        new_line=delim,
+                        xsize=xsize,
+                        ysize=ysize,
+                        add_location=add_location,
+                        add_cell_location=add_table_cell_location,
+                        add_cell_text=add_table_cell_text,
+                        add_caption=True,
+                    )
                 )
-            elif isinstance(item, PictureItem) and (item.label in labels):
-                result += item.export_to_document_tokens(
-                    doc=self,
-                    new_line=delim,
-                    xsize=xsize,
-                    ysize=ysize,
-                    add_caption=True,
-                    add_location=add_location,
-                    add_content=add_content,
-                    add_page_index=add_page_index,
+            elif isinstance(item, PictureItem):
+                output_parts.append(
+                    item.export_to_document_tokens(
+                        doc=self,
+                        new_line=delim,
+                        xsize=xsize,
+                        ysize=ysize,
+                        add_caption=True,
+                        add_location=add_location,
+                        add_content=add_content,
+                    )
                 )
-        result += DocumentToken.END_DOCUMENT.value
+        # End any lists that might still be open
+        ordered_list_stack = _close_lists(
+            0, previous_level, ordered_list_stack, output_parts
+        )
-        return result
+        # End document
+        output_parts.append(f"</{DocumentToken.DOCUMENT.value}>")
+        return "".join(output_parts)
     def _export_to_indented_text(
         self, indent="  ", max_text_len: int = -1, explicit_tables: bool = False

docling_core/types/doc/labels.py CHANGED Viewed

@@ -111,7 +111,7 @@ class PictureClassificationLabel(str, Enum):
     SIGNATURE = "signature"
     STAMP = "stamp"
     QR_CODE = "qr_code"
-    BAR_CODE = "bat_code"
+    BAR_CODE = "bar_code"
     SCREENSHOT = "screenshot"
     # Geology/Geography

docling_core/types/doc/tokens.py CHANGED Viewed

@@ -8,13 +8,15 @@
 from enum import Enum
 from typing import Tuple
+from docling_core.types.doc.labels import PictureClassificationLabel
 class TableToken(Enum):
     """Class to represent an LLM friendly representation of a Table."""
     CELL_LABEL_COLUMN_HEADER = "<column_header>"
     CELL_LABEL_ROW_HEADER = "<row_header>"
-    CELL_LABEL_SECTION_HEADERE = "<section_header>"
+    CELL_LABEL_SECTION_HEADER = "<shed>"
     CELL_LABEL_DATA = "<data>"
     OTSL_ECEL = "<ecel>"  # empty cell
@@ -42,83 +44,30 @@ class TableToken(Enum):
 class DocumentToken(Enum):
     """Class to represent an LLM friendly representation of a Document."""
-    BEG_DOCUMENT = "<document>"
-    END_DOCUMENT = "</document>"
-    BEG_TITLE = "<title>"
-    END_TITLE = "</title>"
-    BEG_ABSTRACT = "<abstract>"
-    END_ABSTRACT = "</abstract>"
-    BEG_DOI = "<doi>"
-    END_DOI = "</doi>"
-    BEG_DATE = "<date>"
-    END_DATE = "</date>"
-    BEG_AUTHORS = "<authors>"
-    END_AUTHORS = "</authors>"
-    BEG_AUTHOR = "<author>"
-    END_AUTHOR = "</author>"
-    BEG_AFFILIATIONS = "<affiliations>"
-    END_AFFILIATIONS = "</affiliations>"
-    BEG_AFFILIATION = "<affiliation>"
-    END_AFFILIATION = "</affiliation>"
-    BEG_HEADER = "<section-header>"
-    END_HEADER = "</section-header>"
-    BEG_TEXT = "<text>"
-    END_TEXT = "</text>"
-    BEG_PARAGRAPH = "<paragraph>"
-    END_PARAGRAPH = "</paragraph>"
-    BEG_TABLE = "<table>"
-    END_TABLE = "</table>"
-    BEG_FIGURE = "<figure>"
-    END_FIGURE = "</figure>"
-    BEG_CAPTION = "<caption>"
-    END_CAPTION = "</caption>"
-    BEG_EQUATION = "<equation>"
-    END_EQUATION = "</equation>"
-    BEG_LIST = "<list>"
-    END_LIST = "</list>"
-    BEG_LISTITEM = "<list-item>"
-    END_LISTITEM = "</list-item>"
-    BEG_LOCATION = "<location>"
-    END_LOCATION = "</location>"
-    BEG_GROUP = "<group>"
-    END_GROUP = "</group>"
+    DOCUMENT = "doctag"
+    OTSL = "otsl"
+    ORDERED_LIST = "ordered_list"
+    UNORDERED_LIST = "unordered_list"
+    LOC = "loc_"
+    PAGE_BREAK = "page_break"
     @classmethod
     def get_special_tokens(
         cls,
-        max_rows: int = 100,
-        max_cols: int = 100,
-        max_pages: int = 1000,
         page_dimension: Tuple[int, int] = (100, 100),
     ):
         """Function to get all special document tokens."""
         special_tokens = [token.value for token in cls]
-        # Adding dynamically generated row and col tokens
-        for i in range(0, max_rows + 1):
-            special_tokens += [f"<row_{i}>", f"</row_{i}>"]
-        for i in range(0, max_cols + 1):
-            special_tokens += [f"<col_{i}>", f"</col_{i}>"]
-        for i in range(6):
-            special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
-        # FIXME: this is synonym of section header
         for i in range(6):
-            special_tokens += [f"<subtitle-level-{i}>", f"</subtitle-level-{i}>"]
+            special_tokens += [
+                f"<section_header_level_{i}>",
+                f"</section_header_level_{i}>",
+            ]
-        # Adding dynamically generated page-tokens
-        for i in range(0, max_pages + 1):
-            special_tokens.append(f"<page_{i}>")
-            special_tokens.append(f"</page_{i}>")
+        # Add dynamically picture classification tokens
+        for _, member in PictureClassificationLabel.__members__.items():
+            special_tokens.append(f"<{member}>")
         # Adding dynamically generated location-tokens
         for i in range(0, max(page_dimension[0] + 1, page_dimension[1] + 1)):
@@ -132,25 +81,9 @@ class DocumentToken(Enum):
         return label in DocumentToken.get_special_tokens()
     @staticmethod
-    def get_row_token(row: int, beg=bool) -> str:
-        """Function to get page tokens."""
-        if beg:
-            return f"<row_{row}>"
-        else:
-            return f"</row_{row}>"
-    @staticmethod
-    def get_col_token(col: int, beg=bool) -> str:
-        """Function to get page tokens."""
-        if beg:
-            return f"<col_{col}>"
-        else:
-            return f"</col_{col}>"
-    @staticmethod
-    def get_page_token(page: int):
-        """Function to get page tokens."""
-        return f"<page_{page}>"
+    def get_picture_classification_token(classification: str) -> str:
+        """Function to get picture classification tokens."""
+        return f"<{classification}>"
     @staticmethod
     def get_location_token(val: float, rnorm: int = 100):
@@ -172,7 +105,6 @@ class DocumentToken(Enum):
         page_h: float,
         xsize: int = 100,
         ysize: int = 100,
-        page_i: int = -1,
     ):
         """Get the location string give bbox and page-dim."""
         assert bbox[0] <= bbox[2], f"bbox[0]<=bbox[2] => {bbox[0]}<={bbox[2]}"
@@ -183,17 +115,11 @@ class DocumentToken(Enum):
         x1 = bbox[2] / page_w
         y1 = bbox[3] / page_h
-        page_tok = ""
-        if page_i != -1:
-            page_tok = DocumentToken.get_page_token(page=page_i)
         x0_tok = DocumentToken.get_location_token(val=min(x0, x1), rnorm=xsize)
         y0_tok = DocumentToken.get_location_token(val=min(y0, y1), rnorm=ysize)
         x1_tok = DocumentToken.get_location_token(val=max(x0, x1), rnorm=xsize)
         y1_tok = DocumentToken.get_location_token(val=max(y0, y1), rnorm=ysize)
-        loc_str = f"{DocumentToken.BEG_LOCATION.value}"
-        loc_str += f"{page_tok}{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
-        loc_str += f"{DocumentToken.END_LOCATION.value}"
+        loc_str = f"{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
         return loc_str

{docling_core-2.18.1.dist-info → docling_core-2.19.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling-core
-Version: 2.18.1
+Version: 2.19.1
 Summary: A python library to define and validate data types in Docling.
 Home-page: https://ds4sd.github.io/
 License: MIT

{docling_core-2.18.1.dist-info → docling_core-2.19.1.dist-info}/RECORD RENAMED Viewed

@@ -18,15 +18,15 @@ docling_core/search/package.py,sha256=Lz2ml2eDy5t0ZimnGTq-DXHAn-f18w0bn4H5xrhs75
 docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9ACDd57ds,106
 docling_core/transforms/chunker/__init__.py,sha256=YdizSKXLmmK9eyYBsarHWr8Mx_AoA0PT7c0absibZMk,306
 docling_core/transforms/chunker/base.py,sha256=BSWTiFOsF5YaZaZJZY8nwIdOXb9uufJMRIds7LxRNh8,2546
-docling_core/transforms/chunker/hierarchical_chunker.py,sha256=cy3sE9w_7l-uoIEUcfnZlQweDHUoyAJTQ6IkzxxVjFY,8052
+docling_core/transforms/chunker/hierarchical_chunker.py,sha256=MStDUDtzFGc6j8v9AkcAnnSHTDxdoiVrp8FTmRdGqU8,8138
 docling_core/transforms/chunker/hybrid_chunker.py,sha256=kokjDdxjc_gygOokQwYFVnHv2NjWTgf9uex8o0ole7w,9876
 docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
 docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
 docling_core/types/doc/__init__.py,sha256=bysJn2iwjAHwThSWDPXEdVUUij7p_ax12_nx2_0CMdg,653
 docling_core/types/doc/base.py,sha256=lMRNq1DUK7K26L2VNZRqFaItCSZ6m9BdYTVaJA98PZQ,11495
-docling_core/types/doc/document.py,sha256=Rn2hA0LPpnt7tGJOD2ME6t5x8R42mttPFD2Ks2cbvVU,102698
-docling_core/types/doc/labels.py,sha256=8Luymal9SKXTwyqq1ONKiUTxuMo_nRMYfBkRPFkdSSo,5306
-docling_core/types/doc/tokens.py,sha256=GMtm5TsNljBPaMYkgmD3WWZmC0FHqKF9imKEEySz4ps,6020
+docling_core/types/doc/document.py,sha256=t1nk1GeR5_YvZhuWUVZkkBekp89vFB4RBtMuwD3Acw4,104373
+docling_core/types/doc/labels.py,sha256=cqH4DGN9lgZns6gOtL5urzZzUPGOjHJ75xQbIKSh_h8,5306
+docling_core/types/doc/tokens.py,sha256=i73PXkmqXCLsQ5SddnJX8L9e_Ub2_K_DYSE-VE8NDq0,3925
 docling_core/types/doc/utils.py,sha256=SaiQD-WMMooFm1bMqwatU-IGhtG048iKJb-ppnJit_k,2250
 docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
 docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
@@ -56,8 +56,8 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
 docling_core/utils/legacy.py,sha256=SqNQAxl97aHfoJEsC9vZcMJg5FNkmqKPFi-wdSrnfI0,24442
 docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
 docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
-docling_core-2.18.1.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
-docling_core-2.18.1.dist-info/METADATA,sha256=XV0FP0Uuqjra5G7O6ENd-0FwCc6TK6XasiXbHnCSGEA,5803
-docling_core-2.18.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-docling_core-2.18.1.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
-docling_core-2.18.1.dist-info/RECORD,,
+docling_core-2.19.1.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
+docling_core-2.19.1.dist-info/METADATA,sha256=Uz-AUOD2_itxSEVxatsPbCQ0pFBE3fMX-gXx0YLmsKw,5803
+docling_core-2.19.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+docling_core-2.19.1.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
+docling_core-2.19.1.dist-info/RECORD,,

{docling_core-2.18.1.dist-info → docling_core-2.19.1.dist-info}/LICENSE RENAMED Viewed

File without changes

{docling_core-2.18.1.dist-info → docling_core-2.19.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{docling_core-2.18.1.dist-info → docling_core-2.19.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

docling-core 2.18.1__py3-none-any.whl → 2.19.1__py3-none-any.whl

Potentially problematic release.

docling-core 2.18.1py3-none-any.whl → 2.19.1py3-none-any.whl