PyPI - docling-core - Versions diffs - 2.38.2__py3-none-any.whl → 2.40.0__py3-none-any.whl - Mend

docling-core 2.38.2py3-none-any.whl → 2.40.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling-core might be problematic. Click here for more details.

Files changed (19) hide show

docling_core/transforms/chunker/hierarchical_chunker.py CHANGED Viewed

@@ -35,11 +35,10 @@ from docling_core.types.doc.document import (
     DocumentOrigin,
     InlineGroup,
     LevelNumber,
-    OrderedList,
+    ListGroup,
     SectionHeaderItem,
     TableItem,
     TitleItem,
-    UnorderedList,
 )
 _VERSION: Final = "1.0.0"
@@ -240,7 +239,7 @@ class HierarchicalChunker(BaseChunker):
                     heading_by_level.pop(k, None)
                 continue
             elif (
-                isinstance(item, (OrderedList, UnorderedList, InlineGroup, DocItem))
+                isinstance(item, (ListGroup, InlineGroup, DocItem))
                 and item.self_ref not in visited
             ):
                 ser_res = my_doc_ser.serialize(item=item, visited=visited)

docling_core/transforms/serializer/base.py CHANGED Viewed

@@ -17,12 +17,11 @@ from docling_core.types.doc.document import (
     FormItem,
     InlineGroup,
     KeyValueItem,
+    ListGroup,
     NodeItem,
-    OrderedList,
     PictureItem,
     TableItem,
     TextItem,
-    UnorderedList,
 )
@@ -128,7 +127,7 @@ class BaseListSerializer(ABC):
     def serialize(
         self,
         *,
-        item: Union[UnorderedList, OrderedList],
+        item: ListGroup,
         doc_serializer: "BaseDocSerializer",
         doc: DoclingDocument,
         **kwargs: Any,

docling_core/transforms/serializer/common.py CHANGED Viewed

@@ -39,8 +39,8 @@ from docling_core.types.doc.document import (
     FormItem,
     InlineGroup,
     KeyValueItem,
+    ListGroup,
     NodeItem,
-    OrderedList,
     PictureClassificationData,
     PictureDataType,
     PictureItem,
@@ -49,7 +49,6 @@ from docling_core.types.doc.document import (
     TableAnnotationType,
     TableItem,
     TextItem,
-    UnorderedList,
 )
 from docling_core.types.doc.labels import DocItemLabel
@@ -89,7 +88,7 @@ def _iterate_items(
     ):
         if add_page_breaks:
             if (
-                isinstance(item, (UnorderedList, OrderedList, InlineGroup))
+                isinstance(item, (ListGroup, InlineGroup))
                 and item.self_ref not in my_visited
             ):
                 # if group starts with new page, yield page break before group node
@@ -316,7 +315,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
         ########
         # groups
         ########
-        if isinstance(item, (UnorderedList, OrderedList)):
+        if isinstance(item, ListGroup):
             part = self.list_serializer.serialize(
                 item=item,
                 doc_serializer=self,

docling_core/transforms/serializer/doctags.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """Define classes for Doctags serialization."""
 from enum import Enum
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional
 from pydantic import BaseModel
 from typing_extensions import override
@@ -34,9 +34,9 @@ from docling_core.types.doc.document import (
     FormItem,
     InlineGroup,
     KeyValueItem,
+    ListGroup,
     ListItem,
     NodeItem,
-    OrderedList,
     PictureClassificationData,
     PictureItem,
     PictureMoleculeData,
@@ -44,7 +44,6 @@ from docling_core.types.doc.document import (
     ProvenanceItem,
     TableItem,
     TextItem,
-    UnorderedList,
 )
 from docling_core.types.doc.labels import DocItemLabel, PictureClassificationLabel
 from docling_core.types.doc.tokens import DocumentToken
@@ -376,7 +375,7 @@ class DocTagsListSerializer(BaseModel, BaseListSerializer):
     def serialize(
         self,
         *,
-        item: Union[UnorderedList, OrderedList],
+        item: ListGroup,
         doc_serializer: "BaseDocSerializer",
         doc: DoclingDocument,
         list_level: int = 0,
@@ -406,7 +405,7 @@ class DocTagsListSerializer(BaseModel, BaseListSerializer):
             text_res = f"{text_res}{delim}"
             wrap_tag = (
                 DocumentToken.ORDERED_LIST.value
-                if isinstance(item, OrderedList)
+                if item.first_item_is_enumerated(doc)
                 else DocumentToken.UNORDERED_LIST.value
             )
             text_res = _wrap(text=text_res, wrap_tag=wrap_tag)

docling_core/transforms/serializer/html.py CHANGED Viewed

@@ -58,9 +58,9 @@ from docling_core.types.doc.document import (
     ImageRef,
     InlineGroup,
     KeyValueItem,
+    ListGroup,
     ListItem,
     NodeItem,
-    OrderedList,
     PictureClassificationData,
     PictureItem,
     PictureMoleculeData,
@@ -70,7 +70,6 @@ from docling_core.types.doc.document import (
     TableItem,
     TextItem,
     TitleItem,
-    UnorderedList,
 )
 from docling_core.types.doc.labels import DocItemLabel
 from docling_core.types.doc.utils import (
@@ -117,6 +116,8 @@ class HTMLParams(CommonParams):
     include_annotations: bool = True
+    show_original_list_item_marker: bool = True
 class HTMLTextSerializer(BaseModel, BaseTextSerializer):
     """HTML-specific text item serializer."""
@@ -162,7 +163,19 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
         elif isinstance(item, ListItem):
             # List items are handled by list serializer
             text_inner = self._prepare_content(item.text)
-            text = get_html_tag_with_text_direction(html_tag="li", text=text_inner)
+            text = (
+                get_html_tag_with_text_direction(
+                    html_tag="li",
+                    text=text_inner,
+                    attrs=(
+                        {"style": f"list-style-type: '{item.marker} ';"}
+                        if params.show_original_list_item_marker and item.marker
+                        else {}
+                    ),
+                )
+                if text_inner
+                else ""
+            )
         elif is_inline_scope:
             text = self._prepare_content(item.text)
@@ -680,7 +693,7 @@ class HTMLListSerializer(BaseModel, BaseListSerializer):
     def serialize(
         self,
         *,
-        item: Union[UnorderedList, OrderedList],
+        item: ListGroup,
         doc_serializer: "BaseDocSerializer",
         doc: DoclingDocument,
         list_level: int = 0,
@@ -690,7 +703,7 @@ class HTMLListSerializer(BaseModel, BaseListSerializer):
     ) -> SerializationResult:
         """Serializes a list to HTML."""
         my_visited: set[str] = visited if visited is not None else set()
+        params = HTMLParams(**kwargs)
         # Get all child parts
         parts = doc_serializer.get_parts(
             item=item,
@@ -706,17 +719,51 @@ class HTMLListSerializer(BaseModel, BaseListSerializer):
                 (
                     p.text
                     if (
-                        (p.text.startswith("<li>") and p.text.endswith("</li>"))
-                        or (p.text.startswith("<ol>") and p.text.endswith("</ol>"))
-                        or (p.text.startswith("<ul>") and p.text.endswith("</ul>"))
+                        (
+                            p.text.startswith(("<li>", "<li "))
+                            and p.text.endswith("</li>")
+                        )
+                        or (
+                            p.text.startswith(("<ol>", "<ol "))
+                            and p.text.endswith("</ol>")
+                        )
+                        or (
+                            p.text.startswith(("<ul>", "<ul "))
+                            and p.text.endswith("</ul>")
+                        )
+                    )
+                    else (
+                        get_html_tag_with_text_direction(
+                            html_tag="li",
+                            text=p.text,
+                            attrs=(
+                                {
+                                    "style": f"list-style-type: '{grandparent_item.marker} ';"
+                                }
+                                if params.show_original_list_item_marker
+                                and grandparent_item.marker
+                                else {}
+                            ),
+                        )
+                        if p.spans
+                        and p.spans[0].item.parent
+                        and isinstance(
+                            (parent_item := p.spans[0].item.parent.resolve(doc)),
+                            InlineGroup,
+                        )
+                        and parent_item.parent
+                        and isinstance(
+                            (grandparent_item := parent_item.parent.resolve(doc)),
+                            ListItem,
+                        )
+                        else f"<li>{p.text}</li>"
                     )
-                    else f"<li>{p.text}</li>"
                 )
                 for p in parts
             ]
         )
         if text_res:
-            tag = "ol" if isinstance(item, OrderedList) else "ul"
+            tag = "ol" if item.first_item_is_enumerated(doc) else "ul"
             text_res = f"<{tag}>\n{text_res}\n</{tag}>"
         return create_ser_result(text=text_res, span_source=parts)

docling_core/transforms/serializer/markdown.py CHANGED Viewed

@@ -7,6 +7,7 @@
 import html
 import re
 import textwrap
+from enum import Enum
 from pathlib import Path
 from typing import Any, Optional, Union
@@ -31,7 +32,6 @@ from docling_core.transforms.serializer.common import (
     CommonParams,
     DocSerializer,
     _get_annotation_text,
-    _PageBreakSerResult,
     create_ser_result,
 )
 from docling_core.types.doc.base import ImageRefMode
@@ -48,8 +48,9 @@ from docling_core.types.doc.document import (
     ImageRef,
     InlineGroup,
     KeyValueItem,
+    ListGroup,
+    ListItem,
     NodeItem,
-    OrderedList,
     PictureClassificationData,
     PictureItem,
     PictureMoleculeData,
@@ -58,7 +59,6 @@ from docling_core.types.doc.document import (
     TableItem,
     TextItem,
     TitleItem,
-    UnorderedList,
 )
@@ -79,6 +79,14 @@ def _get_annotation_ser_result(
     )
+class OrigListItemMarkerMode(str, Enum):
+    """Display mode for original list item marker."""
+    NEVER = "never"
+    ALWAYS = "always"
+    AUTO = "auto"
 class MarkdownParams(CommonParams):
     """Markdown-specific serialization parameters."""
@@ -93,6 +101,8 @@ class MarkdownParams(CommonParams):
     escape_html: bool = True
     include_annotations: bool = True
     mark_annotations: bool = False
+    orig_list_item_marker_mode: OrigListItemMarkerMode = OrigListItemMarkerMode.AUTO
+    ensure_valid_list_item_marker: bool = True
 class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
@@ -117,7 +127,7 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
         escape_html = True
         escape_underscores = True
         processing_pending = True
-        if isinstance(item, (TitleItem, SectionHeaderItem)):
+        if isinstance(item, (ListItem, TitleItem, SectionHeaderItem)):
             # case where processing/formatting should be applied first (in inner scope)
             processing_pending = False
             if (
@@ -127,7 +137,7 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
                     (child_group := item.children[0].resolve(doc)), InlineGroup
                 )
             ):
-                # case of heading with inline
+                # case of inline within heading / list item
                 ser_res = doc_serializer.serialize(item=child_group)
                 text = ser_res.text
                 for span in ser_res.spans:
@@ -140,8 +150,55 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
                     formatting=item.formatting,
                     hyperlink=item.hyperlink,
                 )
-            num_hashes = 1 if isinstance(item, TitleItem) else item.level + 1
-            text_part = f"{num_hashes * '#'} {text}"
+            if isinstance(item, ListItem):
+                pieces: list[str] = []
+                case_auto = (
+                    params.orig_list_item_marker_mode == OrigListItemMarkerMode.AUTO
+                    and bool(re.search(r"[a-zA-Z0-9]", item.marker))
+                )
+                case_already_valid = (
+                    params.ensure_valid_list_item_marker
+                    and params.orig_list_item_marker_mode
+                    != OrigListItemMarkerMode.NEVER
+                    and (
+                        item.marker in ["-", "*", "+"]
+                        or re.fullmatch(r"\d+\.", item.marker)
+                    )
+                )
+                # wrap with outer marker (if applicable)
+                if params.ensure_valid_list_item_marker and not case_already_valid:
+                    assert item.parent and isinstance(
+                        (list_group := item.parent.resolve(doc)), ListGroup
+                    )
+                    if list_group.first_item_is_enumerated(doc) and (
+                        params.orig_list_item_marker_mode != OrigListItemMarkerMode.AUTO
+                        or not item.marker
+                    ):
+                        pos = -1
+                        for i, child in enumerate(list_group.children):
+                            if child.resolve(doc) == item:
+                                pos = i
+                                break
+                        md_marker = f"{pos + 1}."
+                    else:
+                        md_marker = "-"
+                    pieces.append(md_marker)
+                # include original marker (if applicable)
+                if item.marker and (
+                    params.orig_list_item_marker_mode == OrigListItemMarkerMode.ALWAYS
+                    or case_auto
+                    or case_already_valid
+                ):
+                    pieces.append(item.marker)
+                pieces.append(text)
+                text_part = " ".join(pieces)
+            else:
+                num_hashes = 1 if isinstance(item, TitleItem) else item.level + 1
+                text_part = f"{num_hashes * '#'} {text}"
         elif isinstance(item, CodeItem):
             text_part = f"`{text}`" if is_inline_scope else f"```\n{text}\n```"
             escape_html = False
@@ -452,7 +509,7 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
     def serialize(
         self,
         *,
-        item: Union[UnorderedList, OrderedList],
+        item: ListGroup,
         doc_serializer: "BaseDocSerializer",
         doc: DoclingDocument,
         list_level: int = 0,
@@ -473,27 +530,24 @@ class MarkdownListSerializer(BaseModel, BaseListSerializer):
         sep = "\n"
         my_parts: list[SerializationResult] = []
         for p in parts:
-            if p.text and p.text[0] == " " and my_parts:
-                my_parts[-1].text = sep.join([my_parts[-1].text, p.text])  # update last
+            if (
+                my_parts
+                and p.text
+                and p.spans
+                and p.spans[0].item.parent
+                and isinstance(p.spans[0].item.parent.resolve(doc), InlineGroup)
+            ):
+                my_parts[-1].text = f"{my_parts[-1].text}{p.text}"  # append to last
                 my_parts[-1].spans.extend(p.spans)
             else:
                 my_parts.append(p)
         indent_str = list_level * params.indent * " "
-        is_ol = isinstance(item, OrderedList)
         text_res = sep.join(
             [
                 # avoid additional marker on already evaled sublists
-                (
-                    c.text
-                    if c.text and c.text[0] == " "
-                    else (
-                        f"{indent_str}"
-                        f"{'' if isinstance(c, _PageBreakSerResult) else (f'{i + 1}. ' if is_ol else '- ')}"  # noqa: E501
-                        f"{c.text}"
-                    )
-                )
-                for i, c in enumerate(my_parts)
+                (c.text if c.text and c.text[0] == " " else f"{indent_str}{c.text}")
+                for c in my_parts
             ]
         )
         return create_ser_result(text=text_res, span_source=my_parts)

docling_core/types/doc/__init__.py CHANGED Viewed

@@ -32,6 +32,7 @@ from .document import (
     ImageRef,
     InlineGroup,
     KeyValueItem,
+    ListGroup,
     ListItem,
     MiscAnnotation,
     NodeItem,

docling_core/types/doc/document.py CHANGED Viewed

@@ -54,7 +54,7 @@ _logger = logging.getLogger(__name__)
 Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
 LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
-CURRENT_VERSION: Final = "1.4.0"
+CURRENT_VERSION: Final = "1.5.0"
 DEFAULT_EXPORT_LABELS = {
     DocItemLabel.TITLE,
@@ -133,12 +133,6 @@ class MiscAnnotation(BaseAnnotation):
     content: Dict[str, Any]
-# deprecated aliases:
-BasePictureData = BaseAnnotation
-PictureDescriptionData = DescriptionAnnotation
-PictureMiscData = MiscAnnotation
 class ChartLine(BaseModel):
     """Represents a line in a line chart.
@@ -737,9 +731,11 @@ class ProvenanceItem(BaseModel):
 class ContentLayer(str, Enum):
     """ContentLayer."""
-    BODY = "body"
-    FURNITURE = "furniture"
-    BACKGROUND = "background"
+    BODY = "body"  # main content of the document
+    FURNITURE = "furniture"  # eg page-headers/footers
+    BACKGROUND = "background"  # eg watermarks
+    INVISIBLE = "invisible"  # hidden or invisible text
+    NOTES = "notes"  # author/speaker notes, corrections, etc
 DEFAULT_CONTENT_LAYERS = {ContentLayer.BODY}
@@ -860,12 +856,27 @@ class GroupItem(NodeItem):  # Container type, can't be a leaf node
     label: GroupLabel = GroupLabel.UNSPECIFIED
-class UnorderedList(GroupItem):
-    """UnorderedList."""
+class ListGroup(GroupItem):
+    """ListGroup."""
     label: typing.Literal[GroupLabel.LIST] = GroupLabel.LIST  # type: ignore[assignment]
+    @field_validator("label", mode="before")
+    @classmethod
+    def patch_ordered(cls, value):
+        """patch_ordered."""
+        return GroupLabel.LIST if value == GroupLabel.ORDERED_LIST else value
+    def first_item_is_enumerated(self, doc: "DoclingDocument"):
+        """Whether the first list item is enumerated."""
+        return (
+            len(self.children) > 0
+            and isinstance(first_child := self.children[0].resolve(doc), ListItem)
+            and first_child.enumerated
+        )
+@deprecated("Use ListGroup instead.")
 class OrderedList(GroupItem):
     """OrderedList."""
@@ -1752,7 +1763,7 @@ class DoclingDocument(BaseModel):
     )  # List[RefItem] = []
     body: GroupItem = GroupItem(name="_root_", self_ref="#/body")  # List[RefItem] = []
-    groups: List[Union[OrderedList, UnorderedList, InlineGroup, GroupItem]] = []
+    groups: List[Union[ListGroup, InlineGroup, GroupItem]] = []
     texts: List[
         Union[TitleItem, SectionHeaderItem, ListItem, CodeItem, FormulaItem, TextItem]
     ] = []
@@ -1938,7 +1949,7 @@ class DoclingDocument(BaseModel):
             self.form_items.append(item)
-        elif isinstance(item, (UnorderedList, OrderedList, InlineGroup)):
+        elif isinstance(item, (ListGroup, InlineGroup)):
             item_label = "groups"
             item_index = len(self.groups)
@@ -2160,16 +2171,16 @@ class DoclingDocument(BaseModel):
     # TODO: refactor add* methods below
     ###################################
-    def add_ordered_list(
+    def add_list_group(
         self,
         name: Optional[str] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
-    ) -> GroupItem:
-        """add_ordered_list."""
+    ) -> ListGroup:
+        """add_list_group."""
         _parent = parent or self.body
         cref = f"#/groups/{len(self.groups)}"
-        group = OrderedList(self_ref=cref, parent=_parent.get_ref())
+        group = ListGroup(self_ref=cref, parent=_parent.get_ref())
         if name is not None:
             group.name = name
         if content_layer:
@@ -2179,6 +2190,21 @@ class DoclingDocument(BaseModel):
         _parent.children.append(RefItem(cref=cref))
         return group
+    @deprecated("Use add_list_group() instead.")
+    def add_ordered_list(
+        self,
+        name: Optional[str] = None,
+        parent: Optional[NodeItem] = None,
+        content_layer: Optional[ContentLayer] = None,
+    ) -> GroupItem:
+        """add_ordered_list."""
+        return self.add_list_group(
+            name=name,
+            parent=parent,
+            content_layer=content_layer,
+        )
+    @deprecated("Use add_list_group() instead.")
     def add_unordered_list(
         self,
         name: Optional[str] = None,
@@ -2186,25 +2212,18 @@ class DoclingDocument(BaseModel):
         content_layer: Optional[ContentLayer] = None,
     ) -> GroupItem:
         """add_unordered_list."""
-        _parent = parent or self.body
-        cref = f"#/groups/{len(self.groups)}"
-        group = UnorderedList(self_ref=cref, parent=_parent.get_ref())
-        if name is not None:
-            group.name = name
-        if content_layer:
-            group.content_layer = content_layer
-        self.groups.append(group)
-        _parent.children.append(RefItem(cref=cref))
-        return group
+        return self.add_list_group(
+            name=name,
+            parent=parent,
+            content_layer=content_layer,
+        )
     def add_inline_group(
         self,
         name: Optional[str] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
-        # marker: Optional[UnorderedList.ULMarker] = None,
-    ) -> GroupItem:
+    ) -> InlineGroup:
         """add_inline_group."""
         _parent = parent or self.body
         cref = f"#/groups/{len(self.groups)}"
@@ -2232,14 +2251,8 @@ class DoclingDocument(BaseModel):
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
-        if label == GroupLabel.LIST:
-            return self.add_unordered_list(
-                name=name,
-                parent=parent,
-                content_layer=content_layer,
-            )
-        elif label == GroupLabel.ORDERED_LIST:
-            return self.add_ordered_list(
+        if label in [GroupLabel.LIST, GroupLabel.ORDERED_LIST]:
+            return self.add_list_group(
                 name=name,
                 parent=parent,
                 content_layer=content_layer,
@@ -2291,17 +2304,16 @@ class DoclingDocument(BaseModel):
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
-        if not isinstance(parent, (OrderedList, UnorderedList)):
-            warnings.warn("ListItem's parent must be a list group.", DeprecationWarning)
-        if not parent:
-            parent = self.body
+        if not isinstance(parent, ListGroup):
+            warnings.warn(
+                "ListItem parent must be a list group, creating one on the fly.",
+                DeprecationWarning,
+            )
+            parent = self.add_list_group(parent=parent)
         if not orig:
             orig = text
-        marker = marker or "-"
         text_index = len(self.texts)
         cref = f"#/texts/{text_index}"
         list_item = ListItem(
@@ -2310,7 +2322,7 @@ class DoclingDocument(BaseModel):
             self_ref=cref,
             parent=parent.get_ref(),
             enumerated=enumerated,
-            marker=marker,
+            marker=marker or "",
             formatting=formatting,
             hyperlink=hyperlink,
         )
@@ -2864,7 +2876,7 @@ class DoclingDocument(BaseModel):
             if (
                 root_is_picture
                 and not traverse_pictures
-                and isinstance(child, DocItem)
+                and isinstance(child, NodeItem)
                 and child.self_ref not in allowed_pic_refs
             ):
                 continue
@@ -4056,18 +4068,18 @@ class DoclingDocument(BaseModel):
                     DocumentToken.ORDERED_LIST.value,
                     DocumentToken.UNORDERED_LIST.value,
                 ]:
-                    list_label = GroupLabel.LIST
+                    GroupLabel.LIST
                     enum_marker = ""
                     enum_value = 0
                     if tag_name == DocumentToken.ORDERED_LIST.value:
-                        list_label = GroupLabel.ORDERED_LIST
+                        GroupLabel.ORDERED_LIST
                     list_item_pattern = (
                         rf"<(?P<tag>{DocItemLabel.LIST_ITEM})>.*?</(?P=tag)>"
                     )
                     li_pattern = re.compile(list_item_pattern, re.DOTALL)
                     # Add list group:
-                    new_list = doc.add_group(label=list_label, name="list")
+                    new_list = doc.add_list_group(name="list")
                     # Pricess list items
                     for li_match in li_pattern.finditer(full_chunk):
                         enum_value += 1
@@ -4385,17 +4397,17 @@ class DoclingDocument(BaseModel):
     @field_validator("version")
     @classmethod
     def check_version_is_compatible(cls, v: str) -> str:
-        """Check if this document version is compatible with current version."""
-        current_match = re.match(VERSION_PATTERN, CURRENT_VERSION)
+        """Check if this document version is compatible with SDK schema version."""
+        sdk_match = re.match(VERSION_PATTERN, CURRENT_VERSION)
         doc_match = re.match(VERSION_PATTERN, v)
         if (
             doc_match is None
-            or current_match is None
-            or doc_match["major"] != current_match["major"]
-            or doc_match["minor"] > current_match["minor"]
+            or sdk_match is None
+            or doc_match["major"] != sdk_match["major"]
+            or doc_match["minor"] > sdk_match["minor"]
         ):
             raise ValueError(
-                f"incompatible version {v} with schema version {CURRENT_VERSION}"
+                f"Doc version {v} incompatible with SDK schema version {CURRENT_VERSION}"
             )
         else:
             return CURRENT_VERSION
@@ -4425,9 +4437,7 @@ class DoclingDocument(BaseModel):
         ):
             if isinstance(item, ListItem) and (
                 item.parent is None
-                or not isinstance(
-                    item.parent.resolve(doc=self), (OrderedList, UnorderedList)
-                )
+                or not isinstance(item.parent.resolve(doc=self), ListGroup)
             ):
                 if isinstance(prev, ListItem) and (
                     prev.parent is None or prev.parent.resolve(self) == self.body
@@ -4440,11 +4450,7 @@ class DoclingDocument(BaseModel):
         for curr_list_items in reversed(misplaced_list_items):
             # add group
-            new_group = (
-                OrderedList(self_ref="#")
-                if curr_list_items[0].enumerated
-                else UnorderedList(self_ref="#")
-            )
+            new_group = ListGroup(self_ref="#")
             self.insert_item_before_sibling(
                 new_item=new_group,
                 sibling=curr_list_items[0],
@@ -4531,3 +4537,10 @@ class DoclingDocument(BaseModel):
         self.key_value_items = item_lists["key_value_items"]  # type: ignore
         self.form_items = item_lists["form_items"]  # type: ignore
         self.body = new_body
+# deprecated aliases (kept for backwards compatibility):
+BasePictureData = BaseAnnotation
+PictureDescriptionData = DescriptionAnnotation
+PictureMiscData = MiscAnnotation
+UnorderedList = ListGroup

docling_core/types/doc/labels.py CHANGED Viewed

@@ -77,7 +77,7 @@ class GroupLabel(str, Enum):
     LIST = (
         "list"  # group label for list container (not the list-items) (e.g. HTML <ul/>)
     )
-    ORDERED_LIST = "ordered_list"  # List with enumeration (e.g. HTML <ol/>)
+    ORDERED_LIST = "ordered_list"  # deprecated
     CHAPTER = "chapter"
     SECTION = "section"
     SHEET = "sheet"

docling_core/types/doc/page.py CHANGED Viewed

@@ -122,6 +122,8 @@ class BoundingRectangle(BaseModel):
         p_1 = ((self.r_x1 + self.r_x2) / 2.0, (self.r_y1 + self.r_y2) / 2.0)
         delta_x, delta_y = p_1[0] - p_0[0], p_1[1] - p_0[1]
+        if self.coord_origin == CoordOrigin.TOPLEFT:
+            delta_y = -delta_y
         if abs(delta_y) < 1.0e-3:
             angle = 0.0
@@ -131,8 +133,7 @@ class BoundingRectangle(BaseModel):
             angle = math.atan(delta_y / delta_x)
         if delta_x < 0:
             angle += np.pi
-        if angle < 0:
-            angle += 2 * np.pi
+        angle = angle % (2 * np.pi)
         return angle
     @property

docling_core/types/doc/utils.py CHANGED Viewed

@@ -5,8 +5,10 @@
 """Utils for document types."""
+import html
 import unicodedata
 from pathlib import Path
+from typing import Optional
 def relative_path(src: Path, target: Path) -> Path:
@@ -49,14 +51,23 @@ def relative_path(src: Path, target: Path) -> Path:
     return Path(*up_segments, *down_segments)
-def get_html_tag_with_text_direction(html_tag: str, text: str) -> str:
+def get_html_tag_with_text_direction(
+    html_tag: str, text: str, attrs: Optional[dict] = None
+) -> str:
     """Form the HTML element with tag, text, and optional dir attribute."""
-    text_dir = get_text_direction(text)
-    if text_dir == "ltr":
-        return f"<{html_tag}>{text}</{html_tag}>"
-    else:
-        return f'<{html_tag} dir="{text_dir}">{text}</{html_tag}>'
+    my_attrs = attrs or {}
+    if (dir := my_attrs.get("dir")) is not None and dir != "ltr":
+        my_attrs["dir"] = get_text_direction(text)
+    pieces: list[str] = [html_tag]
+    if my_attrs:
+        attrs_str = " ".join(
+            [
+                f'{html.escape(k, quote=False)}="{html.escape(my_attrs[k], quote=False)}"'
+                for k in my_attrs
+            ]
+        )
+        pieces.append(attrs_str)
+    return f"<{' '.join(pieces)}>{text}</{html_tag}>"
 def get_text_direction(text: str) -> str:

docling_core/utils/file.py CHANGED Viewed

@@ -6,6 +6,7 @@
 """File-related utilities."""
 import importlib
+import re
 import tempfile
 from io import BytesIO
 from pathlib import Path
@@ -76,6 +77,32 @@ def resolve_source_to_stream(
             agent_name = f"docling-core/{importlib.metadata.version('docling-core')}"
             req_headers["user-agent"] = agent_name
+        # Google Docs, Files, PDF URLs, Spreadsheets, Presentations: convert to export URL
+        google_doc_id = re.search(
+            r"google\.com\/(file|document|spreadsheets|presentation)\/d\/([\w-]+)",
+            str(http_url),
+        )
+        if google_doc_id:
+            doc_type = google_doc_id.group(1)
+            doc_id = google_doc_id.group(2)
+            if doc_type == "file":
+                http_url = TypeAdapter(AnyHttpUrl).validate_python(
+                    f"https://drive.google.com/uc?export=download&id={doc_id}"
+                )
+            elif doc_type == "document":
+                http_url = TypeAdapter(AnyHttpUrl).validate_python(
+                    f"https://docs.google.com/document/d/{doc_id}/export?format=docx"
+                )
+            elif doc_type == "spreadsheets":
+                http_url = TypeAdapter(AnyHttpUrl).validate_python(
+                    f"https://docs.google.com/spreadsheets/d/{doc_id}/export?format=xlsx"
+                )
+            elif doc_type == "presentation":
+                http_url = TypeAdapter(AnyHttpUrl).validate_python(
+                    f"https://docs.google.com/presentation/d/{doc_id}/export?format=pptx"
+                )
         # fetch the page
         res = requests.get(http_url, stream=True, headers=req_headers)
         res.raise_for_status()

docling_core/utils/legacy.py CHANGED Viewed

@@ -26,7 +26,6 @@ from docling_core.types.doc import (
     TextItem,
 )
 from docling_core.types.doc.document import ContentLayer, GroupItem, ListItem, TableData
-from docling_core.types.doc.labels import GroupLabel
 from docling_core.types.legacy_doc.base import (
     BaseCell,
     BaseText,
@@ -486,7 +485,7 @@ def legacy_to_docling_document(legacy_doc: DsDocument) -> DoclingDocument:  # no
                 item_type in "list-item-level-1" or item.name in {"list", "list-item"}
             ):
                 if current_list is None:
-                    current_list = doc.add_group(label=GroupLabel.LIST, name="list")
+                    current_list = doc.add_list_group(name="list")
             else:
                 current_list = None

{docling_core-2.38.2.dist-info → docling_core-2.40.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docling-core
-Version: 2.38.2
+Version: 2.40.0
 Summary: A python library to define and validate data types in Docling.
 Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
 Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>

{docling_core-2.38.2.dist-info → docling_core-2.40.0.dist-info}/RECORD RENAMED Viewed

@@ -19,19 +19,19 @@ docling_core/search/package.py,sha256=Lz2ml2eDy5t0ZimnGTq-DXHAn-f18w0bn4H5xrhs75
 docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9ACDd57ds,106
 docling_core/transforms/chunker/__init__.py,sha256=YdizSKXLmmK9eyYBsarHWr8Mx_AoA0PT7c0absibZMk,306
 docling_core/transforms/chunker/base.py,sha256=kJaRrGQynglG9wpy0IaAYTf4MKheWH5BAPzx4LE9yIg,2824
-docling_core/transforms/chunker/hierarchical_chunker.py,sha256=7Fpwwsn2BoiR12KGPrn8fU1uuhqBLp85MRLMF0aIsL8,8281
+docling_core/transforms/chunker/hierarchical_chunker.py,sha256=uDf-qGiIT_4JUEg9NOdzvDqAPOTqycKJ-jEpDkV3jJU,8243
 docling_core/transforms/chunker/hybrid_chunker.py,sha256=xjkz8hy3tXXzkJzf7QMFOEq_v8V7Jcs9tCY0Mxjge74,12548
 docling_core/transforms/chunker/tokenizer/__init__.py,sha256=-bhXOTpoI7SYk7vn47z8Ek-RZFjJk4TfZawxsFuNHnE,34
 docling_core/transforms/chunker/tokenizer/base.py,sha256=2gOBQPYJYC0iWXOgMG3DiNP7xEBtii7DYcib0iECq5o,575
 docling_core/transforms/chunker/tokenizer/huggingface.py,sha256=aZ_RNQIzcNkAHGHZw3SBCoqJHM2Ihb65eiM29O9BR6o,2506
 docling_core/transforms/chunker/tokenizer/openai.py,sha256=zt2kwcC-r8MafeEG0CESab8E4RIC9aaFXxxnxOGyTMA,918
 docling_core/transforms/serializer/__init__.py,sha256=CECQlMoCDUxkg4RAUdC3itA3I3qFhKhe2HcYghN6_xw,105
-docling_core/transforms/serializer/base.py,sha256=ZFIiZeplL-QbBs9EDUb1awqxapQ23PsApVetJtAs7Vs,6891
-docling_core/transforms/serializer/common.py,sha256=RO2KWl3sZq_PIvzWzuGJTWntKjLOAy3n17cgZi84AAs,19163
-docling_core/transforms/serializer/doctags.py,sha256=PuAExlP-2HxcDSP_R_phtYQU0yKBW94RrPgb85IUxck,19905
-docling_core/transforms/serializer/html.py,sha256=SZgQa0QnknEoRwMFLdgmVsLQqLF2rQl3D7XyEZzUHCE,37151
+docling_core/transforms/serializer/base.py,sha256=s3Anl_3-QJM1t29Bz-iOgLhAcfG3BZuwZqdYTi5Xfr0,6846
+docling_core/transforms/serializer/common.py,sha256=Dkw9axJqU2qlZuEFRDa6Av11PIL2ejOOOCAahtoK9sA,19106
+docling_core/transforms/serializer/doctags.py,sha256=TD0yAm1qSVy-GsE6svpUAI-Yqjcf2rrTZ3ac9YU3gbE,19858
+docling_core/transforms/serializer/html.py,sha256=oxnUhszRPBINiK1tq2dwf5QjTCrIV_q15vsrPVqBeME,38988
 docling_core/transforms/serializer/html_styles.py,sha256=-jBwS4EU7yfKoz0GSoxhwx90OmIKieO6TwPw57IuxcA,4692
-docling_core/transforms/serializer/markdown.py,sha256=2wV0ydqWKSm-HAW94gF0IRBpjWgoqUjL4JHRYS8DDgY,21803
+docling_core/transforms/serializer/markdown.py,sha256=VwonuAkuOPmQM7ibDIGvQBHOqhTcTJ_t187fLQQiNPo,23951
 docling_core/transforms/visualizer/__init__.py,sha256=gUfF25yiJ_KO46ZIUNqZQOZGy2PLx6gnnr6AZYxKHXI,35
 docling_core/transforms/visualizer/base.py,sha256=aEF7b3rHq6DVdX8zDYEPoq55BHDYe4Hh_97lBdcW4lY,555
 docling_core/transforms/visualizer/layout_visualizer.py,sha256=zHzQTWcy-z1J2BcsjvakLkrp8pgStgnxhDl8YqIAotY,8035
@@ -39,13 +39,13 @@ docling_core/transforms/visualizer/reading_order_visualizer.py,sha256=muqmaxOBao
 docling_core/transforms/visualizer/table_visualizer.py,sha256=iJPjk-XQSSCH3oujcjPMz-redAwNNHseZ41lFyd-u3k,8097
 docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
 docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
-docling_core/types/doc/__init__.py,sha256=pchsIq-9FH_kCTyuyDdB8L4yV77pmnxPwT7399xrqxI,1626
+docling_core/types/doc/__init__.py,sha256=8hOhm5W9mArf3zwgfoMxDs1pHizhLFSAZlLu1tPBBRk,1641
 docling_core/types/doc/base.py,sha256=ndXquBrOKTFQApIJ5s2-zstj3xlVKRbJDSId0KOQnUg,14817
-docling_core/types/doc/document.py,sha256=0e-v_N2ALA66aUZduK1Rii_PcKjffxNKWR9V8Lp0clg,156894
-docling_core/types/doc/labels.py,sha256=JiciRK7_DOkebsrfQ6PVCvS__TsKgWn1ANk84BeB14k,7359
-docling_core/types/doc/page.py,sha256=GV9UnGCvvqs6KD_ac3hF6b_NH6M6IevsL5iSt8WWVCI,41221
+docling_core/types/doc/document.py,sha256=9-n0tngXLTRVAkqGHe3bDSh1OJbBt87EW2nV8GdOGME,157406
+docling_core/types/doc/labels.py,sha256=-W1-LW6z0J9F9ExJqR0Wd1WeqWTaY3Unm-j1UkQGlC4,7330
+docling_core/types/doc/page.py,sha256=J_4ThNhrdhrfPtNMBTDHi-CQBvraejAwUaqVjyDeeeI,41288
 docling_core/types/doc/tokens.py,sha256=z22l9J81_sg9CYMvOuLmPuLsNT7h_s7wao2UT89DvI8,9278
-docling_core/types/doc/utils.py,sha256=SaiQD-WMMooFm1bMqwatU-IGhtG048iKJb-ppnJit_k,2250
+docling_core/types/doc/utils.py,sha256=JpAi7x9DHksFlIj_gRJPcSZOHa8AHvVPEO_K9aSnw4c,2608
 docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
 docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
 docling_core/types/io/__init__.py,sha256=7QYvFRaDE0AzBg8e7tvsVNlLBbCbAbQ9rP2TU8aXR1k,350
@@ -68,15 +68,15 @@ docling_core/types/rec/statement.py,sha256=YwcV4CbVaAbzNwh14yJ_6Py3Ww0XnUJrEEUiK
 docling_core/types/rec/subject.py,sha256=PRCERGTMs4YhR3_Ne6jogkm41zYg8uUWb1yFpM7atm4,2572
 docling_core/utils/__init__.py,sha256=VauNNpWRHG0_ISKrsy5-gTxicrdQZSau6qMfuMl3iqk,120
 docling_core/utils/alias.py,sha256=B6Lqvss8CbaNARHLR4qSmNh9OkB6LvqTpxfsFmkLAFo,874
-docling_core/utils/file.py,sha256=GzX0pclvewwPoqHJSaVUuULzSJwJgkCUwgKgJ7G5ohQ,5628
+docling_core/utils/file.py,sha256=CSNclJGL2OwLIc8DQFdoLxr22FUc4_UC7zS6pNrFfkQ,6858
 docling_core/utils/generate_docs.py,sha256=BdKAoduWXOc7YMvcmlhjoJOFlUxij1ybxglj6LZDtC8,2290
 docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2tyi_OhHepHYtZg,1654
-docling_core/utils/legacy.py,sha256=DrI3QGoL755ZCIoKHF74-pTWm8R0zfFo2C2vB5dT2aY,24463
+docling_core/utils/legacy.py,sha256=5lghO48OEcV9V51tRnH3YSKgLtdqhr-Q5C_OcJZ8TOs,24392
 docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
 docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
-docling_core-2.38.2.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
-docling_core-2.38.2.dist-info/METADATA,sha256=E1ONe70u3yt98iAtpnlfsS9hTV4Cpx8Kn5Q0Zz6o_XY,6453
-docling_core-2.38.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-docling_core-2.38.2.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
-docling_core-2.38.2.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
-docling_core-2.38.2.dist-info/RECORD,,
+docling_core-2.40.0.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
+docling_core-2.40.0.dist-info/METADATA,sha256=A6_Wz_CJzmHa20USMUgQPDMpN5-S3f8VpNrx7ns1SXo,6453
+docling_core-2.40.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+docling_core-2.40.0.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
+docling_core-2.40.0.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
+docling_core-2.40.0.dist-info/RECORD,,

{docling_core-2.38.2.dist-info → docling_core-2.40.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{docling_core-2.38.2.dist-info → docling_core-2.40.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{docling_core-2.38.2.dist-info → docling_core-2.40.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{docling_core-2.38.2.dist-info → docling_core-2.40.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

docling-core 2.38.2__py3-none-any.whl → 2.40.0__py3-none-any.whl

Potentially problematic release.

docling-core 2.38.2py3-none-any.whl → 2.40.0py3-none-any.whl