PyPI - docling-core - Versions diffs - 2.33.1__py3-none-any.whl → 2.34.0__py3-none-any.whl - Mend

docling-core 2.33.1py3-none-any.whl → 2.34.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling-core might be problematic. Click here for more details.

Files changed (12) hide show

docling_core/transforms/serializer/base.py CHANGED Viewed

@@ -202,6 +202,16 @@ class BaseDocSerializer(ABC):
         """Hook for strikethrough formatting serialization."""
         ...
+    @abstractmethod
+    def serialize_subscript(self, text: str, **kwargs: Any) -> str:
+        """Hook for subscript formatting serialization."""
+        ...
+    @abstractmethod
+    def serialize_superscript(self, text: str, **kwargs: Any) -> str:
+        """Hook for superscript formatting serialization."""
+        ...
     @abstractmethod
     def serialize_hyperlink(
         self,
@@ -239,6 +249,15 @@ class BaseDocSerializer(ABC):
         """Serialize the item's captions."""
         ...
+    @abstractmethod
+    def serialize_annotations(
+        self,
+        item: DocItem,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        """Serialize the item's annotations."""
+        ...
     @abstractmethod
     def get_excluded_refs(self, **kwargs: Any) -> set[str]:
         """Get references to excluded items."""
@@ -257,3 +276,18 @@ class BaseSerializerProvider(ABC):
     def get_serializer(self, doc: DoclingDocument) -> BaseDocSerializer:
         """Get a the associated serializer."""
         ...
+class BaseAnnotationSerializer(ABC):
+    """Base class for annotation serializers."""
+    @abstractmethod
+    def serialize(
+        self,
+        *,
+        item: DocItem,
+        doc: DoclingDocument,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        """Serializes the passed annotation."""
+        ...

docling_core/transforms/serializer/common.py CHANGED Viewed

@@ -15,6 +15,7 @@ from pydantic import AnyUrl, BaseModel, ConfigDict, NonNegativeInt, computed_fie
 from typing_extensions import Self, override
 from docling_core.transforms.serializer.base import (
+    BaseAnnotationSerializer,
     BaseDocSerializer,
     BaseFallbackSerializer,
     BaseFormSerializer,
@@ -30,6 +31,7 @@ from docling_core.transforms.serializer.base import (
 from docling_core.types.doc.document import (
     DOCUMENT_TOKENS_EXPORT_LABELS,
     ContentLayer,
+    DescriptionAnnotation,
     DocItem,
     DoclingDocument,
     FloatingItem,
@@ -41,9 +43,10 @@ from docling_core.types.doc.document import (
     OrderedList,
     PictureClassificationData,
     PictureDataType,
-    PictureDescriptionData,
     PictureItem,
     PictureMoleculeData,
+    Script,
+    TableAnnotationType,
     TableItem,
     TextItem,
     UnorderedList,
@@ -122,7 +125,9 @@ def _iterate_items(
         yield item
-def _get_picture_annotation_text(annotation: PictureDataType) -> Optional[str]:
+def _get_annotation_text(
+    annotation: Union[PictureDataType, TableAnnotationType],
+) -> Optional[str]:
     result = None
     if isinstance(annotation, PictureClassificationData):
         predicted_class = (
@@ -132,7 +137,7 @@ def _get_picture_annotation_text(annotation: PictureDataType) -> Optional[str]:
         )
         if predicted_class is not None:
             result = predicted_class.replace("_", " ")
-    elif isinstance(annotation, PictureDescriptionData):
+    elif isinstance(annotation, DescriptionAnnotation):
         result = annotation.text
     elif isinstance(annotation, PictureMoleculeData):
         result = annotation.smi
@@ -211,6 +216,8 @@ class DocSerializer(BaseModel, BaseDocSerializer):
     list_serializer: BaseListSerializer
     inline_serializer: BaseInlineSerializer
+    annotation_serializer: BaseAnnotationSerializer
     params: CommonParams = CommonParams()
     _excluded_refs_cache: dict[str, set[str]] = {}
@@ -449,6 +456,10 @@ class DocSerializer(BaseModel, BaseDocSerializer):
                 res = self.serialize_underline(text=res)
             if formatting.strikethrough:
                 res = self.serialize_strikethrough(text=res)
+            if formatting.script == Script.SUB:
+                res = self.serialize_subscript(text=res)
+            elif formatting.script == Script.SUPER:
+                res = self.serialize_superscript(text=res)
         if params.include_hyperlinks and hyperlink:
             res = self.serialize_hyperlink(text=res, hyperlink=hyperlink)
         return res
@@ -473,6 +484,16 @@ class DocSerializer(BaseModel, BaseDocSerializer):
         """Hook for strikethrough formatting serialization."""
         return text
+    @override
+    def serialize_subscript(self, text: str, **kwargs: Any) -> str:
+        """Hook for subscript formatting serialization."""
+        return text
+    @override
+    def serialize_superscript(self, text: str, **kwargs: Any) -> str:
+        """Hook for superscript formatting serialization."""
+        return text
     @override
     def serialize_hyperlink(
         self,
@@ -505,6 +526,19 @@ class DocSerializer(BaseModel, BaseDocSerializer):
             text_res = ""
         return create_ser_result(text=text_res, span_source=results)
+    @override
+    def serialize_annotations(
+        self,
+        item: DocItem,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        """Serialize the item's annotations."""
+        return self.annotation_serializer.serialize(
+            item=item,
+            doc=self.doc,
+            **kwargs,
+        )
     def _get_applicable_pages(self) -> Optional[list[int]]:
         pages = {
             item.prov[0].page_no: ...

docling_core/transforms/serializer/doctags.py CHANGED Viewed

@@ -7,6 +7,7 @@ from pydantic import BaseModel
 from typing_extensions import override
 from docling_core.transforms.serializer.base import (
+    BaseAnnotationSerializer,
     BaseDocSerializer,
     BaseFallbackSerializer,
     BaseFormSerializer,
@@ -17,12 +18,14 @@ from docling_core.transforms.serializer.base import (
     BaseTableSerializer,
     BaseTextSerializer,
     SerializationResult,
+    Span,
 )
 from docling_core.transforms.serializer.common import (
     CommonParams,
     DocSerializer,
     create_ser_result,
 )
+from docling_core.types.doc.base import BoundingBox
 from docling_core.types.doc.document import (
     CodeItem,
     DocItem,
@@ -38,6 +41,7 @@ from docling_core.types.doc.document import (
     PictureItem,
     PictureMoleculeData,
     PictureTabularChartData,
+    ProvenanceItem,
     TableItem,
     TextItem,
     UnorderedList,
@@ -414,6 +418,39 @@ class DocTagsListSerializer(BaseModel, BaseListSerializer):
 class DocTagsInlineSerializer(BaseInlineSerializer):
     """DocTags-specific inline group serializer."""
+    def _get_inline_location_tags(
+        self, doc: DoclingDocument, item: InlineGroup, params: DocTagsParams
+    ) -> SerializationResult:
+        prov: Optional[ProvenanceItem] = None
+        boxes: list[BoundingBox] = []
+        doc_items: list[DocItem] = []
+        for it, _ in doc.iterate_items(root=item):
+            if isinstance(it, DocItem):
+                for prov in it.prov:
+                    boxes.append(prov.bbox)
+                    doc_items.append(it)
+        if prov is None:
+            return create_ser_result()
+        bbox = BoundingBox.enclosing_bbox(boxes=boxes)
+        # using last seen prov as reference for page dims
+        page_w, page_h = doc.pages[prov.page_no].size.as_tuple()
+        loc_str = DocumentToken.get_location(
+            bbox=bbox.to_top_left_origin(page_h).as_tuple(),
+            page_w=page_w,
+            page_h=page_h,
+            xsize=params.xsize,
+            ysize=params.ysize,
+        )
+        return SerializationResult(
+            text=loc_str,
+            spans=[Span(item=it) for it in doc_items],
+        )
     @override
     def serialize(
         self,
@@ -428,12 +465,23 @@ class DocTagsInlineSerializer(BaseInlineSerializer):
         """Serializes the passed item."""
         my_visited = visited if visited is not None else set()
         params = DocTagsParams(**kwargs)
-        parts = doc_serializer.get_parts(
-            item=item,
-            list_level=list_level,
-            is_inline_scope=True,
-            visited=my_visited,
-            **kwargs,
+        parts: List[SerializationResult] = []
+        if params.add_location:
+            inline_loc_tags_ser_res = self._get_inline_location_tags(
+                doc=doc,
+                item=item,
+                params=params,
+            )
+            parts.append(inline_loc_tags_ser_res)
+            params.add_location = False  # suppress children location serialization
+        parts.extend(
+            doc_serializer.get_parts(
+                item=item,
+                list_level=list_level,
+                is_inline_scope=True,
+                visited=my_visited,
+                **{**kwargs, **params.model_dump()},
+            )
         )
         wrap_tag = DocumentToken.INLINE.value
         delim = _get_delim(params=params)
@@ -460,6 +508,15 @@ class DocTagsFallbackSerializer(BaseFallbackSerializer):
         return create_ser_result()
+class DocTagsAnnotationSerializer(BaseAnnotationSerializer):
+    """DocTags-specific annotation serializer."""
+    @override
+    def serialize(self, *, item: DocItem, **kwargs: Any) -> SerializationResult:
+        """Serializes the item's annotations."""
+        return create_ser_result()
 class DocTagsDocSerializer(DocSerializer):
     """DocTags-specific document serializer."""
@@ -473,6 +530,8 @@ class DocTagsDocSerializer(DocSerializer):
     list_serializer: BaseListSerializer = DocTagsListSerializer()
     inline_serializer: BaseInlineSerializer = DocTagsInlineSerializer()
+    annotation_serializer: BaseAnnotationSerializer = DocTagsAnnotationSerializer()
     params: DocTagsParams = DocTagsParams()
     @override

docling_core/transforms/serializer/html.py CHANGED Viewed

@@ -21,6 +21,7 @@ from pydantic import AnyUrl, BaseModel
 from typing_extensions import override
 from docling_core.transforms.serializer.base import (
+    BaseAnnotationSerializer,
     BaseDocSerializer,
     BaseFallbackSerializer,
     BaseFormSerializer,
@@ -35,7 +36,7 @@ from docling_core.transforms.serializer.base import (
 from docling_core.transforms.serializer.common import (
     CommonParams,
     DocSerializer,
-    _get_picture_annotation_text,
+    _get_annotation_text,
     create_ser_result,
 )
 from docling_core.transforms.serializer.html_styles import (
@@ -47,6 +48,7 @@ from docling_core.types.doc.base import ImageRefMode
 from docling_core.types.doc.document import (
     CodeItem,
     ContentLayer,
+    DescriptionAnnotation,
     DocItem,
     DoclingDocument,
     FloatingItem,
@@ -59,7 +61,9 @@ from docling_core.types.doc.document import (
     ListItem,
     NodeItem,
     OrderedList,
+    PictureClassificationData,
     PictureItem,
+    PictureMoleculeData,
     PictureTabularChartData,
     SectionHeaderItem,
     TableCell,
@@ -758,14 +762,7 @@ class HTMLFallbackSerializer(BaseFallbackSerializer):
     """HTML-specific fallback serializer."""
     @override
-    def serialize(
-        self,
-        *,
-        item: NodeItem,
-        doc_serializer: "BaseDocSerializer",
-        doc: DoclingDocument,
-        **kwargs: Any,
-    ) -> SerializationResult:
+    def serialize(self, *, item: NodeItem, **kwargs: Any) -> SerializationResult:
         """Fallback serializer for items not handled by other serializers."""
         if isinstance(item, DocItem):
             return create_ser_result(
@@ -777,6 +774,42 @@ class HTMLFallbackSerializer(BaseFallbackSerializer):
             return create_ser_result()
+class HTMLAnnotationSerializer(BaseModel, BaseAnnotationSerializer):
+    """HTML-specific annotation serializer."""
+    def serialize(
+        self,
+        *,
+        item: DocItem,
+        doc: DoclingDocument,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        """Serializes the passed annotation to HTML format."""
+        res_parts: list[SerializationResult] = []
+        for ann in item.get_annotations():
+            if isinstance(
+                ann,
+                (PictureClassificationData, DescriptionAnnotation, PictureMoleculeData),
+            ):
+                if ann_text := _get_annotation_text(ann):
+                    text_dir = get_text_direction(ann_text)
+                    dir_str = f' dir="{text_dir}"' if text_dir == "rtl" else ""
+                    ann_ser_res = create_ser_result(
+                        text=(
+                            f'<div data-annotation-kind="{ann.kind}"{dir_str}>'
+                            f"{html.escape(ann_text)}"
+                            f"</div>"
+                        ),
+                        span_source=item,
+                    )
+                    res_parts.append(ann_ser_res)
+        return create_ser_result(
+            text=" ".join([r.text for r in res_parts if r.text]),
+            span_source=res_parts,
+        )
 class HTMLDocSerializer(DocSerializer):
     """HTML-specific document serializer."""
@@ -790,6 +823,8 @@ class HTMLDocSerializer(DocSerializer):
     list_serializer: BaseListSerializer = HTMLListSerializer()
     inline_serializer: BaseInlineSerializer = HTMLInlineSerializer()
+    annotation_serializer: BaseAnnotationSerializer = HTMLAnnotationSerializer()
     params: HTMLParams = HTMLParams()
     @override
@@ -812,6 +847,16 @@ class HTMLDocSerializer(DocSerializer):
         """Apply HTML-specific strikethrough serialization."""
         return f"<del>{text}</del>"
+    @override
+    def serialize_subscript(self, text: str, **kwargs: Any) -> str:
+        """Apply HTML-specific subscript serialization."""
+        return f"<sub>{text}</sub>"
+    @override
+    def serialize_superscript(self, text: str, **kwargs: Any) -> str:
+        """Apply HTML-specific superscript serialization."""
+        return f"<sup>{text}</sup>"
     @override
     def serialize_hyperlink(
         self,
@@ -968,20 +1013,13 @@ class HTMLDocSerializer(DocSerializer):
                     results.append(cap_ser_res)
         if params.include_annotations and item.self_ref not in excluded_refs:
-            if isinstance(item, PictureItem):
-                for ann in item.annotations:
-                    if ann_text := _get_picture_annotation_text(annotation=ann):
-                        text_dir = get_text_direction(ann_text)
-                        dir_str = f' dir="{text_dir}"' if text_dir == "rtl" else ""
-                        ann_ser_res = create_ser_result(
-                            text=(
-                                f'<div data-annotation-kind="{ann.kind}"{dir_str}>'
-                                f"{html.escape(ann_text)}"
-                                f"</div>"
-                            ),
-                            span_source=item,
-                        )
-                        results.append(ann_ser_res)
+            if isinstance(item, (PictureItem, TableItem)):
+                ann_res = self.serialize_annotations(
+                    item=item,
+                    **kwargs,
+                )
+                if ann_res.text:
+                    results.append(ann_res)
         text_res = params.caption_delim.join([r.text for r in results])
         if text_res:

docling_core/transforms/serializer/markdown.py CHANGED Viewed

@@ -15,6 +15,7 @@ from tabulate import tabulate
 from typing_extensions import override
 from docling_core.transforms.serializer.base import (
+    BaseAnnotationSerializer,
     BaseDocSerializer,
     BaseFallbackSerializer,
     BaseFormSerializer,
@@ -29,7 +30,7 @@ from docling_core.transforms.serializer.base import (
 from docling_core.transforms.serializer.common import (
     CommonParams,
     DocSerializer,
-    _get_picture_annotation_text,
+    _get_annotation_text,
     _PageBreakSerResult,
     create_ser_result,
 )
@@ -37,6 +38,7 @@ from docling_core.types.doc.base import ImageRefMode
 from docling_core.types.doc.document import (
     CodeItem,
     ContentLayer,
+    DescriptionAnnotation,
     DocItem,
     DoclingDocument,
     FloatingItem,
@@ -48,7 +50,9 @@ from docling_core.types.doc.document import (
     KeyValueItem,
     NodeItem,
     OrderedList,
+    PictureClassificationData,
     PictureItem,
+    PictureMoleculeData,
     PictureTabularChartData,
     SectionHeaderItem,
     TableItem,
@@ -58,6 +62,23 @@ from docling_core.types.doc.document import (
 )
+def _get_annotation_ser_result(
+    ann_kind: str, ann_text: str, mark_annotation: bool, doc_item: DocItem
+):
+    return create_ser_result(
+        text=(
+            (
+                f'<!--<annotation kind="{ann_kind}">-->'
+                f"{ann_text}"
+                f"<!--<annotation/>-->"
+            )
+            if mark_annotation
+            else ann_text
+        ),
+        span_source=doc_item,
+    )
 class MarkdownParams(CommonParams):
     """Markdown-specific serialization parameters."""
@@ -136,6 +157,49 @@ class MarkdownTextSerializer(BaseModel, BaseTextSerializer):
         return create_ser_result(text=text, span_source=res_parts)
+class MarkdownAnnotationSerializer(BaseModel, BaseAnnotationSerializer):
+    """Markdown-specific annotation serializer."""
+    def serialize(
+        self,
+        *,
+        item: DocItem,
+        doc: DoclingDocument,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        """Serialize the item's annotations."""
+        params = MarkdownParams(**kwargs)
+        res_parts: list[SerializationResult] = []
+        for ann in item.get_annotations():
+            if isinstance(
+                ann,
+                (
+                    PictureClassificationData,
+                    DescriptionAnnotation,
+                    PictureMoleculeData,
+                ),
+            ):
+                if ann_text := _get_annotation_text(ann):
+                    ann_res = create_ser_result(
+                        text=(
+                            (
+                                f'<!--<annotation kind="{ann.kind}">-->'
+                                f"{ann_text}"
+                                f"<!--<annotation/>-->"
+                            )
+                            if params.mark_annotations
+                            else ann_text
+                        ),
+                        span_source=item,
+                    )
+                    res_parts.append(ann_res)
+        return create_ser_result(
+            text="\n\n".join([r.text for r in res_parts if r.text]),
+            span_source=item,
+        )
 class MarkdownTableSerializer(BaseTableSerializer):
     """Markdown-specific table item serializer."""
@@ -149,6 +213,7 @@ class MarkdownTableSerializer(BaseTableSerializer):
         **kwargs: Any,
     ) -> SerializationResult:
         """Serializes the passed item."""
+        params = MarkdownParams(**kwargs)
         res_parts: list[SerializationResult] = []
         cap_res = doc_serializer.serialize_captions(
@@ -159,6 +224,16 @@ class MarkdownTableSerializer(BaseTableSerializer):
             res_parts.append(cap_res)
         if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
+            if params.include_annotations:
+                ann_res = doc_serializer.serialize_annotations(
+                    item=item,
+                    **kwargs,
+                )
+                if ann_res.text:
+                    res_parts.append(ann_res)
             rows = [
                 [
                     # make sure that md tables are not broken
@@ -214,22 +289,12 @@ class MarkdownPictureSerializer(BasePictureSerializer):
         if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
             if params.include_annotations:
-                for ann in item.annotations:
-                    if ann_text := _get_picture_annotation_text(annotation=ann):
-                        ann_ser_res = create_ser_result(
-                            text=(
-                                (
-                                    f'<!--<annotation kind="{ann.kind}">-->'
-                                    f"{ann_text}"
-                                    f"<!--<annotation/>-->"
-                                )
-                                if params.mark_annotations
-                                else ann_text
-                            ),
-                            span_source=item,
-                        )
-                        res_parts.append(ann_ser_res)
+                ann_res = doc_serializer.serialize_annotations(
+                    item=item,
+                    **kwargs,
+                )
+                if ann_res.text:
+                    res_parts.append(ann_res)
             img_res = self._serialize_image_part(
                 item=item,
@@ -257,7 +322,7 @@ class MarkdownPictureSerializer(BasePictureSerializer):
                     res_parts.append(
                         create_ser_result(text=md_table_content, span_source=item)
                     )
-        text_res = "\n\n".join([r.text for r in res_parts])
+        text_res = "\n\n".join([r.text for r in res_parts if r.text])
         return create_ser_result(text=text_res, span_source=res_parts)
@@ -471,6 +536,8 @@ class MarkdownDocSerializer(DocSerializer):
     list_serializer: BaseListSerializer = MarkdownListSerializer()
     inline_serializer: BaseInlineSerializer = MarkdownInlineSerializer()
+    annotation_serializer: BaseAnnotationSerializer = MarkdownAnnotationSerializer()
     params: MarkdownParams = MarkdownParams()
     @override

docling_core/types/doc/document.py CHANGED Viewed

@@ -15,7 +15,7 @@ import warnings
 from enum import Enum
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
+from typing import Any, Dict, Final, List, Literal, Optional, Sequence, Tuple, Union
 from urllib.parse import unquote
 import pandas as pd
@@ -30,6 +30,7 @@ from pydantic import (
     computed_field,
     field_validator,
     model_validator,
+    validate_call,
 )
 from tabulate import tabulate
 from typing_extensions import Annotated, Self, deprecated
@@ -53,7 +54,7 @@ _logger = logging.getLogger(__name__)
 Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
 LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
-CURRENT_VERSION: Final = "1.3.0"
+CURRENT_VERSION: Final = "1.4.0"
 DEFAULT_EXPORT_LABELS = {
     DocItemLabel.TITLE,
@@ -85,8 +86,8 @@ DOCUMENT_TOKENS_EXPORT_LABELS.update(
 )
-class BasePictureData(BaseModel):
-    """BasePictureData."""
+class BaseAnnotation(BaseModel):
+    """Base class for all annotation types."""
     kind: str
@@ -98,7 +99,7 @@ class PictureClassificationClass(BaseModel):
     confidence: float
-class PictureClassificationData(BasePictureData):
+class PictureClassificationData(BaseAnnotation):
     """PictureClassificationData."""
     kind: Literal["classification"] = "classification"
@@ -106,19 +107,18 @@ class PictureClassificationData(BasePictureData):
     predicted_classes: List[PictureClassificationClass]
-class PictureDescriptionData(BasePictureData):
-    """PictureDescriptionData."""
+class DescriptionAnnotation(BaseAnnotation):
+    """DescriptionAnnotation."""
     kind: Literal["description"] = "description"
     text: str
     provenance: str
-class PictureMoleculeData(BaseModel):
+class PictureMoleculeData(BaseAnnotation):
     """PictureMoleculeData."""
     kind: Literal["molecule_data"] = "molecule_data"
     smi: str
     confidence: float
     class_name: str
@@ -126,13 +126,19 @@ class PictureMoleculeData(BaseModel):
     provenance: str
-class PictureMiscData(BaseModel):
-    """PictureMiscData."""
+class MiscAnnotation(BaseAnnotation):
+    """MiscAnnotation."""
     kind: Literal["misc"] = "misc"
     content: Dict[str, Any]
+# deprecated aliases:
+BasePictureData = BaseAnnotation
+PictureDescriptionData = DescriptionAnnotation
+PictureMiscData = MiscAnnotation
 class ChartLine(BaseModel):
     """Represents a line in a line chart.
@@ -196,7 +202,7 @@ class ChartPoint(BaseModel):
     value: Tuple[float, float]
-class PictureChartData(BaseModel):
+class PictureChartData(BaseAnnotation):
     """Base class for picture chart data.
     Attributes:
@@ -381,10 +387,10 @@ class PictureTabularChartData(PictureChartData):
 PictureDataType = Annotated[
     Union[
+        DescriptionAnnotation,
+        MiscAnnotation,
         PictureClassificationData,
-        PictureDescriptionData,
         PictureMoleculeData,
-        PictureMiscData,
         PictureTabularChartData,
         PictureLineChartData,
         PictureBarChartData,
@@ -818,6 +824,18 @@ class DocItem(
         )
         return page_image.crop(crop_bbox.as_tuple())
+    def get_annotations(self) -> Sequence[BaseAnnotation]:
+        """Get the annotations of this DocItem."""
+        return []
+class Script(str, Enum):
+    """Text script position."""
+    BASELINE = "baseline"
+    SUB = "sub"
+    SUPER = "super"
 class Formatting(BaseModel):
     """Formatting."""
@@ -826,6 +844,7 @@ class Formatting(BaseModel):
     italic: bool = False
     underline: bool = False
     strikethrough: bool = False
+    script: Script = Script.BASELINE
 class TextItem(DocItem):
@@ -1182,6 +1201,19 @@ class PictureItem(FloatingItem):
         text = serializer.serialize(item=self).text
         return text
+    def get_annotations(self) -> Sequence[BaseAnnotation]:
+        """Get the annotations of this PictureItem."""
+        return self.annotations
+TableAnnotationType = Annotated[
+    Union[
+        DescriptionAnnotation,
+        MiscAnnotation,
+    ],
+    Field(discriminator="kind"),
+]
 class TableItem(FloatingItem):
     """TableItem."""
@@ -1192,6 +1224,8 @@ class TableItem(FloatingItem):
         DocItemLabel.TABLE,
     ] = DocItemLabel.TABLE
+    annotations: List[TableAnnotationType] = []
     def export_to_dataframe(self) -> pd.DataFrame:
         """Export the table as a Pandas DataFrame."""
         if self.data.num_rows == 0 or self.data.num_cols == 0:
@@ -1438,6 +1472,15 @@ class TableItem(FloatingItem):
         text = serializer.serialize(item=self).text
         return text
+    @validate_call
+    def add_annotation(self, annotation: TableAnnotationType) -> None:
+        """Add an annotation to the table."""
+        self.annotations.append(annotation)
+    def get_annotations(self) -> Sequence[BaseAnnotation]:
+        """Get the annotations of this TableItem."""
+        return self.annotations
 class GraphCell(BaseModel):
     """GraphCell."""
@@ -1776,6 +1819,18 @@ class DoclingDocument(BaseModel):
             item.parent = parent_ref
             self.form_items.append(item)
+        elif isinstance(item, (UnorderedList, OrderedList, InlineGroup)):
+            item_label = "groups"
+            item_index = len(self.groups)
+            cref = f"#/{item_label}/{item_index}"
+            item.self_ref = cref
+            item.parent = parent_ref
+            self.groups.append(item)
         else:
             raise ValueError(f"Item {item} is not supported for insertion")
@@ -2111,8 +2166,8 @@ class DoclingDocument(BaseModel):
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
-        if not parent:
-            parent = self.body
+        if not isinstance(parent, (OrderedList, UnorderedList)):
+            raise ValueError("ListItem's parent must be a list group")
         if not orig:
             orig = text
@@ -2267,6 +2322,7 @@ class DoclingDocument(BaseModel):
         parent: Optional[NodeItem] = None,
         label: DocItemLabel = DocItemLabel.TABLE,
         content_layer: Optional[ContentLayer] = None,
+        annotations: Optional[list[TableAnnotationType]] = None,
     ):
         """add_table.
@@ -2284,7 +2340,11 @@ class DoclingDocument(BaseModel):
         cref = f"#/tables/{table_index}"
         tbl_item = TableItem(
-            label=label, data=data, self_ref=cref, parent=parent.get_ref()
+            label=label,
+            data=data,
+            self_ref=cref,
+            parent=parent.get_ref(),
+            annotations=annotations or [],
         )
         if prov:
             tbl_item.prov.append(prov)
@@ -2301,7 +2361,7 @@ class DoclingDocument(BaseModel):
     def add_picture(
         self,
-        annotations: List[PictureDataType] = [],
+        annotations: Optional[List[PictureDataType]] = None,
         image: Optional[ImageRef] = None,
         caption: Optional[Union[TextItem, RefItem]] = None,
         prov: Optional[ProvenanceItem] = None,
@@ -2310,7 +2370,7 @@ class DoclingDocument(BaseModel):
     ):
         """add_picture.
-        :param data: List[PictureData]: (Default value = [])
+        :param data: Optional[List[PictureData]]: (Default value = None)
         :param caption: Optional[Union[TextItem:
         :param RefItem]]:  (Default value = None)
         :param prov: Optional[ProvenanceItem]:  (Default value = None)
@@ -2324,7 +2384,7 @@ class DoclingDocument(BaseModel):
         fig_item = PictureItem(
             label=DocItemLabel.PICTURE,
-            annotations=annotations,
+            annotations=annotations or [],
             image=image,
             self_ref=cref,
             parent=parent.get_ref(),
@@ -3589,6 +3649,52 @@ class DoclingDocument(BaseModel):
             return (GraphData(cells=cells, links=links), overall_prov)
+        def _add_text(
+            full_chunk: str,
+            bbox: Optional[BoundingBox],
+            pg_width: int,
+            pg_height: int,
+            page_no: int,
+            tag_name: str,
+            doc_label: DocItemLabel,
+            doc: DoclingDocument,
+            parent: Optional[NodeItem],
+        ):
+            # For everything else, treat as text
+            text_content = extract_inner_text(full_chunk)
+            element_prov = (
+                ProvenanceItem(
+                    bbox=bbox.resize_by_scale(pg_width, pg_height),
+                    charspan=(0, len(text_content)),
+                    page_no=page_no,
+                )
+                if bbox
+                else None
+            )
+            content_layer = ContentLayer.BODY
+            if tag_name in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
+                content_layer = ContentLayer.FURNITURE
+            if doc_label == DocItemLabel.SECTION_HEADER:
+                # Extract level from tag_name (e.g. "section_level_header_1" -> 1)
+                level = int(tag_name.split("_")[-1])
+                doc.add_heading(
+                    text=text_content,
+                    level=level,
+                    prov=element_prov,
+                    parent=parent,
+                    content_layer=content_layer,
+                )
+            else:
+                doc.add_text(
+                    label=doc_label,
+                    text=text_content,
+                    prov=element_prov,
+                    parent=parent,
+                    content_layer=content_layer,
+                )
         # doc = DoclingDocument(name="Document")
         for pg_idx, doctag_page in enumerate(doctag_document.pages):
             page_doctags = doctag_page.tokens
@@ -3623,7 +3729,7 @@ class DoclingDocument(BaseModel):
             tag_pattern = (
                 rf"<(?P<tag>{DocItemLabel.TITLE}|{DocItemLabel.DOCUMENT_INDEX}|"
                 rf"{DocItemLabel.CHECKBOX_UNSELECTED}|{DocItemLabel.CHECKBOX_SELECTED}|"
-                rf"{DocItemLabel.TEXT}|{DocItemLabel.PAGE_HEADER}|"
+                rf"{DocItemLabel.TEXT}|{DocItemLabel.PAGE_HEADER}|{GroupLabel.INLINE}|"
                 rf"{DocItemLabel.PAGE_FOOTER}|{DocItemLabel.FORMULA}|"
                 rf"{DocItemLabel.CAPTION}|{DocItemLabel.PICTURE}|"
                 rf"{DocItemLabel.FOOTNOTE}|{DocItemLabel.CODE}|"
@@ -3648,7 +3754,7 @@ class DoclingDocument(BaseModel):
                     # no closing tag; only the existence of the item is recovered
                     full_chunk = f"<{tag_name}></{tag_name}>"
-                doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH)
+                doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.TEXT)
                 if tag_name == DocumentToken.OTSL.value:
                     table_data = parse_table_content(full_chunk)
@@ -3671,6 +3777,24 @@ class DoclingDocument(BaseModel):
                     else:
                         doc.add_table(data=table_data, caption=caption)
+                elif tag_name == GroupLabel.INLINE:
+                    inline_group = doc.add_inline_group()
+                    content = match.group("content")
+                    common_bbox = extract_bounding_box(content)
+                    for item_match in pattern.finditer(content):
+                        item_tag = item_match.group("tag")
+                        _add_text(
+                            full_chunk=item_match.group(0),
+                            bbox=common_bbox,
+                            pg_width=pg_width,
+                            pg_height=pg_height,
+                            page_no=page_no,
+                            tag_name=item_tag,
+                            doc_label=tag_to_doclabel.get(item_tag, DocItemLabel.TEXT),
+                            doc=doc,
+                            parent=inline_group,
+                        )
                 elif tag_name in [DocItemLabel.PICTURE, DocItemLabel.CHART]:
                     caption, caption_bbox = extract_caption(full_chunk)
                     table_data = None
@@ -3820,38 +3944,17 @@ class DoclingDocument(BaseModel):
                         )
                 else:
                     # For everything else, treat as text
-                    text_content = extract_inner_text(full_chunk)
-                    element_prov = (
-                        ProvenanceItem(
-                            bbox=bbox.resize_by_scale(pg_width, pg_height),
-                            charspan=(0, len(text_content)),
-                            page_no=page_no,
-                        )
-                        if bbox
-                        else None
+                    _add_text(
+                        full_chunk=full_chunk,
+                        bbox=bbox,
+                        pg_width=pg_width,
+                        pg_height=pg_height,
+                        page_no=page_no,
+                        tag_name=tag_name,
+                        doc_label=doc_label,
+                        doc=doc,
+                        parent=None,
                     )
-                    content_layer = ContentLayer.BODY
-                    if tag_name in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
-                        content_layer = ContentLayer.FURNITURE
-                    if doc_label == DocItemLabel.SECTION_HEADER:
-                        # Extract level from tag_name (e.g. "section_level_header_1" -> 1)
-                        level = int(tag_name.split("_")[-1])
-                        doc.add_heading(
-                            text=text_content,
-                            level=level,
-                            prov=element_prov,
-                            content_layer=content_layer,
-                        )
-                    else:
-                        doc.add_text(
-                            label=doc_label,
-                            text=text_content,
-                            prov=element_prov,
-                            content_layer=content_layer,
-                        )
         return doc
     @deprecated("Use save_as_doctags instead.")
@@ -4149,3 +4252,58 @@ class DoclingDocument(BaseModel):
                 raise ValueError("Document hierachy is inconsistent.")
         return d
+    @model_validator(mode="after")
+    def validate_misplaced_list_items(self):
+        """validate_misplaced_list_items."""
+        # find list items without list parent, putting succesive ones together
+        misplaced_list_items: list[list[ListItem]] = []
+        prev: Optional[NodeItem] = None
+        for item, _ in self.iterate_items(
+            traverse_pictures=True,
+            included_content_layers={c for c in ContentLayer},
+            with_groups=True,  # so that we can distinguish neighboring lists
+        ):
+            if isinstance(item, ListItem) and (
+                item.parent is None
+                or not isinstance(
+                    item.parent.resolve(doc=self), (OrderedList, UnorderedList)
+                )
+            ):
+                # non_group_list_items.append(item)
+                if prev is None or not isinstance(prev, ListItem):  # if new list
+                    misplaced_list_items.append([item])
+                else:
+                    misplaced_list_items[-1].append(item)
+            prev = item
+        for curr_list_items in reversed(misplaced_list_items):
+            # add group
+            new_group = (
+                OrderedList(self_ref="#")
+                if curr_list_items[0].enumerated
+                else UnorderedList(self_ref="#")
+            )
+            self.insert_item_before_sibling(
+                new_item=new_group,
+                sibling=curr_list_items[0],
+            )
+            # delete list items from document (should not be affected by group addition)
+            self.delete_items(node_items=curr_list_items)
+            # add list items to new group
+            for li in curr_list_items:
+                self.add_list_item(
+                    text=li.text,
+                    enumerated=li.enumerated,
+                    marker=li.marker,
+                    orig=li.orig,
+                    prov=li.prov[0] if li.prov else None,
+                    parent=new_group,
+                    content_layer=li.content_layer,
+                    formatting=li.formatting,
+                    hyperlink=li.hyperlink,
+                )
+        return self

{docling_core-2.33.1.dist-info → docling_core-2.34.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docling-core
-Version: 2.33.1
+Version: 2.34.0
 Summary: A python library to define and validate data types in Docling.
 Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
 Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>

{docling_core-2.33.1.dist-info → docling_core-2.34.0.dist-info}/RECORD RENAMED Viewed

@@ -26,12 +26,12 @@ docling_core/transforms/chunker/tokenizer/base.py,sha256=2gOBQPYJYC0iWXOgMG3DiNP
 docling_core/transforms/chunker/tokenizer/huggingface.py,sha256=aZ_RNQIzcNkAHGHZw3SBCoqJHM2Ihb65eiM29O9BR6o,2506
 docling_core/transforms/chunker/tokenizer/openai.py,sha256=zt2kwcC-r8MafeEG0CESab8E4RIC9aaFXxxnxOGyTMA,918
 docling_core/transforms/serializer/__init__.py,sha256=CECQlMoCDUxkg4RAUdC3itA3I3qFhKhe2HcYghN6_xw,105
-docling_core/transforms/serializer/base.py,sha256=9bgpWA0oMmZNRc3yIuZVnu5bJ1glClBsswtVF1vYwMI,6046
-docling_core/transforms/serializer/common.py,sha256=mkajw0QRL--WgVL42Vlp2e2PuUQVh79D6EKP4_3YKy0,18112
-docling_core/transforms/serializer/doctags.py,sha256=mEmRWVuebcG5pZcR1_HX146cyUk0_FjaLQtMXSgh9hs,17870
-docling_core/transforms/serializer/html.py,sha256=_HN1WFKH_WJkxtZrmvm1a6-UDxsEGt_ChWdUysS1qjY,35843
+docling_core/transforms/serializer/base.py,sha256=ZFIiZeplL-QbBs9EDUb1awqxapQ23PsApVetJtAs7Vs,6891
+docling_core/transforms/serializer/common.py,sha256=WP-qO-woidrKyvZ56m0vlKMysoLrMzzZtHSCIwsl3ek,19119
+docling_core/transforms/serializer/doctags.py,sha256=PuAExlP-2HxcDSP_R_phtYQU0yKBW94RrPgb85IUxck,19905
+docling_core/transforms/serializer/html.py,sha256=KiywrroYBS3yk07gQizlmk3oqkXg_NpFwE0VF31_Z-I,37112
 docling_core/transforms/serializer/html_styles.py,sha256=-jBwS4EU7yfKoz0GSoxhwx90OmIKieO6TwPw57IuxcA,4692
-docling_core/transforms/serializer/markdown.py,sha256=ussKqIptiKPTCRNjy3edjap4DOsy52no-FLSeAyv9S0,18759
+docling_core/transforms/serializer/markdown.py,sha256=wfMNrjA4wMehWLCejAhEN1eQPRixUO1SyL6ojkKkzZY,20614
 docling_core/transforms/visualizer/__init__.py,sha256=gUfF25yiJ_KO46ZIUNqZQOZGy2PLx6gnnr6AZYxKHXI,35
 docling_core/transforms/visualizer/base.py,sha256=aEF7b3rHq6DVdX8zDYEPoq55BHDYe4Hh_97lBdcW4lY,555
 docling_core/transforms/visualizer/layout_visualizer.py,sha256=ulXxWGIl69-HMKDPFk_XKgNCgQeDNc969PVt_X0-drA,7823
@@ -40,7 +40,7 @@ docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HX
 docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
 docling_core/types/doc/__init__.py,sha256=bysJn2iwjAHwThSWDPXEdVUUij7p_ax12_nx2_0CMdg,653
 docling_core/types/doc/base.py,sha256=ndXquBrOKTFQApIJ5s2-zstj3xlVKRbJDSId0KOQnUg,14817
-docling_core/types/doc/document.py,sha256=rdevCAZDpMPzPlZmAtiucvBM8h_AjuIZpQDaqjpknl0,142796
+docling_core/types/doc/document.py,sha256=VKZg1VT-H8gTXybgY6lRlcKKR3f6mFDB9UzcrLtII5I,148197
 docling_core/types/doc/labels.py,sha256=vp4h3e7AmBvezRmgrfuPehjAHTZOufphErLB4ENhdME,7171
 docling_core/types/doc/page.py,sha256=1JMPwglaTITBvg959L_pcWPb-fXoDYGh-e_tGZMzVMQ,41060
 docling_core/types/doc/tokens.py,sha256=z22l9J81_sg9CYMvOuLmPuLsNT7h_s7wao2UT89DvI8,9278
@@ -73,9 +73,9 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
 docling_core/utils/legacy.py,sha256=DrI3QGoL755ZCIoKHF74-pTWm8R0zfFo2C2vB5dT2aY,24463
 docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
 docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
-docling_core-2.33.1.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
-docling_core-2.33.1.dist-info/METADATA,sha256=tib261Wc010Z2y6_lgKcXdO2OKPG8pdf2n1CoIYSDBA,6453
-docling_core-2.33.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-docling_core-2.33.1.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
-docling_core-2.33.1.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
-docling_core-2.33.1.dist-info/RECORD,,
+docling_core-2.34.0.dist-info/licenses/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
+docling_core-2.34.0.dist-info/METADATA,sha256=853af3C8OZrbXzZqYFhfDfu-gtG4m7my-6wqzCir_cg,6453
+docling_core-2.34.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+docling_core-2.34.0.dist-info/entry_points.txt,sha256=ER4zROQWkFMHIrY-oqY5E4HeCcCIg8dLkNztYGxdb7c,59
+docling_core-2.34.0.dist-info/top_level.txt,sha256=O-tcXpGiurlud-1ZxMq1b-OmrfAVA4sajcgWU32RtfA,13
+docling_core-2.34.0.dist-info/RECORD,,

{docling_core-2.33.1.dist-info → docling_core-2.34.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{docling_core-2.33.1.dist-info → docling_core-2.34.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{docling_core-2.33.1.dist-info → docling_core-2.34.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{docling_core-2.33.1.dist-info → docling_core-2.34.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

docling-core 2.33.1__py3-none-any.whl → 2.34.0__py3-none-any.whl

Potentially problematic release.

docling-core 2.33.1py3-none-any.whl → 2.34.0py3-none-any.whl