PyPI - docling-core - Versions diffs - 1.7.1__py3-none-any.whl → 2.0.0__py3-none-any.whl - Mend

docling-core 1.7.1py3-none-any.whl → 2.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling-core might be problematic. Click here for more details.

Files changed (36) hide show

docling_core/transforms/chunker/__init__.py +2 -8
docling_core/transforms/chunker/base.py +27 -40
docling_core/transforms/chunker/hierarchical_chunker.py +144 -312
docling_core/types/__init__.py +12 -8
docling_core/types/doc/__init__.py +25 -0
docling_core/types/doc/base.py +136 -451
docling_core/types/doc/document.py +1288 -559
docling_core/types/{experimental → doc}/labels.py +4 -1
docling_core/types/legacy_doc/__init__.py +6 -0
docling_core/types/legacy_doc/base.py +485 -0
docling_core/types/{doc → legacy_doc}/doc_ann.py +1 -1
docling_core/types/{doc → legacy_doc}/doc_ocr.py +1 -1
docling_core/types/{doc → legacy_doc}/doc_raw.py +1 -1
docling_core/types/legacy_doc/document.py +715 -0
docling_core/types/rec/subject.py +1 -1
docling_core/utils/generate_docs.py +82 -0
docling_core/utils/{ds_generate_jsonschema.py → generate_jsonschema.py} +4 -4
docling_core/utils/validators.py +3 -3
{docling_core-1.7.1.dist-info → docling_core-2.0.0.dist-info}/METADATA +11 -11
{docling_core-1.7.1.dist-info → docling_core-2.0.0.dist-info}/RECORD +24 -31
docling_core-2.0.0.dist-info/entry_points.txt +5 -0
docling_core/transforms/id_generator/__init__.py +0 -12
docling_core/transforms/id_generator/base.py +0 -30
docling_core/transforms/id_generator/doc_hash_id_generator.py +0 -27
docling_core/transforms/id_generator/uuid_generator.py +0 -34
docling_core/transforms/metadata_extractor/__init__.py +0 -13
docling_core/transforms/metadata_extractor/base.py +0 -59
docling_core/transforms/metadata_extractor/simple_metadata_extractor.py +0 -59
docling_core/types/experimental/__init__.py +0 -30
docling_core/types/experimental/base.py +0 -167
docling_core/types/experimental/document.py +0 -1192
docling_core/utils/ds_generate_docs.py +0 -144
docling_core-1.7.1.dist-info/entry_points.txt +0 -5
/docling_core/types/{doc → legacy_doc}/tokens.py +0 -0
{docling_core-1.7.1.dist-info → docling_core-2.0.0.dist-info}/LICENSE +0 -0
{docling_core-1.7.1.dist-info → docling_core-2.0.0.dist-info}/WHEEL +0 -0

docling_core/types/doc/document.py CHANGED Viewed

@@ -1,452 +1,1116 @@
-#
-# Copyright IBM Corp. 2024 - 2024
-# SPDX-License-Identifier: MIT
-#
 """Models for the Docling Document data type."""
-from datetime import datetime
-from typing import Generic, Optional, Union
+import base64
+import mimetypes
+import re
+import typing
+from io import BytesIO
+from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
+import pandas as pd
+from PIL import Image as PILImage
 from pydantic import (
-    AnyHttpUrl,
+    AnyUrl,
     BaseModel,
+    ConfigDict,
     Field,
-    NonNegativeInt,
-    StrictStr,
+    StringConstraints,
+    computed_field,
+    field_validator,
     model_validator,
 )
 from tabulate import tabulate
+from typing_extensions import Annotated, Self
-from docling_core.search.mapping import es_field
-from docling_core.types.base import (
-    Acquisition,
-    CollectionDocumentInfo,
-    CollectionNameTypeT,
-    DescriptionAdvancedT,
-    DescriptionAnalyticsT,
-    FileInfoObject,
-    Identifier,
-    IdentifierTypeT,
-    LanguageT,
-    Log,
-)
-from docling_core.types.doc.base import (
-    BaseCell,
-    BaseText,
-    BitmapObject,
-    Figure,
-    PageDimensions,
-    PageReference,
-    Ref,
-    S3Data,
-    Table,
-)
-from docling_core.types.doc.tokens import DocumentToken
-from docling_core.utils.alias import AliasModel
+from docling_core.search.package import VERSION_PATTERN
+from docling_core.types.base import _JSON_POINTER_REGEX
+from docling_core.types.doc import BoundingBox, Size
+from docling_core.types.doc.labels import DocItemLabel, GroupLabel
+from docling_core.types.legacy_doc.tokens import DocumentToken
+Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
+LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
+CURRENT_VERSION: Final = "1.0.0"
-class CCSFileInfoDescription(BaseModel, extra="forbid"):
-    """File info description."""
+DEFAULT_EXPORT_LABELS = {
+    DocItemLabel.TITLE,
+    DocItemLabel.DOCUMENT_INDEX,
+    DocItemLabel.SECTION_HEADER,
+    DocItemLabel.PARAGRAPH,
+    DocItemLabel.CAPTION,
+    DocItemLabel.TABLE,
+    DocItemLabel.PICTURE,
+    DocItemLabel.FORMULA,
+    DocItemLabel.CHECKBOX_UNSELECTED,
+    DocItemLabel.CHECKBOX_SELECTED,
+    DocItemLabel.TEXT,
+    DocItemLabel.LIST_ITEM,
+    DocItemLabel.CODE,
+}
-    author: Optional[list[StrictStr]] = None
-    keywords: Optional[str] = None
-    subject: Optional[str] = None
-    title: Optional[StrictStr] = None
-    creation_date: Optional[str] = None  # datetime
+class BasePictureData(BaseModel):
+    """BasePictureData."""
-class CCSFileInfoObject(FileInfoObject, extra="forbid"):
-    """File info object."""
+    kind: str
-    num_pages: Optional[int] = Field(default=None, alias="#-pages")
-    collection_name: Optional[str] = Field(
-        default=None,
-        alias="collection-name",
-        json_schema_extra=es_field(type="keyword", ignore_above=8191),
-    )
-    description: Optional[CCSFileInfoDescription] = Field(
-        default=None, json_schema_extra=es_field(suppress=True)
-    )
-    page_hashes: Optional[list[PageReference]] = Field(
-        default=None, alias="page-hashes"
-    )
+class PictureClassificationClass(BaseModel):
+    """PictureClassificationData."""
+    class_name: str
+    confidence: float
-class Affiliation(BaseModel, extra="forbid"):
-    """Affiliation."""
-    name: str = Field(
-        ...,
-        json_schema_extra=es_field(
-            fields={
-                "lower": {
-                    "normalizer": "lowercase_asciifolding",
-                    "type": "keyword",
-                    "ignore_above": 8191,
-                },
-                "keyword": {"type": "keyword", "ignore_above": 8191},
-            },
-        ),
-    )
-    id: Optional[str] = Field(
-        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-    source: Optional[str] = Field(
-        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
+class PictureClassificationData(BasePictureData):
+    """PictureClassificationData."""
-class Author(BaseModel, extra="forbid"):
-    """Author."""
-    name: str = Field(
-        ...,
-        json_schema_extra=es_field(
-            type="text",
-            fields={
-                "lower": {
-                    "normalizer": "lowercase_asciifolding",
-                    "type": "keyword",
-                    "ignore_above": 8191,
-                },
-                "keyword": {"type": "keyword", "ignore_above": 8191},
-            },
-        ),
-    )
-    id: Optional[str] = Field(
-        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-    source: Optional[str] = Field(
-        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-    affiliations: Optional[list[Affiliation]] = None
+    kind: Literal["classification"] = "classification"
+    provenance: str
+    predicted_classes: List[PictureClassificationClass]
-class Publication(BaseModel, Generic[IdentifierTypeT], extra="forbid"):
-    """Publication details of a journal or venue."""
+class PictureDescriptionData(BasePictureData):
+    """PictureDescriptionData."""
-    identifiers: Optional[list[Identifier[IdentifierTypeT]]] = Field(
-        default=None,
-        description="Unique identifiers of a publication venue.",
-    )
-    name: StrictStr = Field(
-        json_schema_extra=es_field(type="keyword", ignore_above=8191),
-        description="Name of the publication.",
-    )
-    alternate_names: Optional[list[StrictStr]] = Field(
-        default=None,
-        json_schema_extra=es_field(type="text"),
-        title="Alternate Names",
-        description="Other names or abbreviations of this publication.",
-    )
-    type: Optional[list[StrictStr]] = Field(
-        default=None,
-        json_schema_extra=es_field(type="keyword", ignore_above=8191),
-        description="Type of publication (journal article, conference, review,...).",
-    )
-    pages: Optional[StrictStr] = Field(
-        default=None,
-        json_schema_extra=es_field(type="text"),
-        description="Page range in the publication.",
-    )
-    issue: Optional[StrictStr] = Field(
-        default=None,
-        json_schema_extra=es_field(type="keyword", ignore_above=8191),
-        description="Publication issue (issue number).",
-    )
-    volume: Optional[StrictStr] = Field(
-        default=None,
-        json_schema_extra=es_field(type="keyword", ignore_above=8191),
-        description="Publication volume.",
-    )
-    url: Optional[AnyHttpUrl] = Field(
-        default=None,
-        json_schema_extra=es_field(type="keyword", ignore_above=8191),
-        description="URL on the publication site.",
-    )
+    kind: Literal["description"] = "description"
+    text: str
+    provenance: str
-class DescriptionLicense(BaseModel, extra="forbid"):
-    """Licence in document description."""
+class PictureMoleculeData(BaseModel):
+    """PictureMoleculeData."""
-    code: Optional[StrictStr] = Field(
-        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-    text: Optional[StrictStr] = None
+    kind: Literal["molecule_data"] = "molecule_data"
+    smi: str
+    confidence: float
+    class_name: str
+    segmentation: List[Tuple[float, float]]
+    provenance: str
-class CCSDocumentDescription(
-    AliasModel,
-    Generic[
-        DescriptionAdvancedT,
-        DescriptionAnalyticsT,
-        IdentifierTypeT,
-        LanguageT,
-        CollectionNameTypeT,
-    ],
-):
-    """Description in document."""
-    title: Optional[StrictStr] = None
-    abstract: Optional[list[StrictStr]] = None
-    authors: Optional[list[Author]] = None
-    affiliations: Optional[list[Affiliation]] = None
-    subjects: Optional[list[str]] = Field(
-        default=None,
-        json_schema_extra=es_field(
-            fields={"keyword": {"ignore_above": 8191, "type": "keyword"}}
-        ),
-    )
-    keywords: Optional[list[str]] = Field(
-        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-    publication_date: Optional[datetime] = None
-    languages: Optional[list[LanguageT]] = Field(
-        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-    license_: Optional[DescriptionLicense] = Field(default=None, alias="license")
-    publishers: Optional[list[StrictStr]] = Field(
-        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-    url_refs: Optional[list[str]] = Field(
-        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
-    )
-    references: Optional[list[Identifier[IdentifierTypeT]]] = None
-    publication: Optional[list[Publication]] = Field(
-        default=None, description="List of publication journals or venues."
-    )
-    reference_count: Optional[NonNegativeInt] = Field(
-        default=None,
-        title="Reference Count",
-        description="Total number of documents referenced by this document.",
-        json_schema_extra=es_field(type="integer"),
-    )
-    citation_count: Optional[NonNegativeInt] = Field(
-        default=None,
-        title="Citation Count",
-        description=(
-            "Total number of citations that this document has received (number "
-            "of documents in whose bibliography this document appears)."
-        ),
-        json_schema_extra=es_field(type="integer"),
-    )
-    citation_date: Optional[datetime] = Field(
-        default=None,
-        title="Citation Count Date",
-        description="Last update date of the citation count.",
-    )
-    advanced: Optional[DescriptionAdvancedT] = None
-    analytics: Optional[DescriptionAnalyticsT] = None
-    logs: list[Log]
-    collection: Optional[CollectionDocumentInfo[CollectionNameTypeT]] = Field(
-        default=None, description="The collection information of this document."
-    )
-    acquisition: Optional[Acquisition] = Field(
-        default=None,
-        description=(
-            "Information on how the document was obtained, for data governance"
-            " purposes."
-        ),
-    )
+class PictureMiscData(BaseModel):
+    """PictureMiscData."""
-class MinimalDocument(
-    AliasModel,
-    Generic[
-        DescriptionAdvancedT,
-        DescriptionAnalyticsT,
-        IdentifierTypeT,
-        LanguageT,
-        CollectionNameTypeT,
-    ],
-):
-    """Minimal model for a document."""
-    name: StrictStr = Field(alias="_name")
-    obj_type: Optional[StrictStr] = Field("document", alias="type")
-    description: CCSDocumentDescription[
-        DescriptionAdvancedT,
-        DescriptionAnalyticsT,
-        IdentifierTypeT,
-        LanguageT,
-        CollectionNameTypeT,
-    ]
-    file_info: FileInfoObject = Field(alias="file-info")
-    main_text: Optional[list[Union[Ref, BaseText]]] = Field(
-        default=None, alias="main-text"
-    )
-    figures: Optional[list[Figure]] = None
-    tables: Optional[list[Table]] = None
-class CCSDocument(
-    MinimalDocument,
-    Generic[
-        DescriptionAdvancedT,
-        DescriptionAnalyticsT,
-        IdentifierTypeT,
-        LanguageT,
-        CollectionNameTypeT,
+    kind: Literal["misc"] = "misc"
+    content: Dict[str, Any]
+PictureDataType = Annotated[
+    Union[
+        PictureClassificationData,
+        PictureDescriptionData,
+        PictureMoleculeData,
+        PictureMiscData,
     ],
-):
-    """Model for a CCS-generated document."""
-    obj_type: Optional[StrictStr] = Field("pdf-document", alias="type")
-    bitmaps: Optional[list[BitmapObject]] = None
-    equations: Optional[list[BaseCell]] = None
-    footnotes: Optional[list[BaseText]] = None
-    file_info: CCSFileInfoObject = Field(alias="file-info")
-    main_text: Optional[list[Union[Ref, BaseText]]] = Field(
-        default=None,
-        alias="main-text",
-    )
-    page_dimensions: Optional[list[PageDimensions]] = Field(
-        default=None, alias="page-dimensions"
-    )
-    page_footers: Optional[list[BaseText]] = Field(default=None, alias="page-footers")
-    page_headers: Optional[list[BaseText]] = Field(default=None, alias="page-headers")
-    s3_data: Optional[S3Data] = Field(default=None, alias="_s3_data")
+    Field(discriminator="kind"),
+]
+class TableCell(BaseModel):
+    """TableCell."""
+    bbox: Optional[BoundingBox] = None
+    row_span: int = 1
+    col_span: int = 1
+    start_row_offset_idx: int
+    end_row_offset_idx: int
+    start_col_offset_idx: int
+    end_col_offset_idx: int
+    text: str
+    column_header: bool = False
+    row_header: bool = False
+    row_section: bool = False
     @model_validator(mode="before")
     @classmethod
-    def from_dict(cls, data):
-        """Validates and fixes the input data."""
-        if not isinstance(data, dict):
-            return data
-        description_collection = data["description"].get("collection")
-        if not description_collection:
-            data["description"].setdefault("collection", {})
-        data["description"]["collection"].setdefault("type", "Document")
-        logs = data["description"].get("logs")
-        if not logs:
-            data["description"].setdefault("logs", [])
-        abstract = data["description"].get("abstract")
-        if abstract is not None and not isinstance(abstract, list):
-            if isinstance(abstract, str):
-                data["description"]["abstract"] = [abstract]
-            else:
-                data["description"].pop("abstract")
+    def from_dict_format(cls, data: Any) -> Any:
+        """from_dict_format."""
+        if isinstance(data, Dict):
+            # Check if this is a native BoundingBox or a bbox from docling-ibm-models
+            if (
+                # "bbox" not in data
+                # or data["bbox"] is None
+                # or isinstance(data["bbox"], BoundingBox)
+                "text"
+                in data
+            ):
+                return data
+            text = data["bbox"].get("token", "")
+            if not len(text):
+                text_cells = data.pop("text_cell_bboxes", None)
+                if text_cells:
+                    for el in text_cells:
+                        text += el["token"] + " "
+                text = text.strip()
+            data["text"] = text
-        for key in ["affiliations", "authors"]:
-            descr = data["description"].get(key)
-            if descr is not None and not isinstance(descr, list):
-                if isinstance(descr, dict):
-                    data["description"][key] = [descr]
-                else:
-                    data["description"].pop(key)
+        return data
-        if data.get("main-text"):
-            for item in data["main-text"]:
-                if ref := item.pop("__ref", None):
-                    item["$ref"] = ref
-        return data
+class TableData(BaseModel):  # TBD
+    """BaseTableData."""
+    table_cells: List[TableCell] = []
+    num_rows: int = 0
+    num_cols: int = 0
-class ExportedCCSDocument(
-    MinimalDocument,
-    Generic[
-        DescriptionAdvancedT,
-        DescriptionAnalyticsT,
-        IdentifierTypeT,
-        LanguageT,
-        CollectionNameTypeT,
-    ],
-):
-    """Document model for Docling."""
+    @computed_field  # type: ignore
+    @property
+    def grid(
+        self,
+    ) -> List[List[TableCell]]:
+        """grid."""
+        # Initialise empty table data grid (only empty cells)
+        table_data = [
+            [
+                TableCell(
+                    text="",
+                    start_row_offset_idx=i,
+                    end_row_offset_idx=i + 1,
+                    start_col_offset_idx=j,
+                    end_col_offset_idx=j + 1,
+                )
+                for j in range(self.num_cols)
+            ]
+            for i in range(self.num_rows)
+        ]
+        # Overwrite cells in table data for which there is actual cell content.
+        for cell in self.table_cells:
+            for i in range(
+                min(cell.start_row_offset_idx, self.num_rows),
+                min(cell.end_row_offset_idx, self.num_rows),
+            ):
+                for j in range(
+                    min(cell.start_col_offset_idx, self.num_cols),
+                    min(cell.end_col_offset_idx, self.num_cols),
+                ):
+                    table_data[i][j] = cell
+        return table_data
-    obj_type: Optional[StrictStr] = Field(
-        "pdf-document",
-        alias="type",
-        json_schema_extra=es_field(type="keyword", ignore_above=8191),
+class DocumentOrigin(BaseModel):
+    """FileSource."""
+    mimetype: str  # the mimetype of the original file
+    binary_hash: Uint64  # the binary hash of the original file.
+    # TODO: Change to be Uint64 and provide utility method to generate
+    filename: str  # The name of the original file, including extension, without path.
+    # Could stem from filesystem, source URI, Content-Disposition header, ...
+    uri: Optional[AnyUrl] = (
+        None  # any possible reference to a source file,
+        # from any file handler protocol (e.g. https://, file://, s3://)
     )
-    bitmaps: Optional[list[BitmapObject]] = None
-    equations: Optional[list[BaseCell]] = None
-    footnotes: Optional[list[BaseText]] = None
-    description: CCSDocumentDescription[
-        DescriptionAdvancedT,
-        DescriptionAnalyticsT,
-        IdentifierTypeT,
-        LanguageT,
-        CollectionNameTypeT,
+    _extra_mimetypes: typing.ClassVar[List[str]] = [
+        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
+        "application/vnd.openxmlformats-officedocument.presentationml.template",
+        "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
+        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
     ]
-    file_info: CCSFileInfoObject = Field(alias="file-info")
-    main_text: Optional[list[Union[Ref, BaseText]]] = Field(
-        default=None, alias="main-text"
-    )
-    page_dimensions: Optional[list[PageDimensions]] = Field(
-        default=None, alias="page-dimensions"
+    @field_validator("binary_hash", mode="before")
+    @classmethod
+    def parse_hex_string(cls, value):
+        """parse_hex_string."""
+        if isinstance(value, str):
+            try:
+                # Convert hex string to an integer
+                hash_int = Uint64(value, 16)
+                # Mask to fit within 64 bits (unsigned)
+                return (
+                    hash_int & 0xFFFFFFFFFFFFFFFF
+                )  # TODO be sure it doesn't clip uint64 max
+            except ValueError:
+                raise ValueError(f"Invalid sha256 hexdigest: {value}")
+        return value  # If already an int, return it as is.
+    @field_validator("mimetype")
+    @classmethod
+    def validate_mimetype(cls, v):
+        """validate_mimetype."""
+        # Check if the provided MIME type is valid using mimetypes module
+        if v not in mimetypes.types_map.values() and v not in cls._extra_mimetypes:
+            raise ValueError(f"'{v}' is not a valid MIME type")
+        return v
+class RefItem(BaseModel):
+    """RefItem."""
+    cref: str = Field(alias="$ref", pattern=_JSON_POINTER_REGEX)
+    # This method makes RefItem compatible with DocItem
+    def get_ref(self):
+        """get_ref."""
+        return self
+    model_config = ConfigDict(
+        populate_by_name=True,
     )
-    page_footers: Optional[list[BaseText]] = Field(default=None, alias="page-footers")
-    page_headers: Optional[list[BaseText]] = Field(default=None, alias="page-headers")
-    s3_data: Optional[S3Data] = Field(default=None, alias="_s3_data")
-    identifiers: Optional[list[Identifier[IdentifierTypeT]]] = None
-    @model_validator(mode="before")
+    def resolve(self, doc: "DoclingDocument"):
+        """resolve."""
+        path_components = self.cref.split("/")
+        if (num_comps := len(path_components)) == 3:
+            _, path, index_str = path_components
+            index = int(index_str)
+            obj = doc.__getattribute__(path)[index]
+        elif num_comps == 2:
+            _, path = path_components
+            obj = doc.__getattribute__(path)
+        else:
+            raise RuntimeError(f"Unsupported number of path components: {num_comps}")
+        return obj
+class ImageRef(BaseModel):
+    """ImageRef."""
+    mimetype: str
+    dpi: int
+    size: Size
+    uri: AnyUrl
+    _pil: Optional[PILImage.Image] = None
+    @property
+    def pil_image(self) -> PILImage.Image:
+        """Return the PIL Image."""
+        if self._pil is not None:
+            return self._pil
+        if str(self.uri).startswith("data:"):
+            encoded_img = str(self.uri).split(",")[1]
+            decoded_img = base64.b64decode(encoded_img)
+            self._pil = PILImage.open(BytesIO(decoded_img))
+        else:
+            self._pil = PILImage.open(str(self.uri))
+        return self._pil
+    @field_validator("mimetype")
     @classmethod
-    def from_dict(cls, data):
-        """Fix ref in main-text."""
-        if not isinstance(data, dict):
-            return data
-        if data.get("main-text"):
-            for item in data["main-text"]:
-                if ref := item.pop("__ref", None):
-                    item["$ref"] = ref
+    def validate_mimetype(cls, v):
+        """validate_mimetype."""
+        # Check if the provided MIME type is valid using mimetypes module
+        if v not in mimetypes.types_map.values():
+            raise ValueError(f"'{v}' is not a valid MIME type")
+        return v
-        return data
+    @classmethod
+    def from_pil(cls, image: PILImage.Image, dpi: int) -> Self:
+        """Construct ImageRef from a PIL Image."""
+        buffered = BytesIO()
+        image.save(buffered, format="PNG")
+        img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+        img_uri = f"data:image/png;base64,{img_str}"
+        return cls(
+            mimetype="image/png",
+            dpi=dpi,
+            size=Size(width=image.width, height=image.height),
+            uri=img_uri,
+            _pil=image,
+        )
-    def _resolve_ref(self, item: Ref) -> Optional[Union[BaseCell, BaseText]]:
-        """Return the resolved reference.
-        Resolved the Ref object within the document.
-        If the object is not found, None is returned.
+class ProvenanceItem(BaseModel):
+    """ProvenanceItem."""
+    page_no: int
+    bbox: BoundingBox
+    charspan: Tuple[int, int]
+class NodeItem(BaseModel):
+    """NodeItem."""
+    self_ref: str = Field(pattern=_JSON_POINTER_REGEX)
+    parent: Optional[RefItem] = None
+    children: List[RefItem] = []
+    model_config = ConfigDict(extra="forbid")
+    def get_ref(self):
+        """get_ref."""
+        return RefItem(cref=self.self_ref)
+class GroupItem(NodeItem):  # Container type, can't be a leaf node
+    """GroupItem."""
+    name: str = (
+        "group"  # Name of the group, e.g. "Introduction Chapter",
+        # "Slide 5", "Navigation menu list", ...
+    )
+    label: GroupLabel = GroupLabel.UNSPECIFIED
+class DocItem(
+    NodeItem
+):  # Base type for any element that carries content, can be a leaf node
+    """DocItem."""
+    label: DocItemLabel
+    prov: List[ProvenanceItem] = []
+    def get_location_tokens(
+        self,
+        doc: "DoclingDocument",
+        new_line: str,
+        xsize: int = 100,
+        ysize: int = 100,
+        add_page_index: bool = True,
+    ) -> str:
+        """Get the location string for the BaseCell."""
+        if not len(self.prov):
+            return ""
+        location = ""
+        for prov in self.prov:
+            page_w, page_h = doc.pages[prov.page_no].size.as_tuple()
+            page_i = -1
+            if add_page_index:
+                page_i = prov.page_no
+            loc_str = DocumentToken.get_location(
+                bbox=prov.bbox.to_bottom_left_origin(page_h).as_tuple(),
+                page_w=page_w,
+                page_h=page_h,
+                xsize=xsize,
+                ysize=ysize,
+                page_i=page_i,
+            )
+            location += f"{loc_str}{new_line}"
+        return location
+class TextItem(DocItem):
+    """TextItem."""
+    orig: str  # untreated representation
+    text: str  # sanitized representation
+    def export_to_document_tokens(
+        self,
+        doc: "DoclingDocument",
+        new_line: str = "\n",
+        xsize: int = 100,
+        ysize: int = 100,
+        add_location: bool = True,
+        add_content: bool = True,
+        add_page_index: bool = True,
+    ):
+        r"""Export text element to document tokens format.
+        :param doc: "DoclingDocument":
+        :param new_line: str:  (Default value = "\n")
+        :param xsize: int:  (Default value = 100)
+        :param ysize: int:  (Default value = 100)
+        :param add_location: bool:  (Default value = True)
+        :param add_content: bool:  (Default value = True)
+        :param add_page_index: bool:  (Default value = True)
         """
-        result: Optional[Union[BaseCell, BaseText]] = None
-        # NOTE: currently only resolves refs explicitely, such that we can make
-        # assumptions on ref parts
-        if item.obj_type == "table" and self.tables:
-            parts = item.ref.split("/")
-            result = self.tables[int(parts[2])]
-        elif item.obj_type == "figure" and self.figures:
-            parts = item.ref.split("/")
-            result = self.figures[int(parts[2])]
-        elif item.obj_type == "equation" and self.equations:
-            parts = item.ref.split("/")
-            result = self.equations[int(parts[2])]
-        elif item.obj_type == "footnote" and self.footnotes:
-            parts = item.ref.split("/")
-            result = self.footnotes[int(parts[2])]
+        body = f"<{self.label.value}>"
-        return result
+        # TODO: This must be done through an explicit mapping.
+        # assert DocumentToken.is_known_token(
+        #    body
+        # ), f"failed DocumentToken.is_known_token({body})"
+        if add_location:
+            body += self.get_location_tokens(
+                doc=doc,
+                new_line="",
+                xsize=xsize,
+                ysize=ysize,
+                add_page_index=add_page_index,
+            )
+        if add_content and self.text is not None:
+            body += self.text.strip()
+        body += f"</{self.label.value}>{new_line}"
+        return body
+class SectionHeaderItem(TextItem):
+    """SectionItem."""
+    label: typing.Literal[DocItemLabel.SECTION_HEADER] = DocItemLabel.SECTION_HEADER
+    level: LevelNumber
+class ListItem(TextItem):
+    """SectionItem."""
+    label: typing.Literal[DocItemLabel.LIST_ITEM] = DocItemLabel.LIST_ITEM
+    enumerated: bool = False
+    marker: str  # The bullet or number symbol that prefixes this list item
+class FloatingItem(DocItem):
+    """FloatingItem."""
+    captions: List[RefItem] = []
+    references: List[RefItem] = []
+    footnotes: List[RefItem] = []
+    image: Optional[ImageRef] = None
+    def caption_text(self, doc: "DoclingDocument") -> str:
+        """Computes the caption as a single text."""
+        text = ""
+        for cap in self.captions:
+            text += cap.resolve(doc).text
+        return text
+class PictureItem(FloatingItem):
+    """PictureItem."""
+    label: typing.Literal[DocItemLabel.PICTURE] = DocItemLabel.PICTURE
+    annotations: List[PictureDataType] = []
+    def export_to_document_tokens(
+        self,
+        doc: "DoclingDocument",
+        new_line: str = "\n",
+        xsize: int = 100,
+        ysize: int = 100,
+        add_location: bool = True,
+        add_caption: bool = True,
+        add_content: bool = True,  # not used at the moment
+        add_page_index: bool = True,
+    ):
+        r"""Export picture to document tokens format.
+        :param doc: "DoclingDocument":
+        :param new_line: str:  (Default value = "\n")
+        :param xsize: int:  (Default value = 100)
+        :param ysize: int:  (Default value = 100)
+        :param add_location: bool:  (Default value = True)
+        :param add_caption: bool:  (Default value = True)
+        :param add_content: bool:  (Default value = True)
+        :param # not used at the momentadd_page_index: bool:  (Default value = True)
+        """
+        body = f"{DocumentToken.BEG_FIGURE.value}{new_line}"
+        if add_location:
+            body += self.get_location_tokens(
+                doc=doc,
+                new_line=new_line,
+                xsize=xsize,
+                ysize=ysize,
+                add_page_index=add_page_index,
+            )
+        if add_caption and len(self.captions):
+            text = self.caption_text(doc)
+            if len(text):
+                body += f"{DocumentToken.BEG_CAPTION.value}"
+                body += f"{text.strip()}"
+                body += f"{DocumentToken.END_CAPTION.value}"
+                body += f"{new_line}"
+        body += f"{DocumentToken.END_FIGURE.value}{new_line}"
+        return body
-    def get_map_to_page_dimensions(self):
-        """Get a map from page-index (start at 1) to page-dim [width, height]."""
-        pagedims = {}
-        if self.page_dimensions is not None:
-            for _ in self.page_dimensions:
-                pagedims[_.page] = [_.width, _.height]
+class TableItem(FloatingItem):
+    """TableItem."""
-        return pagedims
+    data: TableData
+    label: typing.Literal[DocItemLabel.TABLE] = DocItemLabel.TABLE
+    def export_to_dataframe(self) -> pd.DataFrame:
+        """Export the table as a Pandas DataFrame."""
+        if self.data.num_rows == 0 or self.data.num_cols == 0:
+            return pd.DataFrame()
+        # Count how many rows are column headers
+        num_headers = 0
+        for i, row in enumerate(self.data.grid):
+            if len(row) == 0:
+                raise RuntimeError(
+                    f"Invalid table. {len(row)=} but {self.data.num_cols=}."
+                )
+            any_header = False
+            for cell in row:
+                if cell.column_header:
+                    any_header = True
+                    break
+            if any_header:
+                num_headers += 1
+            else:
+                break
+        # Create the column names from all col_headers
+        columns: Optional[List[str]] = None
+        if num_headers > 0:
+            columns = ["" for _ in range(self.data.num_cols)]
+            for i in range(num_headers):
+                for j, cell in enumerate(self.data.grid[i]):
+                    col_name = cell.text
+                    if columns[j] != "":
+                        col_name = f".{col_name}"
+                    columns[j] += col_name
+        # Create table data
+        table_data = [
+            [cell.text for cell in row] for row in self.data.grid[num_headers:]
+        ]
+        # Create DataFrame
+        df = pd.DataFrame(table_data, columns=columns)
+        return df
+    def export_to_markdown(self) -> str:
+        """Export the table as markdown."""
+        table = []
+        for row in self.data.grid:
+            tmp = []
+            for col in row:
+                tmp.append(col.text)
+            table.append(tmp)
+        md_table = ""
+        if len(table) > 1 and len(table[0]) > 0:
+            try:
+                md_table = tabulate(table[1:], headers=table[0], tablefmt="github")
+            except ValueError:
+                md_table = tabulate(
+                    table[1:],
+                    headers=table[0],
+                    tablefmt="github",
+                    disable_numparse=True,
+                )
+        return md_table
+    def export_to_html(self) -> str:
+        """Export the table as html."""
+        body = ""
+        nrows = self.data.num_rows
+        ncols = self.data.num_cols
+        if not len(self.data.table_cells):
+            return ""
+        for i in range(nrows):
+            body += "<tr>"
+            for j in range(ncols):
+                cell: TableCell = self.data.grid[i][j]
+                rowspan, rowstart = (
+                    cell.row_span,
+                    cell.start_row_offset_idx,
+                )
+                colspan, colstart = (
+                    cell.col_span,
+                    cell.start_col_offset_idx,
+                )
+                if rowstart != i:
+                    continue
+                if colstart != j:
+                    continue
+                content = cell.text.strip()
+                celltag = "td"
+                if cell.column_header:
+                    celltag = "th"
+                opening_tag = f"{celltag}"
+                if rowspan > 1:
+                    opening_tag += f' rowspan="{rowspan}"'
+                if colspan > 1:
+                    opening_tag += f' colspan="{colspan}"'
+                body += f"<{opening_tag}>{content}</{celltag}>"
+            body += "</tr>"
+        body = f"<table>{body}</table>"
+        return body
+    def export_to_document_tokens(
+        self,
+        doc: "DoclingDocument",
+        new_line: str = "\n",
+        xsize: int = 100,
+        ysize: int = 100,
+        add_location: bool = True,
+        add_caption: bool = True,
+        add_content: bool = True,
+        add_cell_location: bool = True,
+        add_cell_label: bool = True,
+        add_cell_text: bool = True,
+        add_page_index: bool = True,
+    ):
+        r"""Export table to document tokens format.
+        :param doc: "DoclingDocument":
+        :param new_line: str:  (Default value = "\n")
+        :param xsize: int:  (Default value = 100)
+        :param ysize: int:  (Default value = 100)
+        :param add_location: bool:  (Default value = True)
+        :param add_caption: bool:  (Default value = True)
+        :param add_content: bool:  (Default value = True)
+        :param add_cell_location: bool:  (Default value = True)
+        :param add_cell_label: bool:  (Default value = True)
+        :param add_cell_text: bool:  (Default value = True)
+        :param add_page_index: bool:  (Default value = True)
+        """
+        body = f"{DocumentToken.BEG_TABLE.value}{new_line}"
+        if add_location:
+            body += self.get_location_tokens(
+                doc=doc,
+                new_line=new_line,
+                xsize=xsize,
+                ysize=ysize,
+                add_page_index=add_page_index,
+            )
+        if add_caption and len(self.captions):
+            text = self.caption_text(doc)
+            if len(text):
+                body += f"{DocumentToken.BEG_CAPTION.value}"
+                body += f"{text.strip()}"
+                body += f"{DocumentToken.END_CAPTION.value}"
+                body += f"{new_line}"
+        if add_content and len(self.data.table_cells) > 0:
+            for i, row in enumerate(self.data.grid):
+                body += f"<row_{i}>"
+                for j, col in enumerate(row):
+                    text = ""
+                    if add_cell_text:
+                        text = col.text.strip()
+                    cell_loc = ""
+                    if (
+                        col.bbox is not None
+                        and add_cell_location
+                        and add_page_index
+                        and len(self.prov) > 0
+                    ):
+                        page_w, page_h = doc.pages[self.prov[0].page_no].size.as_tuple()
+                        cell_loc = DocumentToken.get_location(
+                            bbox=col.bbox.to_bottom_left_origin(page_h).as_tuple(),
+                            page_w=page_w,
+                            page_h=page_h,
+                            xsize=xsize,
+                            ysize=ysize,
+                            page_i=self.prov[0].page_no,
+                        )
+                    elif (
+                        col.bbox is not None
+                        and add_cell_location
+                        and not add_page_index
+                        and len(self.prov) > 0
+                    ):
+                        page_w, page_h = doc.pages[self.prov[0].page_no].size.as_tuple()
+                        cell_loc = DocumentToken.get_location(
+                            bbox=col.bbox.to_bottom_left_origin(page_h).as_tuple(),
+                            page_w=page_w,
+                            page_h=page_h,
+                            xsize=xsize,
+                            ysize=ysize,
+                            page_i=-1,
+                        )
+                    cell_label = ""
+                    if add_cell_label:
+                        if col.column_header:
+                            cell_label = "<col_header>"
+                        elif col.row_header:
+                            cell_label = "<row_header>"
+                        elif col.row_section:
+                            cell_label = "<row_section>"
+                        else:
+                            cell_label = "<body>"
+                    body += f"<col_{j}>{cell_loc}{cell_label}{text}</col_{j}>"
+                body += f"</row_{i}>{new_line}"
+        body += f"{DocumentToken.END_TABLE.value}{new_line}"
+        return body
+class KeyValueItem(DocItem):
+    """KeyValueItem."""
+ContentItem = Union[
+    TextItem, SectionHeaderItem, ListItem, PictureItem, TableItem, KeyValueItem
+]
+class PageItem(BaseModel):
+    """PageItem."""
+    # A page carries separate root items for furniture and body,
+    # only referencing items on the page
+    size: Size
+    image: Optional[ImageRef] = None
+    page_no: int
+class DoclingDocument(BaseModel):
+    """DoclingDocument."""
+    schema_name: typing.Literal["DoclingDocument"] = "DoclingDocument"
+    version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = (
+        CURRENT_VERSION
+    )
+    name: str  # The working name of this document, without extensions
+    # (could be taken from originating doc, or just "Untitled 1")
+    origin: Optional[DocumentOrigin] = (
+        None  # DoclingDocuments may specify an origin (converted to DoclingDocument).
+        # This is optional, e.g. a DoclingDocument could also be entirely
+        # generated from synthetic data.
+    )
+    furniture: GroupItem = GroupItem(
+        name="_root_", self_ref="#/furniture"
+    )  # List[RefItem] = []
+    body: GroupItem = GroupItem(name="_root_", self_ref="#/body")  # List[RefItem] = []
+    groups: List[GroupItem] = []
+    texts: List[Union[SectionHeaderItem, ListItem, TextItem]] = []
+    pictures: List[PictureItem] = []
+    tables: List[TableItem] = []
+    key_value_items: List[KeyValueItem] = []
+    pages: Dict[int, PageItem] = {}  # empty as default
+    def add_group(
+        self,
+        label: Optional[GroupLabel] = None,
+        name: Optional[str] = None,
+        parent: Optional[GroupItem] = None,
+    ) -> GroupItem:
+        """add_group.
+        :param label: Optional[GroupLabel]:  (Default value = None)
+        :param name: Optional[str]:  (Default value = None)
+        :param parent: Optional[GroupItem]:  (Default value = None)
+        """
+        if not parent:
+            parent = self.body
+        group_index = len(self.groups)
+        cref = f"#/groups/{group_index}"
+        group = GroupItem(self_ref=cref, parent=parent.get_ref())
+        if name is not None:
+            group.name = name
+        if label is not None:
+            group.label = label
+        self.groups.append(group)
+        parent.children.append(RefItem(cref=cref))
+        return group
+    def add_list_item(
+        self,
+        text: str,
+        enumerated: bool = False,
+        marker: Optional[str] = None,
+        orig: Optional[str] = None,
+        prov: Optional[ProvenanceItem] = None,
+        parent: Optional[GroupItem] = None,
+    ):
+        """add_paragraph.
+        :param label: str:
+        :param text: str:
+        :param orig: Optional[str]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param parent: Optional[GroupItem]:  (Default value = None)
+        """
+        if not parent:
+            parent = self.body
+        if not orig:
+            orig = text
+        marker = marker or "-"
+        text_index = len(self.texts)
+        cref = f"#/texts/{text_index}"
+        list_item = ListItem(
+            text=text,
+            orig=orig,
+            self_ref=cref,
+            parent=parent.get_ref(),
+            enumerated=enumerated,
+            marker=marker,
+        )
+        if prov:
+            list_item.prov.append(prov)
+        self.texts.append(list_item)
+        parent.children.append(RefItem(cref=cref))
+        return list_item
+    def add_text(
+        self,
+        label: DocItemLabel,
+        text: str,
+        orig: Optional[str] = None,
+        prov: Optional[ProvenanceItem] = None,
+        parent: Optional[GroupItem] = None,
+    ):
+        """add_paragraph.
+        :param label: str:
+        :param text: str:
+        :param orig: Optional[str]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param parent: Optional[GroupItem]:  (Default value = None)
+        """
+        if not parent:
+            parent = self.body
+        if not orig:
+            orig = text
+        text_index = len(self.texts)
+        cref = f"#/texts/{text_index}"
+        text_item = TextItem(
+            label=label,
+            text=text,
+            orig=orig,
+            self_ref=cref,
+            parent=parent.get_ref(),
+        )
+        if prov:
+            text_item.prov.append(prov)
+        self.texts.append(text_item)
+        parent.children.append(RefItem(cref=cref))
+        return text_item
+    def add_table(
+        self,
+        data: TableData,
+        caption: Optional[Union[TextItem, RefItem]] = None,  # This is not cool yet.
+        prov: Optional[ProvenanceItem] = None,
+        parent: Optional[GroupItem] = None,
+    ):
+        """add_table.
+        :param data: BaseTableData:
+        :param caption: Optional[Union[TextItem:
+        :param RefItem]]:  (Default value = None)
+        :param # This is not cool yet.prov: Optional[ProvenanceItem]
+        :param parent: Optional[GroupItem]:  (Default value = None)
+        """
+        if not parent:
+            parent = self.body
+        table_index = len(self.tables)
+        cref = f"#/tables/{table_index}"
+        tbl_item = TableItem(
+            label=DocItemLabel.TABLE, data=data, self_ref=cref, parent=parent.get_ref()
+        )
+        if prov:
+            tbl_item.prov.append(prov)
+        if caption:
+            tbl_item.captions.append(caption.get_ref())
+        self.tables.append(tbl_item)
+        parent.children.append(RefItem(cref=cref))
+        return tbl_item
+    def add_picture(
+        self,
+        annotations: List[PictureDataType] = [],
+        image: Optional[ImageRef] = None,
+        caption: Optional[Union[TextItem, RefItem]] = None,
+        prov: Optional[ProvenanceItem] = None,
+        parent: Optional[GroupItem] = None,
+    ):
+        """add_picture.
+        :param data: List[PictureData]: (Default value = [])
+        :param caption: Optional[Union[TextItem:
+        :param RefItem]]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param parent: Optional[GroupItem]:  (Default value = None)
+        """
+        if not parent:
+            parent = self.body
+        picture_index = len(self.pictures)
+        cref = f"#/pictures/{picture_index}"
+        fig_item = PictureItem(
+            label=DocItemLabel.PICTURE,
+            annotations=annotations,
+            image=image,
+            self_ref=cref,
+            parent=parent.get_ref(),
+        )
+        if prov:
+            fig_item.prov.append(prov)
+        if caption:
+            fig_item.captions.append(caption.get_ref())
+        self.pictures.append(fig_item)
+        parent.children.append(RefItem(cref=cref))
+        return fig_item
+    def add_heading(
+        self,
+        text: str,
+        orig: Optional[str] = None,
+        level: LevelNumber = 1,
+        prov: Optional[ProvenanceItem] = None,
+        parent: Optional[GroupItem] = None,
+    ):
+        """add_heading.
+        :param label: DocItemLabel:
+        :param text: str:
+        :param orig: Optional[str]:  (Default value = None)
+        :param level: LevelNumber:  (Default value = 1)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param parent: Optional[GroupItem]:  (Default value = None)
+        """
+        if not parent:
+            parent = self.body
+        if not orig:
+            orig = text
+        text_index = len(self.texts)
+        cref = f"#/texts/{text_index}"
+        section_header_item = SectionHeaderItem(
+            level=level,
+            text=text,
+            orig=orig,
+            self_ref=cref,
+            parent=parent.get_ref(),
+        )
+        if prov:
+            section_header_item.prov.append(prov)
+        self.texts.append(section_header_item)
+        parent.children.append(RefItem(cref=cref))
+        return section_header_item
+    def num_pages(self):
+        """num_pages."""
+        return len(self.pages.values())
+    def validate_tree(self, root) -> bool:
+        """validate_tree."""
+        res = []
+        for child_ref in root.children:
+            child = child_ref.resolve(self)
+            if child.parent.resolve(self) != root:
+                return False
+            res.append(self.validate_tree(child))
+        return all(res) or len(res) == 0
+    def iterate_items(
+        self,
+        root: Optional[NodeItem] = None,
+        with_groups: bool = False,
+        traverse_pictures: bool = True,
+        page_no: Optional[int] = None,
+        _level: int = 0,  # fixed parameter, carries through the node nesting level
+    ) -> typing.Iterable[Tuple[NodeItem, int]]:  # tuple of node and level
+        """iterate_elements.
+        :param root: Optional[NodeItem]:  (Default value = None)
+        :param with_groups: bool:  (Default value = False)
+        :param traverse_pictures: bool:  (Default value = True)
+        :param page_no: Optional[int]:  (Default value = None)
+        :param _level:  (Default value = 0)
+        :param # fixed parameter:
+        :param carries through the node nesting level:
+        """
+        if not root:
+            root = self.body
+        if not isinstance(root, GroupItem) or with_groups:
+            if isinstance(root, DocItem):
+                if page_no is not None:
+                    for prov in root.prov:
+                        if prov.page_no == page_no:
+                            yield root, _level
+                else:
+                    yield root, _level
+            else:
+                yield root, _level
+        # Traverse children
+        for child_ref in root.children:
+            child = child_ref.resolve(self)
+            if isinstance(child, NodeItem):
+                # If the child is a NodeItem, recursively traverse it
+                if not isinstance(child, PictureItem) or traverse_pictures:
+                    yield from self.iterate_items(
+                        child, _level=_level + 1, with_groups=with_groups
+                    )
+    def print_element_tree(self):
+        """print_element_tree."""
+        for ix, (item, level) in enumerate(self.iterate_items(with_groups=True)):
+            if isinstance(item, GroupItem):
+                print(" " * level, f"{ix}: {item.label.value} with name={item.name}")
+            elif isinstance(item, DocItem):
+                print(" " * level, f"{ix}: {item.label.value}")
+    def export_to_dict(self) -> Dict:
+        """export_to_dict."""
+        return self.model_dump(mode="json", by_alias=True, exclude_none=True)
     def export_to_markdown(  # noqa: C901
         self,
         delim: str = "\n\n",
-        main_text_start: int = 0,
-        main_text_stop: Optional[int] = None,
-        main_text_labels: list[str] = [
-            "title",
-            "subtitle-level-1",
-            "paragraph",
-            "caption",
-            "table",
-            "figure",
-        ],
+        from_element: int = 0,
+        to_element: Optional[int] = None,
+        labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
         strict_text: bool = False,
         image_placeholder: str = "<!-- image -->",
     ) -> str:
@@ -455,66 +1119,77 @@ class ExportedCCSDocument(
         Operates on a slice of the document's main_text as defined through arguments
         main_text_start and main_text_stop; defaulting to the whole main_text.
-        Args:
-            delim (str, optional): Delimiter to use when concatenating the various
+        :param delim: Delimiter to use when concatenating the various
                 Markdown parts. Defaults to "\n\n".
-            main_text_start (int, optional): Main-text slicing start index (inclusive).
+        :type delim: str
+        :param from_element: Body slicing start index (inclusive).
                 Defaults to 0.
-            main_text_end (Optional[int], optional): Main-text slicing stop index
+        :type from_element: int
+        :param to_element: Body slicing stop index
                 (exclusive). Defaults to None.
-            main_text_labels (list[str], optional): The labels to include in the
-                markdown.
-            strict_text (bool, optional): if true, the output will be only plain text
-                without any markdown styling. Defaults to False.
-            image_placeholder (str, optional): the placeholder to include to position
-                images in the markdown. Defaults to a markdown comment "<!-- image -->".
-        Returns:
-            str: The exported Markdown representation.
+        :type to_element: Optional[int]
+        :param delim: str:  (Default value = "\n\n")
+        :param from_element: int:  (Default value = 0)
+        :param to_element: Optional[int]:  (Default value = None)
+        :param labels: set[DocItemLabel]
+        :param "subtitle-level-1":
+        :param "paragraph":
+        :param "caption":
+        :param "table":
+        :param "Text":
+        :param "text":
+        :param ]:
+        :param strict_text: bool:  (Default value = False)
+        :param image_placeholder str:  (Default value = "<!-- image -->")
+            the placeholder to include to position images in the markdown.
+        :returns: The exported Markdown representation.
+        :rtype: str
         """
         has_title = False
         prev_text = ""
         md_texts: list[str] = []
-        if self.main_text is not None:
-            # collect all captions embedded in table and figure objects
-            # to avoid repeating them
-            embedded_captions = set()
-            for orig_item in self.main_text[main_text_start:main_text_stop]:
-                item = (
-                    self._resolve_ref(orig_item)
-                    if isinstance(orig_item, Ref)
-                    else orig_item
-                )
-                if item is None:
-                    continue
-                if (
-                    isinstance(item, (Table, Figure))
-                    and item.text
-                    and item.obj_type in main_text_labels
-                ):
-                    embedded_captions.add(item.text)
-            # serialize document to markdown
-            for orig_item in self.main_text[main_text_start:main_text_stop]:
-                markdown_text = ""
-                item = (
-                    self._resolve_ref(orig_item)
-                    if isinstance(orig_item, Ref)
-                    else orig_item
-                )
-                if item is None:
-                    continue
-                item_type = item.obj_type
-                if isinstance(item, BaseText) and item_type in main_text_labels:
+        # collect all captions embedded in table and figure objects
+        # to avoid repeating them
+        embedded_captions = set()
+        skip_count = 0
+        for ix, (item, level) in enumerate(self.iterate_items(self.body)):
+            if skip_count < from_element:
+                skip_count += 1
+                continue  # skip as many items as you want
+            if to_element and ix >= to_element:
+                break
+            if (
+                isinstance(item, (TableItem, PictureItem))
+                and len(item.captions) > 0
+                and item.label in labels
+            ):
+                caption = item.caption_text(self)
+                if caption:
+                    embedded_captions.add(caption)
+        skip_count = 0
+        for ix, (item, level) in enumerate(self.iterate_items(self.body)):
+            if skip_count < from_element:
+                skip_count += 1
+                continue  # skip as many items as you want
+            if to_element and ix >= to_element:
+                break
+            markdown_text = ""
+            if isinstance(item, DocItem):
+                item_type = item.label
+                if isinstance(item, TextItem) and item_type in labels:
                     text = item.text
                     # skip captions of they are embedded in the actual
                     # floating object
-                    if item_type == "caption" and text in embedded_captions:
+                    if item_type == DocItemLabel.CAPTION and text in embedded_captions:
                         continue
                     # ignore repeated text
@@ -524,7 +1199,7 @@ class ExportedCCSDocument(
                         prev_text = text
                     # first title match
-                    if item_type == "title" and not has_title:
+                    if item_type == DocItemLabel.TITLE and not has_title:
                         if strict_text:
                             markdown_text = f"{text}"
                         else:
@@ -532,78 +1207,89 @@ class ExportedCCSDocument(
                         has_title = True
                     # secondary titles
-                    elif item_type in {"title", "subtitle-level-1"} or (
-                        has_title and item_type == "title"
-                    ):
+                    elif item_type in {
+                        DocItemLabel.TITLE,
+                        DocItemLabel.SECTION_HEADER,
+                    } or (has_title and item_type == DocItemLabel.TITLE):
                         if strict_text:
                             markdown_text = f"{text}"
                         else:
                             markdown_text = f"## {text}"
+                    # secondary titles
+                    elif isinstance(item, ListItem):
+                        if item.enumerated:
+                            marker = item.marker
+                        else:
+                            marker = "-"
+                        markdown_text = f"{marker} {text}"
                     # normal text
                     else:
                         markdown_text = text
-                elif (
-                    isinstance(item, Table)
-                    and item.data
-                    and item_type in main_text_labels
-                ):
+                elif isinstance(item, TableItem) and item.data and item_type in labels:
+                    parts = []
-                    md_table = ""
-                    table = []
-                    for row in item.data:
-                        tmp = []
-                        for col in row:
-                            tmp.append(col.text)
-                        table.append(tmp)
-                    if len(table) > 1 and len(table[0]) > 0:
-                        try:
-                            md_table = tabulate(
-                                table[1:], headers=table[0], tablefmt="github"
-                            )
-                        except ValueError:
-                            md_table = tabulate(
-                                table[1:],
-                                headers=table[0],
-                                tablefmt="github",
-                                disable_numparse=True,
-                            )
-                    markdown_text = ""
-                    if item.text:
-                        markdown_text = item.text
+                    # Compute the caption
+                    if caption := item.caption_text(self):
+                        parts.append(caption)
+                        parts.append("\n")
+                    # Rendered the item
                     if not strict_text:
-                        markdown_text += "\n" + md_table
+                        md_table = item.export_to_markdown()
+                        if md_table:
+                            parts.append(item.export_to_markdown())
+                    # Combine parts
+                    markdown_text = "\n".join(parts)
-                elif isinstance(item, Figure) and item_type in main_text_labels:
+                elif isinstance(item, PictureItem) and item_type in labels:
+                    parts = []
-                    markdown_text = ""
-                    if item.text:
-                        markdown_text = item.text
+                    # Compute the caption
+                    if caption := item.caption_text(self):
+                        parts.append(caption)
+                        parts.append("\n")
+                    # Rendered the item
                     if not strict_text:
-                        markdown_text += f"\n{image_placeholder}"
+                        parts.append(f"{image_placeholder}")
+                    # Combine parts
+                    markdown_text = "\n".join(parts)
-                if markdown_text:
-                    md_texts.append(markdown_text)
+            if markdown_text:
+                md_texts.append(markdown_text)
         result = delim.join(md_texts)
         return result
+    def export_to_text(  # noqa: C901
+        self,
+        delim: str = "\n\n",
+        from_element: int = 0,
+        to_element: Optional[int] = None,
+        labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
+    ) -> str:
+        """export_to_text."""
+        return self.export_to_markdown(
+            delim,
+            from_element,
+            to_element,
+            labels,
+            strict_text=True,
+            image_placeholder="",
+        )
     def export_to_document_tokens(
         self,
         delim: str = "\n\n",
-        main_text_start: int = 0,
-        main_text_stop: Optional[int] = None,
-        main_text_labels: list[str] = [
-            "title",
-            "subtitle-level-1",
-            "paragraph",
-            "caption",
-            "table",
-            "figure",
-        ],
+        from_element: int = 0,
+        to_element: Optional[int] = None,
+        labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
         xsize: int = 100,
         ysize: int = 100,
         add_location: bool = True,
@@ -616,11 +1302,23 @@ class ExportedCCSDocument(
     ) -> str:
         r"""Exports the document content to an DocumentToken format.
-        Operates on a slice of the document's main_text as defined through arguments
-        main_text_start and main_text_stop; defaulting to the whole main_text.
-        Returns:
-            str: The content of the document formatted as a DocTags string.
+        Operates on a slice of the document's body as defined through arguments
+        from_element and to_element; defaulting to the whole main_text.
+        :param delim: str:  (Default value = "\n\n")
+        :param from_element: int:  (Default value = 0)
+        :param to_element: Optional[int]:  (Default value = None)
+        :param labels: set[DocItemLabel]
+        :param xsize: int:  (Default value = 100)
+        :param ysize: int:  (Default value = 100)
+        :param add_location: bool:  (Default value = True)
+        :param add_content: bool:  (Default value = True)
+        :param add_page_index: bool:  (Default value = True)
+        :param # table specific flagsadd_table_cell_location: bool
+        :param add_table_cell_label: bool:  (Default value = True)
+        :param add_table_cell_text: bool:  (Default value = True)
+        :returns: The content of the document formatted as a DocTags string.
+        :rtype: str
         """
         new_line = ""
         if delim:
@@ -630,82 +1328,113 @@ class ExportedCCSDocument(
         # pagedims = self.get_map_to_page_dimensions()
-        if self.main_text is not None:
-            for orig_item in self.main_text[main_text_start:main_text_stop]:
+        skip_count = 0
+        for ix, (item, level) in enumerate(self.iterate_items(self.body)):
+            if skip_count < from_element:
+                skip_count += 1
+                continue  # skip as many items as you want
-                item = (
-                    self._resolve_ref(orig_item)
-                    if isinstance(orig_item, Ref)
-                    else orig_item
-                )
+            if to_element and ix >= to_element:
+                break
-                if item is None:
-                    continue
+            if not isinstance(item, DocItem):
+                continue
-                prov = item.prov
+            prov = item.prov
-                page_i = -1
-                page_w = 0.0
-                page_h = 0.0
+            page_i = -1
-                if (
-                    add_location
-                    and self.page_dimensions is not None
-                    and prov is not None
-                    and len(prov) > 0
-                ):
+            if add_location and len(self.pages) and len(prov) > 0:
-                    page_i = prov[0].page
-                    page_dim = self.page_dimensions[page_i - 1]
+                page_i = prov[0].page_no
+                page_dim = self.pages[page_i].size
-                    page_w = float(page_dim.width)
-                    page_h = float(page_dim.height)
+                float(page_dim.width)
+                float(page_dim.height)
-                item_type = item.obj_type
-                if isinstance(item, BaseText) and (item_type in main_text_labels):
+            item_type = item.label
+            if isinstance(item, TextItem) and (item_type in labels):
-                    doctags += item.export_to_document_tokens(
-                        new_line=new_line,
-                        page_w=page_w,
-                        page_h=page_h,
-                        xsize=xsize,
-                        ysize=ysize,
-                        add_location=add_location,
-                        add_content=add_content,
-                        add_page_index=add_page_index,
-                    )
+                doctags += item.export_to_document_tokens(
+                    doc=self,
+                    new_line=new_line,
+                    xsize=xsize,
+                    ysize=ysize,
+                    add_location=add_location,
+                    add_content=add_content,
+                    add_page_index=add_page_index,
+                )
-                elif isinstance(item, Table) and (item_type in main_text_labels):
-                    doctags += item.export_to_document_tokens(
-                        new_line=new_line,
-                        page_w=page_w,
-                        page_h=page_h,
-                        xsize=xsize,
-                        ysize=ysize,
-                        add_caption=True,
-                        add_location=add_location,
-                        add_content=add_content,
-                        add_cell_location=add_table_cell_location,
-                        add_cell_label=add_table_cell_label,
-                        add_cell_text=add_table_cell_text,
-                        add_page_index=add_page_index,
-                    )
+            elif isinstance(item, TableItem) and (item_type in labels):
+                doctags += item.export_to_document_tokens(
+                    doc=self,
+                    new_line=new_line,
+                    xsize=xsize,
+                    ysize=ysize,
+                    add_caption=True,
+                    add_location=add_location,
+                    add_content=add_content,
+                    add_cell_location=add_table_cell_location,
+                    add_cell_label=add_table_cell_label,
+                    add_cell_text=add_table_cell_text,
+                    add_page_index=add_page_index,
+                )
-                elif isinstance(item, Figure) and (item_type in main_text_labels):
-                    doctags += item.export_to_document_tokens(
-                        new_line=new_line,
-                        page_w=page_w,
-                        page_h=page_h,
-                        xsize=xsize,
-                        ysize=ysize,
-                        add_caption=True,
-                        add_location=add_location,
-                        add_content=add_content,
-                        add_page_index=add_page_index,
-                    )
+            elif isinstance(item, PictureItem) and (item_type in labels):
+                doctags += item.export_to_document_tokens(
+                    doc=self,
+                    new_line=new_line,
+                    xsize=xsize,
+                    ysize=ysize,
+                    add_caption=True,
+                    add_location=add_location,
+                    add_content=add_content,
+                    add_page_index=add_page_index,
+                )
         doctags += DocumentToken.END_DOCUMENT.value
         return doctags
+    def add_page(
+        self, page_no: int, size: Size, image: Optional[ImageRef] = None
+    ) -> PageItem:
+        """add_page.
+        :param page_no: int:
+        :param size: Size:
+        """
+        pitem = PageItem(page_no=page_no, size=size, image=image)
+        self.pages[page_no] = pitem
+        return pitem
+    @field_validator("version")
+    @classmethod
+    def check_version_is_compatible(cls, v: str) -> str:
+        """Check if this document version is compatible with current version."""
+        current_match = re.match(VERSION_PATTERN, CURRENT_VERSION)
+        doc_match = re.match(VERSION_PATTERN, v)
+        if (
+            doc_match is None
+            or current_match is None
+            or doc_match["major"] != current_match["major"]
+            or doc_match["minor"] > current_match["minor"]
+        ):
+            raise ValueError(
+                f"incompatible version {v} with schema version {CURRENT_VERSION}"
+            )
+        else:
+            return CURRENT_VERSION
+    @model_validator(mode="after")  # type: ignore
+    @classmethod
+    def validate_document(cls, d: "DoclingDocument"):
+        """validate_document."""
+        if not d.validate_tree(d.body) or not d.validate_tree(d.furniture):
+            raise ValueError("Document hierachy is inconsistent.")
+        return d

docling-core 1.7.1__py3-none-any.whl → 2.0.0__py3-none-any.whl

Potentially problematic release.

docling-core 1.7.1py3-none-any.whl → 2.0.0py3-none-any.whl