PyPI - docling-core - Versions diffs - 2.8.0__py3-none-any.whl → 2.10.0__py3-none-any.whl - Mend

docling-core 2.8.0py3-none-any.whl → 2.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling-core might be problematic. Click here for more details.

Files changed (11) hide show

docling_core/cli/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """CLI package."""

docling_core/cli/view.py ADDED Viewed

@@ -0,0 +1,68 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+"""CLI for docling viewer."""
+import importlib
+import tempfile
+import webbrowser
+from pathlib import Path
+from typing import Annotated, Optional
+import typer
+from docling_core.types.doc import DoclingDocument
+from docling_core.types.doc.base import ImageRefMode
+from docling_core.utils.file import resolve_source_to_path
+app = typer.Typer(
+    name="Docling",
+    no_args_is_help=True,
+    add_completion=False,
+    pretty_exceptions_enable=False,
+)
+def version_callback(value: bool):
+    """Callback for version inspection."""
+    if value:
+        docling_core_version = importlib.metadata.version("docling-core")
+        print(f"Docling Core version: {docling_core_version}")
+        raise typer.Exit()
+@app.command(no_args_is_help=True)
+def view(
+    source: Annotated[
+        str,
+        typer.Argument(
+            ...,
+            metavar="source",
+            help="Docling JSON file to view.",
+        ),
+    ],
+    version: Annotated[
+        Optional[bool],
+        typer.Option(
+            "--version",
+            callback=version_callback,
+            is_eager=True,
+            help="Show version information.",
+        ),
+    ] = None,
+):
+    """Display a Docling JSON file on the default browser."""
+    path = resolve_source_to_path(source=source)
+    doc = DoclingDocument.load_from_json(filename=path)
+    target_path = Path(tempfile.mkdtemp()) / "out.html"
+    html_output = doc.export_to_html(image_mode=ImageRefMode.EMBEDDED)
+    with open(target_path, "w") as f:
+        f.write(html_output)
+    webbrowser.open(url=f"file://{target_path.absolute().resolve()}")
+click_app = typer.main.get_command(app)
+if __name__ == "__main__":
+    app()

docling_core/transforms/chunker/hybrid_chunker.py CHANGED Viewed

@@ -44,7 +44,9 @@ class HybridChunker(BaseChunker):
     model_config = ConfigDict(arbitrary_types_allowed=True)
-    tokenizer: Union[PreTrainedTokenizerBase, str]
+    tokenizer: Union[PreTrainedTokenizerBase, str] = (
+        "sentence-transformers/all-MiniLM-L6-v2"
+    )
     max_tokens: int = None  # type: ignore[assignment]
     merge_peers: bool = True
@@ -96,6 +98,7 @@ class HybridChunker(BaseChunker):
             doc_items=doc_chunk.meta.doc_items[window_start : window_end + 1],
             headings=doc_chunk.meta.headings,
             captions=doc_chunk.meta.captions,
+            origin=doc_chunk.meta.origin,
         )
         new_chunk = DocChunk(text=window_text, meta=meta)
         return new_chunk
@@ -242,6 +245,7 @@ class HybridChunker(BaseChunker):
                         doc_items=window_items,
                         headings=current_headings_and_captions[0],
                         captions=current_headings_and_captions[1],
+                        origin=chunk.meta.origin,
                     )
                     new_chunk = DocChunk(
                         text=window_text,

docling_core/types/doc/document.py CHANGED Viewed

@@ -49,7 +49,6 @@ DEFAULT_EXPORT_LABELS = {
     DocItemLabel.DOCUMENT_INDEX,
     DocItemLabel.SECTION_HEADER,
     DocItemLabel.PARAGRAPH,
-    DocItemLabel.CAPTION,
     DocItemLabel.TABLE,
     DocItemLabel.PICTURE,
     DocItemLabel.FORMULA,
@@ -58,6 +57,7 @@ DEFAULT_EXPORT_LABELS = {
     DocItemLabel.TEXT,
     DocItemLabel.LIST_ITEM,
     DocItemLabel.CODE,
+    DocItemLabel.REFERENCE,
 }
@@ -380,6 +380,7 @@ class DocumentOrigin(BaseModel):
         "application/vnd.openxmlformats-officedocument.presentationml.template",
         "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
         "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
         "text/asciidoc",
         "text/markdown",
     ]
@@ -445,7 +446,7 @@ class ImageRef(BaseModel):
     mimetype: str
     dpi: int
     size: Size
-    uri: Union[AnyUrl, Path]
+    uri: Union[AnyUrl, Path] = Field(union_mode="left_to_right")
     _pil: Optional[PILImage.Image] = None
     @property
@@ -592,6 +593,21 @@ class DocItem(
 class TextItem(DocItem):
     """TextItem."""
+    label: typing.Literal[
+        DocItemLabel.CAPTION,
+        DocItemLabel.CHECKBOX_SELECTED,
+        DocItemLabel.CHECKBOX_UNSELECTED,
+        DocItemLabel.CODE,
+        DocItemLabel.FOOTNOTE,
+        DocItemLabel.FORMULA,
+        DocItemLabel.PAGE_FOOTER,
+        DocItemLabel.PAGE_HEADER,
+        DocItemLabel.PARAGRAPH,
+        DocItemLabel.REFERENCE,
+        DocItemLabel.TEXT,
+        DocItemLabel.TITLE,
+    ]
     orig: str  # untreated representation
     text: str  # sanitized representation
@@ -643,8 +659,10 @@ class TextItem(DocItem):
 class SectionHeaderItem(TextItem):
     """SectionItem."""
-    label: typing.Literal[DocItemLabel.SECTION_HEADER] = DocItemLabel.SECTION_HEADER
-    level: LevelNumber
+    label: typing.Literal[DocItemLabel.SECTION_HEADER] = (
+        DocItemLabel.SECTION_HEADER  # type: ignore[assignment]
+    )
+    level: LevelNumber = 1
     def export_to_document_tokens(
         self,
@@ -694,9 +712,11 @@ class SectionHeaderItem(TextItem):
 class ListItem(TextItem):
     """SectionItem."""
-    label: typing.Literal[DocItemLabel.LIST_ITEM] = DocItemLabel.LIST_ITEM
+    label: typing.Literal[DocItemLabel.LIST_ITEM] = (
+        DocItemLabel.LIST_ITEM  # type: ignore[assignment]
+    )
     enumerated: bool = False
-    marker: str  # The bullet or number symbol that prefixes this list item
+    marker: str = "-"  # The bullet or number symbol that prefixes this list item
 class FloatingItem(DocItem):
@@ -922,7 +942,10 @@ class TableItem(FloatingItem):
     """TableItem."""
     data: TableData
-    label: typing.Literal[DocItemLabel.TABLE] = DocItemLabel.TABLE
+    label: typing.Literal[
+        DocItemLabel.DOCUMENT_INDEX,
+        DocItemLabel.TABLE,
+    ] = DocItemLabel.TABLE
     def export_to_dataframe(self) -> pd.DataFrame:
         """Export the table as a Pandas DataFrame."""
@@ -1271,9 +1294,19 @@ class TableItem(FloatingItem):
 class KeyValueItem(DocItem):
     """KeyValueItem."""
+    label: typing.Literal[DocItemLabel.KEY_VALUE_REGION] = DocItemLabel.KEY_VALUE_REGION
-ContentItem = Union[
-    TextItem, SectionHeaderItem, ListItem, PictureItem, TableItem, KeyValueItem
+ContentItem = Annotated[
+    Union[
+        TextItem,
+        SectionHeaderItem,
+        ListItem,
+        PictureItem,
+        TableItem,
+        KeyValueItem,
+    ],
+    Field(discriminator="label"),
 ]
@@ -1375,13 +1408,13 @@ class DoclingDocument(BaseModel):
         self,
         label: Optional[GroupLabel] = None,
         name: Optional[str] = None,
-        parent: Optional[GroupItem] = None,
+        parent: Optional[NodeItem] = None,
     ) -> GroupItem:
         """add_group.
         :param label: Optional[GroupLabel]:  (Default value = None)
         :param name: Optional[str]:  (Default value = None)
-        :param parent: Optional[GroupItem]:  (Default value = None)
+        :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
@@ -1408,7 +1441,7 @@ class DoclingDocument(BaseModel):
         marker: Optional[str] = None,
         orig: Optional[str] = None,
         prov: Optional[ProvenanceItem] = None,
-        parent: Optional[GroupItem] = None,
+        parent: Optional[NodeItem] = None,
     ):
         """add_list_item.
@@ -1416,7 +1449,7 @@ class DoclingDocument(BaseModel):
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
         :param prov: Optional[ProvenanceItem]:  (Default value = None)
-        :param parent: Optional[GroupItem]:  (Default value = None)
+        :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
@@ -1451,7 +1484,7 @@ class DoclingDocument(BaseModel):
         text: str,
         orig: Optional[str] = None,
         prov: Optional[ProvenanceItem] = None,
-        parent: Optional[GroupItem] = None,
+        parent: Optional[NodeItem] = None,
     ):
         """add_text.
@@ -1459,7 +1492,7 @@ class DoclingDocument(BaseModel):
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
         :param prov: Optional[ProvenanceItem]:  (Default value = None)
-        :param parent: Optional[GroupItem]:  (Default value = None)
+        :param parent: Optional[NodeItem]:  (Default value = None)
         """
         # Catch a few cases that are in principle allowed
@@ -1503,15 +1536,16 @@ class DoclingDocument(BaseModel):
         data: TableData,
         caption: Optional[Union[TextItem, RefItem]] = None,  # This is not cool yet.
         prov: Optional[ProvenanceItem] = None,
-        parent: Optional[GroupItem] = None,
+        parent: Optional[NodeItem] = None,
+        label: DocItemLabel = DocItemLabel.TABLE,
     ):
         """add_table.
-        :param data: BaseTableData:
-        :param caption: Optional[Union[TextItem:
-        :param RefItem]]:  (Default value = None)
-        :param # This is not cool yet.prov: Optional[ProvenanceItem]
-        :param parent: Optional[GroupItem]:  (Default value = None)
+        :param data: TableData:
+        :param caption: Optional[Union[TextItem, RefItem]]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param parent: Optional[NodeItem]:  (Default value = None)
+        :param label: DocItemLabel:  (Default value = DocItemLabel.TABLE)
         """
         if not parent:
@@ -1521,7 +1555,7 @@ class DoclingDocument(BaseModel):
         cref = f"#/tables/{table_index}"
         tbl_item = TableItem(
-            label=DocItemLabel.TABLE, data=data, self_ref=cref, parent=parent.get_ref()
+            label=label, data=data, self_ref=cref, parent=parent.get_ref()
         )
         if prov:
             tbl_item.prov.append(prov)
@@ -1539,7 +1573,7 @@ class DoclingDocument(BaseModel):
         image: Optional[ImageRef] = None,
         caption: Optional[Union[TextItem, RefItem]] = None,
         prov: Optional[ProvenanceItem] = None,
-        parent: Optional[GroupItem] = None,
+        parent: Optional[NodeItem] = None,
     ):
         """add_picture.
@@ -1547,7 +1581,7 @@ class DoclingDocument(BaseModel):
         :param caption: Optional[Union[TextItem:
         :param RefItem]]:  (Default value = None)
         :param prov: Optional[ProvenanceItem]:  (Default value = None)
-        :param parent: Optional[GroupItem]:  (Default value = None)
+        :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
             parent = self.body
@@ -1577,14 +1611,14 @@ class DoclingDocument(BaseModel):
         text: str,
         orig: Optional[str] = None,
         prov: Optional[ProvenanceItem] = None,
-        parent: Optional[GroupItem] = None,
+        parent: Optional[NodeItem] = None,
     ):
         """add_title.
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
         :param prov: Optional[ProvenanceItem]:  (Default value = None)
-        :param parent: Optional[GroupItem]:  (Default value = None)
+        :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
             parent = self.body
@@ -1615,7 +1649,7 @@ class DoclingDocument(BaseModel):
         orig: Optional[str] = None,
         level: LevelNumber = 1,
         prov: Optional[ProvenanceItem] = None,
-        parent: Optional[GroupItem] = None,
+        parent: Optional[NodeItem] = None,
     ):
         """add_heading.
@@ -1624,7 +1658,7 @@ class DoclingDocument(BaseModel):
         :param orig: Optional[str]:  (Default value = None)
         :param level: LevelNumber:  (Default value = 1)
         :param prov: Optional[ProvenanceItem]:  (Default value = None)
-        :param parent: Optional[GroupItem]:  (Default value = None)
+        :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
             parent = self.body
@@ -1668,7 +1702,7 @@ class DoclingDocument(BaseModel):
         self,
         root: Optional[NodeItem] = None,
         with_groups: bool = False,
-        traverse_pictures: bool = True,
+        traverse_pictures: bool = False,
         page_no: Optional[int] = None,
         _level: int = 0,  # fixed parameter, carries through the node nesting level
     ) -> typing.Iterable[Tuple[NodeItem, int]]:  # tuple of node and level
@@ -1685,30 +1719,31 @@ class DoclingDocument(BaseModel):
         if not root:
             root = self.body
+        # Yield non-group items or group items when with_groups=True
         if not isinstance(root, GroupItem) or with_groups:
             if isinstance(root, DocItem):
-                if page_no is not None:
-                    for prov in root.prov:
-                        if prov.page_no == page_no:
-                            yield root, _level
-                else:
+                if page_no is None or any(
+                    prov.page_no == page_no for prov in root.prov
+                ):
                     yield root, _level
             else:
                 yield root, _level
+        # Handle picture traversal - only traverse children if requested
+        if isinstance(root, PictureItem) and not traverse_pictures:
+            return
         # Traverse children
         for child_ref in root.children:
             child = child_ref.resolve(self)
             if isinstance(child, NodeItem):
-                # If the child is a NodeItem, recursively traverse it
-                if not isinstance(child, PictureItem) or traverse_pictures:
-                    yield from self.iterate_items(
-                        child,
-                        _level=_level + 1,
-                        with_groups=with_groups,
-                        page_no=page_no,
-                    )
+                yield from self.iterate_items(
+                    child,
+                    with_groups=with_groups,
+                    traverse_pictures=traverse_pictures,
+                    page_no=page_no,
+                    _level=_level + 1,
+                )
     def _clear_picture_pil_cache(self):
         """Clear cache storage of all images."""
@@ -1864,7 +1899,7 @@ class DoclingDocument(BaseModel):
         """
         with open(filename, "r") as f:
-            return cls.model_validate(json.loads(f.read()))
+            return cls.model_validate_json(f.read())
     def save_as_yaml(
         self,
@@ -2053,10 +2088,6 @@ class DoclingDocument(BaseModel):
                 text = f"```\n{item.text}\n```\n"
                 mdtexts.append(text)
-            elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
-                # captions are printed in picture and table ... skipping for now
-                continue
             elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
                 in_list = True
                 # Calculate indent based on list_nesting_level
@@ -2115,10 +2146,30 @@ class DoclingDocument(BaseModel):
         # Bold, Italic, or Bold-Italic
         # Hence, any underscore that we print into Markdown is coming from document text
         # That means we need to escape it, to properly reflect content in the markdown
+        # However, we need to preserve underscores in image URLs
+        # to maintain their validity
+        # For example: ![image](path/to_image.png) should remain unchanged
         def escape_underscores(text):
-            # Replace "_" with "\_" only if it's not already escaped
-            escaped_text = re.sub(r"(?<!\\)_", r"\_", text)
-            return escaped_text
+            """Escape underscores but leave them intact in the URL.."""
+            # Firstly, identify all the URL patterns.
+            url_pattern = r"!\[.*?\]\((.*?)\)"
+            parts = []
+            last_end = 0
+            for match in re.finditer(url_pattern, text):
+                # Text to add before the URL (needs to be escaped)
+                before_url = text[last_end : match.start()]
+                parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))
+                # Add the full URL part (do not escape)
+                parts.append(match.group(0))
+                last_end = match.end()
+            # Add the final part of the text (which needs to be escaped)
+            if last_end < len(text):
+                parts.append(re.sub(r"(?<!\\)_", r"\_", text[last_end:]))
+            return "".join(parts)
         mdtext = escape_underscores(mdtext)
@@ -2328,10 +2379,6 @@ class DoclingDocument(BaseModel):
                 text = f"<pre>{item.text}</pre>"
                 html_texts.append(text)
-            elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
-                # captions are printed in picture and table ... skipping for now
-                continue
             elif isinstance(item, ListItem):
                 text = f"<li>{item.text}</li>"
@@ -2533,10 +2580,6 @@ class DoclingDocument(BaseModel):
                 result += f"<unordered_list>{delim}"
                 in_ordered_list.append(False)
-            elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
-                # captions are printed in picture and table ... skipping for now
-                continue
             elif isinstance(item, SectionHeaderItem):
                 result += item.export_to_document_tokens(
@@ -2642,10 +2685,6 @@ class DoclingDocument(BaseModel):
                     indent * level + f"item-{i} at level {level}: {item.label}: {text}"
                 )
-            elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
-                # captions are printed in picture and table ... skipping for now
-                continue
             elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
                 text = get_text(text=item.text, max_text_len=max_text_len)

docling_core/types/legacy_doc/base.py CHANGED Viewed

@@ -140,6 +140,7 @@ class BaseCell(AliasModel):
     obj_type: str = Field(
         alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
     )
+    payload: Optional[dict] = None
     def get_location_tokens(
         self,

docling_core/utils/legacy.py ADDED Viewed

@@ -0,0 +1,633 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+"""Utilities for converting between legacy and new document format."""
+import hashlib
+import uuid
+from pathlib import Path
+from typing import Dict, Optional, Union
+from docling_core.types.doc import (
+    BoundingBox,
+    CoordOrigin,
+    DocItem,
+    DocItemLabel,
+    DoclingDocument,
+    DocumentOrigin,
+    PictureItem,
+    ProvenanceItem,
+    SectionHeaderItem,
+    Size,
+    TableCell,
+    TableItem,
+    TextItem,
+)
+from docling_core.types.doc.document import GroupItem, ListItem, TableData
+from docling_core.types.doc.labels import GroupLabel
+from docling_core.types.legacy_doc.base import (
+    BaseCell,
+    BaseText,
+    Figure,
+    GlmTableCell,
+    PageDimensions,
+    PageReference,
+    Prov,
+    Ref,
+)
+from docling_core.types.legacy_doc.base import Table as DsSchemaTable
+from docling_core.types.legacy_doc.base import TableCell as DsTableCell
+from docling_core.types.legacy_doc.document import (
+    CCSDocumentDescription as DsDocumentDescription,
+)
+from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
+from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
+def _create_hash(string: str):
+    hasher = hashlib.sha256()
+    hasher.update(string.encode("utf-8"))
+    return hasher.hexdigest()
+def doc_item_label_to_legacy_type(label: DocItemLabel):
+    """Convert the DocItemLabel to the legacy type."""
+    _label_to_ds_type = {
+        DocItemLabel.TITLE: "title",
+        DocItemLabel.DOCUMENT_INDEX: "table-of-contents",
+        DocItemLabel.SECTION_HEADER: "subtitle-level-1",
+        DocItemLabel.CHECKBOX_SELECTED: "checkbox-selected",
+        DocItemLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
+        DocItemLabel.CAPTION: "caption",
+        DocItemLabel.PAGE_HEADER: "page-header",
+        DocItemLabel.PAGE_FOOTER: "page-footer",
+        DocItemLabel.FOOTNOTE: "footnote",
+        DocItemLabel.TABLE: "table",
+        DocItemLabel.FORMULA: "equation",
+        DocItemLabel.LIST_ITEM: "paragraph",
+        DocItemLabel.CODE: "paragraph",
+        DocItemLabel.PICTURE: "figure",
+        DocItemLabel.TEXT: "paragraph",
+        DocItemLabel.PARAGRAPH: "paragraph",
+    }
+    if label in _label_to_ds_type:
+        return _label_to_ds_type[label]
+    return label.value
+def doc_item_label_to_legacy_name(label: DocItemLabel):
+    """Convert the DocItemLabel to the legacy name."""
+    _reverse_label_name_mapping = {
+        DocItemLabel.CAPTION: "Caption",
+        DocItemLabel.FOOTNOTE: "Footnote",
+        DocItemLabel.FORMULA: "Formula",
+        DocItemLabel.LIST_ITEM: "List-item",
+        DocItemLabel.PAGE_FOOTER: "Page-footer",
+        DocItemLabel.PAGE_HEADER: "Page-header",
+        DocItemLabel.PICTURE: "Picture",
+        DocItemLabel.SECTION_HEADER: "Section-header",
+        DocItemLabel.TABLE: "Table",
+        DocItemLabel.TEXT: "Text",
+        DocItemLabel.TITLE: "Title",
+        DocItemLabel.DOCUMENT_INDEX: "Document Index",
+        DocItemLabel.CODE: "Code",
+        DocItemLabel.CHECKBOX_SELECTED: "Checkbox-Selected",
+        DocItemLabel.CHECKBOX_UNSELECTED: "Checkbox-Unselected",
+        DocItemLabel.FORM: "Form",
+        DocItemLabel.KEY_VALUE_REGION: "Key-Value Region",
+        DocItemLabel.PARAGRAPH: "paragraph",
+    }
+    if label in _reverse_label_name_mapping:
+        return _reverse_label_name_mapping[label]
+    return label.value
+def docling_document_to_legacy(doc: DoclingDocument, fallback_filaname: str = "file"):
+    """Convert a DoclingDocument to the legacy format."""
+    title = ""
+    desc: DsDocumentDescription = DsDocumentDescription(logs=[])
+    if doc.origin is not None:
+        document_hash = _create_hash(str(doc.origin.binary_hash))
+        filename = doc.origin.filename
+    else:
+        document_hash = _create_hash(str(uuid.uuid4()))
+        filename = fallback_filaname
+    page_hashes = [
+        PageReference(
+            hash=_create_hash(document_hash + ":" + str(p.page_no - 1)),
+            page=p.page_no,
+            model="default",
+        )
+        for p in doc.pages.values()
+    ]
+    file_info = DsFileInfoObject(
+        filename=filename,
+        document_hash=document_hash,
+        num_pages=len(doc.pages),
+        page_hashes=page_hashes,
+    )
+    main_text: list[Union[Ref, BaseText]] = []
+    tables: list[DsSchemaTable] = []
+    figures: list[Figure] = []
+    equations: list[BaseCell] = []
+    footnotes: list[BaseText] = []
+    page_headers: list[BaseText] = []
+    page_footers: list[BaseText] = []
+    # TODO: populate page_headers page_footers from doc.furniture
+    embedded_captions = set()
+    for ix, (item, level) in enumerate(doc.iterate_items(doc.body)):
+        if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0:
+            caption = item.caption_text(doc)
+            if caption:
+                embedded_captions.add(caption)
+    for item, level in doc.iterate_items():
+        if isinstance(item, DocItem):
+            item_type = item.label
+            if isinstance(item, (TextItem, ListItem, SectionHeaderItem)):
+                if isinstance(item, ListItem) and item.marker:
+                    text = f"{item.marker} {item.text}"
+                else:
+                    text = item.text
+                # Can be empty.
+                prov = [
+                    Prov(
+                        bbox=p.bbox.as_tuple(),
+                        page=p.page_no,
+                        span=[0, len(item.text)],
+                    )
+                    for p in item.prov
+                ]
+                main_text.append(
+                    BaseText(
+                        text=text,
+                        obj_type=doc_item_label_to_legacy_type(item.label),
+                        name=doc_item_label_to_legacy_name(item.label),
+                        prov=prov,
+                    )
+                )
+                # skip captions of they are embedded in the actual
+                # floating object
+                if item_type == DocItemLabel.CAPTION and text in embedded_captions:
+                    continue
+            elif isinstance(item, TableItem) and item.data:
+                index = len(tables)
+                ref_str = f"#/tables/{index}"
+                main_text.append(
+                    Ref(
+                        name=doc_item_label_to_legacy_name(item.label),
+                        obj_type=doc_item_label_to_legacy_type(item.label),
+                        ref=ref_str,
+                    ),
+                )
+                # Initialise empty table data grid (only empty cells)
+                table_data = [
+                    [
+                        DsTableCell(
+                            text="",
+                            # bbox=[0,0,0,0],
+                            spans=[[i, j]],
+                            obj_type="body",
+                        )
+                        for j in range(item.data.num_cols)
+                    ]
+                    for i in range(item.data.num_rows)
+                ]
+                # Overwrite cells in table data for which there is actual cell content.
+                for cell in item.data.table_cells:
+                    for i in range(
+                        min(cell.start_row_offset_idx, item.data.num_rows),
+                        min(cell.end_row_offset_idx, item.data.num_rows),
+                    ):
+                        for j in range(
+                            min(cell.start_col_offset_idx, item.data.num_cols),
+                            min(cell.end_col_offset_idx, item.data.num_cols),
+                        ):
+                            celltype = "body"
+                            if cell.column_header:
+                                celltype = "col_header"
+                            elif cell.row_header:
+                                celltype = "row_header"
+                            elif cell.row_section:
+                                celltype = "row_section"
+                            def _make_spans(cell: TableCell, table_item: TableItem):
+                                for rspan in range(
+                                    min(
+                                        cell.start_row_offset_idx,
+                                        table_item.data.num_rows,
+                                    ),
+                                    min(
+                                        cell.end_row_offset_idx,
+                                        table_item.data.num_rows,
+                                    ),
+                                ):
+                                    for cspan in range(
+                                        min(
+                                            cell.start_col_offset_idx,
+                                            table_item.data.num_cols,
+                                        ),
+                                        min(
+                                            cell.end_col_offset_idx,
+                                            table_item.data.num_cols,
+                                        ),
+                                    ):
+                                        yield [rspan, cspan]
+                            spans = list(_make_spans(cell, item))
+                            table_data[i][j] = GlmTableCell(
+                                text=cell.text,
+                                bbox=(
+                                    cell.bbox.as_tuple()
+                                    if cell.bbox is not None
+                                    else None
+                                ),  # check if this is bottom-left
+                                spans=spans,
+                                obj_type=celltype,
+                                col=j,
+                                row=i,
+                                row_header=cell.row_header,
+                                row_section=cell.row_section,
+                                col_header=cell.column_header,
+                                row_span=[
+                                    cell.start_row_offset_idx,
+                                    cell.end_row_offset_idx,
+                                ],
+                                col_span=[
+                                    cell.start_col_offset_idx,
+                                    cell.end_col_offset_idx,
+                                ],
+                            )
+                # Compute the caption
+                caption = item.caption_text(doc)
+                tables.append(
+                    DsSchemaTable(
+                        text=caption,
+                        num_cols=item.data.num_cols,
+                        num_rows=item.data.num_rows,
+                        obj_type=doc_item_label_to_legacy_type(item.label),
+                        data=table_data,
+                        prov=[
+                            Prov(
+                                bbox=p.bbox.as_tuple(),
+                                page=p.page_no,
+                                span=[0, 0],
+                            )
+                            for p in item.prov
+                        ],
+                    )
+                )
+            elif isinstance(item, PictureItem):
+                index = len(figures)
+                ref_str = f"#/figures/{index}"
+                main_text.append(
+                    Ref(
+                        name=doc_item_label_to_legacy_name(item.label),
+                        obj_type=doc_item_label_to_legacy_type(item.label),
+                        ref=ref_str,
+                    ),
+                )
+                # Compute the caption
+                caption = item.caption_text(doc)
+                figures.append(
+                    Figure(
+                        prov=[
+                            Prov(
+                                bbox=p.bbox.as_tuple(),
+                                page=p.page_no,
+                                span=[0, len(caption)],
+                            )
+                            for p in item.prov
+                        ],
+                        obj_type=doc_item_label_to_legacy_type(item.label),
+                        text=caption,
+                        # data=[[]],
+                    )
+                )
+    page_dimensions = [
+        PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
+        for p in doc.pages.values()
+    ]
+    legacy_doc: DsDocument = DsDocument(
+        name=title,
+        description=desc,
+        file_info=file_info,
+        main_text=main_text,
+        equations=equations,
+        footnotes=footnotes,
+        page_headers=page_headers,
+        page_footers=page_footers,
+        tables=tables,
+        figures=figures,
+        page_dimensions=page_dimensions,
+    )
+    return legacy_doc
+def legacy_to_docling_document(legacy_doc: DsDocument) -> DoclingDocument:  # noqa: C901
+    """Convert a legacy document to DoclingDocument.
+    It is known that the following content will not be preserved in the transformation:
+    - name of labels (upper vs lower case)
+    - caption of figures are not in main-text anymore
+    - s3_data removed
+    - model metadata removed
+    - logs removed
+    - document hash cannot be preserved
+    """
+    def _transform_prov(item: BaseCell) -> Optional[ProvenanceItem]:
+        """Create a new provenance from a legacy item."""
+        prov: Optional[ProvenanceItem] = None
+        if item.prov is not None and len(item.prov) > 0:
+            prov = ProvenanceItem(
+                page_no=int(item.prov[0].page),
+                charspan=tuple(item.prov[0].span),
+                bbox=BoundingBox.from_tuple(
+                    tuple(item.prov[0].bbox), origin=CoordOrigin.BOTTOMLEFT
+                ),
+            )
+        return prov
+    origin = DocumentOrigin(
+        mimetype="application/pdf",
+        filename=legacy_doc.file_info.filename,
+        binary_hash=legacy_doc.file_info.document_hash,
+    )
+    doc_name = Path(origin.filename).stem
+    doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin)
+    # define pages
+    if legacy_doc.page_dimensions is not None:
+        for page_dim in legacy_doc.page_dimensions:
+            page_no = int(page_dim.page)
+            size = Size(width=page_dim.width, height=page_dim.height)
+            doc.add_page(page_no=page_no, size=size)
+    # page headers
+    if legacy_doc.page_headers is not None:
+        for text_item in legacy_doc.page_headers:
+            if text_item.text is None:
+                continue
+            prov = _transform_prov(text_item)
+            doc.add_text(
+                label=DocItemLabel.PAGE_HEADER,
+                text=text_item.text,
+                parent=doc.furniture,
+            )
+    # page footers
+    if legacy_doc.page_footers is not None:
+        for text_item in legacy_doc.page_footers:
+            if text_item.text is None:
+                continue
+            prov = _transform_prov(text_item)
+            doc.add_text(
+                label=DocItemLabel.PAGE_FOOTER,
+                text=text_item.text,
+                parent=doc.furniture,
+            )
+    # footnotes
+    if legacy_doc.footnotes is not None:
+        for text_item in legacy_doc.footnotes:
+            if text_item.text is None:
+                continue
+            prov = _transform_prov(text_item)
+            doc.add_text(
+                label=DocItemLabel.FOOTNOTE, text=text_item.text, parent=doc.furniture
+            )
+    # main-text content
+    if legacy_doc.main_text is not None:
+        item: Optional[Union[BaseCell, BaseText]]
+        # collect all captions embedded in table and figure objects
+        # to avoid repeating them
+        embedded_captions: Dict[str, int] = {}
+        for ix, orig_item in enumerate(legacy_doc.main_text):
+            item = (
+                legacy_doc._resolve_ref(orig_item)
+                if isinstance(orig_item, Ref)
+                else orig_item
+            )
+            if item is None:
+                continue
+            if isinstance(item, (DsSchemaTable, Figure)) and item.text:
+                embedded_captions[item.text] = ix
+        # build lookup from floating objects to their caption item
+        floating_to_caption: Dict[int, BaseText] = {}
+        for ix, orig_item in enumerate(legacy_doc.main_text):
+            item = (
+                legacy_doc._resolve_ref(orig_item)
+                if isinstance(orig_item, Ref)
+                else orig_item
+            )
+            if item is None:
+                continue
+            item_type = item.obj_type.lower()
+            if (
+                isinstance(item, BaseText)
+                and (
+                    item_type == "caption"
+                    or (item.name is not None and item.name.lower() == "caption")
+                )
+                and item.text in embedded_captions
+            ):
+                floating_ix = embedded_captions[item.text]
+                floating_to_caption[floating_ix] = item
+        # main loop iteration
+        current_list: Optional[GroupItem] = None
+        for ix, orig_item in enumerate(legacy_doc.main_text):
+            item = (
+                legacy_doc._resolve_ref(orig_item)
+                if isinstance(orig_item, Ref)
+                else orig_item
+            )
+            if item is None:
+                continue
+            prov = _transform_prov(item)
+            item_type = item.obj_type.lower()
+            # if a group is needed, add it
+            if isinstance(item, BaseText) and (
+                item_type in "list-item-level-1" or item.name in {"list", "list-item"}
+            ):
+                if current_list is None:
+                    current_list = doc.add_group(label=GroupLabel.LIST, name="list")
+            else:
+                current_list = None
+            # add the document item in the document
+            if isinstance(item, BaseText):
+                text = item.text if item.text is not None else ""
+                label_name = item.name if item.name is not None else "text"
+                if item_type == "caption":
+                    if text in embedded_captions:
+                        # skip captions if they are embedded in the actual
+                        # floating objects
+                        continue
+                    else:
+                        # captions without a related object are inserted as text
+                        doc.add_text(label=DocItemLabel.TEXT, text=text, prov=prov)
+                # first title match
+                if item_type == "title":
+                    doc.add_title(text=text, prov=prov)
+                # secondary titles
+                elif item_type in {
+                    "subtitle-level-1",
+                }:
+                    doc.add_heading(text=text, prov=prov)
+                # list item
+                elif item_type in "list-item-level-1" or label_name in {
+                    "list",
+                    "list-item",
+                }:
+                    # TODO: Infer if this is a numbered or a bullet list item
+                    doc.add_list_item(
+                        text=text, enumerated=False, prov=prov, parent=current_list
+                    )
+                # normal text
+                else:
+                    label = DocItemLabel.TEXT
+                    normalized_label_name = label_name.replace("-", "_")
+                    if normalized_label_name is not None:
+                        try:
+                            label = DocItemLabel(normalized_label_name)
+                        except ValueError:
+                            pass
+                    doc.add_text(label=label, text=text, prov=prov)
+            elif isinstance(item, DsSchemaTable):
+                table_data = TableData(num_cols=item.num_cols, num_rows=item.num_rows)
+                if item.data is not None:
+                    seen_spans = set()
+                    for row_ix, row in enumerate(item.data):
+                        for col_ix, orig_cell_data in enumerate(row):
+                            cell_bbox: Optional[BoundingBox] = (
+                                BoundingBox.from_tuple(
+                                    tuple(orig_cell_data.bbox),
+                                    origin=CoordOrigin.BOTTOMLEFT,
+                                )
+                                if orig_cell_data.bbox is not None
+                                else None
+                            )
+                            cell = TableCell(
+                                start_row_offset_idx=row_ix,
+                                end_row_offset_idx=row_ix + 1,
+                                start_col_offset_idx=col_ix,
+                                end_col_offset_idx=col_ix + 1,
+                                text=orig_cell_data.text,
+                                bbox=cell_bbox,
+                                column_header=(orig_cell_data.obj_type == "col_header"),
+                                row_header=(orig_cell_data.obj_type == "row_header"),
+                                row_section=(orig_cell_data.obj_type == "row_section"),
+                            )
+                            if orig_cell_data.spans is not None:
+                                # convert to a tuple of tuples for hashing
+                                spans_tuple = tuple(
+                                    tuple(span) for span in orig_cell_data.spans
+                                )
+                                # skip repeated spans
+                                if spans_tuple in seen_spans:
+                                    continue
+                                seen_spans.add(spans_tuple)
+                                cell.start_row_offset_idx = min(
+                                    s[0] for s in spans_tuple
+                                )
+                                cell.end_row_offset_idx = (
+                                    max(s[0] for s in spans_tuple) + 1
+                                )
+                                cell.start_col_offset_idx = min(
+                                    s[1] for s in spans_tuple
+                                )
+                                cell.end_col_offset_idx = (
+                                    max(s[1] for s in spans_tuple) + 1
+                                )
+                                cell.row_span = (
+                                    cell.end_row_offset_idx - cell.start_row_offset_idx
+                                )
+                                cell.col_span = (
+                                    cell.end_col_offset_idx - cell.start_col_offset_idx
+                                )
+                            table_data.table_cells.append(cell)
+                new_item = doc.add_table(data=table_data, prov=prov)
+                if (caption_item := floating_to_caption.get(ix)) is not None:
+                    if caption_item.text is not None:
+                        caption_prov = _transform_prov(caption_item)
+                        caption = doc.add_text(
+                            label=DocItemLabel.CAPTION,
+                            text=caption_item.text,
+                            prov=caption_prov,
+                            parent=new_item,
+                        )
+                        new_item.captions.append(caption.get_ref())
+            elif isinstance(item, Figure):
+                new_item = doc.add_picture(prov=prov)
+                if (caption_item := floating_to_caption.get(ix)) is not None:
+                    if caption_item.text is not None:
+                        caption_prov = _transform_prov(caption_item)
+                        caption = doc.add_text(
+                            label=DocItemLabel.CAPTION,
+                            text=caption_item.text,
+                            prov=caption_prov,
+                            parent=new_item,
+                        )
+                        new_item.captions.append(caption.get_ref())
+            # equations
+            elif (
+                isinstance(item, BaseCell)
+                and item.text is not None
+                and item_type in {"formula", "equation"}
+            ):
+                doc.add_text(label=DocItemLabel.FORMULA, text=item.text, prov=prov)
+    return doc

{docling_core-2.8.0.dist-info → docling_core-2.10.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling-core
-Version: 2.8.0
+Version: 2.10.0
 Summary: A python library to define and validate data types in Docling.
 Home-page: https://ds4sd.github.io/
 License: MIT
@@ -35,6 +35,7 @@ Requires-Dist: pyyaml (>=5.1,<7.0.0)
 Requires-Dist: semchunk (>=2.2.0,<3.0.0) ; extra == "chunking"
 Requires-Dist: tabulate (>=0.9.0,<0.10.0)
 Requires-Dist: transformers (>=4.34.0,<5.0.0) ; extra == "chunking"
+Requires-Dist: typer (>=0.12.5,<0.13.0)
 Requires-Dist: typing-extensions (>=4.12.2,<5.0.0)
 Project-URL: Repository, https://github.com/DS4SD/docling-core
 Description-Content-Type: text/markdown

{docling_core-2.8.0.dist-info → docling_core-2.10.0.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,6 @@
 docling_core/__init__.py,sha256=D0afxif-BMUrgx2cYk1cwxiwATRYaGXsIMk_z4nw1Vs,90
+docling_core/cli/__init__.py,sha256=C63yWifzpA0IV7YWDatpAdrhoV8zjqxAKv0xMf09VdM,19
+docling_core/cli/view.py,sha256=bhxvPQWIJVo2g_pRL0GjQwjDw-jdiRXp1-BTbG849go,1746
 docling_core/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling_core/resources/schemas/doc/ANN.json,sha256=04U5j-PU9m5w7IagJ_rHcAx7qUtLkUuaWZO9GuYHnTA,4202
 docling_core/resources/schemas/doc/DOC.json,sha256=9tVKpCqDGGq3074Nn5qlUCdTN-5k1Q0ri_scJblwnLE,6686
@@ -17,12 +19,12 @@ docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9AC
 docling_core/transforms/chunker/__init__.py,sha256=YdizSKXLmmK9eyYBsarHWr8Mx_AoA0PT7c0absibZMk,306
 docling_core/transforms/chunker/base.py,sha256=PZl6QN41cZseTPkTwPzysDHYYFb6DwDSKw0QVSiFfG0,2541
 docling_core/transforms/chunker/hierarchical_chunker.py,sha256=cy3sE9w_7l-uoIEUcfnZlQweDHUoyAJTQ6IkzxxVjFY,8052
-docling_core/transforms/chunker/hybrid_chunker.py,sha256=LUzlqtTbXfhY40bhBVGtjEMZXFWRz1XH53OGqBh2Z3Y,11224
+docling_core/transforms/chunker/hybrid_chunker.py,sha256=9bGhjr4vzpXbOMLCydCl81r1HbzMuMlo9ABfXyLRtd4,11375
 docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
 docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
 docling_core/types/doc/__init__.py,sha256=bEL4zKVOG7Wxm6xQrgF58mu-Teds9aSavuEAKVNhrTU,639
 docling_core/types/doc/base.py,sha256=_ttU8QI8wXDTQRUnN5n7L6D9wYFVLSAibxlFoMbgAsk,4557
-docling_core/types/doc/document.py,sha256=FoEm1GFV2JeXdxtj-ZINe7S_b_rZZjSKOSa72J16ork,90522
+docling_core/types/doc/document.py,sha256=9t6FPvrxT9gKtUaYMP_Kyhz_izo2p6TQX_LlG2Fj5hY,91593
 docling_core/types/doc/labels.py,sha256=A8vWP82VAeXO1rlCO0oDKo_Hb8uDeQe0myOTY3P03hk,1596
 docling_core/types/doc/tokens.py,sha256=uU_MYW_p7ypf7eYICFBvxdnVaPZ7CQnvZmbJ6oPrtEA,6134
 docling_core/types/doc/utils.py,sha256=YDOh_ZD1Y7OmCEDdCLJ_MO5K3HA67nc_acfhOK6WztU,1439
@@ -30,7 +32,7 @@ docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6
 docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
 docling_core/types/io/__init__.py,sha256=7QYvFRaDE0AzBg8e7tvsVNlLBbCbAbQ9rP2TU8aXR1k,350
 docling_core/types/legacy_doc/__init__.py,sha256=Pzj_8rft6SJTVTCHgXRwHtuZjL6LK_6dcBWjikL9biY,125
-docling_core/types/legacy_doc/base.py,sha256=l8NKCuORUQ1ebjdGWpj6b30oQEvtErLsIHKQHbbJiPg,14683
+docling_core/types/legacy_doc/base.py,sha256=aBKBunw6M6nvEq4lqP1cfFWK3GpGa6PXwNQqbvcJ3dU,14718
 docling_core/types/legacy_doc/doc_ann.py,sha256=CIQHW8yzu70bsMR9gtu7dqe4oz603Tq2eDDt9sh-tYo,1203
 docling_core/types/legacy_doc/doc_ocr.py,sha256=FfFqHAyMSbFt5cKeE7QLcxS0qUweBilBJoN9CH2TsQs,1394
 docling_core/types/legacy_doc/doc_raw.py,sha256=LrvQ9DhNjBRy98p_F9PUyHZeTGAxMKWqJzY4WJ7v-xs,3895
@@ -51,10 +53,11 @@ docling_core/utils/alias.py,sha256=B6Lqvss8CbaNARHLR4qSmNh9OkB6LvqTpxfsFmkLAFo,8
 docling_core/utils/file.py,sha256=GzX0pclvewwPoqHJSaVUuULzSJwJgkCUwgKgJ7G5ohQ,5628
 docling_core/utils/generate_docs.py,sha256=BdKAoduWXOc7YMvcmlhjoJOFlUxij1ybxglj6LZDtC8,2290
 docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2tyi_OhHepHYtZg,1654
+docling_core/utils/legacy.py,sha256=xfp7U0JqjI60K3loWiNTk8w08_KfCUzTb2MNULBOIz4,24396
 docling_core/utils/validate.py,sha256=3FmnxnKTDZC5J9OGxCL3U3DGRl0t0bBV1NcySXswdas,2031
 docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
-docling_core-2.8.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
-docling_core-2.8.0.dist-info/METADATA,sha256=HNRaSRjkC-DkeOvguUK82YRbCUDYir4cuSG6-qqKT1U,5703
-docling_core-2.8.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-docling_core-2.8.0.dist-info/entry_points.txt,sha256=jIxlWv3tnO04irlZc0zfhqJIgz1bg9Hha4AkaLWSdUA,177
-docling_core-2.8.0.dist-info/RECORD,,
+docling_core-2.10.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
+docling_core-2.10.0.dist-info/METADATA,sha256=2Xr2MRaXihKpNdNhAwfZT973ffbX7GGs19ylGCBwfe4,5744
+docling_core-2.10.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+docling_core-2.10.0.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
+docling_core-2.10.0.dist-info/RECORD,,

{docling_core-2.8.0.dist-info → docling_core-2.10.0.dist-info}/entry_points.txt RENAMED Viewed

@@ -1,4 +1,5 @@
 [console_scripts]
+docling-view=docling_core.cli.view:app
 generate_docs=docling_core.utils.generate_docs:main
 generate_jsonschema=docling_core.utils.generate_jsonschema:main
 validate=docling_core.utils.validate:main

{docling_core-2.8.0.dist-info → docling_core-2.10.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{docling_core-2.8.0.dist-info → docling_core-2.10.0.dist-info}/WHEEL RENAMED Viewed

File without changes

docling-core 2.8.0__py3-none-any.whl → 2.10.0__py3-none-any.whl

Potentially problematic release.

docling-core 2.8.0py3-none-any.whl → 2.10.0py3-none-any.whl