PyPI - docling-core - Versions diffs - 2.9.0__py3-none-any.whl → 2.10.0__py3-none-any.whl - Mend

docling-core 2.9.0py3-none-any.whl → 2.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling-core might be problematic. Click here for more details.

Files changed (10) hide show

docling_core/cli/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """CLI package."""

docling_core/cli/view.py ADDED Viewed

@@ -0,0 +1,68 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+"""CLI for docling viewer."""
+import importlib
+import tempfile
+import webbrowser
+from pathlib import Path
+from typing import Annotated, Optional
+import typer
+from docling_core.types.doc import DoclingDocument
+from docling_core.types.doc.base import ImageRefMode
+from docling_core.utils.file import resolve_source_to_path
+app = typer.Typer(
+    name="Docling",
+    no_args_is_help=True,
+    add_completion=False,
+    pretty_exceptions_enable=False,
+)
+def version_callback(value: bool):
+    """Callback for version inspection."""
+    if value:
+        docling_core_version = importlib.metadata.version("docling-core")
+        print(f"Docling Core version: {docling_core_version}")
+        raise typer.Exit()
+@app.command(no_args_is_help=True)
+def view(
+    source: Annotated[
+        str,
+        typer.Argument(
+            ...,
+            metavar="source",
+            help="Docling JSON file to view.",
+        ),
+    ],
+    version: Annotated[
+        Optional[bool],
+        typer.Option(
+            "--version",
+            callback=version_callback,
+            is_eager=True,
+            help="Show version information.",
+        ),
+    ] = None,
+):
+    """Display a Docling JSON file on the default browser."""
+    path = resolve_source_to_path(source=source)
+    doc = DoclingDocument.load_from_json(filename=path)
+    target_path = Path(tempfile.mkdtemp()) / "out.html"
+    html_output = doc.export_to_html(image_mode=ImageRefMode.EMBEDDED)
+    with open(target_path, "w") as f:
+        f.write(html_output)
+    webbrowser.open(url=f"file://{target_path.absolute().resolve()}")
+click_app = typer.main.get_command(app)
+if __name__ == "__main__":
+    app()

docling_core/transforms/chunker/hybrid_chunker.py CHANGED Viewed

@@ -44,7 +44,9 @@ class HybridChunker(BaseChunker):
     model_config = ConfigDict(arbitrary_types_allowed=True)
-    tokenizer: Union[PreTrainedTokenizerBase, str]
+    tokenizer: Union[PreTrainedTokenizerBase, str] = (
+        "sentence-transformers/all-MiniLM-L6-v2"
+    )
     max_tokens: int = None  # type: ignore[assignment]
     merge_peers: bool = True
@@ -96,6 +98,7 @@ class HybridChunker(BaseChunker):
             doc_items=doc_chunk.meta.doc_items[window_start : window_end + 1],
             headings=doc_chunk.meta.headings,
             captions=doc_chunk.meta.captions,
+            origin=doc_chunk.meta.origin,
         )
         new_chunk = DocChunk(text=window_text, meta=meta)
         return new_chunk
@@ -242,6 +245,7 @@ class HybridChunker(BaseChunker):
                         doc_items=window_items,
                         headings=current_headings_and_captions[0],
                         captions=current_headings_and_captions[1],
+                        origin=chunk.meta.origin,
                     )
                     new_chunk = DocChunk(
                         text=window_text,

docling_core/types/doc/document.py CHANGED Viewed

@@ -49,7 +49,6 @@ DEFAULT_EXPORT_LABELS = {
     DocItemLabel.DOCUMENT_INDEX,
     DocItemLabel.SECTION_HEADER,
     DocItemLabel.PARAGRAPH,
-    DocItemLabel.CAPTION,
     DocItemLabel.TABLE,
     DocItemLabel.PICTURE,
     DocItemLabel.FORMULA,
@@ -58,6 +57,7 @@ DEFAULT_EXPORT_LABELS = {
     DocItemLabel.TEXT,
     DocItemLabel.LIST_ITEM,
     DocItemLabel.CODE,
+    DocItemLabel.REFERENCE,
 }
@@ -593,6 +593,21 @@ class DocItem(
 class TextItem(DocItem):
     """TextItem."""
+    label: typing.Literal[
+        DocItemLabel.CAPTION,
+        DocItemLabel.CHECKBOX_SELECTED,
+        DocItemLabel.CHECKBOX_UNSELECTED,
+        DocItemLabel.CODE,
+        DocItemLabel.FOOTNOTE,
+        DocItemLabel.FORMULA,
+        DocItemLabel.PAGE_FOOTER,
+        DocItemLabel.PAGE_HEADER,
+        DocItemLabel.PARAGRAPH,
+        DocItemLabel.REFERENCE,
+        DocItemLabel.TEXT,
+        DocItemLabel.TITLE,
+    ]
     orig: str  # untreated representation
     text: str  # sanitized representation
@@ -644,8 +659,10 @@ class TextItem(DocItem):
 class SectionHeaderItem(TextItem):
     """SectionItem."""
-    label: typing.Literal[DocItemLabel.SECTION_HEADER] = DocItemLabel.SECTION_HEADER
-    level: LevelNumber
+    label: typing.Literal[DocItemLabel.SECTION_HEADER] = (
+        DocItemLabel.SECTION_HEADER  # type: ignore[assignment]
+    )
+    level: LevelNumber = 1
     def export_to_document_tokens(
         self,
@@ -695,9 +712,11 @@ class SectionHeaderItem(TextItem):
 class ListItem(TextItem):
     """SectionItem."""
-    label: typing.Literal[DocItemLabel.LIST_ITEM] = DocItemLabel.LIST_ITEM
+    label: typing.Literal[DocItemLabel.LIST_ITEM] = (
+        DocItemLabel.LIST_ITEM  # type: ignore[assignment]
+    )
     enumerated: bool = False
-    marker: str  # The bullet or number symbol that prefixes this list item
+    marker: str = "-"  # The bullet or number symbol that prefixes this list item
 class FloatingItem(DocItem):
@@ -923,7 +942,10 @@ class TableItem(FloatingItem):
     """TableItem."""
     data: TableData
-    label: typing.Literal[DocItemLabel.TABLE] = DocItemLabel.TABLE
+    label: typing.Literal[
+        DocItemLabel.DOCUMENT_INDEX,
+        DocItemLabel.TABLE,
+    ] = DocItemLabel.TABLE
     def export_to_dataframe(self) -> pd.DataFrame:
         """Export the table as a Pandas DataFrame."""
@@ -1272,9 +1294,19 @@ class TableItem(FloatingItem):
 class KeyValueItem(DocItem):
     """KeyValueItem."""
+    label: typing.Literal[DocItemLabel.KEY_VALUE_REGION] = DocItemLabel.KEY_VALUE_REGION
-ContentItem = Union[
-    TextItem, SectionHeaderItem, ListItem, PictureItem, TableItem, KeyValueItem
+ContentItem = Annotated[
+    Union[
+        TextItem,
+        SectionHeaderItem,
+        ListItem,
+        PictureItem,
+        TableItem,
+        KeyValueItem,
+    ],
+    Field(discriminator="label"),
 ]
@@ -1376,13 +1408,13 @@ class DoclingDocument(BaseModel):
         self,
         label: Optional[GroupLabel] = None,
         name: Optional[str] = None,
-        parent: Optional[GroupItem] = None,
+        parent: Optional[NodeItem] = None,
     ) -> GroupItem:
         """add_group.
         :param label: Optional[GroupLabel]:  (Default value = None)
         :param name: Optional[str]:  (Default value = None)
-        :param parent: Optional[GroupItem]:  (Default value = None)
+        :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
@@ -1409,7 +1441,7 @@ class DoclingDocument(BaseModel):
         marker: Optional[str] = None,
         orig: Optional[str] = None,
         prov: Optional[ProvenanceItem] = None,
-        parent: Optional[GroupItem] = None,
+        parent: Optional[NodeItem] = None,
     ):
         """add_list_item.
@@ -1417,7 +1449,7 @@ class DoclingDocument(BaseModel):
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
         :param prov: Optional[ProvenanceItem]:  (Default value = None)
-        :param parent: Optional[GroupItem]:  (Default value = None)
+        :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
@@ -1452,7 +1484,7 @@ class DoclingDocument(BaseModel):
         text: str,
         orig: Optional[str] = None,
         prov: Optional[ProvenanceItem] = None,
-        parent: Optional[GroupItem] = None,
+        parent: Optional[NodeItem] = None,
     ):
         """add_text.
@@ -1460,7 +1492,7 @@ class DoclingDocument(BaseModel):
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
         :param prov: Optional[ProvenanceItem]:  (Default value = None)
-        :param parent: Optional[GroupItem]:  (Default value = None)
+        :param parent: Optional[NodeItem]:  (Default value = None)
         """
         # Catch a few cases that are in principle allowed
@@ -1504,15 +1536,16 @@ class DoclingDocument(BaseModel):
         data: TableData,
         caption: Optional[Union[TextItem, RefItem]] = None,  # This is not cool yet.
         prov: Optional[ProvenanceItem] = None,
-        parent: Optional[GroupItem] = None,
+        parent: Optional[NodeItem] = None,
+        label: DocItemLabel = DocItemLabel.TABLE,
     ):
         """add_table.
-        :param data: BaseTableData:
-        :param caption: Optional[Union[TextItem:
-        :param RefItem]]:  (Default value = None)
-        :param # This is not cool yet.prov: Optional[ProvenanceItem]
-        :param parent: Optional[GroupItem]:  (Default value = None)
+        :param data: TableData:
+        :param caption: Optional[Union[TextItem, RefItem]]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param parent: Optional[NodeItem]:  (Default value = None)
+        :param label: DocItemLabel:  (Default value = DocItemLabel.TABLE)
         """
         if not parent:
@@ -1522,7 +1555,7 @@ class DoclingDocument(BaseModel):
         cref = f"#/tables/{table_index}"
         tbl_item = TableItem(
-            label=DocItemLabel.TABLE, data=data, self_ref=cref, parent=parent.get_ref()
+            label=label, data=data, self_ref=cref, parent=parent.get_ref()
         )
         if prov:
             tbl_item.prov.append(prov)
@@ -1540,7 +1573,7 @@ class DoclingDocument(BaseModel):
         image: Optional[ImageRef] = None,
         caption: Optional[Union[TextItem, RefItem]] = None,
         prov: Optional[ProvenanceItem] = None,
-        parent: Optional[GroupItem] = None,
+        parent: Optional[NodeItem] = None,
     ):
         """add_picture.
@@ -1548,7 +1581,7 @@ class DoclingDocument(BaseModel):
         :param caption: Optional[Union[TextItem:
         :param RefItem]]:  (Default value = None)
         :param prov: Optional[ProvenanceItem]:  (Default value = None)
-        :param parent: Optional[GroupItem]:  (Default value = None)
+        :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
             parent = self.body
@@ -1578,14 +1611,14 @@ class DoclingDocument(BaseModel):
         text: str,
         orig: Optional[str] = None,
         prov: Optional[ProvenanceItem] = None,
-        parent: Optional[GroupItem] = None,
+        parent: Optional[NodeItem] = None,
     ):
         """add_title.
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
         :param prov: Optional[ProvenanceItem]:  (Default value = None)
-        :param parent: Optional[GroupItem]:  (Default value = None)
+        :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
             parent = self.body
@@ -1616,7 +1649,7 @@ class DoclingDocument(BaseModel):
         orig: Optional[str] = None,
         level: LevelNumber = 1,
         prov: Optional[ProvenanceItem] = None,
-        parent: Optional[GroupItem] = None,
+        parent: Optional[NodeItem] = None,
     ):
         """add_heading.
@@ -1625,7 +1658,7 @@ class DoclingDocument(BaseModel):
         :param orig: Optional[str]:  (Default value = None)
         :param level: LevelNumber:  (Default value = 1)
         :param prov: Optional[ProvenanceItem]:  (Default value = None)
-        :param parent: Optional[GroupItem]:  (Default value = None)
+        :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
             parent = self.body
@@ -2055,10 +2088,6 @@ class DoclingDocument(BaseModel):
                 text = f"```\n{item.text}\n```\n"
                 mdtexts.append(text)
-            elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
-                # captions are printed in picture and table ... skipping for now
-                continue
             elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
                 in_list = True
                 # Calculate indent based on list_nesting_level
@@ -2350,10 +2379,6 @@ class DoclingDocument(BaseModel):
                 text = f"<pre>{item.text}</pre>"
                 html_texts.append(text)
-            elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
-                # captions are printed in picture and table ... skipping for now
-                continue
             elif isinstance(item, ListItem):
                 text = f"<li>{item.text}</li>"
@@ -2555,10 +2580,6 @@ class DoclingDocument(BaseModel):
                 result += f"<unordered_list>{delim}"
                 in_ordered_list.append(False)
-            elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
-                # captions are printed in picture and table ... skipping for now
-                continue
             elif isinstance(item, SectionHeaderItem):
                 result += item.export_to_document_tokens(
@@ -2664,10 +2685,6 @@ class DoclingDocument(BaseModel):
                     indent * level + f"item-{i} at level {level}: {item.label}: {text}"
                 )
-            elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
-                # captions are printed in picture and table ... skipping for now
-                continue
             elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
                 text = get_text(text=item.text, max_text_len=max_text_len)

docling_core/utils/legacy.py CHANGED Viewed

@@ -7,19 +7,26 @@
 import hashlib
 import uuid
-from typing import Union
+from pathlib import Path
+from typing import Dict, Optional, Union
 from docling_core.types.doc import (
+    BoundingBox,
+    CoordOrigin,
     DocItem,
     DocItemLabel,
     DoclingDocument,
+    DocumentOrigin,
     PictureItem,
+    ProvenanceItem,
     SectionHeaderItem,
+    Size,
     TableCell,
     TableItem,
     TextItem,
 )
-from docling_core.types.doc.document import ListItem
+from docling_core.types.doc.document import GroupItem, ListItem, TableData
+from docling_core.types.doc.labels import GroupLabel
 from docling_core.types.legacy_doc.base import (
     BaseCell,
     BaseText,
@@ -342,5 +349,285 @@ def docling_document_to_legacy(doc: DoclingDocument, fallback_filaname: str = "f
     return legacy_doc
-# def legacy_to_docling_document(legacy_doc: DsDocument) -> DoclingDocument:
-#     """Convert a legacy document to DoclingDocument."""
+def legacy_to_docling_document(legacy_doc: DsDocument) -> DoclingDocument:  # noqa: C901
+    """Convert a legacy document to DoclingDocument.
+    It is known that the following content will not be preserved in the transformation:
+    - name of labels (upper vs lower case)
+    - caption of figures are not in main-text anymore
+    - s3_data removed
+    - model metadata removed
+    - logs removed
+    - document hash cannot be preserved
+    """
+    def _transform_prov(item: BaseCell) -> Optional[ProvenanceItem]:
+        """Create a new provenance from a legacy item."""
+        prov: Optional[ProvenanceItem] = None
+        if item.prov is not None and len(item.prov) > 0:
+            prov = ProvenanceItem(
+                page_no=int(item.prov[0].page),
+                charspan=tuple(item.prov[0].span),
+                bbox=BoundingBox.from_tuple(
+                    tuple(item.prov[0].bbox), origin=CoordOrigin.BOTTOMLEFT
+                ),
+            )
+        return prov
+    origin = DocumentOrigin(
+        mimetype="application/pdf",
+        filename=legacy_doc.file_info.filename,
+        binary_hash=legacy_doc.file_info.document_hash,
+    )
+    doc_name = Path(origin.filename).stem
+    doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin)
+    # define pages
+    if legacy_doc.page_dimensions is not None:
+        for page_dim in legacy_doc.page_dimensions:
+            page_no = int(page_dim.page)
+            size = Size(width=page_dim.width, height=page_dim.height)
+            doc.add_page(page_no=page_no, size=size)
+    # page headers
+    if legacy_doc.page_headers is not None:
+        for text_item in legacy_doc.page_headers:
+            if text_item.text is None:
+                continue
+            prov = _transform_prov(text_item)
+            doc.add_text(
+                label=DocItemLabel.PAGE_HEADER,
+                text=text_item.text,
+                parent=doc.furniture,
+            )
+    # page footers
+    if legacy_doc.page_footers is not None:
+        for text_item in legacy_doc.page_footers:
+            if text_item.text is None:
+                continue
+            prov = _transform_prov(text_item)
+            doc.add_text(
+                label=DocItemLabel.PAGE_FOOTER,
+                text=text_item.text,
+                parent=doc.furniture,
+            )
+    # footnotes
+    if legacy_doc.footnotes is not None:
+        for text_item in legacy_doc.footnotes:
+            if text_item.text is None:
+                continue
+            prov = _transform_prov(text_item)
+            doc.add_text(
+                label=DocItemLabel.FOOTNOTE, text=text_item.text, parent=doc.furniture
+            )
+    # main-text content
+    if legacy_doc.main_text is not None:
+        item: Optional[Union[BaseCell, BaseText]]
+        # collect all captions embedded in table and figure objects
+        # to avoid repeating them
+        embedded_captions: Dict[str, int] = {}
+        for ix, orig_item in enumerate(legacy_doc.main_text):
+            item = (
+                legacy_doc._resolve_ref(orig_item)
+                if isinstance(orig_item, Ref)
+                else orig_item
+            )
+            if item is None:
+                continue
+            if isinstance(item, (DsSchemaTable, Figure)) and item.text:
+                embedded_captions[item.text] = ix
+        # build lookup from floating objects to their caption item
+        floating_to_caption: Dict[int, BaseText] = {}
+        for ix, orig_item in enumerate(legacy_doc.main_text):
+            item = (
+                legacy_doc._resolve_ref(orig_item)
+                if isinstance(orig_item, Ref)
+                else orig_item
+            )
+            if item is None:
+                continue
+            item_type = item.obj_type.lower()
+            if (
+                isinstance(item, BaseText)
+                and (
+                    item_type == "caption"
+                    or (item.name is not None and item.name.lower() == "caption")
+                )
+                and item.text in embedded_captions
+            ):
+                floating_ix = embedded_captions[item.text]
+                floating_to_caption[floating_ix] = item
+        # main loop iteration
+        current_list: Optional[GroupItem] = None
+        for ix, orig_item in enumerate(legacy_doc.main_text):
+            item = (
+                legacy_doc._resolve_ref(orig_item)
+                if isinstance(orig_item, Ref)
+                else orig_item
+            )
+            if item is None:
+                continue
+            prov = _transform_prov(item)
+            item_type = item.obj_type.lower()
+            # if a group is needed, add it
+            if isinstance(item, BaseText) and (
+                item_type in "list-item-level-1" or item.name in {"list", "list-item"}
+            ):
+                if current_list is None:
+                    current_list = doc.add_group(label=GroupLabel.LIST, name="list")
+            else:
+                current_list = None
+            # add the document item in the document
+            if isinstance(item, BaseText):
+                text = item.text if item.text is not None else ""
+                label_name = item.name if item.name is not None else "text"
+                if item_type == "caption":
+                    if text in embedded_captions:
+                        # skip captions if they are embedded in the actual
+                        # floating objects
+                        continue
+                    else:
+                        # captions without a related object are inserted as text
+                        doc.add_text(label=DocItemLabel.TEXT, text=text, prov=prov)
+                # first title match
+                if item_type == "title":
+                    doc.add_title(text=text, prov=prov)
+                # secondary titles
+                elif item_type in {
+                    "subtitle-level-1",
+                }:
+                    doc.add_heading(text=text, prov=prov)
+                # list item
+                elif item_type in "list-item-level-1" or label_name in {
+                    "list",
+                    "list-item",
+                }:
+                    # TODO: Infer if this is a numbered or a bullet list item
+                    doc.add_list_item(
+                        text=text, enumerated=False, prov=prov, parent=current_list
+                    )
+                # normal text
+                else:
+                    label = DocItemLabel.TEXT
+                    normalized_label_name = label_name.replace("-", "_")
+                    if normalized_label_name is not None:
+                        try:
+                            label = DocItemLabel(normalized_label_name)
+                        except ValueError:
+                            pass
+                    doc.add_text(label=label, text=text, prov=prov)
+            elif isinstance(item, DsSchemaTable):
+                table_data = TableData(num_cols=item.num_cols, num_rows=item.num_rows)
+                if item.data is not None:
+                    seen_spans = set()
+                    for row_ix, row in enumerate(item.data):
+                        for col_ix, orig_cell_data in enumerate(row):
+                            cell_bbox: Optional[BoundingBox] = (
+                                BoundingBox.from_tuple(
+                                    tuple(orig_cell_data.bbox),
+                                    origin=CoordOrigin.BOTTOMLEFT,
+                                )
+                                if orig_cell_data.bbox is not None
+                                else None
+                            )
+                            cell = TableCell(
+                                start_row_offset_idx=row_ix,
+                                end_row_offset_idx=row_ix + 1,
+                                start_col_offset_idx=col_ix,
+                                end_col_offset_idx=col_ix + 1,
+                                text=orig_cell_data.text,
+                                bbox=cell_bbox,
+                                column_header=(orig_cell_data.obj_type == "col_header"),
+                                row_header=(orig_cell_data.obj_type == "row_header"),
+                                row_section=(orig_cell_data.obj_type == "row_section"),
+                            )
+                            if orig_cell_data.spans is not None:
+                                # convert to a tuple of tuples for hashing
+                                spans_tuple = tuple(
+                                    tuple(span) for span in orig_cell_data.spans
+                                )
+                                # skip repeated spans
+                                if spans_tuple in seen_spans:
+                                    continue
+                                seen_spans.add(spans_tuple)
+                                cell.start_row_offset_idx = min(
+                                    s[0] for s in spans_tuple
+                                )
+                                cell.end_row_offset_idx = (
+                                    max(s[0] for s in spans_tuple) + 1
+                                )
+                                cell.start_col_offset_idx = min(
+                                    s[1] for s in spans_tuple
+                                )
+                                cell.end_col_offset_idx = (
+                                    max(s[1] for s in spans_tuple) + 1
+                                )
+                                cell.row_span = (
+                                    cell.end_row_offset_idx - cell.start_row_offset_idx
+                                )
+                                cell.col_span = (
+                                    cell.end_col_offset_idx - cell.start_col_offset_idx
+                                )
+                            table_data.table_cells.append(cell)
+                new_item = doc.add_table(data=table_data, prov=prov)
+                if (caption_item := floating_to_caption.get(ix)) is not None:
+                    if caption_item.text is not None:
+                        caption_prov = _transform_prov(caption_item)
+                        caption = doc.add_text(
+                            label=DocItemLabel.CAPTION,
+                            text=caption_item.text,
+                            prov=caption_prov,
+                            parent=new_item,
+                        )
+                        new_item.captions.append(caption.get_ref())
+            elif isinstance(item, Figure):
+                new_item = doc.add_picture(prov=prov)
+                if (caption_item := floating_to_caption.get(ix)) is not None:
+                    if caption_item.text is not None:
+                        caption_prov = _transform_prov(caption_item)
+                        caption = doc.add_text(
+                            label=DocItemLabel.CAPTION,
+                            text=caption_item.text,
+                            prov=caption_prov,
+                            parent=new_item,
+                        )
+                        new_item.captions.append(caption.get_ref())
+            # equations
+            elif (
+                isinstance(item, BaseCell)
+                and item.text is not None
+                and item_type in {"formula", "equation"}
+            ):
+                doc.add_text(label=DocItemLabel.FORMULA, text=item.text, prov=prov)
+    return doc

{docling_core-2.9.0.dist-info → docling_core-2.10.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling-core
-Version: 2.9.0
+Version: 2.10.0
 Summary: A python library to define and validate data types in Docling.
 Home-page: https://ds4sd.github.io/
 License: MIT
@@ -35,6 +35,7 @@ Requires-Dist: pyyaml (>=5.1,<7.0.0)
 Requires-Dist: semchunk (>=2.2.0,<3.0.0) ; extra == "chunking"
 Requires-Dist: tabulate (>=0.9.0,<0.10.0)
 Requires-Dist: transformers (>=4.34.0,<5.0.0) ; extra == "chunking"
+Requires-Dist: typer (>=0.12.5,<0.13.0)
 Requires-Dist: typing-extensions (>=4.12.2,<5.0.0)
 Project-URL: Repository, https://github.com/DS4SD/docling-core
 Description-Content-Type: text/markdown

{docling_core-2.9.0.dist-info → docling_core-2.10.0.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,6 @@
 docling_core/__init__.py,sha256=D0afxif-BMUrgx2cYk1cwxiwATRYaGXsIMk_z4nw1Vs,90
+docling_core/cli/__init__.py,sha256=C63yWifzpA0IV7YWDatpAdrhoV8zjqxAKv0xMf09VdM,19
+docling_core/cli/view.py,sha256=bhxvPQWIJVo2g_pRL0GjQwjDw-jdiRXp1-BTbG849go,1746
 docling_core/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling_core/resources/schemas/doc/ANN.json,sha256=04U5j-PU9m5w7IagJ_rHcAx7qUtLkUuaWZO9GuYHnTA,4202
 docling_core/resources/schemas/doc/DOC.json,sha256=9tVKpCqDGGq3074Nn5qlUCdTN-5k1Q0ri_scJblwnLE,6686
@@ -17,12 +19,12 @@ docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9AC
 docling_core/transforms/chunker/__init__.py,sha256=YdizSKXLmmK9eyYBsarHWr8Mx_AoA0PT7c0absibZMk,306
 docling_core/transforms/chunker/base.py,sha256=PZl6QN41cZseTPkTwPzysDHYYFb6DwDSKw0QVSiFfG0,2541
 docling_core/transforms/chunker/hierarchical_chunker.py,sha256=cy3sE9w_7l-uoIEUcfnZlQweDHUoyAJTQ6IkzxxVjFY,8052
-docling_core/transforms/chunker/hybrid_chunker.py,sha256=LUzlqtTbXfhY40bhBVGtjEMZXFWRz1XH53OGqBh2Z3Y,11224
+docling_core/transforms/chunker/hybrid_chunker.py,sha256=9bGhjr4vzpXbOMLCydCl81r1HbzMuMlo9ABfXyLRtd4,11375
 docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
 docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
 docling_core/types/doc/__init__.py,sha256=bEL4zKVOG7Wxm6xQrgF58mu-Teds9aSavuEAKVNhrTU,639
 docling_core/types/doc/base.py,sha256=_ttU8QI8wXDTQRUnN5n7L6D9wYFVLSAibxlFoMbgAsk,4557
-docling_core/types/doc/document.py,sha256=nyyQWikflk2XRJSB2b-V2MEMMvEok0g35v9iEyIODj8,91521
+docling_core/types/doc/document.py,sha256=9t6FPvrxT9gKtUaYMP_Kyhz_izo2p6TQX_LlG2Fj5hY,91593
 docling_core/types/doc/labels.py,sha256=A8vWP82VAeXO1rlCO0oDKo_Hb8uDeQe0myOTY3P03hk,1596
 docling_core/types/doc/tokens.py,sha256=uU_MYW_p7ypf7eYICFBvxdnVaPZ7CQnvZmbJ6oPrtEA,6134
 docling_core/types/doc/utils.py,sha256=YDOh_ZD1Y7OmCEDdCLJ_MO5K3HA67nc_acfhOK6WztU,1439
@@ -51,11 +53,11 @@ docling_core/utils/alias.py,sha256=B6Lqvss8CbaNARHLR4qSmNh9OkB6LvqTpxfsFmkLAFo,8
 docling_core/utils/file.py,sha256=GzX0pclvewwPoqHJSaVUuULzSJwJgkCUwgKgJ7G5ohQ,5628
 docling_core/utils/generate_docs.py,sha256=BdKAoduWXOc7YMvcmlhjoJOFlUxij1ybxglj6LZDtC8,2290
 docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2tyi_OhHepHYtZg,1654
-docling_core/utils/legacy.py,sha256=mncL2r2PL5rVXTXhgOArYGVwXs0PWaJ4RxuCRMfNxac,12814
+docling_core/utils/legacy.py,sha256=xfp7U0JqjI60K3loWiNTk8w08_KfCUzTb2MNULBOIz4,24396
 docling_core/utils/validate.py,sha256=3FmnxnKTDZC5J9OGxCL3U3DGRl0t0bBV1NcySXswdas,2031
 docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
-docling_core-2.9.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
-docling_core-2.9.0.dist-info/METADATA,sha256=P7s_dSFfZ_lvmwRFJRCOnxvR3iavYGX-3kzthwAs2vk,5703
-docling_core-2.9.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-docling_core-2.9.0.dist-info/entry_points.txt,sha256=jIxlWv3tnO04irlZc0zfhqJIgz1bg9Hha4AkaLWSdUA,177
-docling_core-2.9.0.dist-info/RECORD,,
+docling_core-2.10.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
+docling_core-2.10.0.dist-info/METADATA,sha256=2Xr2MRaXihKpNdNhAwfZT973ffbX7GGs19ylGCBwfe4,5744
+docling_core-2.10.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+docling_core-2.10.0.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
+docling_core-2.10.0.dist-info/RECORD,,

{docling_core-2.9.0.dist-info → docling_core-2.10.0.dist-info}/entry_points.txt RENAMED Viewed

@@ -1,4 +1,5 @@
 [console_scripts]
+docling-view=docling_core.cli.view:app
 generate_docs=docling_core.utils.generate_docs:main
 generate_jsonschema=docling_core.utils.generate_jsonschema:main
 validate=docling_core.utils.validate:main

{docling_core-2.9.0.dist-info → docling_core-2.10.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{docling_core-2.9.0.dist-info → docling_core-2.10.0.dist-info}/WHEEL RENAMED Viewed

File without changes

docling-core 2.9.0__py3-none-any.whl → 2.10.0__py3-none-any.whl

Potentially problematic release.

docling-core 2.9.0py3-none-any.whl → 2.10.0py3-none-any.whl