PyPI - docling - Versions diffs - 1.10.0__tar.gz → 1.12.0__tar.gz - Mend

docling 1.10.0tar.gz → 1.12.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

{docling-1.10.0 → docling-1.12.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 1.10.0
+Version: 1.12.0
 Summary: Docling PDF conversion package
 Home-page: https://github.com/DS4SD/docling
 License: MIT
@@ -19,21 +19,34 @@ Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Provides-Extra: examples
 Requires-Dist: certifi (>=2024.7.4)
 Requires-Dist: deepsearch-glm (>=0.21.0,<0.22.0)
-Requires-Dist: docling-core (>=1.1.3,<2.0.0)
-Requires-Dist: docling-ibm-models (>=1.1.3,<2.0.0)
+Requires-Dist: docling-core (>=1.3.0,<2.0.0)
+Requires-Dist: docling-ibm-models (>=1.1.7,<2.0.0)
 Requires-Dist: docling-parse (>=1.2.0,<2.0.0)
 Requires-Dist: easyocr (>=1.7,<2.0)
 Requires-Dist: filetype (>=1.2.0,<2.0.0)
 Requires-Dist: huggingface_hub (>=0.23,<1)
+Requires-Dist: langchain-huggingface (>=0.0.3,<0.0.4) ; extra == "examples"
+Requires-Dist: langchain-milvus (>=0.1.4,<0.2.0) ; extra == "examples"
+Requires-Dist: langchain-text-splitters (>=0.2.4,<0.3.0) ; extra == "examples"
+Requires-Dist: llama-index-embeddings-huggingface (>=0.3.1,<0.4.0) ; extra == "examples"
+Requires-Dist: llama-index-llms-huggingface-api (>=0.2.0,<0.3.0) ; extra == "examples"
+Requires-Dist: llama-index-vector-stores-milvus (>=0.2.1,<0.3.0) ; extra == "examples"
 Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
 Requires-Dist: pydantic (>=2.0.0,<3.0.0)
 Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
 Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
+Requires-Dist: python-dotenv (>=1.0.1,<2.0.0) ; extra == "examples"
 Requires-Dist: requests (>=2.32.3,<3.0.0)
 Requires-Dist: rtree (>=1.3.0,<2.0.0)
 Requires-Dist: scipy (>=1.14.1,<2.0.0)
+Requires-Dist: torch (>=2.2.2,<2.3.0) ; sys_platform == "darwin" and platform_machine == "x86_64"
+Requires-Dist: torch (>=2.2.2,<3.0.0) ; sys_platform != "darwin" or platform_machine != "x86_64"
+Requires-Dist: torchvision (>=0,<1) ; sys_platform != "darwin" or platform_machine != "x86_64"
+Requires-Dist: torchvision (>=0.17.2,<0.18.0) ; sys_platform == "darwin" and platform_machine == "x86_64"
+Requires-Dist: typer (>=0.12.5,<0.13.0)
 Project-URL: Repository, https://github.com/DS4SD/docling
 Description-Content-Type: text/markdown
@@ -62,8 +75,7 @@ Docling bundles PDF document conversion to JSON and Markdown in an easy, self-co
 * 📑 Understands detailed page layout, reading order and recovers table structures
 * 📝 Extracts metadata from the document, such as title, authors, references and language
 * 🔍 Optionally applies OCR (use with scanned PDFs)
-For RAG, check out [Quackling](https://github.com/DS4SD/quackling) to get the most out of your docs, be it using LlamaIndex, LangChain or your pipeline.
+* 🤖 Integrates easily with LLM app / RAG frameworks like 🦙 LlamaIndex and 🦜🔗 LangChain
 ## Installation
@@ -182,6 +194,10 @@ results = doc_converter.convert(conv_input)
 You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
+### RAG
+Check out the following examples showcasing RAG using Docling with standard LLM application frameworks:
+- [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb)
+- [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb)
 ## Technical report

{docling-1.10.0 → docling-1.12.0}/README.md RENAMED Viewed

@@ -23,8 +23,7 @@ Docling bundles PDF document conversion to JSON and Markdown in an easy, self-co
 * 📑 Understands detailed page layout, reading order and recovers table structures
 * 📝 Extracts metadata from the document, such as title, authors, references and language
 * 🔍 Optionally applies OCR (use with scanned PDFs)
-For RAG, check out [Quackling](https://github.com/DS4SD/quackling) to get the most out of your docs, be it using LlamaIndex, LangChain or your pipeline.
+* 🤖 Integrates easily with LLM app / RAG frameworks like 🦙 LlamaIndex and 🦜🔗 LangChain
 ## Installation
@@ -143,6 +142,10 @@ results = doc_converter.convert(conv_input)
 You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
+### RAG
+Check out the following examples showcasing RAG using Docling with standard LLM application frameworks:
+- [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb)
+- [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb)
 ## Technical report

{docling-1.10.0 → docling-1.12.0}/docling/backend/abstract_backend.py RENAMED Viewed

@@ -1,10 +1,13 @@
 from abc import ABC, abstractmethod
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Iterable, Optional, Union
+from typing import TYPE_CHECKING, Any, Iterable, Optional, Union
 from PIL import Image
+if TYPE_CHECKING:
+    from docling.datamodel.base_models import BoundingBox, Cell, PageSize
 class PdfPageBackend(ABC):
@@ -17,12 +20,12 @@ class PdfPageBackend(ABC):
         pass
     @abstractmethod
-    def get_bitmap_rects(self, scale: int = 1) -> Iterable["BoundingBox"]:
+    def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]:
         pass
     @abstractmethod
     def get_page_image(
-        self, scale: int = 1, cropbox: Optional["BoundingBox"] = None
+        self, scale: float = 1, cropbox: Optional["BoundingBox"] = None
     ) -> Image.Image:
         pass

{docling-1.10.0 → docling-1.12.0}/docling/backend/docling_parse_backend.py RENAMED Viewed

@@ -2,7 +2,7 @@ import logging
 import random
 from io import BytesIO
 from pathlib import Path
-from typing import Iterable, Optional, Union
+from typing import Iterable, List, Optional, Union
 import pypdfium2 as pdfium
 from docling_parse.docling_parse import pdf_parser
@@ -22,7 +22,6 @@ class DoclingParsePageBackend(PdfPageBackend):
         self._ppage = page_obj
         parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
-        self._dpage = None
         self.valid = "pages" in parsed_page
         if self.valid:
             self._dpage = parsed_page["pages"][0]
@@ -68,7 +67,7 @@ class DoclingParsePageBackend(PdfPageBackend):
         return text_piece
     def get_text_cells(self) -> Iterable[Cell]:
-        cells = []
+        cells: List[Cell] = []
         cell_counter = 0
         if not self.valid:
@@ -130,7 +129,7 @@ class DoclingParsePageBackend(PdfPageBackend):
         return cells
-    def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
+    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
         AREA_THRESHOLD = 32 * 32
         for i in range(len(self._dpage["images"])):
@@ -145,7 +144,7 @@ class DoclingParsePageBackend(PdfPageBackend):
                 yield cropbox
     def get_page_image(
-        self, scale: int = 1, cropbox: Optional[BoundingBox] = None
+        self, scale: float = 1, cropbox: Optional[BoundingBox] = None
     ) -> Image.Image:
         page_size = self.get_size()

{docling-1.10.0 → docling-1.12.0}/docling/backend/pypdfium2_backend.py RENAMED Viewed

@@ -7,7 +7,7 @@ from typing import Iterable, List, Optional, Union
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
 from PIL import Image, ImageDraw
-from pypdfium2 import PdfPage
+from pypdfium2 import PdfPage, PdfTextPage
 from pypdfium2._helpers.misc import PdfiumError
 from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
@@ -29,12 +29,12 @@ class PyPdfiumPageBackend(PdfPageBackend):
                 exc_info=True,
             )
             self.valid = False
-        self.text_page = None
+        self.text_page: Optional[PdfTextPage] = None
     def is_valid(self) -> bool:
         return self.valid
-    def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
+    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
         AREA_THRESHOLD = 32 * 32
         for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
             pos = obj.get_pos()
@@ -189,7 +189,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
         return cells
     def get_page_image(
-        self, scale: int = 1, cropbox: Optional[BoundingBox] = None
+        self, scale: float = 1, cropbox: Optional[BoundingBox] = None
     ) -> Image.Image:
         page_size = self.get_size()

docling-1.12.0/docling/cli/main.py ADDED Viewed

@@ -0,0 +1,257 @@
+import importlib
+import json
+import logging
+import time
+import warnings
+from enum import Enum
+from pathlib import Path
+from typing import Annotated, Iterable, List, Optional
+import typer
+from pydantic import AnyUrl
+from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.datamodel.base_models import ConversionStatus, PipelineOptions
+from docling.datamodel.document import ConversionResult, DocumentConversionInput
+from docling.document_converter import DocumentConverter
+warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
+warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
+_log = logging.getLogger(__name__)
+from rich.console import Console
+err_console = Console(stderr=True)
+app = typer.Typer(
+    name="Docling",
+    no_args_is_help=True,
+    add_completion=False,
+    pretty_exceptions_enable=False,
+)
+def version_callback(value: bool):
+    if value:
+        docling_version = importlib.metadata.version("docling")
+        docling_core_version = importlib.metadata.version("docling-core")
+        docling_ibm_models_version = importlib.metadata.version("docling-ibm-models")
+        docling_parse_version = importlib.metadata.version("docling-parse")
+        print(f"Docling version: {docling_version}")
+        print(f"Docling Core version: {docling_core_version}")
+        print(f"Docling IBM Models version: {docling_ibm_models_version}")
+        print(f"Docling Parse version: {docling_parse_version}")
+        raise typer.Exit()
+# Define an enum for the backend options
+class Backend(str, Enum):
+    PYPDFIUM2 = "pypdfium2"
+    DOCLING = "docling"
+def export_documents(
+    conv_results: Iterable[ConversionResult],
+    output_dir: Path,
+    export_json: bool,
+    export_md: bool,
+    export_txt: bool,
+    export_doctags: bool,
+):
+    success_count = 0
+    failure_count = 0
+    for conv_res in conv_results:
+        if conv_res.status == ConversionStatus.SUCCESS:
+            success_count += 1
+            doc_filename = conv_res.input.file.stem
+            # Export Deep Search document JSON format:
+            if export_json:
+                fname = output_dir / f"{doc_filename}.json"
+                with fname.open("w") as fp:
+                    _log.info(f"writing JSON output to {fname}")
+                    fp.write(json.dumps(conv_res.render_as_dict()))
+            # Export Text format:
+            if export_txt:
+                fname = output_dir / f"{doc_filename}.txt"
+                with fname.open("w") as fp:
+                    _log.info(f"writing Text output to {fname}")
+                    fp.write(conv_res.render_as_text())
+            # Export Markdown format:
+            if export_md:
+                fname = output_dir / f"{doc_filename}.md"
+                with fname.open("w") as fp:
+                    _log.info(f"writing Markdown output to {fname}")
+                    fp.write(conv_res.render_as_markdown())
+            # Export Document Tags format:
+            if export_doctags:
+                fname = output_dir / f"{doc_filename}.doctags"
+                with fname.open("w") as fp:
+                    _log.info(f"writing Doc Tags output to {fname}")
+                    fp.write(conv_res.render_as_doctags())
+        else:
+            _log.warning(f"Document {conv_res.input.file} failed to convert.")
+            failure_count += 1
+    _log.info(
+        f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
+    )
+@app.command(no_args_is_help=True)
+def convert(
+    input_sources: Annotated[
+        List[Path],
+        typer.Argument(
+            ...,
+            metavar="source",
+            help="PDF files to convert. Directories are also accepted.",
+        ),
+    ],
+    export_json: Annotated[
+        bool,
+        typer.Option(
+            ..., "--json/--no-json", help="If enabled the document is exported as JSON."
+        ),
+    ] = False,
+    export_md: Annotated[
+        bool,
+        typer.Option(
+            ..., "--md/--no-md", help="If enabled the document is exported as Markdown."
+        ),
+    ] = True,
+    export_txt: Annotated[
+        bool,
+        typer.Option(
+            ..., "--txt/--no-txt", help="If enabled the document is exported as Text."
+        ),
+    ] = False,
+    export_doctags: Annotated[
+        bool,
+        typer.Option(
+            ...,
+            "--doctags/--no-doctags",
+            help="If enabled the document is exported as Doc Tags.",
+        ),
+    ] = False,
+    ocr: Annotated[
+        bool,
+        typer.Option(
+            ..., help="If enabled, the bitmap content will be processed using OCR."
+        ),
+    ] = True,
+    backend: Annotated[
+        Backend, typer.Option(..., help="The PDF backend to use.")
+    ] = Backend.DOCLING,
+    output: Annotated[
+        Path, typer.Option(..., help="Output directory where results are saved.")
+    ] = Path("."),
+    version: Annotated[
+        Optional[bool],
+        typer.Option(
+            "--version",
+            callback=version_callback,
+            is_eager=True,
+            help="Show version information.",
+        ),
+    ] = None,
+):
+    logging.basicConfig(level=logging.INFO)
+    input_doc_paths: List[Path] = []
+    for source in input_sources:
+        if not source.exists():
+            err_console.print(
+                f"[red]Error: The input file {source} does not exist.[/red]"
+            )
+            raise typer.Abort()
+        elif source.is_dir():
+            input_doc_paths.extend(list(source.glob("**/*.pdf", case_sensitive=False)))
+        else:
+            input_doc_paths.append(source)
+    ###########################################################################
+    # The following sections contain a combination of PipelineOptions
+    # and PDF Backends for various configurations.
+    # Uncomment one section at the time to see the differences in the output.
+    doc_converter = None
+    if backend == Backend.PYPDFIUM2 and not ocr:  # PyPdfium without OCR
+        pipeline_options = PipelineOptions()
+        pipeline_options.do_ocr = False
+        pipeline_options.do_table_structure = True
+        pipeline_options.table_structure_options.do_cell_matching = False
+        doc_converter = DocumentConverter(
+            pipeline_options=pipeline_options,
+            pdf_backend=PyPdfiumDocumentBackend,
+        )
+    elif backend == Backend.PYPDFIUM2.value and ocr:  # PyPdfium with OCR
+        pipeline_options = PipelineOptions()
+        pipeline_options.do_ocr = False
+        pipeline_options.do_table_structure = True
+        pipeline_options.table_structure_options.do_cell_matching = True
+        doc_converter = DocumentConverter(
+            pipeline_options=pipeline_options,
+            pdf_backend=PyPdfiumDocumentBackend,
+        )
+    elif backend == Backend.DOCLING.value and not ocr:  # Docling Parse without OCR
+        pipeline_options = PipelineOptions()
+        pipeline_options.do_ocr = False
+        pipeline_options.do_table_structure = True
+        pipeline_options.table_structure_options.do_cell_matching = True
+        doc_converter = DocumentConverter(
+            pipeline_options=pipeline_options,
+            pdf_backend=DoclingParseDocumentBackend,
+        )
+    elif backend == Backend.DOCLING.value and ocr:  # Docling Parse with OCR
+        pipeline_options = PipelineOptions()
+        pipeline_options.do_ocr = True
+        pipeline_options.do_table_structure = True
+        pipeline_options.table_structure_options.do_cell_matching = True
+        doc_converter = DocumentConverter(
+            pipeline_options=pipeline_options,
+            pdf_backend=DoclingParseDocumentBackend,
+        )
+    ###########################################################################
+    # Define input files
+    input = DocumentConversionInput.from_paths(input_doc_paths)
+    start_time = time.time()
+    conv_results = doc_converter.convert(input)
+    output.mkdir(parents=True, exist_ok=True)
+    export_documents(
+        conv_results,
+        output_dir=output,
+        export_json=export_json,
+        export_md=export_md,
+        export_txt=export_txt,
+        export_doctags=export_doctags,
+    )
+    end_time = time.time() - start_time
+    _log.info(f"All documents were converted in {end_time:.2f} seconds.")
+if __name__ == "__main__":
+    app()

{docling-1.10.0 → docling-1.12.0}/docling/datamodel/base_models.py RENAMED Viewed

@@ -87,7 +87,7 @@ class BoundingBox(BaseModel):
             return (self.l, self.b, self.r, self.t)
     @classmethod
-    def from_tuple(cls, coord: Tuple[float], origin: CoordOrigin):
+    def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
         if origin == CoordOrigin.TOPLEFT:
             l, t, r, b = coord[0], coord[1], coord[2], coord[3]
             if r < l:
@@ -246,7 +246,7 @@ class EquationPrediction(BaseModel):
 class PagePredictions(BaseModel):
-    layout: LayoutPrediction = None
+    layout: Optional[LayoutPrediction] = None
     tablestructure: Optional[TableStructurePrediction] = None
     figures_classification: Optional[FigureClassificationPrediction] = None
     equations_prediction: Optional[EquationPrediction] = None
@@ -267,7 +267,7 @@ class Page(BaseModel):
     page_no: int
     page_hash: Optional[str] = None
     size: Optional[PageSize] = None
-    cells: List[Cell] = None
+    cells: List[Cell] = []
     predictions: PagePredictions = PagePredictions()
     assembled: Optional[AssembledUnit] = None

{docling-1.10.0 → docling-1.12.0}/docling/datamodel/document.py RENAMED Viewed

@@ -11,6 +11,7 @@ from docling_core.types import FileInfoObject as DsFileInfoObject
 from docling_core.types import PageDimensions, PageReference, Prov, Ref
 from docling_core.types import Table as DsSchemaTable
 from docling_core.types import TableCell
+from docling_core.types.doc.base import Figure
 from pydantic import BaseModel
 from typing_extensions import deprecated
@@ -279,7 +280,7 @@ class ConvertedDocument(BaseModel):
                     ),
                 )
                 figures.append(
-                    BaseCell(
+                    Figure(
                         prov=[
                             Prov(
                                 bbox=target_bbox,
@@ -312,8 +313,76 @@ class ConvertedDocument(BaseModel):
     def render_as_dict(self):
         return self.output.model_dump(by_alias=True, exclude_none=True)
-    def render_as_markdown(self):
-        return self.output.export_to_markdown()
+    def render_as_markdown(
+        self,
+        delim: str = "\n\n",
+        main_text_start: int = 0,
+        main_text_stop: Optional[int] = None,
+        main_text_labels: list[str] = [
+            "title",
+            "subtitle-level-1",
+            "paragraph",
+            "caption",
+            "table",
+        ],
+        strict_text: bool = False,
+    ):
+        return self.output.export_to_markdown(
+            delim=delim,
+            main_text_start=main_text_start,
+            main_text_stop=main_text_stop,
+            main_text_labels=main_text_labels,
+            strict_text=strict_text,
+        )
+    def render_as_text(
+        self,
+        delim: str = "\n\n",
+        main_text_start: int = 0,
+        main_text_stop: Optional[int] = None,
+        main_text_labels: list[str] = [
+            "title",
+            "subtitle-level-1",
+            "paragraph",
+            "caption",
+        ],
+    ):
+        return self.output.export_to_markdown(
+            delim=delim,
+            main_text_start=main_text_start,
+            main_text_stop=main_text_stop,
+            main_text_labels=main_text_labels,
+            strict_text=True,
+        )
+    def render_as_doctags(
+        self,
+        delim: str = "\n\n",
+        main_text_start: int = 0,
+        main_text_stop: Optional[int] = None,
+        main_text_labels: list[str] = [
+            "title",
+            "subtitle-level-1",
+            "paragraph",
+            "caption",
+            "table",
+            "figure",
+        ],
+        page_tagging: bool = True,
+        location_tagging: bool = True,
+        location_dimensions: Tuple[int, int] = (100, 100),
+        add_new_line: bool = True,
+    ) -> str:
+        return self.output.export_to_document_tokens(
+            delim=delim,
+            main_text_start=main_text_start,
+            main_text_stop=main_text_stop,
+            main_text_labels=main_text_labels,
+            page_tagging=page_tagging,
+            location_tagging=location_tagging,
+            location_dimensions=location_dimensions,
+            add_new_line=add_new_line,
+        )
     def render_element_images(
         self, element_types: Tuple[PageElement] = (FigureElement,)

{docling-1.10.0 → docling-1.12.0}/docling/pipeline/base_model_pipeline.py RENAMED Viewed

@@ -1,12 +1,12 @@
 from pathlib import Path
-from typing import Iterable
+from typing import Callable, Iterable, List
 from docling.datamodel.base_models import Page, PipelineOptions
 class BaseModelPipeline:
     def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
-        self.model_pipe = []
+        self.model_pipe: List[Callable] = []
         self.artifacts_path = artifacts_path
         self.pipeline_options = pipeline_options

docling-1.12.0/docling/utils/__init__.py ADDED Viewed

File without changes

{docling-1.10.0 → docling-1.12.0}/docling/utils/export.py RENAMED Viewed

@@ -1,10 +1,10 @@
 import logging
-from typing import Any, Dict, Iterable, List, Tuple
+from typing import Any, Dict, Iterable, List, Tuple, Union
-from docling_core.types.doc.base import BaseCell, Ref, Table, TableCell
+from docling_core.types.doc.base import BaseCell, BaseText, Ref, Table, TableCell
 from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell
-from docling.datamodel.document import ConvertedDocument, Page
+from docling.datamodel.document import ConversionResult, Page
 _log = logging.getLogger(__name__)
@@ -15,7 +15,10 @@ def _export_table_to_html(table: Table):
     # to the docling-core package.
     def _get_tablecell_span(cell: TableCell, ix):
-        span = set([s[ix] for s in cell.spans])
+        if cell.spans is None:
+            span = set()
+        else:
+            span = set([s[ix] for s in cell.spans])
         if len(span) == 0:
             return 1, None, None
         return len(span), min(span), max(span)
@@ -24,6 +27,8 @@ def _export_table_to_html(table: Table):
     nrows = table.num_rows
     ncols = table.num_cols
+    if table.data is None:
+        return ""
     for i in range(nrows):
         body += "<tr>"
         for j in range(ncols):
@@ -66,7 +71,7 @@ def _export_table_to_html(table: Table):
 def generate_multimodal_pages(
-    doc_result: ConvertedDocument,
+    doc_result: ConversionResult,
 ) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
     label_to_doclaynet = {
@@ -94,7 +99,7 @@ def generate_multimodal_pages(
     page_no = 0
     start_ix = 0
     end_ix = 0
-    doc_items = []
+    doc_items: List[Tuple[int, Union[BaseCell, BaseText]]] = []
     doc = doc_result.output
@@ -105,11 +110,11 @@ def generate_multimodal_pages(
             item_type = item.obj_type
             label = label_to_doclaynet.get(item_type, None)
-            if label is None:
+            if label is None or item.prov is None or page.size is None:
                 continue
             bbox = BoundingBox.from_tuple(
-                item.prov[0].bbox, origin=CoordOrigin.BOTTOMLEFT
+                tuple(item.prov[0].bbox), origin=CoordOrigin.BOTTOMLEFT
             )
             new_bbox = bbox.to_top_left_origin(page_height=page.size.height).normalized(
                 page_size=page.size
@@ -137,13 +142,15 @@ def generate_multimodal_pages(
         return segments
     def _process_page_cells(page: Page):
-        cells = []
+        cells: List[dict] = []
+        if page.size is None:
+            return cells
         for cell in page.cells:
             new_bbox = cell.bbox.to_top_left_origin(
                 page_height=page.size.height
             ).normalized(page_size=page.size)
             is_ocr = isinstance(cell, OcrCell)
-            ocr_confidence = cell.confidence if is_ocr else 1.0
+            ocr_confidence = cell.confidence if isinstance(cell, OcrCell) else 1.0
             cells.append(
                 {
                     "text": cell.text,
@@ -163,9 +170,15 @@ def generate_multimodal_pages(
         content_md = doc.export_to_markdown(
             main_text_start=start_ix, main_text_stop=end_ix
         )
+        # No page-tagging since we only do 1 page at the time
+        content_dt = doc.export_to_document_tokens(
+            main_text_start=start_ix, main_text_stop=end_ix, page_tagging=False
+        )
-        return content_text, content_md, page_cells, page_segments, page
+        return content_text, content_md, content_dt, page_cells, page_segments, page
+    if doc.main_text is None:
+        return
     for ix, orig_item in enumerate(doc.main_text):
         item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item

{docling-1.10.0 → docling-1.12.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "1.10.0"  # DO NOT EDIT, updated automatically
+version = "1.12.0"  # DO NOT EDIT, updated automatically
 description = "Docling PDF conversion package"
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"
@@ -23,8 +23,8 @@ packages = [{include = "docling"}]
 [tool.poetry.dependencies]
 python = "^3.10"
 pydantic = "^2.0.0"
-docling-core = "^1.1.3"
-docling-ibm-models = "^1.1.3"
+docling-core = "^1.3.0"
+docling-ibm-models = "^1.1.7"
 deepsearch-glm = "^0.21.0"
 filetype = "^1.2.0"
 pypdfium2 = "^4.30.0"
@@ -38,6 +38,30 @@ rtree = "^1.3.0"
 scipy = "^1.14.1"
 pyarrow = "^16.1.0"
+#########
+# extras:
+#########
+python-dotenv = { version = "^1.0.1", optional = true }
+llama-index-embeddings-huggingface = { version = "^0.3.1", optional = true }
+llama-index-llms-huggingface-api = { version = "^0.2.0", optional = true }
+llama-index-vector-stores-milvus = { version = "^0.2.1", optional = true }
+langchain-huggingface = { version = "^0.0.3", optional = true}
+langchain-milvus = { version = "^0.1.4", optional = true }
+langchain-text-splitters = { version = "^0.2.4", optional = true }
+##############
+# constraints:
+##############
+torch = [
+  {version = "^2.2.2", optional = true, markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'"},
+  {version = "~2.2.2", optional = true, markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'"}
+]
+torchvision = [
+  {version = "^0", optional = true, markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'"},
+  {version = "~0.17.2", optional = true, markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'"}
+]
+typer = "^0.12.5"
 [tool.poetry.group.dev.dependencies]
 black = {extras = ["jupyter"], version = "^24.4.2"}
 pytest = "^7.2.2"
@@ -51,11 +75,31 @@ pytest-xdist = "^3.3.1"
 types-requests = "^2.31.0.2"
 flake8-pyproject = "^1.2.3"
 pylint = "^2.17.5"
+pandas-stubs = "^2.2.2.240909"
+ipykernel = "^6.29.5"
+ipywidgets = "^8.1.5"
+nbqa = "^1.9.0"
 [tool.poetry.group.examples.dependencies]
 datasets = "^2.21.0"
+[tool.poetry.extras]
+examples = [
+    "python-dotenv",
+    # LlamaIndex examples:
+    "llama-index-embeddings-huggingface",
+    "llama-index-llms-huggingface-api",
+    "llama-index-vector-stores-milvus",
+    # LangChain examples:
+    "langchain-huggingface",
+    "langchain-milvus",
+    "langchain-text-splitters",
+]
+[tool.poetry.scripts]
+docling = "docling.cli.main:app"
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
@@ -76,6 +120,14 @@ pretty = true
 no_implicit_optional = true
 python_version = "3.10"
+[[tool.mypy.overrides]]
+module = [
+    "docling_parse.*",
+    "pypdfium2.*",
+    "networkx.*",
+]
+ignore_missing_imports = true
 [tool.flake8]
 max-line-length = 88
 extend-ignore = ["E203", "E501"]