PyPI - docling - Versions diffs - 1.6.2__py3-none-any.whl → 1.19.0__py3-none-any.whl - Mend

docling 1.6.2py3-none-any.whl → 1.19.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

docling/backend/abstract_backend.py +17 -8
docling/backend/docling_parse_backend.py +42 -26
docling/backend/pypdfium2_backend.py +33 -11
docling/cli/__init__.py +0 -0
docling/cli/main.py +253 -0
docling/datamodel/base_models.py +39 -27
docling/datamodel/document.py +115 -17
docling/datamodel/pipeline_options.py +67 -0
docling/document_converter.py +65 -44
docling/models/base_ocr_model.py +4 -4
docling/models/ds_glm_model.py +11 -7
docling/models/easyocr_model.py +19 -4
docling/models/layout_model.py +3 -3
docling/models/table_structure_model.py +18 -2
docling/models/tesseract_ocr_cli_model.py +167 -0
docling/models/tesseract_ocr_model.py +122 -0
docling/pipeline/base_model_pipeline.py +4 -3
docling/pipeline/standard_model_pipeline.py +36 -8
docling/utils/export.py +145 -0
{docling-1.6.2.dist-info → docling-1.19.0.dist-info}/LICENSE +1 -1
docling-1.19.0.dist-info/METADATA +380 -0
docling-1.19.0.dist-info/RECORD +34 -0
docling-1.19.0.dist-info/entry_points.txt +3 -0
docling-1.6.2.dist-info/METADATA +0 -192
docling-1.6.2.dist-info/RECORD +0 -27
{docling-1.6.2.dist-info → docling-1.19.0.dist-info}/WHEEL +0 -0

docling/backend/abstract_backend.py CHANGED Viewed

@@ -1,14 +1,15 @@
 from abc import ABC, abstractmethod
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Iterable, Optional, Union
+from typing import TYPE_CHECKING, Any, Iterable, Optional, Union
 from PIL import Image
+if TYPE_CHECKING:
+    from docling.datamodel.base_models import BoundingBox, Cell, PageSize
 class PdfPageBackend(ABC):
-    def __init__(self, page_obj: Any) -> object:
-        pass
     @abstractmethod
     def get_text_in_rect(self, bbox: "BoundingBox") -> str:
@@ -19,12 +20,12 @@ class PdfPageBackend(ABC):
         pass
     @abstractmethod
-    def get_bitmap_rects(self, scale: int = 1) -> Iterable["BoundingBox"]:
+    def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]:
         pass
     @abstractmethod
     def get_page_image(
-        self, scale: int = 1, cropbox: Optional["BoundingBox"] = None
+        self, scale: float = 1, cropbox: Optional["BoundingBox"] = None
     ) -> Image.Image:
         pass
@@ -32,6 +33,10 @@ class PdfPageBackend(ABC):
     def get_size(self) -> "PageSize":
         pass
+    @abstractmethod
+    def is_valid(self) -> bool:
+        pass
     @abstractmethod
     def unload(self):
         pass
@@ -39,8 +44,9 @@ class PdfPageBackend(ABC):
 class PdfDocumentBackend(ABC):
     @abstractmethod
-    def __init__(self, path_or_stream: Union[BytesIO, Path]):
-        pass
+    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
+        self.path_or_stream = path_or_stream
+        self.document_hash = document_hash
     @abstractmethod
     def load_page(self, page_no: int) -> PdfPageBackend:
@@ -56,4 +62,7 @@ class PdfDocumentBackend(ABC):
     @abstractmethod
     def unload(self):
-        pass
+        if isinstance(self.path_or_stream, BytesIO):
+            self.path_or_stream.close()
+        self.path_or_stream = None

docling/backend/docling_parse_backend.py CHANGED Viewed

@@ -1,9 +1,8 @@
 import logging
 import random
-import time
 from io import BytesIO
 from pathlib import Path
-from typing import Iterable, Optional, Union
+from typing import Iterable, List, Optional, Union
 import pypdfium2 as pdfium
 from docling_parse.docling_parse import pdf_parser
@@ -17,13 +16,26 @@ _log = logging.getLogger(__name__)
 class DoclingParsePageBackend(PdfPageBackend):
-    def __init__(self, page_obj: PdfPage, docling_page_obj):
-        super().__init__(page_obj)
+    def __init__(
+        self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage
+    ):
         self._ppage = page_obj
-        self._dpage = docling_page_obj
-        self.text_page = None
+        parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
+        self.valid = "pages" in parsed_page
+        if self.valid:
+            self._dpage = parsed_page["pages"][0]
+        else:
+            _log.info(
+                f"An error occured when loading page {page_no} of document {document_hash}."
+            )
+    def is_valid(self) -> bool:
+        return self.valid
     def get_text_in_rect(self, bbox: BoundingBox) -> str:
+        if not self.valid:
+            return ""
         # Find intersecting cells on the page
         text_piece = ""
         page_size = self.get_size()
@@ -55,9 +67,12 @@ class DoclingParsePageBackend(PdfPageBackend):
         return text_piece
     def get_text_cells(self) -> Iterable[Cell]:
-        cells = []
+        cells: List[Cell] = []
         cell_counter = 0
+        if not self.valid:
+            return cells
         page_size = self.get_size()
         parser_width = self._dpage["width"]
@@ -114,7 +129,7 @@ class DoclingParsePageBackend(PdfPageBackend):
         return cells
-    def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
+    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
         AREA_THRESHOLD = 32 * 32
         for i in range(len(self._dpage["images"])):
@@ -129,7 +144,7 @@ class DoclingParsePageBackend(PdfPageBackend):
                 yield cropbox
     def get_page_image(
-        self, scale: int = 1, cropbox: Optional[BoundingBox] = None
+        self, scale: float = 1, cropbox: Optional[BoundingBox] = None
     ) -> Image.Image:
         page_size = self.get_size()
@@ -168,40 +183,41 @@ class DoclingParsePageBackend(PdfPageBackend):
     def unload(self):
         self._ppage = None
         self._dpage = None
-        self.text_page = None
 class DoclingParseDocumentBackend(PdfDocumentBackend):
-    def __init__(self, path_or_stream: Union[BytesIO, Path]):
-        super().__init__(path_or_stream)
-        self._pdoc = pdfium.PdfDocument(path_or_stream)
-        # Parsing cells with docling_parser call
-        parser = pdf_parser()
+    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
+        super().__init__(path_or_stream, document_hash)
-        start_pb_time = time.time()
+        self._pdoc = pdfium.PdfDocument(path_or_stream)
+        self.parser = pdf_parser()
+        success = False
         if isinstance(path_or_stream, BytesIO):
-            self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
-        else:
-            self._parser_doc = parser.find_cells(str(path_or_stream))
+            success = self.parser.load_document_from_bytesio(
+                document_hash, path_or_stream
+            )
+        elif isinstance(path_or_stream, Path):
+            success = self.parser.load_document(document_hash, str(path_or_stream))
-        end_pb_time = time.time() - start_pb_time
-        _log.info(
-            f"Time to parse {path_or_stream.name} with docling-parse: time={end_pb_time:.3f}"
-        )
+        if not success:
+            raise RuntimeError(
+                f"docling-parse could not load document {document_hash}."
+            )
     def page_count(self) -> int:
-        return len(self._parser_doc["pages"])
+        return len(self._pdoc)  # To be replaced with docling-parse API
     def load_page(self, page_no: int) -> DoclingParsePageBackend:
         return DoclingParsePageBackend(
-            self._pdoc[page_no], self._parser_doc["pages"][page_no]
+            self.parser, self.document_hash, page_no, self._pdoc[page_no]
         )
     def is_valid(self) -> bool:
         return self.page_count() > 0
     def unload(self):
+        super().unload()
+        self.parser.unload_document(self.document_hash)
         self._pdoc.close()
         self._pdoc = None
-        self._parser_doc = None

docling/backend/pypdfium2_backend.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import logging
 import random
 from io import BytesIO
 from pathlib import Path
@@ -6,19 +7,34 @@ from typing import Iterable, List, Optional, Union
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
 from PIL import Image, ImageDraw
-from pypdfium2 import PdfPage
+from pypdfium2 import PdfPage, PdfTextPage
+from pypdfium2._helpers.misc import PdfiumError
 from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
 from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
+_log = logging.getLogger(__name__)
 class PyPdfiumPageBackend(PdfPageBackend):
-    def __init__(self, page_obj: PdfPage):
-        super().__init__(page_obj)
-        self._ppage = page_obj
-        self.text_page = None
+    def __init__(
+        self, pdfium_doc: pdfium.PdfDocument, document_hash: str, page_no: int
+    ):
+        self.valid = True  # No better way to tell from pypdfium.
+        try:
+            self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
+        except PdfiumError as e:
+            _log.info(
+                f"An exception occured when loading page {page_no} of document {document_hash}.",
+                exc_info=True,
+            )
+            self.valid = False
+        self.text_page: Optional[PdfTextPage] = None
+    def is_valid(self) -> bool:
+        return self.valid
-    def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
+    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
         AREA_THRESHOLD = 32 * 32
         for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
             pos = obj.get_pos()
@@ -173,7 +189,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
         return cells
     def get_page_image(
-        self, scale: int = 1, cropbox: Optional[BoundingBox] = None
+        self, scale: float = 1, cropbox: Optional[BoundingBox] = None
     ) -> Image.Image:
         page_size = self.get_size()
@@ -215,19 +231,25 @@ class PyPdfiumPageBackend(PdfPageBackend):
 class PyPdfiumDocumentBackend(PdfDocumentBackend):
-    def __init__(self, path_or_stream: Union[BytesIO, Path]):
-        super().__init__(path_or_stream)
-        self._pdoc = pdfium.PdfDocument(path_or_stream)
+    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
+        super().__init__(path_or_stream, document_hash)
+        try:
+            self._pdoc = pdfium.PdfDocument(path_or_stream)
+        except PdfiumError as e:
+            raise RuntimeError(
+                f"pypdfium could not load document {document_hash}"
+            ) from e
     def page_count(self) -> int:
         return len(self._pdoc)
     def load_page(self, page_no: int) -> PyPdfiumPageBackend:
-        return PyPdfiumPageBackend(self._pdoc[page_no])
+        return PyPdfiumPageBackend(self._pdoc, self.document_hash, page_no)
     def is_valid(self) -> bool:
         return self.page_count() > 0
     def unload(self):
+        super().unload()
         self._pdoc.close()
         self._pdoc = None

docling/cli/__init__.py ADDED Viewed

File without changes

docling/cli/main.py ADDED Viewed

@@ -0,0 +1,253 @@
+import importlib
+import json
+import logging
+import time
+import warnings
+from enum import Enum
+from pathlib import Path
+from typing import Annotated, Iterable, List, Optional
+import typer
+from docling_core.utils.file import resolve_file_source
+from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.datamodel.base_models import ConversionStatus
+from docling.datamodel.document import ConversionResult, DocumentConversionInput
+from docling.datamodel.pipeline_options import (
+    EasyOcrOptions,
+    PipelineOptions,
+    TesseractCliOcrOptions,
+    TesseractOcrOptions,
+)
+from docling.document_converter import DocumentConverter
+warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
+warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
+_log = logging.getLogger(__name__)
+from rich.console import Console
+err_console = Console(stderr=True)
+app = typer.Typer(
+    name="Docling",
+    no_args_is_help=True,
+    add_completion=False,
+    pretty_exceptions_enable=False,
+)
+def version_callback(value: bool):
+    if value:
+        docling_version = importlib.metadata.version("docling")
+        docling_core_version = importlib.metadata.version("docling-core")
+        docling_ibm_models_version = importlib.metadata.version("docling-ibm-models")
+        docling_parse_version = importlib.metadata.version("docling-parse")
+        print(f"Docling version: {docling_version}")
+        print(f"Docling Core version: {docling_core_version}")
+        print(f"Docling IBM Models version: {docling_ibm_models_version}")
+        print(f"Docling Parse version: {docling_parse_version}")
+        raise typer.Exit()
+# Define an enum for the backend options
+class Backend(str, Enum):
+    PYPDFIUM2 = "pypdfium2"
+    DOCLING = "docling"
+# Define an enum for the ocr engines
+class OcrEngine(str, Enum):
+    EASYOCR = "easyocr"
+    TESSERACT_CLI = "tesseract_cli"
+    TESSERACT = "tesseract"
+def export_documents(
+    conv_results: Iterable[ConversionResult],
+    output_dir: Path,
+    export_json: bool,
+    export_md: bool,
+    export_txt: bool,
+    export_doctags: bool,
+):
+    success_count = 0
+    failure_count = 0
+    for conv_res in conv_results:
+        if conv_res.status == ConversionStatus.SUCCESS:
+            success_count += 1
+            doc_filename = conv_res.input.file.stem
+            # Export Deep Search document JSON format:
+            if export_json:
+                fname = output_dir / f"{doc_filename}.json"
+                with fname.open("w") as fp:
+                    _log.info(f"writing JSON output to {fname}")
+                    fp.write(json.dumps(conv_res.render_as_dict()))
+            # Export Text format:
+            if export_txt:
+                fname = output_dir / f"{doc_filename}.txt"
+                with fname.open("w") as fp:
+                    _log.info(f"writing Text output to {fname}")
+                    fp.write(conv_res.render_as_text())
+            # Export Markdown format:
+            if export_md:
+                fname = output_dir / f"{doc_filename}.md"
+                with fname.open("w") as fp:
+                    _log.info(f"writing Markdown output to {fname}")
+                    fp.write(conv_res.render_as_markdown())
+            # Export Document Tags format:
+            if export_doctags:
+                fname = output_dir / f"{doc_filename}.doctags"
+                with fname.open("w") as fp:
+                    _log.info(f"writing Doc Tags output to {fname}")
+                    fp.write(conv_res.render_as_doctags())
+        else:
+            _log.warning(f"Document {conv_res.input.file} failed to convert.")
+            failure_count += 1
+    _log.info(
+        f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
+    )
+@app.command(no_args_is_help=True)
+def convert(
+    input_sources: Annotated[
+        List[str],
+        typer.Argument(
+            ...,
+            metavar="source",
+            help="PDF files to convert. Can be local file / directory paths or URL.",
+        ),
+    ],
+    export_json: Annotated[
+        bool,
+        typer.Option(
+            ..., "--json/--no-json", help="If enabled the document is exported as JSON."
+        ),
+    ] = False,
+    export_md: Annotated[
+        bool,
+        typer.Option(
+            ..., "--md/--no-md", help="If enabled the document is exported as Markdown."
+        ),
+    ] = True,
+    export_txt: Annotated[
+        bool,
+        typer.Option(
+            ..., "--txt/--no-txt", help="If enabled the document is exported as Text."
+        ),
+    ] = False,
+    export_doctags: Annotated[
+        bool,
+        typer.Option(
+            ...,
+            "--doctags/--no-doctags",
+            help="If enabled the document is exported as Doc Tags.",
+        ),
+    ] = False,
+    ocr: Annotated[
+        bool,
+        typer.Option(
+            ..., help="If enabled, the bitmap content will be processed using OCR."
+        ),
+    ] = True,
+    backend: Annotated[
+        Backend, typer.Option(..., help="The PDF backend to use.")
+    ] = Backend.DOCLING,
+    ocr_engine: Annotated[
+        OcrEngine, typer.Option(..., help="The OCR engine to use.")
+    ] = OcrEngine.EASYOCR,
+    output: Annotated[
+        Path, typer.Option(..., help="Output directory where results are saved.")
+    ] = Path("."),
+    version: Annotated[
+        Optional[bool],
+        typer.Option(
+            "--version",
+            callback=version_callback,
+            is_eager=True,
+            help="Show version information.",
+        ),
+    ] = None,
+):
+    logging.basicConfig(level=logging.INFO)
+    input_doc_paths: List[Path] = []
+    for src in input_sources:
+        source = resolve_file_source(source=src)
+        if not source.exists():
+            err_console.print(
+                f"[red]Error: The input file {source} does not exist.[/red]"
+            )
+            raise typer.Abort()
+        elif source.is_dir():
+            input_doc_paths.extend(list(source.glob("**/*.pdf")))
+            input_doc_paths.extend(list(source.glob("**/*.PDF")))
+        else:
+            input_doc_paths.append(source)
+    match backend:
+        case Backend.PYPDFIUM2:
+            do_cell_matching = ocr  # only do cell matching when OCR enabled
+            pdf_backend = PyPdfiumDocumentBackend
+        case Backend.DOCLING:
+            do_cell_matching = True
+            pdf_backend = DoclingParseDocumentBackend
+        case _:
+            raise RuntimeError(f"Unexpected backend type {backend}")
+    match ocr_engine:
+        case OcrEngine.EASYOCR:
+            ocr_options = EasyOcrOptions()
+        case OcrEngine.TESSERACT_CLI:
+            ocr_options = TesseractCliOcrOptions()
+        case OcrEngine.TESSERACT:
+            ocr_options = TesseractOcrOptions()
+        case _:
+            raise RuntimeError(f"Unexpected backend type {backend}")
+    pipeline_options = PipelineOptions(
+        do_ocr=ocr,
+        ocr_options=ocr_options,
+        do_table_structure=True,
+    )
+    pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
+    doc_converter = DocumentConverter(
+        pipeline_options=pipeline_options,
+        pdf_backend=pdf_backend,
+    )
+    # Define input files
+    input = DocumentConversionInput.from_paths(input_doc_paths)
+    start_time = time.time()
+    conv_results = doc_converter.convert(input)
+    output.mkdir(parents=True, exist_ok=True)
+    export_documents(
+        conv_results,
+        output_dir=output,
+        export_json=export_json,
+        export_md=export_md,
+        export_txt=export_txt,
+        export_doctags=export_doctags,
+    )
+    end_time = time.time() - start_time
+    _log.info(f"All documents were converted in {end_time:.2f} seconds.")
+if __name__ == "__main__":
+    app()

docling/datamodel/base_models.py CHANGED Viewed

@@ -9,6 +9,10 @@ from pydantic import BaseModel, ConfigDict, Field, model_validator
 from typing_extensions import Self
 from docling.backend.abstract_backend import PdfPageBackend
+from docling.datamodel.pipeline_options import (  # Must be imported here for backward compatibility.
+    PipelineOptions,
+    TableStructureOptions,
+)
 class ConversionStatus(str, Enum):
@@ -16,7 +20,7 @@ class ConversionStatus(str, Enum):
     STARTED = auto()
     FAILURE = auto()
     SUCCESS = auto()
-    SUCCESS_WITH_ERRORS = auto()
+    PARTIAL_SUCCESS = auto()
 class DocInputType(str, Enum):
@@ -29,6 +33,18 @@ class CoordOrigin(str, Enum):
     BOTTOMLEFT = auto()
+class DoclingComponentType(str, Enum):
+    PDF_BACKEND = auto()
+    MODEL = auto()
+    DOC_ASSEMBLER = auto()
+class ErrorItem(BaseModel):
+    component_type: DoclingComponentType
+    module_name: str
+    error_message: str
 class PageSize(BaseModel):
     width: float = 0.0
     height: float = 0.0
@@ -59,6 +75,15 @@ class BoundingBox(BaseModel):
         return out_bbox
+    def normalized(self, page_size: PageSize) -> "BoundingBox":
+        out_bbox = copy.deepcopy(self)
+        out_bbox.l /= page_size.width
+        out_bbox.r /= page_size.width
+        out_bbox.t /= page_size.height
+        out_bbox.b /= page_size.height
+        return out_bbox
     def as_tuple(self):
         if self.coord_origin == CoordOrigin.TOPLEFT:
             return (self.l, self.t, self.r, self.b)
@@ -66,7 +91,7 @@ class BoundingBox(BaseModel):
             return (self.l, self.b, self.r, self.t)
     @classmethod
-    def from_tuple(cls, coord: Tuple[float], origin: CoordOrigin):
+    def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
         if origin == CoordOrigin.TOPLEFT:
             l, t, r, b = coord[0], coord[1], coord[2], coord[3]
             if r < l:
@@ -85,7 +110,10 @@ class BoundingBox(BaseModel):
             return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
     def area(self) -> float:
-        return (self.r - self.l) * (self.b - self.t)
+        area = (self.r - self.l) * (self.b - self.t)
+        if self.coord_origin == CoordOrigin.BOTTOMLEFT:
+            area = -area
+        return area
     def intersection_area_with(self, other: "BoundingBox") -> float:
         # Calculate intersection coordinates
@@ -225,19 +253,19 @@ class EquationPrediction(BaseModel):
 class PagePredictions(BaseModel):
-    layout: LayoutPrediction = None
-    tablestructure: TableStructurePrediction = None
-    figures_classification: FigureClassificationPrediction = None
-    equations_prediction: EquationPrediction = None
+    layout: Optional[LayoutPrediction] = None
+    tablestructure: Optional[TableStructurePrediction] = None
+    figures_classification: Optional[FigureClassificationPrediction] = None
+    equations_prediction: Optional[EquationPrediction] = None
 PageElement = Union[TextElement, TableElement, FigureElement]
 class AssembledUnit(BaseModel):
-    elements: List[PageElement]
-    body: List[PageElement]
-    headers: List[PageElement]
+    elements: List[PageElement] = []
+    body: List[PageElement] = []
+    headers: List[PageElement] = []
 class Page(BaseModel):
@@ -246,7 +274,7 @@ class Page(BaseModel):
     page_no: int
     page_hash: Optional[str] = None
     size: Optional[PageSize] = None
-    cells: List[Cell] = None
+    cells: List[Cell] = []
     predictions: PagePredictions = PagePredictions()
     assembled: Optional[AssembledUnit] = None
@@ -277,22 +305,6 @@ class DocumentStream(BaseModel):
     stream: BytesIO
-class TableStructureOptions(BaseModel):
-    do_cell_matching: bool = (
-        True
-        # True:  Matches predictions back to PDF cells. Can break table output if PDF cells
-        #        are merged across table columns.
-        # False: Let table structure model define the text cells, ignore PDF cells.
-    )
-class PipelineOptions(BaseModel):
-    do_table_structure: bool = True  # True: perform table structure extraction
-    do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
-    table_structure_options: TableStructureOptions = TableStructureOptions()
 class AssembleOptions(BaseModel):
     keep_page_images: Annotated[
         bool,

docling 1.6.2__py3-none-any.whl → 1.19.0__py3-none-any.whl

docling 1.6.2py3-none-any.whl → 1.19.0py3-none-any.whl