PyPI - docling - Versions diffs - 1.19.0__py3-none-any.whl → 2.1.0__py3-none-any.whl - Mend

docling 1.19.0py3-none-any.whl → 2.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

docling/backend/abstract_backend.py +32 -37
docling/backend/docling_parse_backend.py +16 -12
docling/backend/docling_parse_v2_backend.py +240 -0
docling/backend/html_backend.py +425 -0
docling/backend/mspowerpoint_backend.py +375 -0
docling/backend/msword_backend.py +509 -0
docling/backend/pdf_backend.py +78 -0
docling/backend/pypdfium2_backend.py +15 -10
docling/cli/main.py +61 -60
docling/datamodel/base_models.py +73 -193
docling/datamodel/document.py +379 -324
docling/datamodel/pipeline_options.py +16 -0
docling/datamodel/settings.py +1 -0
docling/document_converter.py +215 -252
docling/models/base_model.py +25 -0
docling/models/base_ocr_model.py +19 -6
docling/models/ds_glm_model.py +220 -22
docling/models/easyocr_model.py +45 -40
docling/models/layout_model.py +130 -114
docling/models/page_assemble_model.py +119 -95
docling/models/page_preprocessing_model.py +61 -0
docling/models/table_structure_model.py +122 -111
docling/models/tesseract_ocr_cli_model.py +65 -58
docling/models/tesseract_ocr_model.py +58 -50
docling/pipeline/base_pipeline.py +190 -0
docling/pipeline/simple_pipeline.py +59 -0
docling/pipeline/standard_pdf_pipeline.py +198 -0
docling/utils/export.py +4 -3
docling/utils/layout_utils.py +17 -11
docling-2.1.0.dist-info/METADATA +149 -0
docling-2.1.0.dist-info/RECORD +42 -0
docling/pipeline/base_model_pipeline.py +0 -18
docling/pipeline/standard_model_pipeline.py +0 -66
docling-1.19.0.dist-info/METADATA +0 -380
docling-1.19.0.dist-info/RECORD +0 -34
{docling-1.19.0.dist-info → docling-2.1.0.dist-info}/LICENSE +0 -0
{docling-1.19.0.dist-info → docling-2.1.0.dist-info}/WHEEL +0 -0
{docling-1.19.0.dist-info → docling-2.1.0.dist-info}/entry_points.txt +0 -0

docling/cli/main.py CHANGED Viewed

@@ -5,22 +5,27 @@ import time
 import warnings
 from enum import Enum
 from pathlib import Path
-from typing import Annotated, Iterable, List, Optional
+from typing import Annotated, Dict, Iterable, List, Optional
 import typer
 from docling_core.utils.file import resolve_file_source
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
-from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
-from docling.datamodel.base_models import ConversionStatus
-from docling.datamodel.document import ConversionResult, DocumentConversionInput
+from docling.datamodel.base_models import (
+    ConversionStatus,
+    FormatToExtensions,
+    InputFormat,
+    OutputFormat,
+)
+from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
     EasyOcrOptions,
-    PipelineOptions,
+    OcrOptions,
+    PdfPipelineOptions,
     TesseractCliOcrOptions,
     TesseractOcrOptions,
 )
-from docling.document_converter import DocumentConverter
+from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
 warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
@@ -87,28 +92,28 @@ def export_documents(
                 fname = output_dir / f"{doc_filename}.json"
                 with fname.open("w") as fp:
                     _log.info(f"writing JSON output to {fname}")
-                    fp.write(json.dumps(conv_res.render_as_dict()))
+                    fp.write(json.dumps(conv_res.document.export_to_dict()))
             # Export Text format:
             if export_txt:
                 fname = output_dir / f"{doc_filename}.txt"
                 with fname.open("w") as fp:
                     _log.info(f"writing Text output to {fname}")
-                    fp.write(conv_res.render_as_text())
+                    fp.write(conv_res.document.export_to_markdown(strict_text=True))
             # Export Markdown format:
             if export_md:
                 fname = output_dir / f"{doc_filename}.md"
                 with fname.open("w") as fp:
                     _log.info(f"writing Markdown output to {fname}")
-                    fp.write(conv_res.render_as_markdown())
+                    fp.write(conv_res.document.export_to_markdown())
             # Export Document Tags format:
             if export_doctags:
                 fname = output_dir / f"{doc_filename}.doctags"
                 with fname.open("w") as fp:
                     _log.info(f"writing Doc Tags output to {fname}")
-                    fp.write(conv_res.render_as_doctags())
+                    fp.write(conv_res.document.export_to_document_tokens())
         else:
             _log.warning(f"Document {conv_res.input.file} failed to convert.")
@@ -129,44 +134,31 @@ def convert(
             help="PDF files to convert. Can be local file / directory paths or URL.",
         ),
     ],
-    export_json: Annotated[
-        bool,
-        typer.Option(
-            ..., "--json/--no-json", help="If enabled the document is exported as JSON."
-        ),
-    ] = False,
-    export_md: Annotated[
-        bool,
-        typer.Option(
-            ..., "--md/--no-md", help="If enabled the document is exported as Markdown."
-        ),
-    ] = True,
-    export_txt: Annotated[
-        bool,
-        typer.Option(
-            ..., "--txt/--no-txt", help="If enabled the document is exported as Text."
-        ),
-    ] = False,
-    export_doctags: Annotated[
-        bool,
-        typer.Option(
-            ...,
-            "--doctags/--no-doctags",
-            help="If enabled the document is exported as Doc Tags.",
-        ),
-    ] = False,
+    from_formats: List[InputFormat] = typer.Option(
+        None,
+        "--from",
+        help="Specify input formats to convert from. Defaults to all formats.",
+    ),
+    to_formats: List[OutputFormat] = typer.Option(
+        None, "--to", help="Specify output formats. Defaults to Markdown."
+    ),
     ocr: Annotated[
         bool,
         typer.Option(
             ..., help="If enabled, the bitmap content will be processed using OCR."
         ),
     ] = True,
-    backend: Annotated[
-        Backend, typer.Option(..., help="The PDF backend to use.")
-    ] = Backend.DOCLING,
     ocr_engine: Annotated[
         OcrEngine, typer.Option(..., help="The OCR engine to use.")
     ] = OcrEngine.EASYOCR,
+    abort_on_error: Annotated[
+        bool,
+        typer.Option(
+            ...,
+            "--abort-on-error/--no-abort-on-error",
+            help="If enabled, the bitmap content will be processed using OCR.",
+        ),
+    ] = False,
     output: Annotated[
         Path, typer.Option(..., help="Output directory where results are saved.")
     ] = Path("."),
@@ -182,6 +174,9 @@ def convert(
 ):
     logging.basicConfig(level=logging.INFO)
+    if from_formats is None:
+        from_formats = [e for e in InputFormat]
     input_doc_paths: List[Path] = []
     for src in input_sources:
         source = resolve_file_source(source=src)
@@ -191,48 +186,54 @@ def convert(
             )
             raise typer.Abort()
         elif source.is_dir():
-            input_doc_paths.extend(list(source.glob("**/*.pdf")))
-            input_doc_paths.extend(list(source.glob("**/*.PDF")))
+            for fmt in from_formats:
+                for ext in FormatToExtensions[fmt]:
+                    input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
+                    input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
         else:
             input_doc_paths.append(source)
-    match backend:
-        case Backend.PYPDFIUM2:
-            do_cell_matching = ocr  # only do cell matching when OCR enabled
-            pdf_backend = PyPdfiumDocumentBackend
-        case Backend.DOCLING:
-            do_cell_matching = True
-            pdf_backend = DoclingParseDocumentBackend
-        case _:
-            raise RuntimeError(f"Unexpected backend type {backend}")
+    if to_formats is None:
+        to_formats = [OutputFormat.MARKDOWN]
+    export_json = OutputFormat.JSON in to_formats
+    export_md = OutputFormat.MARKDOWN in to_formats
+    export_txt = OutputFormat.TEXT in to_formats
+    export_doctags = OutputFormat.DOCTAGS in to_formats
     match ocr_engine:
         case OcrEngine.EASYOCR:
-            ocr_options = EasyOcrOptions()
+            ocr_options: OcrOptions = EasyOcrOptions()
         case OcrEngine.TESSERACT_CLI:
             ocr_options = TesseractCliOcrOptions()
         case OcrEngine.TESSERACT:
             ocr_options = TesseractOcrOptions()
         case _:
-            raise RuntimeError(f"Unexpected backend type {backend}")
+            raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
-    pipeline_options = PipelineOptions(
+    pipeline_options = PdfPipelineOptions(
         do_ocr=ocr,
         ocr_options=ocr_options,
         do_table_structure=True,
     )
-    pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
+    pipeline_options.table_structure_options.do_cell_matching = True  # do_cell_matching
+    format_options: Dict[InputFormat, FormatOption] = {
+        InputFormat.PDF: PdfFormatOption(
+            pipeline_options=pipeline_options,
+            backend=DoclingParseDocumentBackend,  # pdf_backend
+        )
+    }
     doc_converter = DocumentConverter(
-        pipeline_options=pipeline_options,
-        pdf_backend=pdf_backend,
+        allowed_formats=from_formats,
+        format_options=format_options,
     )
-    # Define input files
-    input = DocumentConversionInput.from_paths(input_doc_paths)
     start_time = time.time()
-    conv_results = doc_converter.convert(input)
+    conv_results = doc_converter.convert_all(
+        input_doc_paths, raises_on_error=abort_on_error
+    )
     output.mkdir(parents=True, exist_ok=True)
     export_documents(

docling/datamodel/base_models.py CHANGED Viewed

@@ -1,18 +1,19 @@
-import copy
-import warnings
 from enum import Enum, auto
 from io import BytesIO
-from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union
+from docling_core.types.doc import (
+    BoundingBox,
+    DocItemLabel,
+    PictureDataType,
+    Size,
+    TableCell,
+)
 from PIL.Image import Image
-from pydantic import BaseModel, ConfigDict, Field, model_validator
-from typing_extensions import Self
+from pydantic import BaseModel, ConfigDict
-from docling.backend.abstract_backend import PdfPageBackend
-from docling.datamodel.pipeline_options import (  # Must be imported here for backward compatibility.
-    PipelineOptions,
-    TableStructureOptions,
-)
+if TYPE_CHECKING:
+    from docling.backend.pdf_backend import PdfPageBackend
 class ConversionStatus(str, Enum):
@@ -23,18 +24,61 @@ class ConversionStatus(str, Enum):
     PARTIAL_SUCCESS = auto()
+class InputFormat(str, Enum):
+    DOCX = "docx"
+    PPTX = "pptx"
+    HTML = "html"
+    IMAGE = "image"
+    PDF = "pdf"
+class OutputFormat(str, Enum):
+    MARKDOWN = "md"
+    JSON = "json"
+    TEXT = "text"
+    DOCTAGS = "doctags"
+FormatToExtensions: Dict[InputFormat, List[str]] = {
+    InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
+    InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
+    InputFormat.PDF: ["pdf"],
+    InputFormat.HTML: ["html", "htm", "xhtml"],
+    InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
+}
+FormatToMimeType: Dict[InputFormat, Set[str]] = {
+    InputFormat.DOCX: {
+        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
+    },
+    InputFormat.PPTX: {
+        "application/vnd.openxmlformats-officedocument.presentationml.template",
+        "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
+        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+    },
+    InputFormat.HTML: {"text/html", "application/xhtml+xml"},
+    InputFormat.IMAGE: {
+        "image/png",
+        "image/jpeg",
+        "image/tiff",
+        "image/gif",
+        "image/bmp",
+    },
+    InputFormat.PDF: {"application/pdf"},
+}
+MimeTypeToFormat = {
+    mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
+}
 class DocInputType(str, Enum):
     PATH = auto()
     STREAM = auto()
-class CoordOrigin(str, Enum):
-    TOPLEFT = auto()
-    BOTTOMLEFT = auto()
 class DoclingComponentType(str, Enum):
-    PDF_BACKEND = auto()
+    DOCUMENT_BACKEND = auto()
     MODEL = auto()
     DOC_ASSEMBLER = auto()
@@ -45,118 +89,6 @@ class ErrorItem(BaseModel):
     error_message: str
-class PageSize(BaseModel):
-    width: float = 0.0
-    height: float = 0.0
-class BoundingBox(BaseModel):
-    l: float  # left
-    t: float  # top
-    r: float  # right
-    b: float  # bottom
-    coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
-    @property
-    def width(self):
-        return self.r - self.l
-    @property
-    def height(self):
-        return abs(self.t - self.b)
-    def scaled(self, scale: float) -> "BoundingBox":
-        out_bbox = copy.deepcopy(self)
-        out_bbox.l *= scale
-        out_bbox.r *= scale
-        out_bbox.t *= scale
-        out_bbox.b *= scale
-        return out_bbox
-    def normalized(self, page_size: PageSize) -> "BoundingBox":
-        out_bbox = copy.deepcopy(self)
-        out_bbox.l /= page_size.width
-        out_bbox.r /= page_size.width
-        out_bbox.t /= page_size.height
-        out_bbox.b /= page_size.height
-        return out_bbox
-    def as_tuple(self):
-        if self.coord_origin == CoordOrigin.TOPLEFT:
-            return (self.l, self.t, self.r, self.b)
-        elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
-            return (self.l, self.b, self.r, self.t)
-    @classmethod
-    def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
-        if origin == CoordOrigin.TOPLEFT:
-            l, t, r, b = coord[0], coord[1], coord[2], coord[3]
-            if r < l:
-                l, r = r, l
-            if b < t:
-                b, t = t, b
-            return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
-        elif origin == CoordOrigin.BOTTOMLEFT:
-            l, b, r, t = coord[0], coord[1], coord[2], coord[3]
-            if r < l:
-                l, r = r, l
-            if b > t:
-                b, t = t, b
-            return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
-    def area(self) -> float:
-        area = (self.r - self.l) * (self.b - self.t)
-        if self.coord_origin == CoordOrigin.BOTTOMLEFT:
-            area = -area
-        return area
-    def intersection_area_with(self, other: "BoundingBox") -> float:
-        # Calculate intersection coordinates
-        left = max(self.l, other.l)
-        top = max(self.t, other.t)
-        right = min(self.r, other.r)
-        bottom = min(self.b, other.b)
-        # Calculate intersection dimensions
-        width = right - left
-        height = bottom - top
-        # If the bounding boxes do not overlap, width or height will be negative
-        if width <= 0 or height <= 0:
-            return 0.0
-        return width * height
-    def to_bottom_left_origin(self, page_height) -> "BoundingBox":
-        if self.coord_origin == CoordOrigin.BOTTOMLEFT:
-            return self
-        elif self.coord_origin == CoordOrigin.TOPLEFT:
-            return BoundingBox(
-                l=self.l,
-                r=self.r,
-                t=page_height - self.t,
-                b=page_height - self.b,
-                coord_origin=CoordOrigin.BOTTOMLEFT,
-            )
-    def to_top_left_origin(self, page_height):
-        if self.coord_origin == CoordOrigin.TOPLEFT:
-            return self
-        elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
-            return BoundingBox(
-                l=self.l,
-                r=self.r,
-                t=page_height - self.t,  # self.b
-                b=page_height - self.b,  # self.t
-                coord_origin=CoordOrigin.TOPLEFT,
-            )
 class Cell(BaseModel):
     id: int
     text: str
@@ -169,14 +101,14 @@ class OcrCell(Cell):
 class Cluster(BaseModel):
     id: int
-    label: str
+    label: DocItemLabel
     bbox: BoundingBox
     confidence: float = 1.0
     cells: List[Cell] = []
 class BasePageElement(BaseModel):
-    label: str
+    label: DocItemLabel
     id: int
     page_no: int
     cluster: Cluster
@@ -187,37 +119,7 @@ class LayoutPrediction(BaseModel):
     clusters: List[Cluster] = []
-class TableCell(BaseModel):
-    bbox: BoundingBox
-    row_span: int
-    col_span: int
-    start_row_offset_idx: int
-    end_row_offset_idx: int
-    start_col_offset_idx: int
-    end_col_offset_idx: int
-    text: str
-    column_header: bool = False
-    row_header: bool = False
-    row_section: bool = False
-    @model_validator(mode="before")
-    @classmethod
-    def from_dict_format(cls, data: Any) -> Any:
-        if isinstance(data, Dict):
-            text = data["bbox"].get("token", "")
-            if not len(text):
-                text_cells = data.pop("text_cell_bboxes", None)
-                if text_cells:
-                    for el in text_cells:
-                        text += el["token"] + " "
-                text = text.strip()
-            data["text"] = text
-        return data
-class TableElement(BasePageElement):
+class Table(BasePageElement):
     otsl_seq: List[str]
     num_rows: int = 0
     num_cols: int = 0
@@ -225,18 +127,15 @@ class TableElement(BasePageElement):
 class TableStructurePrediction(BaseModel):
-    table_map: Dict[int, TableElement] = {}
-class TextElement(BasePageElement): ...
+    table_map: Dict[int, Table] = {}
-class FigureData(BaseModel):
-    pass
+class TextElement(BasePageElement):
+    text: str
 class FigureElement(BasePageElement):
-    data: Optional[FigureData] = None
+    annotations: List[PictureDataType] = []
     provenance: Optional[str] = None
     predicted_class: Optional[str] = None
     confidence: Optional[float] = None
@@ -259,7 +158,7 @@ class PagePredictions(BaseModel):
     equations_prediction: Optional[EquationPrediction] = None
-PageElement = Union[TextElement, TableElement, FigureElement]
+PageElement = Union[TextElement, Table, FigureElement]
 class AssembledUnit(BaseModel):
@@ -272,13 +171,13 @@ class Page(BaseModel):
     model_config = ConfigDict(arbitrary_types_allowed=True)
     page_no: int
-    page_hash: Optional[str] = None
-    size: Optional[PageSize] = None
+    # page_hash: Optional[str] = None
+    size: Optional[Size] = None
     cells: List[Cell] = []
     predictions: PagePredictions = PagePredictions()
     assembled: Optional[AssembledUnit] = None
-    _backend: Optional[PdfPageBackend] = (
+    _backend: Optional["PdfPageBackend"] = (
         None  # Internal PDF backend. By default it is cleared during assembling.
     )
     _default_image_scale: float = 1.0  # Default image scale for external usage.
@@ -301,24 +200,5 @@ class Page(BaseModel):
 class DocumentStream(BaseModel):
     model_config = ConfigDict(arbitrary_types_allowed=True)
-    filename: str
+    name: str
     stream: BytesIO
-class AssembleOptions(BaseModel):
-    keep_page_images: Annotated[
-        bool,
-        Field(
-            deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
-        ),
-    ] = False  # False: page images are removed in the assemble step
-    images_scale: Optional[float] = None  # if set, the scale for generated images
-    @model_validator(mode="after")
-    def set_page_images_from_deprecated(self) -> Self:
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore", DeprecationWarning)
-            default_scale = 1.0
-            if self.keep_page_images and self.images_scale is None:
-                self.images_scale = default_scale
-        return self

docling 1.19.0__py3-none-any.whl → 2.1.0__py3-none-any.whl

docling 1.19.0py3-none-any.whl → 2.1.0py3-none-any.whl