PyPI - docling - Versions diffs - 2.26.0__py3-none-any.whl → 2.28.0__py3-none-any.whl - Mend

docling 2.26.0py3-none-any.whl → 2.28.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

docling/backend/asciidoc_backend.py +1 -1
docling/backend/csv_backend.py +1 -1
docling/backend/docling_parse_backend.py +21 -13
docling/backend/docling_parse_v2_backend.py +20 -12
docling/backend/docling_parse_v4_backend.py +192 -0
docling/backend/docx/__init__.py +0 -0
docling/backend/docx/latex/__init__.py +0 -0
docling/backend/docx/latex/latex_dict.py +271 -0
docling/backend/docx/latex/omml.py +453 -0
docling/backend/html_backend.py +7 -7
docling/backend/md_backend.py +1 -1
docling/backend/msexcel_backend.py +2 -45
docling/backend/mspowerpoint_backend.py +19 -1
docling/backend/msword_backend.py +68 -3
docling/backend/pdf_backend.py +7 -2
docling/backend/pypdfium2_backend.py +52 -30
docling/backend/xml/uspto_backend.py +1 -1
docling/cli/main.py +135 -53
docling/cli/models.py +1 -1
docling/datamodel/base_models.py +8 -10
docling/datamodel/pipeline_options.py +54 -32
docling/document_converter.py +5 -5
docling/models/base_model.py +9 -1
docling/models/base_ocr_model.py +27 -16
docling/models/easyocr_model.py +28 -13
docling/models/factories/__init__.py +27 -0
docling/models/factories/base_factory.py +122 -0
docling/models/factories/ocr_factory.py +11 -0
docling/models/factories/picture_description_factory.py +11 -0
docling/models/hf_mlx_model.py +137 -0
docling/models/ocr_mac_model.py +39 -11
docling/models/page_preprocessing_model.py +4 -0
docling/models/picture_description_api_model.py +20 -3
docling/models/picture_description_base_model.py +19 -3
docling/models/picture_description_vlm_model.py +14 -2
docling/models/plugins/__init__.py +0 -0
docling/models/plugins/defaults.py +28 -0
docling/models/rapid_ocr_model.py +34 -13
docling/models/table_structure_model.py +13 -4
docling/models/tesseract_ocr_cli_model.py +40 -15
docling/models/tesseract_ocr_model.py +37 -12
docling/pipeline/standard_pdf_pipeline.py +25 -78
docling/pipeline/vlm_pipeline.py +78 -398
docling/utils/export.py +8 -6
docling/utils/layout_postprocessor.py +26 -23
docling/utils/visualization.py +1 -1
{docling-2.26.0.dist-info → docling-2.28.0.dist-info}/METADATA +47 -23
docling-2.28.0.dist-info/RECORD +84 -0
{docling-2.26.0.dist-info → docling-2.28.0.dist-info}/entry_points.txt +3 -0
docling-2.26.0.dist-info/RECORD +0 -72
{docling-2.26.0.dist-info → docling-2.28.0.dist-info}/LICENSE +0 -0
{docling-2.26.0.dist-info → docling-2.28.0.dist-info}/WHEEL +0 -0

docling/backend/msword_backend.py CHANGED Viewed

@@ -26,6 +26,7 @@ from PIL import Image, UnidentifiedImageError
 from typing_extensions import override
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.backend.docx.latex.omml import oMath2Latex
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument
@@ -260,6 +261,27 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         else:
             return label, None
+    def handle_equations_in_text(self, element, text):
+        only_texts = []
+        only_equations = []
+        texts_and_equations = []
+        for subt in element.iter():
+            tag_name = etree.QName(subt).localname
+            if tag_name == "t" and "math" not in subt.tag:
+                only_texts.append(subt.text)
+                texts_and_equations.append(subt.text)
+            elif "oMath" in subt.tag and "oMathPara" not in subt.tag:
+                latex_equation = str(oMath2Latex(subt))
+                only_equations.append(latex_equation)
+                texts_and_equations.append(latex_equation)
+        if "".join(only_texts).strip() != text.strip():
+            # If we are not able to reconstruct the initial raw text
+            # do not try to parse equations and return the original
+            return text, []
+        return "".join(texts_and_equations), only_equations
     def handle_text_elements(
         self,
         element: BaseOxmlElement,
@@ -268,9 +290,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
     ) -> None:
         paragraph = Paragraph(element, docx_obj)
-        if paragraph.text is None:
+        raw_text = paragraph.text
+        text, equations = self.handle_equations_in_text(element=element, text=raw_text)
+        if text is None:
             return
-        text = paragraph.text.strip()
+        text = text.strip()
         # Common styles for bullet and numbered lists.
         # "List Bullet", "List Number", "List Paragraph"
@@ -323,6 +348,46 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         elif "Heading" in p_style_id:
             self.add_header(doc, p_level, text)
+        elif len(equations) > 0:
+            if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
+                # Standalone equation
+                level = self.get_level()
+                doc.add_text(
+                    label=DocItemLabel.FORMULA,
+                    parent=self.parents[level - 1],
+                    text=text,
+                )
+            else:
+                # Inline equation
+                level = self.get_level()
+                inline_equation = doc.add_group(
+                    label=GroupLabel.INLINE, parent=self.parents[level - 1]
+                )
+                text_tmp = text
+                for eq in equations:
+                    if len(text_tmp) == 0:
+                        break
+                    pre_eq_text = text_tmp.split(eq, maxsplit=1)[0]
+                    text_tmp = text_tmp.split(eq, maxsplit=1)[1]
+                    if len(pre_eq_text) > 0:
+                        doc.add_text(
+                            label=DocItemLabel.PARAGRAPH,
+                            parent=inline_equation,
+                            text=pre_eq_text,
+                        )
+                    doc.add_text(
+                        label=DocItemLabel.FORMULA,
+                        parent=inline_equation,
+                        text=eq,
+                    )
+                if len(text_tmp) > 0:
+                    doc.add_text(
+                        label=DocItemLabel.PARAGRAPH,
+                        parent=inline_equation,
+                        text=text_tmp,
+                    )
         elif p_style_id in [
             "Paragraph",
             "Normal",
@@ -539,7 +604,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                     end_row_offset_idx=row.grid_cols_before + spanned_idx,
                     start_col_offset_idx=col_idx,
                     end_col_offset_idx=col_idx + cell.grid_span,
-                    col_header=False,
+                    column_header=row.grid_cols_before + row_idx == 0,
                     row_header=False,
                 )
                 data.table_cells.append(table_cell)

docling/backend/pdf_backend.py CHANGED Viewed

@@ -4,10 +4,11 @@ from pathlib import Path
 from typing import Iterable, Optional, Set, Union
 from docling_core.types.doc import BoundingBox, Size
+from docling_core.types.doc.page import SegmentedPdfPage, TextCell
 from PIL import Image
 from docling.backend.abstract_backend import PaginatedDocumentBackend
-from docling.datamodel.base_models import Cell, InputFormat
+from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument
@@ -17,7 +18,11 @@ class PdfPageBackend(ABC):
         pass
     @abstractmethod
-    def get_text_cells(self) -> Iterable[Cell]:
+    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
+        pass
+    @abstractmethod
+    def get_text_cells(self) -> Iterable[TextCell]:
         pass
     @abstractmethod

docling/backend/pypdfium2_backend.py CHANGED Viewed

@@ -7,12 +7,12 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
 from docling_core.types.doc import BoundingBox, CoordOrigin, Size
+from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfTextPage
 from pypdfium2._helpers.misc import PdfiumError
 from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
-from docling.datamodel.base_models import Cell
 from docling.utils.locks import pypdfium2_lock
 if TYPE_CHECKING:
@@ -68,7 +68,10 @@ class PyPdfiumPageBackend(PdfPageBackend):
         return text_piece
-    def get_text_cells(self) -> Iterable[Cell]:
+    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
+        return None
+    def get_text_cells(self) -> Iterable[TextCell]:
         with pypdfium2_lock:
             if not self.text_page:
                 self.text_page = self._ppage.get_textpage()
@@ -84,11 +87,19 @@ class PyPdfiumPageBackend(PdfPageBackend):
                 text_piece = self.text_page.get_text_bounded(*rect)
                 x0, y0, x1, y1 = rect
                 cells.append(
-                    Cell(
-                        id=cell_counter,
+                    TextCell(
+                        index=cell_counter,
                         text=text_piece,
-                        bbox=BoundingBox(
-                            l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT
+                        orig=text_piece,
+                        from_ocr=False,
+                        rect=BoundingRectangle.from_bounding_box(
+                            BoundingBox(
+                                l=x0,
+                                b=y0,
+                                r=x1,
+                                t=y1,
+                                coord_origin=CoordOrigin.BOTTOMLEFT,
+                            )
                         ).to_top_left_origin(page_size.height),
                     )
                 )
@@ -97,51 +108,56 @@ class PyPdfiumPageBackend(PdfPageBackend):
         # PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs.
         # The cell merging code below is to clean this up.
         def merge_horizontal_cells(
-            cells: List[Cell],
+            cells: List[TextCell],
             horizontal_threshold_factor: float = 1.0,
             vertical_threshold_factor: float = 0.5,
-        ) -> List[Cell]:
+        ) -> List[TextCell]:
             if not cells:
                 return []
-            def group_rows(cells: List[Cell]) -> List[List[Cell]]:
+            def group_rows(cells: List[TextCell]) -> List[List[TextCell]]:
                 rows = []
                 current_row = [cells[0]]
-                row_top = cells[0].bbox.t
-                row_bottom = cells[0].bbox.b
-                row_height = cells[0].bbox.height
+                row_top = cells[0].rect.to_bounding_box().t
+                row_bottom = cells[0].rect.to_bounding_box().b
+                row_height = cells[0].rect.to_bounding_box().height
                 for cell in cells[1:]:
                     vertical_threshold = row_height * vertical_threshold_factor
                     if (
-                        abs(cell.bbox.t - row_top) <= vertical_threshold
-                        and abs(cell.bbox.b - row_bottom) <= vertical_threshold
+                        abs(cell.rect.to_bounding_box().t - row_top)
+                        <= vertical_threshold
+                        and abs(cell.rect.to_bounding_box().b - row_bottom)
+                        <= vertical_threshold
                     ):
                         current_row.append(cell)
-                        row_top = min(row_top, cell.bbox.t)
-                        row_bottom = max(row_bottom, cell.bbox.b)
+                        row_top = min(row_top, cell.rect.to_bounding_box().t)
+                        row_bottom = max(row_bottom, cell.rect.to_bounding_box().b)
                         row_height = row_bottom - row_top
                     else:
                         rows.append(current_row)
                         current_row = [cell]
-                        row_top = cell.bbox.t
-                        row_bottom = cell.bbox.b
-                        row_height = cell.bbox.height
+                        row_top = cell.rect.to_bounding_box().t
+                        row_bottom = cell.rect.to_bounding_box().b
+                        row_height = cell.rect.to_bounding_box().height
                 if current_row:
                     rows.append(current_row)
                 return rows
-            def merge_row(row: List[Cell]) -> List[Cell]:
+            def merge_row(row: List[TextCell]) -> List[TextCell]:
                 merged = []
                 current_group = [row[0]]
                 for cell in row[1:]:
                     prev_cell = current_group[-1]
-                    avg_height = (prev_cell.bbox.height + cell.bbox.height) / 2
+                    avg_height = (
+                        prev_cell.rect.height + cell.rect.to_bounding_box().height
+                    ) / 2
                     if (
-                        cell.bbox.l - prev_cell.bbox.r
+                        cell.rect.to_bounding_box().l
+                        - prev_cell.rect.to_bounding_box().r
                         <= avg_height * horizontal_threshold_factor
                     ):
                         current_group.append(cell)
@@ -154,24 +170,30 @@ class PyPdfiumPageBackend(PdfPageBackend):
                 return merged
-            def merge_group(group: List[Cell]) -> Cell:
+            def merge_group(group: List[TextCell]) -> TextCell:
                 if len(group) == 1:
                     return group[0]
                 merged_text = "".join(cell.text for cell in group)
                 merged_bbox = BoundingBox(
-                    l=min(cell.bbox.l for cell in group),
-                    t=min(cell.bbox.t for cell in group),
-                    r=max(cell.bbox.r for cell in group),
-                    b=max(cell.bbox.b for cell in group),
+                    l=min(cell.rect.to_bounding_box().l for cell in group),
+                    t=min(cell.rect.to_bounding_box().t for cell in group),
+                    r=max(cell.rect.to_bounding_box().r for cell in group),
+                    b=max(cell.rect.to_bounding_box().b for cell in group),
+                )
+                return TextCell(
+                    index=group[0].index,
+                    text=merged_text,
+                    orig=merged_text,
+                    rect=BoundingRectangle.from_bounding_box(merged_bbox),
+                    from_ocr=False,
                 )
-                return Cell(id=group[0].id, text=merged_text, bbox=merged_bbox)
             rows = group_rows(cells)
             merged_cells = [cell for row in rows for cell in merge_row(row)]
             for i, cell in enumerate(merged_cells, 1):
-                cell.id = i
+                cell.index = i
             return merged_cells
@@ -181,7 +203,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
             )  # make new image to avoid drawing on the saved ones
             draw = ImageDraw.Draw(image)
             for c in cells:
-                x0, y0, x1, y1 = c.bbox.as_tuple()
+                x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
                 cell_color = (
                     random.randint(30, 140),
                     random.randint(30, 140),

docling/backend/xml/uspto_backend.py CHANGED Viewed

@@ -999,7 +999,7 @@ class PatentUsptoGrantAps(PatentUspto):
                     parent=self.parents[self.level],
                 )
-            last_claim.text += f" {value}" if last_claim.text else value
+            last_claim.text += f" {value.strip()}" if last_claim.text else value.strip()
         elif field == self.Field.CAPTION.value and section in (
             self.Section.SUMMARY.value,

docling/cli/main.py CHANGED Viewed

@@ -9,6 +9,7 @@ import warnings
 from pathlib import Path
 from typing import Annotated, Dict, Iterable, List, Optional, Type
+import rich.table
 import typer
 from docling_core.types.doc import ImageRefMode
 from docling_core.utils.file import resolve_source_to_path
@@ -16,6 +17,7 @@ from pydantic import TypeAdapter
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
+from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import (
@@ -29,18 +31,22 @@ from docling.datamodel.pipeline_options import (
     AcceleratorDevice,
     AcceleratorOptions,
     EasyOcrOptions,
-    OcrEngine,
-    OcrMacOptions,
     OcrOptions,
+    PaginatedPipelineOptions,
     PdfBackend,
+    PdfPipeline,
     PdfPipelineOptions,
-    RapidOcrOptions,
     TableFormerMode,
-    TesseractCliOcrOptions,
-    TesseractOcrOptions,
+    VlmModelType,
+    VlmPipelineOptions,
+    granite_vision_vlm_conversion_options,
+    smoldocling_vlm_conversion_options,
+    smoldocling_vlm_mlx_conversion_options,
 )
 from docling.datamodel.settings import settings
 from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
+from docling.models.factories import get_ocr_factory
+from docling.pipeline.vlm_pipeline import VlmPipeline
 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
 warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
@@ -48,8 +54,11 @@ warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr
 _log = logging.getLogger(__name__)
 from rich.console import Console
+console = Console()
 err_console = Console(stderr=True)
+ocr_factory_internal = get_ocr_factory(allow_external_plugins=False)
+ocr_engines_enum_internal = ocr_factory_internal.get_enum()
 app = typer.Typer(
     name="Docling",
@@ -77,6 +86,24 @@ def version_callback(value: bool):
         raise typer.Exit()
+def show_external_plugins_callback(value: bool):
+    if value:
+        ocr_factory_all = get_ocr_factory(allow_external_plugins=True)
+        table = rich.table.Table(title="Available OCR engines")
+        table.add_column("Name", justify="right")
+        table.add_column("Plugin")
+        table.add_column("Package")
+        for meta in ocr_factory_all.registered_meta.values():
+            if not meta.module.startswith("docling."):
+                table.add_row(
+                    f"[bold]{meta.kind}[/bold]",
+                    meta.plugin_name,
+                    meta.module.split(".")[0],
+                )
+        rich.print(table)
+        raise typer.Exit()
 def export_documents(
     conv_results: Iterable[ConversionResult],
     output_dir: Path,
@@ -181,6 +208,14 @@ def convert(
             help="Image export mode for the document (only in case of JSON, Markdown or HTML). With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.",
         ),
     ] = ImageRefMode.EMBEDDED,
+    pipeline: Annotated[
+        PdfPipeline,
+        typer.Option(..., help="Choose the pipeline to process PDF or image files."),
+    ] = PdfPipeline.STANDARD,
+    vlm_model: Annotated[
+        VlmModelType,
+        typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
+    ] = VlmModelType.SMOLDOCLING,
     ocr: Annotated[
         bool,
         typer.Option(
@@ -195,8 +230,16 @@ def convert(
         ),
     ] = False,
     ocr_engine: Annotated[
-        OcrEngine, typer.Option(..., help="The OCR engine to use.")
-    ] = OcrEngine.EASYOCR,
+        str,
+        typer.Option(
+            ...,
+            help=(
+                f"The OCR engine to use. When --allow-external-plugins is *not* set, the available values are: "
+                f"{', '.join((o.value for o in ocr_engines_enum_internal))}. "
+                f"Use the option --show-external-plugins to see the options allowed with external plugins."
+            ),
+        ),
+    ] = EasyOcrOptions.kind,
     ocr_lang: Annotated[
         Optional[str],
         typer.Option(
@@ -240,6 +283,21 @@ def convert(
             ..., help="Must be enabled when using models connecting to remote services."
         ),
     ] = False,
+    allow_external_plugins: Annotated[
+        bool,
+        typer.Option(
+            ..., help="Must be enabled for loading modules from third-party plugins."
+        ),
+    ] = False,
+    show_external_plugins: Annotated[
+        bool,
+        typer.Option(
+            ...,
+            help="List the third-party plugins which are available when the option --allow-external-plugins is set.",
+            callback=show_external_plugins_callback,
+            is_eager=True,
+        ),
+    ] = False,
     abort_on_error: Annotated[
         bool,
         typer.Option(
@@ -367,64 +425,88 @@ def convert(
         export_txt = OutputFormat.TEXT in to_formats
         export_doctags = OutputFormat.DOCTAGS in to_formats
-        if ocr_engine == OcrEngine.EASYOCR:
-            ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
-        elif ocr_engine == OcrEngine.TESSERACT_CLI:
-            ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
-        elif ocr_engine == OcrEngine.TESSERACT:
-            ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
-        elif ocr_engine == OcrEngine.OCRMAC:
-            ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
-        elif ocr_engine == OcrEngine.RAPIDOCR:
-            ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
-        else:
-            raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
+        ocr_factory = get_ocr_factory(allow_external_plugins=allow_external_plugins)
+        ocr_options: OcrOptions = ocr_factory.create_options(  # type: ignore
+            kind=ocr_engine,
+            force_full_page_ocr=force_ocr,
+        )
         ocr_lang_list = _split_list(ocr_lang)
         if ocr_lang_list is not None:
             ocr_options.lang = ocr_lang_list
         accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
-        pipeline_options = PdfPipelineOptions(
-            enable_remote_services=enable_remote_services,
-            accelerator_options=accelerator_options,
-            do_ocr=ocr,
-            ocr_options=ocr_options,
-            do_table_structure=True,
-            do_code_enrichment=enrich_code,
-            do_formula_enrichment=enrich_formula,
-            do_picture_description=enrich_picture_description,
-            do_picture_classification=enrich_picture_classes,
-            document_timeout=document_timeout,
-        )
-        pipeline_options.table_structure_options.do_cell_matching = (
-            True  # do_cell_matching
-        )
-        pipeline_options.table_structure_options.mode = table_mode
+        pipeline_options: PaginatedPipelineOptions
+        if pipeline == PdfPipeline.STANDARD:
+            pipeline_options = PdfPipelineOptions(
+                allow_external_plugins=allow_external_plugins,
+                enable_remote_services=enable_remote_services,
+                accelerator_options=accelerator_options,
+                do_ocr=ocr,
+                ocr_options=ocr_options,
+                do_table_structure=True,
+                do_code_enrichment=enrich_code,
+                do_formula_enrichment=enrich_formula,
+                do_picture_description=enrich_picture_description,
+                do_picture_classification=enrich_picture_classes,
+                document_timeout=document_timeout,
+            )
+            pipeline_options.table_structure_options.do_cell_matching = (
+                True  # do_cell_matching
+            )
+            pipeline_options.table_structure_options.mode = table_mode
-        if image_export_mode != ImageRefMode.PLACEHOLDER:
-            pipeline_options.generate_page_images = True
-            pipeline_options.generate_picture_images = (
-                True  # FIXME: to be deprecated in verson 3
+            if image_export_mode != ImageRefMode.PLACEHOLDER:
+                pipeline_options.generate_page_images = True
+                pipeline_options.generate_picture_images = (
+                    True  # FIXME: to be deprecated in verson 3
+                )
+                pipeline_options.images_scale = 2
+            backend: Type[PdfDocumentBackend]
+            if pdf_backend == PdfBackend.DLPARSE_V1:
+                backend = DoclingParseDocumentBackend
+            elif pdf_backend == PdfBackend.DLPARSE_V2:
+                backend = DoclingParseV2DocumentBackend
+            elif pdf_backend == PdfBackend.DLPARSE_V4:
+                backend = DoclingParseV4DocumentBackend  # type: ignore
+            elif pdf_backend == PdfBackend.PYPDFIUM2:
+                backend = PyPdfiumDocumentBackend  # type: ignore
+            else:
+                raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
+            pdf_format_option = PdfFormatOption(
+                pipeline_options=pipeline_options,
+                backend=backend,  # pdf_backend
+            )
+        elif pipeline == PdfPipeline.VLM:
+            pipeline_options = VlmPipelineOptions()
+            if vlm_model == VlmModelType.GRANITE_VISION:
+                pipeline_options.vlm_options = granite_vision_vlm_conversion_options
+            elif vlm_model == VlmModelType.SMOLDOCLING:
+                pipeline_options.vlm_options = smoldocling_vlm_conversion_options
+                if sys.platform == "darwin":
+                    try:
+                        import mlx_vlm
+                        pipeline_options.vlm_options = (
+                            smoldocling_vlm_mlx_conversion_options
+                        )
+                    except ImportError:
+                        _log.warning(
+                            "To run SmolDocling faster, please install mlx-vlm:\n"
+                            "pip install mlx-vlm"
+                        )
+            pdf_format_option = PdfFormatOption(
+                pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
             )
-            pipeline_options.images_scale = 2
         if artifacts_path is not None:
             pipeline_options.artifacts_path = artifacts_path
-        if pdf_backend == PdfBackend.DLPARSE_V1:
-            backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
-        elif pdf_backend == PdfBackend.DLPARSE_V2:
-            backend = DoclingParseV2DocumentBackend
-        elif pdf_backend == PdfBackend.PYPDFIUM2:
-            backend = PyPdfiumDocumentBackend
-        else:
-            raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
-        pdf_format_option = PdfFormatOption(
-            pipeline_options=pipeline_options,
-            backend=backend,  # pdf_backend
-        )
         format_options: Dict[InputFormat, FormatOption] = {
             InputFormat.PDF: pdf_format_option,
             InputFormat.IMAGE: pdf_format_option,

docling/cli/models.py CHANGED Viewed

@@ -121,7 +121,7 @@ def download(
             "Using the CLI:",
             f"`docling --artifacts-path={output_dir} FILE`",
             "\n",
-            "Using Python: see the documentation at <https://ds4sd.github.io/docling/usage>.",
+            "Using Python: see the documentation at <https://docling-project.github.io/docling/usage>.",
         )

docling/datamodel/base_models.py CHANGED Viewed

@@ -9,6 +9,7 @@ from docling_core.types.doc import (
     Size,
     TableCell,
 )
+from docling_core.types.doc.page import SegmentedPdfPage, TextCell
 from docling_core.types.io import (  # DO ΝΟΤ REMOVE; explicitly exposed from this location
     DocumentStream,
 )
@@ -123,14 +124,10 @@ class ErrorItem(BaseModel):
     error_message: str
-class Cell(BaseModel):
-    id: int
-    text: str
-    bbox: BoundingBox
-class OcrCell(Cell):
-    confidence: float
+# class Cell(BaseModel):
+#    id: int
+#    text: str
+#    bbox: BoundingBox
 class Cluster(BaseModel):
@@ -138,7 +135,7 @@ class Cluster(BaseModel):
     label: DocItemLabel
     bbox: BoundingBox
     confidence: float = 1.0
-    cells: List[Cell] = []
+    cells: List[TextCell] = []
     children: List["Cluster"] = []  # Add child cluster support
@@ -226,7 +223,8 @@ class Page(BaseModel):
     page_no: int
     # page_hash: Optional[str] = None
     size: Optional[Size] = None
-    cells: List[Cell] = []
+    cells: List[TextCell] = []
+    parsed_page: Optional[SegmentedPdfPage] = None
     predictions: PagePredictions = PagePredictions()
     assembled: Optional[AssembledUnit] = None

docling 2.26.0__py3-none-any.whl → 2.28.0__py3-none-any.whl

docling 2.26.0py3-none-any.whl → 2.28.0py3-none-any.whl