PyPI - docling - Versions diffs - 2.26.0__py3-none-any.whl → 2.28.0__py3-none-any.whl - Mend

docling 2.26.0py3-none-any.whl → 2.28.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

docling/backend/asciidoc_backend.py +1 -1
docling/backend/csv_backend.py +1 -1
docling/backend/docling_parse_backend.py +21 -13
docling/backend/docling_parse_v2_backend.py +20 -12
docling/backend/docling_parse_v4_backend.py +192 -0
docling/backend/docx/__init__.py +0 -0
docling/backend/docx/latex/__init__.py +0 -0
docling/backend/docx/latex/latex_dict.py +271 -0
docling/backend/docx/latex/omml.py +453 -0
docling/backend/html_backend.py +7 -7
docling/backend/md_backend.py +1 -1
docling/backend/msexcel_backend.py +2 -45
docling/backend/mspowerpoint_backend.py +19 -1
docling/backend/msword_backend.py +68 -3
docling/backend/pdf_backend.py +7 -2
docling/backend/pypdfium2_backend.py +52 -30
docling/backend/xml/uspto_backend.py +1 -1
docling/cli/main.py +135 -53
docling/cli/models.py +1 -1
docling/datamodel/base_models.py +8 -10
docling/datamodel/pipeline_options.py +54 -32
docling/document_converter.py +5 -5
docling/models/base_model.py +9 -1
docling/models/base_ocr_model.py +27 -16
docling/models/easyocr_model.py +28 -13
docling/models/factories/__init__.py +27 -0
docling/models/factories/base_factory.py +122 -0
docling/models/factories/ocr_factory.py +11 -0
docling/models/factories/picture_description_factory.py +11 -0
docling/models/hf_mlx_model.py +137 -0
docling/models/ocr_mac_model.py +39 -11
docling/models/page_preprocessing_model.py +4 -0
docling/models/picture_description_api_model.py +20 -3
docling/models/picture_description_base_model.py +19 -3
docling/models/picture_description_vlm_model.py +14 -2
docling/models/plugins/__init__.py +0 -0
docling/models/plugins/defaults.py +28 -0
docling/models/rapid_ocr_model.py +34 -13
docling/models/table_structure_model.py +13 -4
docling/models/tesseract_ocr_cli_model.py +40 -15
docling/models/tesseract_ocr_model.py +37 -12
docling/pipeline/standard_pdf_pipeline.py +25 -78
docling/pipeline/vlm_pipeline.py +78 -398
docling/utils/export.py +8 -6
docling/utils/layout_postprocessor.py +26 -23
docling/utils/visualization.py +1 -1
{docling-2.26.0.dist-info → docling-2.28.0.dist-info}/METADATA +47 -23
docling-2.28.0.dist-info/RECORD +84 -0
{docling-2.26.0.dist-info → docling-2.28.0.dist-info}/entry_points.txt +3 -0
docling-2.26.0.dist-info/RECORD +0 -72
{docling-2.26.0.dist-info → docling-2.28.0.dist-info}/LICENSE +0 -0
{docling-2.26.0.dist-info → docling-2.28.0.dist-info}/WHEEL +0 -0

docling/pipeline/vlm_pipeline.py CHANGED Viewed

@@ -1,30 +1,13 @@
-import itertools
 import logging
-import re
 import warnings
 from io import BytesIO
-# from io import BytesIO
 from pathlib import Path
-from typing import Optional
+from typing import List, Optional, Union, cast
-from docling_core.types import DoclingDocument
-from docling_core.types.doc import (
-    BoundingBox,
-    DocItem,
-    DocItemLabel,
-    DoclingDocument,
-    GroupLabel,
-    ImageRef,
-    ImageRefMode,
-    PictureItem,
-    ProvenanceItem,
-    Size,
-    TableCell,
-    TableData,
-    TableItem,
-)
-from docling_core.types.doc.tokens import DocumentToken, TableToken
+# from docling_core.types import DoclingDocument
+from docling_core.types.doc import BoundingBox, DocItem, ImageRef, PictureItem, TextItem
+from docling_core.types.doc.document import DocTagsDocument
+from PIL import Image as PILImage
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.md_backend import MarkdownDocumentBackend
@@ -32,11 +15,12 @@ from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.datamodel.base_models import InputFormat, Page
 from docling.datamodel.document import ConversionResult, InputDocument
 from docling.datamodel.pipeline_options import (
-    PdfPipelineOptions,
+    InferenceFramework,
     ResponseFormat,
     VlmPipelineOptions,
 )
 from docling.datamodel.settings import settings
+from docling.models.hf_mlx_model import HuggingFaceMlxModel
 from docling.models.hf_vlm_model import HuggingFaceVlmModel
 from docling.pipeline.base_pipeline import PaginatedPipeline
 from docling.utils.profiling import ProfilingScope, TimeRecorder
@@ -50,12 +34,6 @@ class VlmPipeline(PaginatedPipeline):
         super().__init__(pipeline_options)
         self.keep_backend = True
-        warnings.warn(
-            "The VlmPipeline is currently experimental and may change in upcoming versions without notice.",
-            category=UserWarning,
-            stacklevel=2,
-        )
         self.pipeline_options: VlmPipelineOptions
         artifacts_path: Optional[Path] = None
@@ -79,14 +57,27 @@ class VlmPipeline(PaginatedPipeline):
         self.keep_images = self.pipeline_options.generate_page_images
-        self.build_pipe = [
-            HuggingFaceVlmModel(
-                enabled=True,  # must be always enabled for this pipeline to make sense.
-                artifacts_path=artifacts_path,
-                accelerator_options=pipeline_options.accelerator_options,
-                vlm_options=self.pipeline_options.vlm_options,
-            ),
-        ]
+        if (
+            self.pipeline_options.vlm_options.inference_framework
+            == InferenceFramework.MLX
+        ):
+            self.build_pipe = [
+                HuggingFaceMlxModel(
+                    enabled=True,  # must be always enabled for this pipeline to make sense.
+                    artifacts_path=artifacts_path,
+                    accelerator_options=pipeline_options.accelerator_options,
+                    vlm_options=self.pipeline_options.vlm_options,
+                ),
+            ]
+        else:
+            self.build_pipe = [
+                HuggingFaceVlmModel(
+                    enabled=True,  # must be always enabled for this pipeline to make sense.
+                    artifacts_path=artifacts_path,
+                    accelerator_options=pipeline_options.accelerator_options,
+                    vlm_options=self.pipeline_options.vlm_options,
+                ),
+            ]
         self.enrichment_pipe = [
             # Other models working on `NodeItem` elements in the DoclingDocument
@@ -100,6 +91,17 @@ class VlmPipeline(PaginatedPipeline):
         return page
+    def extract_text_from_backend(
+        self, page: Page, bbox: Union[BoundingBox, None]
+    ) -> str:
+        # Convert bounding box normalized to 0-100 into page coordinates for cropping
+        text = ""
+        if bbox:
+            if page.size:
+                if page._backend:
+                    text = page._backend.get_text_in_rect(bbox)
+        return text
     def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
         with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
@@ -107,7 +109,45 @@ class VlmPipeline(PaginatedPipeline):
                 self.pipeline_options.vlm_options.response_format
                 == ResponseFormat.DOCTAGS
             ):
-                conv_res.document = self._turn_tags_into_doc(conv_res.pages)
+                doctags_list = []
+                image_list = []
+                for page in conv_res.pages:
+                    predicted_doctags = ""
+                    img = PILImage.new("RGB", (1, 1), "rgb(255,255,255)")
+                    if page.predictions.vlm_response:
+                        predicted_doctags = page.predictions.vlm_response.text
+                    if page.image:
+                        img = page.image
+                    image_list.append(img)
+                    doctags_list.append(predicted_doctags)
+                doctags_list_c = cast(List[Union[Path, str]], doctags_list)
+                image_list_c = cast(List[Union[Path, PILImage.Image]], image_list)
+                doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
+                    doctags_list_c, image_list_c
+                )
+                conv_res.document.load_from_doctags(doctags_doc)
+                # If forced backend text, replace model predicted text with backend one
+                if page.size:
+                    if self.force_backend_text:
+                        scale = self.pipeline_options.images_scale
+                        for element, _level in conv_res.document.iterate_items():
+                            if (
+                                not isinstance(element, TextItem)
+                                or len(element.prov) == 0
+                            ):
+                                continue
+                            crop_bbox = (
+                                element.prov[0]
+                                .bbox.scaled(scale=scale)
+                                .to_top_left_origin(
+                                    page_height=page.size.height * scale
+                                )
+                            )
+                            txt = self.extract_text_from_backend(page, crop_bbox)
+                            element.text = txt
+                            element.orig = txt
             elif (
                 self.pipeline_options.vlm_options.response_format
                 == ResponseFormat.MARKDOWN
@@ -165,366 +205,6 @@ class VlmPipeline(PaginatedPipeline):
         )
         return backend.convert()
-    def _turn_tags_into_doc(self, pages: list[Page]) -> DoclingDocument:
-        ###############################################
-        # Tag definitions and color mappings
-        ###############################################
-        # Maps the recognized tag to a Docling label.
-        # Code items will be given DocItemLabel.CODE
-        tag_to_doclabel = {
-            "title": DocItemLabel.TITLE,
-            "document_index": DocItemLabel.DOCUMENT_INDEX,
-            "otsl": DocItemLabel.TABLE,
-            "section_header_level_1": DocItemLabel.SECTION_HEADER,
-            "checkbox_selected": DocItemLabel.CHECKBOX_SELECTED,
-            "checkbox_unselected": DocItemLabel.CHECKBOX_UNSELECTED,
-            "text": DocItemLabel.TEXT,
-            "page_header": DocItemLabel.PAGE_HEADER,
-            "page_footer": DocItemLabel.PAGE_FOOTER,
-            "formula": DocItemLabel.FORMULA,
-            "caption": DocItemLabel.CAPTION,
-            "picture": DocItemLabel.PICTURE,
-            "list_item": DocItemLabel.LIST_ITEM,
-            "footnote": DocItemLabel.FOOTNOTE,
-            "code": DocItemLabel.CODE,
-        }
-        # Maps each tag to an associated bounding box color.
-        tag_to_color = {
-            "title": "blue",
-            "document_index": "darkblue",
-            "otsl": "green",
-            "section_header_level_1": "purple",
-            "checkbox_selected": "black",
-            "checkbox_unselected": "gray",
-            "text": "red",
-            "page_header": "orange",
-            "page_footer": "cyan",
-            "formula": "pink",
-            "caption": "magenta",
-            "picture": "yellow",
-            "list_item": "brown",
-            "footnote": "darkred",
-            "code": "lightblue",
-        }
-        def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
-            """Extracts <loc_...> bounding box coords from the chunk, normalized by / 500."""
-            coords = re.findall(r"<loc_(\d+)>", text_chunk)
-            if len(coords) == 4:
-                l, t, r, b = map(float, coords)
-                return BoundingBox(l=l / 500, t=t / 500, r=r / 500, b=b / 500)
-            return None
-        def extract_inner_text(text_chunk: str) -> str:
-            """Strips all <...> tags inside the chunk to get the raw text content."""
-            return re.sub(r"<.*?>", "", text_chunk, flags=re.DOTALL).strip()
-        def extract_text_from_backend(page: Page, bbox: BoundingBox | None) -> str:
-            # Convert bounding box normalized to 0-100 into page coordinates for cropping
-            text = ""
-            if bbox:
-                if page.size:
-                    bbox.l = bbox.l * page.size.width
-                    bbox.t = bbox.t * page.size.height
-                    bbox.r = bbox.r * page.size.width
-                    bbox.b = bbox.b * page.size.height
-                    if page._backend:
-                        text = page._backend.get_text_in_rect(bbox)
-            return text
-        def otsl_parse_texts(texts, tokens):
-            split_word = TableToken.OTSL_NL.value
-            split_row_tokens = [
-                list(y)
-                for x, y in itertools.groupby(tokens, lambda z: z == split_word)
-                if not x
-            ]
-            table_cells = []
-            r_idx = 0
-            c_idx = 0
-            def count_right(tokens, c_idx, r_idx, which_tokens):
-                span = 0
-                c_idx_iter = c_idx
-                while tokens[r_idx][c_idx_iter] in which_tokens:
-                    c_idx_iter += 1
-                    span += 1
-                    if c_idx_iter >= len(tokens[r_idx]):
-                        return span
-                return span
-            def count_down(tokens, c_idx, r_idx, which_tokens):
-                span = 0
-                r_idx_iter = r_idx
-                while tokens[r_idx_iter][c_idx] in which_tokens:
-                    r_idx_iter += 1
-                    span += 1
-                    if r_idx_iter >= len(tokens):
-                        return span
-                return span
-            for i, text in enumerate(texts):
-                cell_text = ""
-                if text in [
-                    TableToken.OTSL_FCEL.value,
-                    TableToken.OTSL_ECEL.value,
-                    TableToken.OTSL_CHED.value,
-                    TableToken.OTSL_RHED.value,
-                    TableToken.OTSL_SROW.value,
-                ]:
-                    row_span = 1
-                    col_span = 1
-                    right_offset = 1
-                    if text != TableToken.OTSL_ECEL.value:
-                        cell_text = texts[i + 1]
-                        right_offset = 2
-                    # Check next element(s) for lcel / ucel / xcel, set properly row_span, col_span
-                    next_right_cell = ""
-                    if i + right_offset < len(texts):
-                        next_right_cell = texts[i + right_offset]
-                    next_bottom_cell = ""
-                    if r_idx + 1 < len(split_row_tokens):
-                        if c_idx < len(split_row_tokens[r_idx + 1]):
-                            next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
-                    if next_right_cell in [
-                        TableToken.OTSL_LCEL.value,
-                        TableToken.OTSL_XCEL.value,
-                    ]:
-                        # we have horisontal spanning cell or 2d spanning cell
-                        col_span += count_right(
-                            split_row_tokens,
-                            c_idx + 1,
-                            r_idx,
-                            [TableToken.OTSL_LCEL.value, TableToken.OTSL_XCEL.value],
-                        )
-                    if next_bottom_cell in [
-                        TableToken.OTSL_UCEL.value,
-                        TableToken.OTSL_XCEL.value,
-                    ]:
-                        # we have a vertical spanning cell or 2d spanning cell
-                        row_span += count_down(
-                            split_row_tokens,
-                            c_idx,
-                            r_idx + 1,
-                            [TableToken.OTSL_UCEL.value, TableToken.OTSL_XCEL.value],
-                        )
-                    table_cells.append(
-                        TableCell(
-                            text=cell_text.strip(),
-                            row_span=row_span,
-                            col_span=col_span,
-                            start_row_offset_idx=r_idx,
-                            end_row_offset_idx=r_idx + row_span,
-                            start_col_offset_idx=c_idx,
-                            end_col_offset_idx=c_idx + col_span,
-                        )
-                    )
-                if text in [
-                    TableToken.OTSL_FCEL.value,
-                    TableToken.OTSL_ECEL.value,
-                    TableToken.OTSL_CHED.value,
-                    TableToken.OTSL_RHED.value,
-                    TableToken.OTSL_SROW.value,
-                    TableToken.OTSL_LCEL.value,
-                    TableToken.OTSL_UCEL.value,
-                    TableToken.OTSL_XCEL.value,
-                ]:
-                    c_idx += 1
-                if text == TableToken.OTSL_NL.value:
-                    r_idx += 1
-                    c_idx = 0
-            return table_cells, split_row_tokens
-        def otsl_extract_tokens_and_text(s: str):
-            # Pattern to match anything enclosed by < > (including the angle brackets themselves)
-            pattern = r"(<[^>]+>)"
-            # Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
-            tokens = re.findall(pattern, s)
-            # Remove any tokens that start with "<loc_"
-            tokens = [
-                token
-                for token in tokens
-                if not (
-                    token.startswith(rf"<{DocumentToken.LOC.value}")
-                    or token
-                    in [
-                        rf"<{DocumentToken.OTSL.value}>",
-                        rf"</{DocumentToken.OTSL.value}>",
-                    ]
-                )
-            ]
-            # Split the string by those tokens to get the in-between text
-            text_parts = re.split(pattern, s)
-            text_parts = [
-                token
-                for token in text_parts
-                if not (
-                    token.startswith(rf"<{DocumentToken.LOC.value}")
-                    or token
-                    in [
-                        rf"<{DocumentToken.OTSL.value}>",
-                        rf"</{DocumentToken.OTSL.value}>",
-                    ]
-                )
-            ]
-            # Remove any empty or purely whitespace strings from text_parts
-            text_parts = [part for part in text_parts if part.strip()]
-            return tokens, text_parts
-        def parse_table_content(otsl_content: str) -> TableData:
-            tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
-            table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
-            return TableData(
-                num_rows=len(split_row_tokens),
-                num_cols=(
-                    max(len(row) for row in split_row_tokens) if split_row_tokens else 0
-                ),
-                table_cells=table_cells,
-            )
-        doc = DoclingDocument(name="Document")
-        for pg_idx, page in enumerate(pages):
-            xml_content = ""
-            predicted_text = ""
-            if page.predictions.vlm_response:
-                predicted_text = page.predictions.vlm_response.text
-            image = page.image
-            page_no = pg_idx + 1
-            bounding_boxes = []
-            if page.size:
-                pg_width = page.size.width
-                pg_height = page.size.height
-                size = Size(width=pg_width, height=pg_height)
-                parent_page = doc.add_page(page_no=page_no, size=size)
-            """
-            1. Finds all <tag>...</tag> blocks in the entire string (multi-line friendly) in the order they appear.
-            2. For each chunk, extracts bounding box (if any) and inner text.
-            3. Adds the item to a DoclingDocument structure with the right label.
-            4. Tracks bounding boxes + color in a separate list for later visualization.
-            """
-            # Regex for all recognized tags
-            tag_pattern = (
-                rf"<(?P<tag>{DocItemLabel.TITLE}|{DocItemLabel.DOCUMENT_INDEX}|"
-                rf"{DocItemLabel.CHECKBOX_UNSELECTED}|{DocItemLabel.CHECKBOX_SELECTED}|"
-                rf"{DocItemLabel.TEXT}|{DocItemLabel.PAGE_HEADER}|"
-                rf"{DocItemLabel.PAGE_FOOTER}|{DocItemLabel.FORMULA}|"
-                rf"{DocItemLabel.CAPTION}|{DocItemLabel.PICTURE}|"
-                rf"{DocItemLabel.LIST_ITEM}|{DocItemLabel.FOOTNOTE}|{DocItemLabel.CODE}|"
-                rf"{DocItemLabel.SECTION_HEADER}_level_1|{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
-            )
-            # DocumentToken.OTSL
-            pattern = re.compile(tag_pattern, re.DOTALL)
-            # Go through each match in order
-            for match in pattern.finditer(predicted_text):
-                full_chunk = match.group(0)
-                tag_name = match.group("tag")
-                bbox = extract_bounding_box(full_chunk)
-                doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH)
-                color = tag_to_color.get(tag_name, "white")
-                # Store bounding box + color
-                if bbox:
-                    bounding_boxes.append((bbox, color))
-                if tag_name == DocumentToken.OTSL.value:
-                    table_data = parse_table_content(full_chunk)
-                    bbox = extract_bounding_box(full_chunk)
-                    if bbox:
-                        prov = ProvenanceItem(
-                            bbox=bbox.resize_by_scale(pg_width, pg_height),
-                            charspan=(0, 0),
-                            page_no=page_no,
-                        )
-                        doc.add_table(data=table_data, prov=prov)
-                    else:
-                        doc.add_table(data=table_data)
-                elif tag_name == DocItemLabel.PICTURE:
-                    text_caption_content = extract_inner_text(full_chunk)
-                    if image:
-                        if bbox:
-                            im_width, im_height = image.size
-                            crop_box = (
-                                int(bbox.l * im_width),
-                                int(bbox.t * im_height),
-                                int(bbox.r * im_width),
-                                int(bbox.b * im_height),
-                            )
-                            cropped_image = image.crop(crop_box)
-                            pic = doc.add_picture(
-                                parent=None,
-                                image=ImageRef.from_pil(image=cropped_image, dpi=72),
-                                prov=(
-                                    ProvenanceItem(
-                                        bbox=bbox.resize_by_scale(pg_width, pg_height),
-                                        charspan=(0, 0),
-                                        page_no=page_no,
-                                    )
-                                ),
-                            )
-                            # If there is a caption to an image, add it as well
-                            if len(text_caption_content) > 0:
-                                caption_item = doc.add_text(
-                                    label=DocItemLabel.CAPTION,
-                                    text=text_caption_content,
-                                    parent=None,
-                                )
-                                pic.captions.append(caption_item.get_ref())
-                    else:
-                        if bbox:
-                            # In case we don't have access to an binary of an image
-                            doc.add_picture(
-                                parent=None,
-                                prov=ProvenanceItem(
-                                    bbox=bbox, charspan=(0, 0), page_no=page_no
-                                ),
-                            )
-                            # If there is a caption to an image, add it as well
-                            if len(text_caption_content) > 0:
-                                caption_item = doc.add_text(
-                                    label=DocItemLabel.CAPTION,
-                                    text=text_caption_content,
-                                    parent=None,
-                                )
-                                pic.captions.append(caption_item.get_ref())
-                else:
-                    # For everything else, treat as text
-                    if self.force_backend_text:
-                        text_content = extract_text_from_backend(page, bbox)
-                    else:
-                        text_content = extract_inner_text(full_chunk)
-                    doc.add_text(
-                        label=doc_label,
-                        text=text_content,
-                        prov=(
-                            ProvenanceItem(
-                                bbox=bbox.resize_by_scale(pg_width, pg_height),
-                                charspan=(0, len(text_content)),
-                                page_no=page_no,
-                            )
-                            if bbox
-                            else None
-                        ),
-                    )
-        return doc
     @classmethod
     def get_default_options(cls) -> VlmPipelineOptions:
         return VlmPipelineOptions()

docling/utils/export.py CHANGED Viewed

@@ -2,9 +2,9 @@ import logging
 from typing import Any, Dict, Iterable, List, Tuple, Union
 from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import TextCell
 from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
-from docling.datamodel.base_models import OcrCell
 from docling.datamodel.document import ConversionResult, Page
 _log = logging.getLogger(__name__)
@@ -86,11 +86,13 @@ def generate_multimodal_pages(
         if page.size is None:
             return cells
         for cell in page.cells:
-            new_bbox = cell.bbox.to_top_left_origin(
-                page_height=page.size.height
-            ).normalized(page_size=page.size)
-            is_ocr = isinstance(cell, OcrCell)
-            ocr_confidence = cell.confidence if isinstance(cell, OcrCell) else 1.0
+            new_bbox = (
+                cell.rect.to_bounding_box()
+                .to_top_left_origin(page_height=page.size.height)
+                .normalized(page_size=page.size)
+            )
+            is_ocr = cell.from_ocr
+            ocr_confidence = cell.confidence
             cells.append(
                 {
                     "text": cell.text,

docling 2.26.0__py3-none-any.whl → 2.28.0__py3-none-any.whl

docling 2.26.0py3-none-any.whl → 2.28.0py3-none-any.whl