PyPI - docling - Versions diffs - 2.69.0__py3-none-any.whl - Mend

docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling might be problematic. Click here for more details.

Files changed (138) hide show

docling/__init__.py +0 -0
docling/backend/__init__.py +0 -0
docling/backend/abstract_backend.py +84 -0
docling/backend/asciidoc_backend.py +443 -0
docling/backend/csv_backend.py +125 -0
docling/backend/docling_parse_backend.py +237 -0
docling/backend/docling_parse_v2_backend.py +276 -0
docling/backend/docling_parse_v4_backend.py +260 -0
docling/backend/docx/__init__.py +0 -0
docling/backend/docx/drawingml/utils.py +131 -0
docling/backend/docx/latex/__init__.py +0 -0
docling/backend/docx/latex/latex_dict.py +274 -0
docling/backend/docx/latex/omml.py +459 -0
docling/backend/html_backend.py +1502 -0
docling/backend/image_backend.py +188 -0
docling/backend/json/__init__.py +0 -0
docling/backend/json/docling_json_backend.py +58 -0
docling/backend/md_backend.py +618 -0
docling/backend/mets_gbs_backend.py +399 -0
docling/backend/msexcel_backend.py +686 -0
docling/backend/mspowerpoint_backend.py +398 -0
docling/backend/msword_backend.py +1663 -0
docling/backend/noop_backend.py +51 -0
docling/backend/pdf_backend.py +82 -0
docling/backend/pypdfium2_backend.py +417 -0
docling/backend/webvtt_backend.py +572 -0
docling/backend/xml/__init__.py +0 -0
docling/backend/xml/jats_backend.py +819 -0
docling/backend/xml/uspto_backend.py +1905 -0
docling/chunking/__init__.py +12 -0
docling/cli/__init__.py +0 -0
docling/cli/main.py +974 -0
docling/cli/models.py +196 -0
docling/cli/tools.py +17 -0
docling/datamodel/__init__.py +0 -0
docling/datamodel/accelerator_options.py +69 -0
docling/datamodel/asr_model_specs.py +494 -0
docling/datamodel/backend_options.py +102 -0
docling/datamodel/base_models.py +493 -0
docling/datamodel/document.py +699 -0
docling/datamodel/extraction.py +39 -0
docling/datamodel/layout_model_specs.py +91 -0
docling/datamodel/pipeline_options.py +457 -0
docling/datamodel/pipeline_options_asr_model.py +78 -0
docling/datamodel/pipeline_options_vlm_model.py +136 -0
docling/datamodel/settings.py +65 -0
docling/datamodel/vlm_model_specs.py +365 -0
docling/document_converter.py +559 -0
docling/document_extractor.py +327 -0
docling/exceptions.py +10 -0
docling/experimental/__init__.py +5 -0
docling/experimental/datamodel/__init__.py +1 -0
docling/experimental/datamodel/table_crops_layout_options.py +13 -0
docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
docling/experimental/models/__init__.py +3 -0
docling/experimental/models/table_crops_layout_model.py +114 -0
docling/experimental/pipeline/__init__.py +1 -0
docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
docling/models/__init__.py +0 -0
docling/models/base_layout_model.py +39 -0
docling/models/base_model.py +230 -0
docling/models/base_ocr_model.py +241 -0
docling/models/base_table_model.py +45 -0
docling/models/extraction/__init__.py +0 -0
docling/models/extraction/nuextract_transformers_model.py +305 -0
docling/models/factories/__init__.py +47 -0
docling/models/factories/base_factory.py +122 -0
docling/models/factories/layout_factory.py +7 -0
docling/models/factories/ocr_factory.py +11 -0
docling/models/factories/picture_description_factory.py +11 -0
docling/models/factories/table_factory.py +7 -0
docling/models/picture_description_base_model.py +149 -0
docling/models/plugins/__init__.py +0 -0
docling/models/plugins/defaults.py +60 -0
docling/models/stages/__init__.py +0 -0
docling/models/stages/code_formula/__init__.py +0 -0
docling/models/stages/code_formula/code_formula_model.py +342 -0
docling/models/stages/layout/__init__.py +0 -0
docling/models/stages/layout/layout_model.py +249 -0
docling/models/stages/ocr/__init__.py +0 -0
docling/models/stages/ocr/auto_ocr_model.py +132 -0
docling/models/stages/ocr/easyocr_model.py +200 -0
docling/models/stages/ocr/ocr_mac_model.py +145 -0
docling/models/stages/ocr/rapid_ocr_model.py +328 -0
docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
docling/models/stages/page_assemble/__init__.py +0 -0
docling/models/stages/page_assemble/page_assemble_model.py +156 -0
docling/models/stages/page_preprocessing/__init__.py +0 -0
docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
docling/models/stages/picture_classifier/__init__.py +0 -0
docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
docling/models/stages/picture_description/__init__.py +0 -0
docling/models/stages/picture_description/picture_description_api_model.py +66 -0
docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
docling/models/stages/reading_order/__init__.py +0 -0
docling/models/stages/reading_order/readingorder_model.py +431 -0
docling/models/stages/table_structure/__init__.py +0 -0
docling/models/stages/table_structure/table_structure_model.py +305 -0
docling/models/utils/__init__.py +0 -0
docling/models/utils/generation_utils.py +157 -0
docling/models/utils/hf_model_download.py +45 -0
docling/models/vlm_pipeline_models/__init__.py +1 -0
docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
docling/models/vlm_pipeline_models/mlx_model.py +325 -0
docling/models/vlm_pipeline_models/vllm_model.py +344 -0
docling/pipeline/__init__.py +0 -0
docling/pipeline/asr_pipeline.py +431 -0
docling/pipeline/base_extraction_pipeline.py +72 -0
docling/pipeline/base_pipeline.py +326 -0
docling/pipeline/extraction_vlm_pipeline.py +207 -0
docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
docling/pipeline/simple_pipeline.py +55 -0
docling/pipeline/standard_pdf_pipeline.py +859 -0
docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
docling/pipeline/vlm_pipeline.py +416 -0
docling/py.typed +1 -0
docling/utils/__init__.py +0 -0
docling/utils/accelerator_utils.py +97 -0
docling/utils/api_image_request.py +205 -0
docling/utils/deepseekocr_utils.py +388 -0
docling/utils/export.py +146 -0
docling/utils/glm_utils.py +361 -0
docling/utils/layout_postprocessor.py +683 -0
docling/utils/locks.py +3 -0
docling/utils/model_downloader.py +168 -0
docling/utils/ocr_utils.py +69 -0
docling/utils/orientation.py +65 -0
docling/utils/profiling.py +65 -0
docling/utils/utils.py +65 -0
docling/utils/visualization.py +85 -0
docling-2.69.0.dist-info/METADATA +237 -0
docling-2.69.0.dist-info/RECORD +138 -0
docling-2.69.0.dist-info/WHEEL +5 -0
docling-2.69.0.dist-info/entry_points.txt +6 -0
docling-2.69.0.dist-info/licenses/LICENSE +21 -0
docling-2.69.0.dist-info/top_level.txt +1 -0

docling/backend/mspowerpoint_backend.py ADDED Viewed

@@ -0,0 +1,398 @@
+import logging
+from io import BytesIO
+from pathlib import Path
+from typing import Union
+from docling_core.types.doc import (
+    BoundingBox,
+    CoordOrigin,
+    DocItemLabel,
+    DoclingDocument,
+    DocumentOrigin,
+    GroupLabel,
+    ImageRef,
+    ProvenanceItem,
+    Size,
+    TableCell,
+    TableData,
+)
+from docling_core.types.doc.document import ContentLayer
+from PIL import Image, UnidentifiedImageError
+from pptx import Presentation
+from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
+from pptx.oxml.text import CT_TextLineBreak
+from docling.backend.abstract_backend import (
+    DeclarativeDocumentBackend,
+    PaginatedDocumentBackend,
+)
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+_log = logging.getLogger(__name__)
+class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+        self.namespaces = {
+            "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
+            "c": "http://schemas.openxmlformats.org/drawingml/2006/chart",
+            "p": "http://schemas.openxmlformats.org/presentationml/2006/main",
+        }
+        # Powerpoint file:
+        self.path_or_stream = path_or_stream
+        self.pptx_obj = None
+        self.valid = False
+        try:
+            if isinstance(self.path_or_stream, BytesIO):
+                self.pptx_obj = Presentation(self.path_or_stream)
+            elif isinstance(self.path_or_stream, Path):
+                self.pptx_obj = Presentation(str(self.path_or_stream))
+            self.valid = True
+        except Exception as e:
+            raise RuntimeError(
+                f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
+            ) from e
+        return
+    def page_count(self) -> int:
+        if self.is_valid():
+            assert self.pptx_obj is not None
+            return len(self.pptx_obj.slides)
+        else:
+            return 0
+    def is_valid(self) -> bool:
+        return self.valid
+    @classmethod
+    def supports_pagination(cls) -> bool:
+        return True  # True? if so, how to handle pages...
+    def unload(self):
+        if isinstance(self.path_or_stream, BytesIO):
+            self.path_or_stream.close()
+        self.path_or_stream = None
+    @classmethod
+    def supported_formats(cls) -> set[InputFormat]:
+        return {InputFormat.PPTX}
+    def convert(self) -> DoclingDocument:
+        # Parses the PPTX into a structured document model.
+        # origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)
+        origin = DocumentOrigin(
+            filename=self.file.name or "file",
+            mimetype="application/vnd.ms-powerpoint",
+            binary_hash=self.document_hash,
+        )
+        doc = DoclingDocument(
+            name=self.file.stem or "file", origin=origin
+        )  # must add origin information
+        doc = self.walk_linear(self.pptx_obj, doc)
+        return doc
+    def generate_prov(
+        self, shape, slide_ind, text="", slide_size=Size(width=1, height=1)
+    ):
+        if shape.left:
+            left = shape.left
+            top = shape.top
+            width = shape.width
+            height = shape.height
+        else:
+            left = 0
+            top = 0
+            width = slide_size.width
+            height = slide_size.height
+        shape_bbox = [left, top, left + width, top + height]
+        shape_bbox = BoundingBox.from_tuple(shape_bbox, origin=CoordOrigin.BOTTOMLEFT)
+        prov = ProvenanceItem(
+            page_no=slide_ind + 1, charspan=[0, len(text)], bbox=shape_bbox
+        )
+        return prov
+    def handle_text_elements(
+        self, shape, parent_slide, slide_ind, doc: DoclingDocument, slide_size
+    ):
+        is_list_group_created = False
+        enum_list_item_value = 0
+        new_list = None
+        doc_label = DocItemLabel.LIST_ITEM
+        prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
+        def is_list_item(paragraph):
+            """Check if the paragraph is a list item."""
+            p = paragraph._element
+            if (
+                p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
+                is not None
+            ):
+                return (True, "Bullet")
+            elif (
+                p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
+                is not None
+            ):
+                return (True, "Numbered")
+            elif paragraph.level > 0:
+                # Most likely a sub-list
+                return (True, "None")
+            else:
+                return (False, "None")
+        # Iterate through paragraphs to build up text
+        for paragraph in shape.text_frame.paragraphs:
+            is_a_list, bullet_type = is_list_item(paragraph)
+            p = paragraph._element
+            # Convert line breaks to spaces and accumulate text
+            p_text = ""
+            for e in p.content_children:
+                if isinstance(e, CT_TextLineBreak):
+                    p_text += " "
+                else:
+                    p_text += e.text
+            if is_a_list:
+                enum_marker = ""
+                enumerated = bullet_type == "Numbered"
+                if not is_list_group_created:
+                    new_list = doc.add_list_group(
+                        name="list",
+                        parent=parent_slide,
+                    )
+                    is_list_group_created = True
+                    enum_list_item_value = 0
+                if enumerated:
+                    enum_list_item_value += 1
+                    enum_marker = str(enum_list_item_value) + "."
+                doc.add_list_item(
+                    marker=enum_marker,
+                    enumerated=enumerated,
+                    parent=new_list,
+                    text=p_text,
+                    prov=prov,
+                )
+            else:  # is paragraph not a list item
+                # Assign proper label to the text, depending if it's a Title or Section Header
+                # For other types of text, assign - PARAGRAPH
+                doc_label = DocItemLabel.PARAGRAPH
+                if shape.is_placeholder:
+                    placeholder_type = shape.placeholder_format.type
+                    if placeholder_type in [
+                        PP_PLACEHOLDER.CENTER_TITLE,
+                        PP_PLACEHOLDER.TITLE,
+                    ]:
+                        # It's a title
+                        doc_label = DocItemLabel.TITLE
+                    elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
+                        DocItemLabel.SECTION_HEADER
+                # output accumulated inline text:
+                doc.add_text(
+                    label=doc_label,
+                    parent=parent_slide,
+                    text=p_text,
+                    prov=prov,
+                )
+        return
+    def handle_title(self, shape, parent_slide, slide_ind, doc):
+        placeholder_type = shape.placeholder_format.type
+        txt = shape.text.strip()
+        prov = self.generate_prov(shape, slide_ind, txt)
+        if len(txt.strip()) > 0:
+            # title = slide.shapes.title.text if slide.shapes.title else "No title"
+            if placeholder_type in [PP_PLACEHOLDER.CENTER_TITLE, PP_PLACEHOLDER.TITLE]:
+                _log.info(f"Title found: {shape.text}")
+                doc.add_text(
+                    label=DocItemLabel.TITLE, parent=parent_slide, text=txt, prov=prov
+                )
+            elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
+                _log.info(f"Subtitle found: {shape.text}")
+                # Using DocItemLabel.FOOTNOTE, while SUBTITLE label is not avail.
+                doc.add_text(
+                    label=DocItemLabel.SECTION_HEADER,
+                    parent=parent_slide,
+                    text=txt,
+                    prov=prov,
+                )
+        return
+    def handle_pictures(self, shape, parent_slide, slide_ind, doc, slide_size):
+        # Open it with PIL
+        try:
+            # Get the image bytes
+            image = shape.image
+            image_bytes = image.blob
+            im_dpi, _ = image.dpi
+            pil_image = Image.open(BytesIO(image_bytes))
+            # shape has picture
+            prov = self.generate_prov(shape, slide_ind, "", slide_size)
+            doc.add_picture(
+                parent=parent_slide,
+                image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
+                caption=None,
+                prov=prov,
+            )
+        except (UnidentifiedImageError, OSError) as e:
+            _log.warning(f"Warning: image cannot be loaded by Pillow: {e}")
+        return
+    def handle_tables(self, shape, parent_slide, slide_ind, doc, slide_size):
+        # Handling tables, images, charts
+        if shape.has_table:
+            table = shape.table
+            table_xml = shape._element
+            prov = self.generate_prov(shape, slide_ind, "", slide_size)
+            num_cols = 0
+            num_rows = len(table.rows)
+            tcells = []
+            # Access the XML element for the shape that contains the table
+            table_xml = shape._element
+            for row_idx, row in enumerate(table.rows):
+                if len(row.cells) > num_cols:
+                    num_cols = len(row.cells)
+                for col_idx, cell in enumerate(row.cells):
+                    # Access the XML of the cell (this is the 'tc' element in table XML)
+                    cell_xml = table_xml.xpath(
+                        f".//a:tbl/a:tr[{row_idx + 1}]/a:tc[{col_idx + 1}]"
+                    )
+                    if not cell_xml:
+                        continue  # If no cell XML is found, skip
+                    cell_xml = cell_xml[0]  # Get the first matching XML node
+                    row_span = cell_xml.get("rowSpan")  # Vertical span
+                    col_span = cell_xml.get("gridSpan")  # Horizontal span
+                    if row_span is None:
+                        row_span = 1
+                    else:
+                        row_span = int(row_span)
+                    if col_span is None:
+                        col_span = 1
+                    else:
+                        col_span = int(col_span)
+                    icell = TableCell(
+                        text=cell.text.strip(),
+                        row_span=row_span,
+                        col_span=col_span,
+                        start_row_offset_idx=row_idx,
+                        end_row_offset_idx=row_idx + row_span,
+                        start_col_offset_idx=col_idx,
+                        end_col_offset_idx=col_idx + col_span,
+                        column_header=row_idx == 0,
+                        row_header=False,
+                    )
+                    if len(cell.text.strip()) > 0:
+                        tcells.append(icell)
+            # Initialize Docling TableData
+            data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
+            # Populate
+            for tcell in tcells:
+                data.table_cells.append(tcell)
+            if len(tcells) > 0:
+                # If table is not fully empty...
+                # Create Docling table
+                doc.add_table(parent=parent_slide, data=data, prov=prov)
+        return
+    def walk_linear(self, pptx_obj, doc) -> DoclingDocument:
+        # Units of size in PPTX by default are EMU units (English Metric Units)
+        slide_width = pptx_obj.slide_width
+        slide_height = pptx_obj.slide_height
+        max_levels = 10
+        parents = {}  # type: ignore
+        for i in range(max_levels):
+            parents[i] = None
+        # Loop through each slide
+        for slide_num, slide in enumerate(pptx_obj.slides):
+            slide_ind = pptx_obj.slides.index(slide)
+            parent_slide = doc.add_group(
+                name=f"slide-{slide_ind}", label=GroupLabel.CHAPTER, parent=parents[0]
+            )
+            slide_size = Size(width=slide_width, height=slide_height)
+            doc.add_page(page_no=slide_ind + 1, size=slide_size)
+            def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
+                handle_groups(shape, parent_slide, slide_ind, doc, slide_size)
+                if shape.has_table:
+                    # Handle Tables
+                    self.handle_tables(shape, parent_slide, slide_ind, doc, slide_size)
+                if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
+                    # Handle Pictures
+                    if hasattr(shape, "image"):
+                        self.handle_pictures(
+                            shape, parent_slide, slide_ind, doc, slide_size
+                        )
+                # If shape doesn't have any text, move on to the next shape
+                if not hasattr(shape, "text"):
+                    return
+                if shape.text is None:
+                    return
+                if len(shape.text.strip()) == 0:
+                    return
+                if not shape.has_text_frame:
+                    _log.warning("Warning: shape has text but not text_frame")
+                    return
+                # Handle other text elements, including lists (bullet lists, numbered lists)
+                self.handle_text_elements(
+                    shape, parent_slide, slide_ind, doc, slide_size
+                )
+                return
+            def handle_groups(shape, parent_slide, slide_ind, doc, slide_size):
+                if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
+                    for groupedshape in shape.shapes:
+                        handle_shapes(
+                            groupedshape, parent_slide, slide_ind, doc, slide_size
+                        )
+            # Loop through each shape in the slide
+            for shape in slide.shapes:
+                handle_shapes(shape, parent_slide, slide_ind, doc, slide_size)
+            # Handle notes slide
+            if slide.has_notes_slide:
+                notes_slide = slide.notes_slide
+                if notes_slide.notes_text_frame is not None:
+                    notes_text = notes_slide.notes_text_frame.text.strip()
+                    if notes_text:
+                        bbox = BoundingBox(l=0, t=0, r=0, b=0)
+                        prov = ProvenanceItem(
+                            page_no=slide_ind + 1,
+                            charspan=[0, len(notes_text)],
+                            bbox=bbox,
+                        )
+                        doc.add_text(
+                            label=DocItemLabel.TEXT,
+                            parent=parent_slide,
+                            text=notes_text,
+                            prov=prov,
+                            content_layer=ContentLayer.FURNITURE,
+                        )
+        return doc