PyPI - docling - Versions diffs - 2.8.3__tar.gz → 2.10.0__tar.gz - Mend

docling 2.8.3tar.gz → 2.10.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

{docling-2.8.3 → docling-2.10.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 2.8.3
+Version: 2.10.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Home-page: https://github.com/DS4SD/docling
 License: MIT
@@ -25,10 +25,10 @@ Provides-Extra: rapidocr
 Provides-Extra: tesserocr
 Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
 Requires-Dist: certifi (>=2024.7.4)
-Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
-Requires-Dist: docling-core (>=2.6.1,<3.0.0)
+Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
+Requires-Dist: docling-core[chunking] (>=2.9.0,<3.0.0)
 Requires-Dist: docling-ibm-models (>=2.0.6,<3.0.0)
-Requires-Dist: docling-parse (>=2.0.5,<3.0.0)
+Requires-Dist: docling-parse (>=3.0.0,<4.0.0)
 Requires-Dist: easyocr (>=1.7,<2.0)
 Requires-Dist: filetype (>=1.2.0,<2.0.0)
 Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -39,7 +39,7 @@ Requires-Dist: onnxruntime (>=1.7.0,<1.20.0) ; (python_version < "3.10") and (ex
 Requires-Dist: onnxruntime (>=1.7.0,<2.0.0) ; (python_version >= "3.10") and (extra == "rapidocr")
 Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
 Requires-Dist: pandas (>=2.1.4,<3.0.0)
-Requires-Dist: pydantic (>=2.0.0,<2.10)
+Requires-Dist: pydantic (>=2.0.0,<3.0.0)
 Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
 Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
 Requires-Dist: python-docx (>=1.1.2,<2.0.0)
@@ -59,7 +59,7 @@ Description-Content-Type: text/markdown
   </a>
 </p>
-# 🦆 Docling
+# Docling
 <p align="center">
   <a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
@@ -81,7 +81,7 @@ Docling parses documents and exports them to the desired format with ease and sp
 ## Features
-* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON
+* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
 * 📑 Advanced PDF document understanding including page layout, reading order & table structures
 * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
 * 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications

{docling-2.8.3 → docling-2.10.0}/README.md RENAMED Viewed

@@ -4,7 +4,7 @@
   </a>
 </p>
-# 🦆 Docling
+# Docling
 <p align="center">
   <a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
@@ -26,7 +26,7 @@ Docling parses documents and exports them to the desired format with ease and sp
 ## Features
-* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON
+* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
 * 📑 Advanced PDF document understanding including page layout, reading order & table structures
 * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
 * 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications

{docling-2.8.3 → docling-2.10.0}/docling/backend/docling_parse_backend.py RENAMED Viewed

@@ -6,7 +6,7 @@ from typing import Iterable, List, Optional, Union
 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin, Size
-from docling_parse.docling_parse import pdf_parser_v1
+from docling_parse.pdf_parsers import pdf_parser_v1
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage

{docling-2.8.3 → docling-2.10.0}/docling/backend/docling_parse_v2_backend.py RENAMED Viewed

@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling_parse.docling_parse import pdf_parser_v2
+from docling_parse.pdf_parsers import pdf_parser_v2
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage
@@ -210,12 +210,14 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
         self.parser = pdf_parser_v2("fatal")
         success = False
-        if isinstance(path_or_stream, BytesIO):
+        if isinstance(self.path_or_stream, BytesIO):
             success = self.parser.load_document_from_bytesio(
-                self.document_hash, path_or_stream
+                self.document_hash, self.path_or_stream
+            )
+        elif isinstance(self.path_or_stream, Path):
+            success = self.parser.load_document(
+                self.document_hash, str(self.path_or_stream)
             )
-        elif isinstance(path_or_stream, Path):
-            success = self.parser.load_document(self.document_hash, str(path_or_stream))
         if not success:
             raise RuntimeError(

{docling-2.8.3 → docling-2.10.0}/docling/backend/msword_backend.py RENAMED Viewed

@@ -1,4 +1,5 @@
 import logging
+import re
 from io import BytesIO
 from pathlib import Path
 from typing import Set, Union
@@ -133,7 +134,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
     def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
         for element in body:
             tag_name = etree.QName(element).localname
             # Check for Inline Images (blip elements)
             namespaces = {
                 "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
@@ -153,6 +153,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 self.handle_pictures(element, docx_obj, drawing_blip, doc)
             # Check for Text
             elif tag_name in ["p"]:
+                # "tcPr", "sectPr"
                 self.handle_text_elements(element, docx_obj, doc)
             else:
                 _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
@@ -166,6 +167,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         except ValueError:
             return default
+    def split_text_and_number(self, input_string):
+        match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string)
+        if match:
+            parts = list(filter(None, match.groups()))
+            return parts
+        else:
+            return [input_string]
     def get_numId_and_ilvl(self, paragraph):
         # Access the XML element of the paragraph
         numPr = paragraph._element.find(
@@ -188,7 +197,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
     def get_label_and_level(self, paragraph):
         if paragraph.style is None:
             return "Normal", None
-        label = paragraph.style.name
+        label = paragraph.style.style_id
         if label is None:
             return "Normal", None
         if ":" in label:
@@ -197,7 +206,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
             if len(parts) == 2:
                 return parts[0], int(parts[1])
-        parts = label.split(" ")
+        parts = self.split_text_and_number(label)
         if "Heading" in label and len(parts) == 2:
             parts.sort()
@@ -219,14 +228,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         if paragraph.text is None:
             return
         text = paragraph.text.strip()
-        # if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
         # Common styles for bullet and numbered lists.
         # "List Bullet", "List Number", "List Paragraph"
         # Identify wether list is a numbered list or not
         # is_numbered = "List Bullet" not in paragraph.style.name
         is_numbered = False
-        p_style_name, p_level = self.get_label_and_level(paragraph)
+        p_style_id, p_level = self.get_label_and_level(paragraph)
         numid, ilevel = self.get_numId_and_ilvl(paragraph)
         if numid == 0:
@@ -238,14 +246,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 element,
                 docx_obj,
                 doc,
-                p_style_name,
+                p_style_id,
                 p_level,
                 numid,
                 ilevel,
                 text,
                 is_numbered,
             )
-            self.update_history(p_style_name, p_level, numid, ilevel)
+            self.update_history(p_style_id, p_level, numid, ilevel)
             return
         elif numid is None and self.prev_numid() is not None:  # Close list
             for key, val in self.parents.items():
@@ -253,23 +261,23 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                     self.parents[key] = None
             self.level = self.level_at_new_list - 1
             self.level_at_new_list = None
-        if p_style_name in ["Title"]:
+        if p_style_id in ["Title"]:
             for key, val in self.parents.items():
                 self.parents[key] = None
             self.parents[0] = doc.add_text(
                 parent=None, label=DocItemLabel.TITLE, text=text
             )
-        elif "Heading" in p_style_name:
-            self.add_header(element, docx_obj, doc, p_style_name, p_level, text)
+        elif "Heading" in p_style_id:
+            self.add_header(element, docx_obj, doc, p_style_id, p_level, text)
-        elif p_style_name in [
+        elif p_style_id in [
             "Paragraph",
             "Normal",
             "Subtitle",
             "Author",
-            "Default Text",
-            "List Paragraph",
-            "List Bullet",
+            "DefaultText",
+            "ListParagraph",
+            "ListBullet",
             "Quote",
         ]:
             level = self.get_level()
@@ -285,15 +293,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
             )
-        self.update_history(p_style_name, p_level, numid, ilevel)
+        self.update_history(p_style_id, p_level, numid, ilevel)
         return
     def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str):
         level = self.get_level()
         if isinstance(curr_level, int):
             if curr_level > level:
                 # add invisible group
                 for i in range(level, curr_level):
                     self.parents[i] = doc.add_group(
@@ -301,9 +307,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                         label=GroupLabel.SECTION,
                         name=f"header-{i}",
                     )
             elif curr_level < level:
                 # remove the tail
                 for key, val in self.parents.items():
                     if key >= curr_level:
@@ -314,7 +318,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 text=text,
                 level=curr_level,
             )
         else:
             self.parents[self.level] = doc.add_heading(
                 parent=self.parents[self.level - 1],
@@ -328,7 +331,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         element,
         docx_obj,
         doc,
-        p_style_name,
+        p_style_id,
         p_level,
         numid,
         ilevel,
@@ -346,7 +349,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
             )
-            # TODO: Set marker and enumerated arguments if this is an enumeration element.
+            # Set marker and enumerated arguments if this is an enumeration element.
             self.listIter += 1
             if is_numbered:
                 enum_marker = str(self.listIter) + "."
@@ -365,8 +368,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 self.level_at_new_list + self.prev_indent() + 1,
                 self.level_at_new_list + ilevel + 1,
             ):
-                # TODO: determine if this is an unordered list or an ordered list.
-                #  Set GroupLabel.ORDERED_LIST when it fits.
+                # Determine if this is an unordered list or an ordered list.
+                # Set GroupLabel.ORDERED_LIST when it fits.
                 self.listIter = 0
                 if is_numbered:
                     self.parents[i] = doc.add_group(
@@ -467,6 +470,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 row_span = get_rowspan(cell)
                 col_span = get_colspan(cell)
+                cell_text = cell.text
+                # In case cell doesn't return text via docx library:
+                if len(cell_text) == 0:
+                    cell_xml = cell._element
+                    texts = [""]
+                    for elem in cell_xml.iter():
+                        if elem.tag.endswith("t"):  # <w:t> tags that contain text
+                            if elem.text:
+                                texts.append(elem.text)
+                    # Join the collected text
+                    cell_text = " ".join(texts).strip()
                 # Find the next available column in the grid
                 while table_grid[row_idx][col_idx] is not None:
                     col_idx += 1
@@ -477,15 +493,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                         table_grid[row_idx + i][col_idx + j] = ""
                 cell = TableCell(
-                    text=cell.text,
+                    text=cell_text,
                     row_span=row_span,
                     col_span=col_span,
                     start_row_offset_idx=row_idx,
                     end_row_offset_idx=row_idx + row_span,
                     start_col_offset_idx=col_idx,
                     end_col_offset_idx=col_idx + col_span,
-                    col_header=False,  # col_header,
-                    row_header=False,  # ((not col_header) and html_cell.name=='th')
+                    col_header=False,
+                    row_header=False,
                 )
                 data.table_cells.append(cell)

docling-2.10.0/docling/chunking/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta
+from docling_core.transforms.chunker.hierarchical_chunker import (
+    DocChunk,
+    DocMeta,
+    HierarchicalChunker,
+)
+from docling_core.transforms.chunker.hybrid_chunker import HybridChunker

{docling-2.8.3 → docling-2.10.0}/docling/cli/main.py RENAMED Viewed

@@ -10,7 +10,9 @@ from pathlib import Path
 from typing import Annotated, Dict, Iterable, List, Optional, Type
 import typer
+from docling_core.types.doc import ImageRefMode
 from docling_core.utils.file import resolve_source_to_path
+from pydantic import TypeAdapter, ValidationError
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
@@ -86,9 +88,11 @@ def export_documents(
     conv_results: Iterable[ConversionResult],
     output_dir: Path,
     export_json: bool,
+    export_html: bool,
     export_md: bool,
     export_txt: bool,
     export_doctags: bool,
+    image_export_mode: ImageRefMode,
 ):
     success_count = 0
@@ -99,33 +103,45 @@ def export_documents(
             success_count += 1
             doc_filename = conv_res.input.file.stem
-            # Export Deep Search document JSON format:
+            # Export JSON format:
             if export_json:
                 fname = output_dir / f"{doc_filename}.json"
-                with fname.open("w", encoding="utf8") as fp:
-                    _log.info(f"writing JSON output to {fname}")
-                    fp.write(json.dumps(conv_res.document.export_to_dict()))
+                _log.info(f"writing JSON output to {fname}")
+                conv_res.document.save_as_json(
+                    filename=fname, image_mode=image_export_mode
+                )
+            # Export HTML format:
+            if export_html:
+                fname = output_dir / f"{doc_filename}.html"
+                _log.info(f"writing HTML output to {fname}")
+                conv_res.document.save_as_html(
+                    filename=fname, image_mode=image_export_mode
+                )
             # Export Text format:
             if export_txt:
                 fname = output_dir / f"{doc_filename}.txt"
-                with fname.open("w", encoding="utf8") as fp:
-                    _log.info(f"writing Text output to {fname}")
-                    fp.write(conv_res.document.export_to_markdown(strict_text=True))
+                _log.info(f"writing TXT output to {fname}")
+                conv_res.document.save_as_markdown(
+                    filename=fname,
+                    strict_text=True,
+                    image_mode=ImageRefMode.PLACEHOLDER,
+                )
             # Export Markdown format:
             if export_md:
                 fname = output_dir / f"{doc_filename}.md"
-                with fname.open("w", encoding="utf8") as fp:
-                    _log.info(f"writing Markdown output to {fname}")
-                    fp.write(conv_res.document.export_to_markdown())
+                _log.info(f"writing Markdown output to {fname}")
+                conv_res.document.save_as_markdown(
+                    filename=fname, image_mode=image_export_mode
+                )
             # Export Document Tags format:
             if export_doctags:
                 fname = output_dir / f"{doc_filename}.doctags"
-                with fname.open("w", encoding="utf8") as fp:
-                    _log.info(f"writing Doc Tags output to {fname}")
-                    fp.write(conv_res.document.export_to_document_tokens())
+                _log.info(f"writing Doc Tags output to {fname}")
+                conv_res.document.save_as_document_tokens(filename=fname)
         else:
             _log.warning(f"Document {conv_res.input.file} failed to convert.")
@@ -160,6 +176,13 @@ def convert(
     to_formats: List[OutputFormat] = typer.Option(
         None, "--to", help="Specify output formats. Defaults to Markdown."
     ),
+    image_export_mode: Annotated[
+        ImageRefMode,
+        typer.Option(
+            ...,
+            help="Image export mode for the document (only in case of JSON, Markdown or HTML). With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.",
+        ),
+    ] = ImageRefMode.EMBEDDED,
     ocr: Annotated[
         bool,
         typer.Option(
@@ -185,7 +208,7 @@ def convert(
     ] = None,
     pdf_backend: Annotated[
         PdfBackend, typer.Option(..., help="The PDF backend to use.")
-    ] = PdfBackend.DLPARSE_V1,
+    ] = PdfBackend.DLPARSE_V2,
     table_mode: Annotated[
         TableFormerMode,
         typer.Option(..., help="The mode to use in the table structure model."),
@@ -260,24 +283,45 @@ def convert(
     with tempfile.TemporaryDirectory() as tempdir:
         input_doc_paths: List[Path] = []
         for src in input_sources:
-            source = resolve_source_to_path(source=src, workdir=Path(tempdir))
-            if not source.exists():
+            try:
+                # check if we can fetch some remote url
+                source = resolve_source_to_path(source=src, workdir=Path(tempdir))
+                input_doc_paths.append(source)
+            except FileNotFoundError:
                 err_console.print(
-                    f"[red]Error: The input file {source} does not exist.[/red]"
+                    f"[red]Error: The input file {src} does not exist.[/red]"
                 )
                 raise typer.Abort()
-            elif source.is_dir():
-                for fmt in from_formats:
-                    for ext in FormatToExtensions[fmt]:
-                        input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
-                        input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
-            else:
-                input_doc_paths.append(source)
+            except IsADirectoryError:
+                # if the input matches to a file or a folder
+                try:
+                    local_path = TypeAdapter(Path).validate_python(src)
+                    if local_path.exists() and local_path.is_dir():
+                        for fmt in from_formats:
+                            for ext in FormatToExtensions[fmt]:
+                                input_doc_paths.extend(
+                                    list(local_path.glob(f"**/*.{ext}"))
+                                )
+                                input_doc_paths.extend(
+                                    list(local_path.glob(f"**/*.{ext.upper()}"))
+                                )
+                    elif local_path.exists():
+                        input_doc_paths.append(local_path)
+                    else:
+                        err_console.print(
+                            f"[red]Error: The input file {src} does not exist.[/red]"
+                        )
+                        raise typer.Abort()
+                except Exception as err:
+                    err_console.print(f"[red]Error: Cannot read the input {src}.[/red]")
+                    _log.info(err)  # will print more details if verbose is activated
+                    raise typer.Abort()
         if to_formats is None:
             to_formats = [OutputFormat.MARKDOWN]
         export_json = OutputFormat.JSON in to_formats
+        export_html = OutputFormat.HTML in to_formats
         export_md = OutputFormat.MARKDOWN in to_formats
         export_txt = OutputFormat.TEXT in to_formats
         export_doctags = OutputFormat.DOCTAGS in to_formats
@@ -309,6 +353,13 @@ def convert(
         )
         pipeline_options.table_structure_options.mode = table_mode
+        if image_export_mode != ImageRefMode.PLACEHOLDER:
+            pipeline_options.generate_page_images = True
+            pipeline_options.generate_picture_images = (
+                True  # FIXME: to be deprecated in verson 3
+            )
+            pipeline_options.images_scale = 2
         if artifacts_path is not None:
             pipeline_options.artifacts_path = artifacts_path
@@ -321,11 +372,13 @@ def convert(
         else:
             raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
+        pdf_format_option = PdfFormatOption(
+            pipeline_options=pipeline_options,
+            backend=backend,  # pdf_backend
+        )
         format_options: Dict[InputFormat, FormatOption] = {
-            InputFormat.PDF: PdfFormatOption(
-                pipeline_options=pipeline_options,
-                backend=backend,  # pdf_backend
-            )
+            InputFormat.PDF: pdf_format_option,
+            InputFormat.IMAGE: pdf_format_option,
         }
         doc_converter = DocumentConverter(
             allowed_formats=from_formats,
@@ -343,9 +396,11 @@ def convert(
             conv_results,
             output_dir=output,
             export_json=export_json,
+            export_html=export_html,
             export_md=export_md,
             export_txt=export_txt,
             export_doctags=export_doctags,
+            image_export_mode=image_export_mode,
         )
         end_time = time.time() - start_time

{docling-2.8.3 → docling-2.10.0}/docling/datamodel/base_models.py RENAMED Viewed

@@ -41,6 +41,7 @@ class InputFormat(str, Enum):
 class OutputFormat(str, Enum):
     MARKDOWN = "md"
     JSON = "json"
+    HTML = "html"
     TEXT = "text"
     DOCTAGS = "doctags"

docling 2.8.3__tar.gz → 2.10.0__tar.gz

docling 2.8.3tar.gz → 2.10.0tar.gz