PyPI - docling - Versions diffs - 2.3.1__tar.gz → 2.4.1__tar.gz - Mend

docling 2.3.1tar.gz → 2.4.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

{docling-2.3.1 → docling-2.4.1}/PKG-INFO RENAMED Viewed

@@ -1,10 +1,10 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 2.3.1
-Summary: Docling PDF conversion package
+Version: 2.4.1
+Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Home-page: https://github.com/DS4SD/docling
 License: MIT
-Keywords: docling,convert,document,pdf,layout model,segmentation,table structure,table former
+Keywords: docling,convert,document,pdf,docx,html,markdown,layout model,segmentation,table structure,table former
 Author: Christoph Auer
 Author-email: cau@zurich.ibm.com
 Requires-Python: >=3.10,<4.0
@@ -53,6 +53,10 @@ Description-Content-Type: text/markdown
 # Docling
+<p align="center">
+  <a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+</p>
 [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
 [![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://ds4sd.github.io/docling/)
 [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
@@ -66,19 +70,22 @@ Description-Content-Type: text/markdown
 Docling parses documents and exports them to the desired format with ease and speed.
 ## Features
 * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, Images, HTML, AsciiDoc, Markdown) and exports to Markdown and JSON
 * 📑 Advanced PDF document understanding including page layout, reading order & table structures
 * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
-* 📝 Metadata extraction, including title, authors, references & language
-* 🤖 Seamless LlamaIndex 🦙 & LangChain 🦜🔗 integration for powerful RAG / QA applications
+* 🤖 Easy integration with LlamaIndex 🦙 & LangChain 🦜🔗 for powerful RAG / QA applications
 * 🔍 OCR support for scanned PDFs
 * 💻 Simple and convenient CLI
 Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty examples and unlock the full power of Docling!
+### Coming soon
+* ♾️ Equation & code extraction
+* 📝 Metadata extraction, including title, authors, references & language
+* 🦜🔗 Native LangChain extension
 ## Installation
@@ -104,16 +111,13 @@ result = converter.convert(source)
 print(result.document.export_to_markdown())  # output: "## Docling Technical Report[...]"
 ```
 Check out [Getting started](https://ds4sd.github.io/docling/).
 You will find lots of tuning options to leverage all the advanced capabilities.
 ## Get help and support
 Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
 ## Technical report
 For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
@@ -122,7 +126,6 @@ For more details on Docling's inner workings, check out the [Docling Technical R
 Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
 ## References
 If you use Docling in your projects, please consider citing the following:
@@ -142,6 +145,10 @@ If you use Docling in your projects, please consider citing the following:
 ## License
-The Docling codebase is under MIT license.
+The Docling codebase is under MIT license.
 For individual model usage, please refer to the model licenses found in the original packages.
+## IBM ❤️ Open Source AI
+Docling has been brought to you by IBM.

{docling-2.3.1 → docling-2.4.1}/README.md RENAMED Viewed

@@ -6,6 +6,10 @@
 # Docling
+<p align="center">
+  <a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+</p>
 [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
 [![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://ds4sd.github.io/docling/)
 [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
@@ -19,19 +23,22 @@
 Docling parses documents and exports them to the desired format with ease and speed.
 ## Features
 * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, Images, HTML, AsciiDoc, Markdown) and exports to Markdown and JSON
 * 📑 Advanced PDF document understanding including page layout, reading order & table structures
 * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
-* 📝 Metadata extraction, including title, authors, references & language
-* 🤖 Seamless LlamaIndex 🦙 & LangChain 🦜🔗 integration for powerful RAG / QA applications
+* 🤖 Easy integration with LlamaIndex 🦙 & LangChain 🦜🔗 for powerful RAG / QA applications
 * 🔍 OCR support for scanned PDFs
 * 💻 Simple and convenient CLI
 Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty examples and unlock the full power of Docling!
+### Coming soon
+* ♾️ Equation & code extraction
+* 📝 Metadata extraction, including title, authors, references & language
+* 🦜🔗 Native LangChain extension
 ## Installation
@@ -57,16 +64,13 @@ result = converter.convert(source)
 print(result.document.export_to_markdown())  # output: "## Docling Technical Report[...]"
 ```
 Check out [Getting started](https://ds4sd.github.io/docling/).
 You will find lots of tuning options to leverage all the advanced capabilities.
 ## Get help and support
 Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
 ## Technical report
 For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
@@ -75,7 +79,6 @@ For more details on Docling's inner workings, check out the [Docling Technical R
 Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
 ## References
 If you use Docling in your projects, please consider citing the following:
@@ -95,5 +98,9 @@ If you use Docling in your projects, please consider citing the following:
 ## License
-The Docling codebase is under MIT license.
+The Docling codebase is under MIT license.
 For individual model usage, please refer to the model licenses found in the original packages.
+## IBM ❤️ Open Source AI
+Docling has been brought to you by IBM.

{docling-2.3.1 → docling-2.4.1}/docling/backend/docling_parse_backend.py RENAMED Viewed

@@ -29,7 +29,7 @@ class DoclingParsePageBackend(PdfPageBackend):
             self._dpage = parsed_page["pages"][0]
         else:
             _log.info(
-                f"An error occured when loading page {page_no} of document {document_hash}."
+                f"An error occurred when loading page {page_no} of document {document_hash}."
             )
     def is_valid(self) -> bool:

{docling-2.3.1 → docling-2.4.1}/docling/backend/docling_parse_v2_backend.py RENAMED Viewed

@@ -31,7 +31,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
             self._dpage = parsed_page["pages"][0]
         else:
             _log.info(
-                f"An error occured when loading page {page_no} of document {document_hash}."
+                f"An error occurred when loading page {page_no} of document {document_hash}."
             )
     def is_valid(self) -> bool:

{docling-2.3.1 → docling-2.4.1}/docling/backend/pypdfium2_backend.py RENAMED Viewed

@@ -29,7 +29,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
             self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
         except PdfiumError as e:
             _log.info(
-                f"An exception occured when loading page {page_no} of document {document_hash}.",
+                f"An exception occurred when loading page {page_no} of document {document_hash}.",
                 exc_info=True,
             )
             self.valid = False

{docling-2.3.1 → docling-2.4.1}/docling/cli/main.py RENAMED Viewed

@@ -5,12 +5,15 @@ import time
 import warnings
 from enum import Enum
 from pathlib import Path
-from typing import Annotated, Dict, Iterable, List, Optional
+from typing import Annotated, Dict, Iterable, List, Optional, Type
 import typer
 from docling_core.utils.file import resolve_file_source
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
+from docling.backend.pdf_backend import PdfDocumentBackend
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import (
     ConversionStatus,
     FormatToExtensions,
@@ -22,6 +25,7 @@ from docling.datamodel.pipeline_options import (
     EasyOcrOptions,
     OcrOptions,
     PdfPipelineOptions,
+    TableFormerMode,
     TesseractCliOcrOptions,
     TesseractOcrOptions,
 )
@@ -58,9 +62,10 @@ def version_callback(value: bool):
 # Define an enum for the backend options
-class Backend(str, Enum):
+class PdfBackend(str, Enum):
     PYPDFIUM2 = "pypdfium2"
-    DOCLING = "docling"
+    DLPARSE_V1 = "dlparse_v1"
+    DLPARSE_V2 = "dlparse_v2"
 # Define an enum for the ocr engines
@@ -90,28 +95,28 @@ def export_documents(
             # Export Deep Search document JSON format:
             if export_json:
                 fname = output_dir / f"{doc_filename}.json"
-                with fname.open("w") as fp:
+                with fname.open("w", encoding="utf8") as fp:
                     _log.info(f"writing JSON output to {fname}")
                     fp.write(json.dumps(conv_res.document.export_to_dict()))
             # Export Text format:
             if export_txt:
                 fname = output_dir / f"{doc_filename}.txt"
-                with fname.open("w") as fp:
+                with fname.open("w", encoding="utf8") as fp:
                     _log.info(f"writing Text output to {fname}")
                     fp.write(conv_res.document.export_to_markdown(strict_text=True))
             # Export Markdown format:
             if export_md:
                 fname = output_dir / f"{doc_filename}.md"
-                with fname.open("w") as fp:
+                with fname.open("w", encoding="utf8") as fp:
                     _log.info(f"writing Markdown output to {fname}")
                     fp.write(conv_res.document.export_to_markdown())
             # Export Document Tags format:
             if export_doctags:
                 fname = output_dir / f"{doc_filename}.doctags"
-                with fname.open("w") as fp:
+                with fname.open("w", encoding="utf8") as fp:
                     _log.info(f"writing Doc Tags output to {fname}")
                     fp.write(conv_res.document.export_to_document_tokens())
@@ -151,6 +156,17 @@ def convert(
     ocr_engine: Annotated[
         OcrEngine, typer.Option(..., help="The OCR engine to use.")
     ] = OcrEngine.EASYOCR,
+    pdf_backend: Annotated[
+        PdfBackend, typer.Option(..., help="The PDF backend to use.")
+    ] = PdfBackend.DLPARSE_V1,
+    table_mode: Annotated[
+        TableFormerMode,
+        typer.Option(..., help="The mode to use in the table structure model."),
+    ] = TableFormerMode.FAST,
+    artifacts_path: Annotated[
+        Optional[Path],
+        typer.Option(..., help="If provided, the location of the model artifacts."),
+    ] = None,
     abort_on_error: Annotated[
         bool,
         typer.Option(
@@ -217,11 +233,25 @@ def convert(
         do_table_structure=True,
     )
     pipeline_options.table_structure_options.do_cell_matching = True  # do_cell_matching
+    pipeline_options.table_structure_options.mode = table_mode
+    if artifacts_path is not None:
+        pipeline_options.artifacts_path = artifacts_path
+    match pdf_backend:
+        case PdfBackend.DLPARSE_V1:
+            backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
+        case PdfBackend.DLPARSE_V2:
+            backend = DoclingParseV2DocumentBackend
+        case PdfBackend.PYPDFIUM2:
+            backend = PyPdfiumDocumentBackend
+        case _:
+            raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
     format_options: Dict[InputFormat, FormatOption] = {
         InputFormat.PDF: PdfFormatOption(
             pipeline_options=pipeline_options,
-            backend=DoclingParseDocumentBackend,  # pdf_backend
+            backend=backend,  # pdf_backend
         )
     }
     doc_converter = DocumentConverter(

{docling-2.3.1 → docling-2.4.1}/docling/datamodel/pipeline_options.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from enum import Enum, auto
+from enum import Enum
 from pathlib import Path
 from typing import List, Literal, Optional, Union
@@ -6,8 +6,8 @@ from pydantic import BaseModel, ConfigDict, Field
 class TableFormerMode(str, Enum):
-    FAST = auto()
-    ACCURATE = auto()
+    FAST = "fast"
+    ACCURATE = "accurate"
 class TableStructureOptions(BaseModel):

{docling-2.3.1 → docling-2.4.1}/docling/models/tesseract_ocr_model.py RENAMED Viewed

@@ -22,25 +22,37 @@ class TesseractOcrModel(BaseOcrModel):
         self.reader = None
         if self.enabled:
-            setup_errmsg = (
+            install_errmsg = (
                 "tesserocr is not correctly installed. "
                 "Please install it via `pip install tesserocr` to use this OCR engine. "
-                "Note that tesserocr might have to be manually compiled for working with"
+                "Note that tesserocr might have to be manually compiled for working with "
                 "your Tesseract installation. The Docling documentation provides examples for it. "
-                "Alternatively, Docling has support for other OCR engines. See the documentation."
+                "Alternatively, Docling has support for other OCR engines. See the documentation: "
+                "https://ds4sd.github.io/docling/installation/"
             )
+            missing_langs_errmsg = (
+                "tesserocr is not correctly configured. No language models have been detected. "
+                "Please ensure that the TESSDATA_PREFIX envvar points to tesseract languages dir. "
+                "You can find more information how to setup other OCR engines in Docling "
+                "documentation: "
+                "https://ds4sd.github.io/docling/installation/"
+            )
             try:
                 import tesserocr
             except ImportError:
-                raise ImportError(setup_errmsg)
+                raise ImportError(install_errmsg)
             try:
                 tesseract_version = tesserocr.tesseract_version()
-                _log.debug("Initializing TesserOCR: %s", tesseract_version)
             except:
-                raise ImportError(setup_errmsg)
+                raise ImportError(install_errmsg)
+            _, tesserocr_languages = tesserocr.get_languages()
+            if not tesserocr_languages:
+                raise ImportError(missing_langs_errmsg)
             # Initialize the tesseractAPI
+            _log.debug("Initializing TesserOCR: %s", tesseract_version)
             lang = "+".join(self.options.lang)
             if self.options.path is not None:
                 self.reader = tesserocr.PyTessBaseAPI(

{docling-2.3.1 → docling-2.4.1}/pyproject.toml RENAMED Viewed

@@ -1,13 +1,13 @@
 [tool.poetry]
 name = "docling"
-version = "2.3.1"  # DO NOT EDIT, updated automatically
-description = "Docling PDF conversion package"
-authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
+version = "2.4.1"  # DO NOT EDIT, updated automatically
+description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
+authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"
 readme = "README.md"
 repository = "https://github.com/DS4SD/docling"
 homepage = "https://github.com/DS4SD/docling"
-keywords= ["docling", "convert", "document", "pdf", "layout model", "segmentation", "table structure", "table former"]
+keywords= ["docling", "convert", "document", "pdf", "docx", "html", "markdown", "layout model", "segmentation", "table structure", "table former"]
  classifiers = [
      "License :: OSI Approved :: MIT License",
      "Operating System :: MacOS :: MacOS X",
@@ -73,12 +73,6 @@ mkdocs-jupyter = "^0.25.0"
 [tool.poetry.group.examples.dependencies]
 datasets = "^2.21.0"
 python-dotenv = "^1.0.1"
-# llama-index-readers-docling = { version = "^0.1.0", markers = 'python_version < "3.13"' }
-# llama-index-node-parser-docling = { version = "^0.1.0", markers = 'python_version < "3.13"' }
-# llama-index-readers-file = { version = "^0.2.2", markers = 'python_version < "3.13"' }
-# llama-index-embeddings-huggingface = { version = "^0.3.1", markers = 'python_version < "3.13"' }
-# llama-index-llms-huggingface-api = { version = "^0.2.0", markers = 'python_version < "3.13"' }
-# llama-index-vector-stores-milvus ={ version =  "^0.2.1", markers = 'python_version < "3.13"' }
 langchain-huggingface = "^0.0.3"
 langchain-milvus = "^0.1.4"
 langchain-text-splitters = "^0.2.4"