PyPI - docling - Versions diffs - 1.13.0__tar.gz → 1.14.0__tar.gz - Mend

docling 1.13.0tar.gz → 1.14.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

{docling-1.13.0 → docling-1.14.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 1.13.0
+Version: 1.14.0
 Summary: Docling PDF conversion package
 Home-page: https://github.com/DS4SD/docling
 License: MIT
@@ -22,7 +22,7 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Provides-Extra: examples
 Requires-Dist: certifi (>=2024.7.4)
 Requires-Dist: deepsearch-glm (>=0.21.1,<0.22.0)
-Requires-Dist: docling-core (>=1.4.0,<2.0.0)
+Requires-Dist: docling-core (>=1.5.0,<2.0.0)
 Requires-Dist: docling-ibm-models (>=1.2.0,<2.0.0)
 Requires-Dist: docling-parse (>=1.2.0,<2.0.0)
 Requires-Dist: easyocr (>=1.7,<2.0)
@@ -74,8 +74,9 @@ Docling bundles PDF document conversion to JSON and Markdown in an easy, self-co
 * ⚡ Converts any PDF document to JSON or Markdown format, stable and lightning fast
 * 📑 Understands detailed page layout, reading order and recovers table structures
 * 📝 Extracts metadata from the document, such as title, authors, references and language
-* 🔍 Optionally applies OCR (use with scanned PDFs)
+* 🔍 Includes OCR support for scanned PDFs
 * 🤖 Integrates easily with LLM app / RAG frameworks like 🦙 LlamaIndex and 🦜🔗 LangChain
+* 💻 Provides a simple and convenient CLI
 ## Installation
@@ -87,31 +88,33 @@ pip install docling
 > [!NOTE]
 > Works on macOS and Linux environments. Windows platforms are currently not tested.
+<details>
+  <summary><b>Alternative PyTorch distributions</b></summary>
-### Use alternative PyTorch distributions
+  The Docling models depend on the [PyTorch](https://pytorch.org/) library.
+  Depending on your architecture, you might want to use a different distribution of `torch`.
+  For example, you might want support for different accelerator or for a cpu-only version.
+  All the different ways for installing `torch` are listed on their website <https://pytorch.org/>.
-The Docling models depend on the [PyTorch](https://pytorch.org/) library.
-Depending on your architecture, you might want to use a different distribution of `torch`.
-For example, you might want support for different accelerator or for a cpu-only version.
-All the different ways for installing `torch` are listed on their website <https://pytorch.org/>.
+  One common situation is the installation on Linux systems with cpu-only support.
+  In this case, we suggest the installation of Docling with the following options
-One common situation is the installation on Linux systems with cpu-only support.
-In this case, we suggest the installation of Docling with the following options
+  ```bash
+  # Example for installing on the Linux cpu-only version
+  pip install docling --extra-index-url https://download.pytorch.org/whl/cpu
+  ```
+</details>
-```bash
-# Example for installing on the Linux cpu-only version
-pip install docling --extra-index-url https://download.pytorch.org/whl/cpu
-```
+<details>
+  <summary><b>Docling development setup</b></summary>
+  To develop for Docling (features, bugfixes etc.), install as follows from your local clone's root dir:
+  ```bash
+  poetry install --all-extras
+  ```
+</details>
-### Development setup
-To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
-```bash
-poetry install --all-extras
-```
-## Usage
+## Getting started
 ### Convert a single document
@@ -123,6 +126,7 @@ source = "https://arxiv.org/pdf/2408.09869"  # PDF path or URL
 converter = DocumentConverter()
 result = converter.convert_single(source)
 print(result.render_as_markdown())  # output: "## Docling Technical Report[...]"
+print(result.render_as_doctags())  # output: "<document><title><page_1><loc_20>..."
 ```
 ### Convert a batch of documents
@@ -136,6 +140,51 @@ python examples/batch_convert.py
 ```
 The output of the above command will be written to `./scratch`.
+### CLI
+You can also use Docling directly from your command line to convert individual files —be it local or by URL— or whole directories.
+A simple example would look like this:
+```console
+docling https://arxiv.org/pdf/2206.01062
+```
+To see all available options (export formats etc.) run `docling --help`.
+<details>
+  <summary><b>CLI reference</b></summary>
+  Here are the available options as of this writing (for an up-to-date listing, run `docling --help`):
+  ```console
+  $ docling --help
+  Usage: docling [OPTIONS] source
+  ╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+  │ *    input_sources      source  PDF files to convert. Can be local file / directory paths or URL. [default: None] [required] │
+  ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+  ╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+  │ --json       --no-json                            If enabled the document is exported as JSON. [default: no-json]            │
+  │ --md         --no-md                              If enabled the document is exported as Markdown. [default: md]             │
+  │ --txt        --no-txt                             If enabled the document is exported as Text. [default: no-txt]             │
+  │ --doctags    --no-doctags                         If enabled the document is exported as Doc Tags. [default: no-doctags]     │
+  │ --ocr        --no-ocr                             If enabled, the bitmap content will be processed using OCR. [default: ocr] │
+  │ --backend                    [pypdfium2|docling]  The PDF backend to use. [default: docling]                                 │
+  │ --output                     PATH                 Output directory where results are saved. [default: .]                     │
+  │ --version                                         Show version information.                                                  │
+  │ --help                                            Show this message and exit.                                                │
+  ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+  ```
+</details>
+### RAG
+Check out the following examples showcasing RAG using Docling with standard LLM application frameworks:
+- [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb)
+- [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb)
+## Advanced features
 ### Adjust pipeline features
 The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
@@ -194,11 +243,6 @@ results = doc_converter.convert(conv_input)
 You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
-### RAG
-Check out the following examples showcasing RAG using Docling with standard LLM application frameworks:
-- [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb)
-- [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb)
 ## Technical report
 For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).

{docling-1.13.0 → docling-1.14.0}/README.md RENAMED Viewed

@@ -22,8 +22,9 @@ Docling bundles PDF document conversion to JSON and Markdown in an easy, self-co
 * ⚡ Converts any PDF document to JSON or Markdown format, stable and lightning fast
 * 📑 Understands detailed page layout, reading order and recovers table structures
 * 📝 Extracts metadata from the document, such as title, authors, references and language
-* 🔍 Optionally applies OCR (use with scanned PDFs)
+* 🔍 Includes OCR support for scanned PDFs
 * 🤖 Integrates easily with LLM app / RAG frameworks like 🦙 LlamaIndex and 🦜🔗 LangChain
+* 💻 Provides a simple and convenient CLI
 ## Installation
@@ -35,31 +36,33 @@ pip install docling
 > [!NOTE]
 > Works on macOS and Linux environments. Windows platforms are currently not tested.
+<details>
+  <summary><b>Alternative PyTorch distributions</b></summary>
-### Use alternative PyTorch distributions
+  The Docling models depend on the [PyTorch](https://pytorch.org/) library.
+  Depending on your architecture, you might want to use a different distribution of `torch`.
+  For example, you might want support for different accelerator or for a cpu-only version.
+  All the different ways for installing `torch` are listed on their website <https://pytorch.org/>.
-The Docling models depend on the [PyTorch](https://pytorch.org/) library.
-Depending on your architecture, you might want to use a different distribution of `torch`.
-For example, you might want support for different accelerator or for a cpu-only version.
-All the different ways for installing `torch` are listed on their website <https://pytorch.org/>.
+  One common situation is the installation on Linux systems with cpu-only support.
+  In this case, we suggest the installation of Docling with the following options
-One common situation is the installation on Linux systems with cpu-only support.
-In this case, we suggest the installation of Docling with the following options
+  ```bash
+  # Example for installing on the Linux cpu-only version
+  pip install docling --extra-index-url https://download.pytorch.org/whl/cpu
+  ```
+</details>
-```bash
-# Example for installing on the Linux cpu-only version
-pip install docling --extra-index-url https://download.pytorch.org/whl/cpu
-```
+<details>
+  <summary><b>Docling development setup</b></summary>
+  To develop for Docling (features, bugfixes etc.), install as follows from your local clone's root dir:
+  ```bash
+  poetry install --all-extras
+  ```
+</details>
-### Development setup
-To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
-```bash
-poetry install --all-extras
-```
-## Usage
+## Getting started
 ### Convert a single document
@@ -71,6 +74,7 @@ source = "https://arxiv.org/pdf/2408.09869"  # PDF path or URL
 converter = DocumentConverter()
 result = converter.convert_single(source)
 print(result.render_as_markdown())  # output: "## Docling Technical Report[...]"
+print(result.render_as_doctags())  # output: "<document><title><page_1><loc_20>..."
 ```
 ### Convert a batch of documents
@@ -84,6 +88,51 @@ python examples/batch_convert.py
 ```
 The output of the above command will be written to `./scratch`.
+### CLI
+You can also use Docling directly from your command line to convert individual files —be it local or by URL— or whole directories.
+A simple example would look like this:
+```console
+docling https://arxiv.org/pdf/2206.01062
+```
+To see all available options (export formats etc.) run `docling --help`.
+<details>
+  <summary><b>CLI reference</b></summary>
+  Here are the available options as of this writing (for an up-to-date listing, run `docling --help`):
+  ```console
+  $ docling --help
+  Usage: docling [OPTIONS] source
+  ╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+  │ *    input_sources      source  PDF files to convert. Can be local file / directory paths or URL. [default: None] [required] │
+  ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+  ╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+  │ --json       --no-json                            If enabled the document is exported as JSON. [default: no-json]            │
+  │ --md         --no-md                              If enabled the document is exported as Markdown. [default: md]             │
+  │ --txt        --no-txt                             If enabled the document is exported as Text. [default: no-txt]             │
+  │ --doctags    --no-doctags                         If enabled the document is exported as Doc Tags. [default: no-doctags]     │
+  │ --ocr        --no-ocr                             If enabled, the bitmap content will be processed using OCR. [default: ocr] │
+  │ --backend                    [pypdfium2|docling]  The PDF backend to use. [default: docling]                                 │
+  │ --output                     PATH                 Output directory where results are saved. [default: .]                     │
+  │ --version                                         Show version information.                                                  │
+  │ --help                                            Show this message and exit.                                                │
+  ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+  ```
+</details>
+### RAG
+Check out the following examples showcasing RAG using Docling with standard LLM application frameworks:
+- [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb)
+- [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb)
+## Advanced features
 ### Adjust pipeline features
 The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
@@ -142,11 +191,6 @@ results = doc_converter.convert(conv_input)
 You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
-### RAG
-Check out the following examples showcasing RAG using Docling with standard LLM application frameworks:
-- [Basic RAG pipeline with 🦙 LlamaIndex](https://github.com/DS4SD/docling/tree/main/examples/rag_llamaindex.ipynb)
-- [Basic RAG pipeline with 🦜🔗 LangChain](https://github.com/DS4SD/docling/tree/main/examples/rag_langchain.ipynb)
 ## Technical report
 For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).

{docling-1.13.0 → docling-1.14.0}/docling/cli/main.py RENAMED Viewed

@@ -8,7 +8,7 @@ from pathlib import Path
 from typing import Annotated, Iterable, List, Optional
 import typer
-from pydantic import AnyUrl
+from docling_core.utils.file import resolve_file_source
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
@@ -109,11 +109,11 @@ def export_documents(
 @app.command(no_args_is_help=True)
 def convert(
     input_sources: Annotated[
-        List[Path],
+        List[str],
         typer.Argument(
             ...,
             metavar="source",
-            help="PDF files to convert. Directories are also accepted.",
+            help="PDF files to convert. Can be local file / directory paths or URL.",
         ),
     ],
     export_json: Annotated[
@@ -167,7 +167,8 @@ def convert(
     logging.basicConfig(level=logging.INFO)
     input_doc_paths: List[Path] = []
-    for source in input_sources:
+    for src in input_sources:
+        source = resolve_file_source(source=src)
         if not source.exists():
             err_console.print(
                 f"[red]Error: The input file {source} does not exist.[/red]"
@@ -179,58 +180,25 @@ def convert(
         else:
             input_doc_paths.append(source)
-    ###########################################################################
-    # The following sections contain a combination of PipelineOptions
-    # and PDF Backends for various configurations.
-    # Uncomment one section at the time to see the differences in the output.
-    doc_converter = None
-    if backend == Backend.PYPDFIUM2 and not ocr:  # PyPdfium without OCR
-        pipeline_options = PipelineOptions()
-        pipeline_options.do_ocr = False
-        pipeline_options.do_table_structure = True
-        pipeline_options.table_structure_options.do_cell_matching = False
-        doc_converter = DocumentConverter(
-            pipeline_options=pipeline_options,
-            pdf_backend=PyPdfiumDocumentBackend,
-        )
-    elif backend == Backend.PYPDFIUM2.value and ocr:  # PyPdfium with OCR
-        pipeline_options = PipelineOptions()
-        pipeline_options.do_ocr = False
-        pipeline_options.do_table_structure = True
-        pipeline_options.table_structure_options.do_cell_matching = True
-        doc_converter = DocumentConverter(
-            pipeline_options=pipeline_options,
-            pdf_backend=PyPdfiumDocumentBackend,
-        )
-    elif backend == Backend.DOCLING.value and not ocr:  # Docling Parse without OCR
-        pipeline_options = PipelineOptions()
-        pipeline_options.do_ocr = False
-        pipeline_options.do_table_structure = True
-        pipeline_options.table_structure_options.do_cell_matching = True
-        doc_converter = DocumentConverter(
-            pipeline_options=pipeline_options,
-            pdf_backend=DoclingParseDocumentBackend,
-        )
-    elif backend == Backend.DOCLING.value and ocr:  # Docling Parse with OCR
-        pipeline_options = PipelineOptions()
-        pipeline_options.do_ocr = True
-        pipeline_options.do_table_structure = True
-        pipeline_options.table_structure_options.do_cell_matching = True
-        doc_converter = DocumentConverter(
-            pipeline_options=pipeline_options,
-            pdf_backend=DoclingParseDocumentBackend,
-        )
-    ###########################################################################
+    match backend:
+        case Backend.PYPDFIUM2:
+            do_cell_matching = ocr  # only do cell matching when OCR enabled
+            pdf_backend = PyPdfiumDocumentBackend
+        case Backend.DOCLING:
+            do_cell_matching = True
+            pdf_backend = DoclingParseDocumentBackend
+        case _:
+            raise RuntimeError(f"Unexpected backend type {backend}")
+    pipeline_options = PipelineOptions(
+        do_ocr=ocr,
+        do_table_structure=True,
+    )
+    pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
+    doc_converter = DocumentConverter(
+        pipeline_options=pipeline_options,
+        pdf_backend=pdf_backend,
+    )
     # Define input files
     input = DocumentConversionInput.from_paths(input_doc_paths)

{docling-1.13.0 → docling-1.14.0}/docling/datamodel/document.py RENAMED Viewed

@@ -368,20 +368,30 @@ class ConvertedDocument(BaseModel):
             "table",
             "figure",
         ],
-        page_tagging: bool = True,
-        location_tagging: bool = True,
-        location_dimensions: Tuple[int, int] = (100, 100),
-        add_new_line: bool = True,
+        xsize: int = 100,
+        ysize: int = 100,
+        add_location: bool = True,
+        add_content: bool = True,
+        add_page_index: bool = True,
+        # table specific flags
+        add_table_cell_location: bool = False,
+        add_table_cell_label: bool = True,
+        add_table_cell_text: bool = True,
     ) -> str:
         return self.output.export_to_document_tokens(
             delim=delim,
             main_text_start=main_text_start,
             main_text_stop=main_text_stop,
             main_text_labels=main_text_labels,
-            page_tagging=page_tagging,
-            location_tagging=location_tagging,
-            location_dimensions=location_dimensions,
-            add_new_line=add_new_line,
+            xsize=xsize,
+            ysize=ysize,
+            add_location=add_location,
+            add_content=add_content,
+            add_page_index=add_page_index,
+            # table specific flags
+            add_table_cell_location=add_table_cell_location,
+            add_table_cell_label=add_table_cell_label,
+            add_table_cell_text=add_table_cell_text,
         )
     def render_element_images(

{docling-1.13.0 → docling-1.14.0}/docling/utils/export.py RENAMED Viewed

@@ -111,7 +111,7 @@ def generate_multimodal_pages(
         )
         # No page-tagging since we only do 1 page at the time
         content_dt = doc.export_to_document_tokens(
-            main_text_start=start_ix, main_text_stop=end_ix, page_tagging=False
+            main_text_start=start_ix, main_text_stop=end_ix, add_page_index=False
         )
         return content_text, content_md, content_dt, page_cells, page_segments, page

{docling-1.13.0 → docling-1.14.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "1.13.0"  # DO NOT EDIT, updated automatically
+version = "1.14.0"  # DO NOT EDIT, updated automatically
 description = "Docling PDF conversion package"
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"
@@ -23,7 +23,7 @@ packages = [{include = "docling"}]
 [tool.poetry.dependencies]
 python = "^3.10"
 pydantic = "^2.0.0"
-docling-core = "^1.4.0"
+docling-core = "^1.5.0"
 docling-ibm-models = "^1.2.0"
 deepsearch-glm = "^0.21.1"
 filetype = "^1.2.0"