PyPI - docling - Versions diffs - 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

docling 1.0.1py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

docling/backend/pypdfium2_backend.py CHANGED Viewed

@@ -201,13 +201,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
 class PyPdfiumDocumentBackend(PdfDocumentBackend):
     def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
         super().__init__(path_or_stream)
-        if isinstance(path_or_stream, Path):
-            self._pdoc = pdfium.PdfDocument(path_or_stream)
-        elif isinstance(path_or_stream, BytesIO):
-            self._pdoc = pdfium.PdfDocument(
-                path_or_stream
-            )  # TODO Fix me, won't accept bytes.
+        self._pdoc = pdfium.PdfDocument(path_or_stream)
     def page_count(self) -> int:
         return len(self._pdoc)

docling/document_converter.py CHANGED Viewed

@@ -1,11 +1,15 @@
 import functools
 import logging
+import tempfile
 import time
 import traceback
 from pathlib import Path
 from typing import Iterable, Optional, Type, Union
+import requests
+from docling_core.types import Document
 from PIL import ImageDraw
+from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
 from docling.backend.abstract_backend import PdfDocumentBackend
 from docling.datamodel.base_models import (
@@ -32,6 +36,7 @@ _log = logging.getLogger(__name__)
 class DocumentConverter:
     _layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
     _table_model_path = "model_artifacts/tableformer"
+    _default_download_filename = "file.pdf"
     def __init__(
         self,
@@ -80,6 +85,57 @@ class DocumentConverter:
             # Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
             yield from map(self.process_document, input_batch)
+    def convert_single(self, source: Path | AnyHttpUrl | str) -> Document:
+        """Convert a single document.
+        Args:
+            source (Path | AnyHttpUrl | str): The PDF input source. Can be a path or URL.
+        Raises:
+            ValueError: If source is of unexpected type.
+            RuntimeError: If conversion fails.
+        Returns:
+            Document: The converted document object.
+        """
+        with tempfile.TemporaryDirectory() as temp_dir:
+            try:
+                http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
+                res = requests.get(http_url, stream=True)
+                res.raise_for_status()
+                fname = None
+                # try to get filename from response header
+                if cont_disp := res.headers.get("Content-Disposition"):
+                    for par in cont_disp.strip().split(";"):
+                        # currently only handling directive "filename" (not "*filename")
+                        if (split := par.split("=")) and split[0].strip() == "filename":
+                            fname = "=".join(split[1:]).strip().strip("'\"") or None
+                            break
+                # otherwise, use name from URL:
+                if fname is None:
+                    fname = Path(http_url.path).name or self._default_download_filename
+                local_path = Path(temp_dir) / fname
+                with open(local_path, "wb") as f:
+                    for chunk in res.iter_content(chunk_size=1024):  # using 1-KB chunks
+                        f.write(chunk)
+            except ValidationError:
+                try:
+                    local_path = TypeAdapter(Path).validate_python(source)
+                except ValidationError:
+                    raise ValueError(
+                        f"Unexpected file path type encountered: {type(source)}"
+                    )
+            conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
+            converted_docs_iter = self.convert(conv_inp)
+            converted_doc: ConvertedDocument = next(converted_docs_iter)
+        if converted_doc.status not in {
+            ConversionStatus.SUCCESS,
+            ConversionStatus.SUCCESS_WITH_ERRORS,
+        }:
+            raise RuntimeError(f"Conversion failed with status: {converted_doc.status}")
+        doc = converted_doc.to_ds_document()
+        return doc
     def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
         start_doc_time = time.time()
         converted_doc = ConvertedDocument(input=in_doc)

{docling-1.0.1.dist-info → docling-1.1.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 1.0.1
+Version: 1.1.0
 Summary: Docling PDF conversion package
 Home-page: https://github.com/DS4SD/docling
 License: MIT
@@ -24,11 +24,13 @@ Provides-Extra: ocr
 Requires-Dist: deepsearch-glm (>=0.19.0,<1)
 Requires-Dist: docling-core (>=1.1.0,<2.0.0)
 Requires-Dist: docling-ibm-models (>=1.1.0,<2.0.0)
+Requires-Dist: easyocr (>=1.7,<2.0) ; extra == "easyocr" or extra == "ocr"
 Requires-Dist: filetype (>=1.2.0,<2.0.0)
 Requires-Dist: huggingface_hub (>=0.23,<1)
 Requires-Dist: pydantic (>=2.0.0,<3.0.0)
 Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
 Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
+Requires-Dist: requests (>=2.32.3,<3.0.0)
 Project-URL: Repository, https://github.com/DS4SD/docling
 Description-Content-Type: text/markdown
@@ -64,19 +66,35 @@ To use Docling, simply install `docling` from your package manager, e.g. pip:
 pip install docling
 ```
-> [!NOTE]
+> [!NOTE]
 > Works on macOS and Linux environments. Windows platforms are currently not tested.
 ### Development setup
 To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
 ```bash
-poetry install
+poetry install --all-extras
 ```
 ## Usage
-For basic usage, see the [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py) example module. Run with:
+### Convert a single document
+To convert invidual PDF documents, use `convert_single()`, for example:
+```python
+from docling.document_converter import DocumentConverter
+source = "https://arxiv.org/pdf/2206.01062"  # PDF path or URL
+converter = DocumentConverter()
+doc = converter.convert_single(source)
+print(doc.export_to_markdown())  # output: "## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis [...]"
+```
+### Convert a batch of documents
+For an example of converting multiple documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
+From a local repo clone, you can run it with:
 ```
 python examples/convert.py
@@ -92,7 +110,7 @@ You can control if table structure recognition or OCR should be performed by arg
 doc_converter = DocumentConverter(
     artifacts_path=artifacts_path,
     pipeline_options=PipelineOptions(
-        do_table_structure=False,  # controls if table structure is recovered
+        do_table_structure=False,  # controls if table structure is recovered
         do_ocr=True,  # controls if OCR is applied (ignores programmatic content)
     ),
 )
@@ -124,7 +142,7 @@ conv_input = DocumentConversionInput.from_paths(
 )
 ```
-### Convert from binary PDF streams
+### Convert from binary PDF streams
 You can convert PDFs from a binary stream instead of from the filesystem as follows:
 ```python

{docling-1.0.1.dist-info → docling-1.1.0.dist-info}/RECORD RENAMED Viewed

@@ -1,12 +1,12 @@
 docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/backend/abstract_backend.py,sha256=dINr8oTax9Fq31Y1AR0CGWNZtAHN5aqB_M7TAPkJNVQ,1122
-docling/backend/pypdfium2_backend.py,sha256=sJMoActFyc3qdKB6RFly3auHXuXM4noQAG0ypUlj26o,7647
+docling/backend/pypdfium2_backend.py,sha256=cIQGFkwzceN57PzmACt06CytRo0A_t-im6rW804RC3M,7421
 docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/datamodel/base_models.py,sha256=k7gLFPnq3ArEMAFz6qUcp5qemlYzVhOmR9qtBTkAiX4,6862
 docling/datamodel/document.py,sha256=7caefzaii6itMQgtXfA4SJhB1TAF32v1c8zRwbiU03s,12497
 docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
-docling/document_converter.py,sha256=MZw23oPlRmRi1ggzoD1PukUnqo-6boO3RZB06dZ5Xt0,7305
+docling/document_converter.py,sha256=I9vjTLCLahsMrcs9ozM3C5r_CtBN-9qHk7-ANma7fkc,9895
 docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/models/ds_glm_model.py,sha256=wmb--2JKFQby-kvidw6PyM8wURPXYPQ_Z_eKKCBAdYQ,3192
 docling/models/easyocr_model.py,sha256=NaHVs8IN0eW9KB076E2Kae1s-bq74_4IMWueze9QqtE,2290
@@ -19,7 +19,7 @@ docling/pipeline/standard_model_pipeline.py,sha256=UTwodKUKrisLoVcntbNUBDhjzRyFv
 docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
 docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
-docling-1.0.1.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
-docling-1.0.1.dist-info/METADATA,sha256=xnNAA9dPt73M-T4icbmxpudwuHFhnCd75aUEs2o4_U0,6113
-docling-1.0.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-docling-1.0.1.dist-info/RECORD,,
+docling-1.1.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
+docling-1.1.0.dist-info/METADATA,sha256=mUAryQOsHRejcJ3Qb4zFvRVWpcKX0e4aycnJM_OE0o0,6759
+docling-1.1.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+docling-1.1.0.dist-info/RECORD,,

{docling-1.0.1.dist-info → docling-1.1.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{docling-1.0.1.dist-info → docling-1.1.0.dist-info}/WHEEL RENAMED Viewed

File without changes

docling 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl

docling 1.0.1py3-none-any.whl → 1.1.0py3-none-any.whl