docling 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -201,13 +201,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
201
201
  class PyPdfiumDocumentBackend(PdfDocumentBackend):
202
202
  def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
203
203
  super().__init__(path_or_stream)
204
-
205
- if isinstance(path_or_stream, Path):
206
- self._pdoc = pdfium.PdfDocument(path_or_stream)
207
- elif isinstance(path_or_stream, BytesIO):
208
- self._pdoc = pdfium.PdfDocument(
209
- path_or_stream
210
- ) # TODO Fix me, won't accept bytes.
204
+ self._pdoc = pdfium.PdfDocument(path_or_stream)
211
205
 
212
206
  def page_count(self) -> int:
213
207
  return len(self._pdoc)
@@ -1,11 +1,15 @@
1
1
  import functools
2
2
  import logging
3
+ import tempfile
3
4
  import time
4
5
  import traceback
5
6
  from pathlib import Path
6
7
  from typing import Iterable, Optional, Type, Union
7
8
 
9
+ import requests
10
+ from docling_core.types import Document
8
11
  from PIL import ImageDraw
12
+ from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
9
13
 
10
14
  from docling.backend.abstract_backend import PdfDocumentBackend
11
15
  from docling.datamodel.base_models import (
@@ -32,6 +36,7 @@ _log = logging.getLogger(__name__)
32
36
  class DocumentConverter:
33
37
  _layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
34
38
  _table_model_path = "model_artifacts/tableformer"
39
+ _default_download_filename = "file.pdf"
35
40
 
36
41
  def __init__(
37
42
  self,
@@ -80,6 +85,57 @@ class DocumentConverter:
80
85
  # Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
81
86
  yield from map(self.process_document, input_batch)
82
87
 
88
+ def convert_single(self, source: Path | AnyHttpUrl | str) -> Document:
89
+ """Convert a single document.
90
+
91
+ Args:
92
+ source (Path | AnyHttpUrl | str): The PDF input source. Can be a path or URL.
93
+
94
+ Raises:
95
+ ValueError: If source is of unexpected type.
96
+ RuntimeError: If conversion fails.
97
+
98
+ Returns:
99
+ Document: The converted document object.
100
+ """
101
+ with tempfile.TemporaryDirectory() as temp_dir:
102
+ try:
103
+ http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
104
+ res = requests.get(http_url, stream=True)
105
+ res.raise_for_status()
106
+ fname = None
107
+ # try to get filename from response header
108
+ if cont_disp := res.headers.get("Content-Disposition"):
109
+ for par in cont_disp.strip().split(";"):
110
+ # currently only handling directive "filename" (not "*filename")
111
+ if (split := par.split("=")) and split[0].strip() == "filename":
112
+ fname = "=".join(split[1:]).strip().strip("'\"") or None
113
+ break
114
+ # otherwise, use name from URL:
115
+ if fname is None:
116
+ fname = Path(http_url.path).name or self._default_download_filename
117
+ local_path = Path(temp_dir) / fname
118
+ with open(local_path, "wb") as f:
119
+ for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
120
+ f.write(chunk)
121
+ except ValidationError:
122
+ try:
123
+ local_path = TypeAdapter(Path).validate_python(source)
124
+ except ValidationError:
125
+ raise ValueError(
126
+ f"Unexpected file path type encountered: {type(source)}"
127
+ )
128
+ conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
129
+ converted_docs_iter = self.convert(conv_inp)
130
+ converted_doc: ConvertedDocument = next(converted_docs_iter)
131
+ if converted_doc.status not in {
132
+ ConversionStatus.SUCCESS,
133
+ ConversionStatus.SUCCESS_WITH_ERRORS,
134
+ }:
135
+ raise RuntimeError(f"Conversion failed with status: {converted_doc.status}")
136
+ doc = converted_doc.to_ds_document()
137
+ return doc
138
+
83
139
  def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
84
140
  start_doc_time = time.time()
85
141
  converted_doc = ConvertedDocument(input=in_doc)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 1.0.1
3
+ Version: 1.1.0
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -24,11 +24,13 @@ Provides-Extra: ocr
24
24
  Requires-Dist: deepsearch-glm (>=0.19.0,<1)
25
25
  Requires-Dist: docling-core (>=1.1.0,<2.0.0)
26
26
  Requires-Dist: docling-ibm-models (>=1.1.0,<2.0.0)
27
+ Requires-Dist: easyocr (>=1.7,<2.0) ; extra == "easyocr" or extra == "ocr"
27
28
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
28
29
  Requires-Dist: huggingface_hub (>=0.23,<1)
29
30
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
30
31
  Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
31
32
  Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
33
+ Requires-Dist: requests (>=2.32.3,<3.0.0)
32
34
  Project-URL: Repository, https://github.com/DS4SD/docling
33
35
  Description-Content-Type: text/markdown
34
36
 
@@ -64,19 +66,35 @@ To use Docling, simply install `docling` from your package manager, e.g. pip:
64
66
  pip install docling
65
67
  ```
66
68
 
67
- > [!NOTE]
69
+ > [!NOTE]
68
70
  > Works on macOS and Linux environments. Windows platforms are currently not tested.
69
71
 
70
72
  ### Development setup
71
73
 
72
74
  To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
73
75
  ```bash
74
- poetry install
76
+ poetry install --all-extras
75
77
  ```
76
78
 
77
79
  ## Usage
78
80
 
79
- For basic usage, see the [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py) example module. Run with:
81
+ ### Convert a single document
82
+
83
+ To convert invidual PDF documents, use `convert_single()`, for example:
84
+ ```python
85
+ from docling.document_converter import DocumentConverter
86
+
87
+ source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL
88
+ converter = DocumentConverter()
89
+ doc = converter.convert_single(source)
90
+ print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis [...]"
91
+ ```
92
+
93
+ ### Convert a batch of documents
94
+
95
+ For an example of converting multiple documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
96
+
97
+ From a local repo clone, you can run it with:
80
98
 
81
99
  ```
82
100
  python examples/convert.py
@@ -92,7 +110,7 @@ You can control if table structure recognition or OCR should be performed by arg
92
110
  doc_converter = DocumentConverter(
93
111
  artifacts_path=artifacts_path,
94
112
  pipeline_options=PipelineOptions(
95
- do_table_structure=False, # controls if table structure is recovered
113
+ do_table_structure=False, # controls if table structure is recovered
96
114
  do_ocr=True, # controls if OCR is applied (ignores programmatic content)
97
115
  ),
98
116
  )
@@ -124,7 +142,7 @@ conv_input = DocumentConversionInput.from_paths(
124
142
  )
125
143
  ```
126
144
 
127
- ### Convert from binary PDF streams
145
+ ### Convert from binary PDF streams
128
146
 
129
147
  You can convert PDFs from a binary stream instead of from the filesystem as follows:
130
148
  ```python
@@ -1,12 +1,12 @@
1
1
  docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  docling/backend/abstract_backend.py,sha256=dINr8oTax9Fq31Y1AR0CGWNZtAHN5aqB_M7TAPkJNVQ,1122
4
- docling/backend/pypdfium2_backend.py,sha256=sJMoActFyc3qdKB6RFly3auHXuXM4noQAG0ypUlj26o,7647
4
+ docling/backend/pypdfium2_backend.py,sha256=cIQGFkwzceN57PzmACt06CytRo0A_t-im6rW804RC3M,7421
5
5
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  docling/datamodel/base_models.py,sha256=k7gLFPnq3ArEMAFz6qUcp5qemlYzVhOmR9qtBTkAiX4,6862
7
7
  docling/datamodel/document.py,sha256=7caefzaii6itMQgtXfA4SJhB1TAF32v1c8zRwbiU03s,12497
8
8
  docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
9
- docling/document_converter.py,sha256=MZw23oPlRmRi1ggzoD1PukUnqo-6boO3RZB06dZ5Xt0,7305
9
+ docling/document_converter.py,sha256=I9vjTLCLahsMrcs9ozM3C5r_CtBN-9qHk7-ANma7fkc,9895
10
10
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  docling/models/ds_glm_model.py,sha256=wmb--2JKFQby-kvidw6PyM8wURPXYPQ_Z_eKKCBAdYQ,3192
12
12
  docling/models/easyocr_model.py,sha256=NaHVs8IN0eW9KB076E2Kae1s-bq74_4IMWueze9QqtE,2290
@@ -19,7 +19,7 @@ docling/pipeline/standard_model_pipeline.py,sha256=UTwodKUKrisLoVcntbNUBDhjzRyFv
19
19
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
20
  docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
21
21
  docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
22
- docling-1.0.1.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
23
- docling-1.0.1.dist-info/METADATA,sha256=xnNAA9dPt73M-T4icbmxpudwuHFhnCd75aUEs2o4_U0,6113
24
- docling-1.0.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
25
- docling-1.0.1.dist-info/RECORD,,
22
+ docling-1.1.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
23
+ docling-1.1.0.dist-info/METADATA,sha256=mUAryQOsHRejcJ3Qb4zFvRVWpcKX0e4aycnJM_OE0o0,6759
24
+ docling-1.1.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
25
+ docling-1.1.0.dist-info/RECORD,,