docling 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/pypdfium2_backend.py +1 -7
- docling/document_converter.py +56 -0
- {docling-1.0.1.dist-info → docling-1.1.0.dist-info}/METADATA +24 -6
- {docling-1.0.1.dist-info → docling-1.1.0.dist-info}/RECORD +6 -6
- {docling-1.0.1.dist-info → docling-1.1.0.dist-info}/LICENSE +0 -0
- {docling-1.0.1.dist-info → docling-1.1.0.dist-info}/WHEEL +0 -0
@@ -201,13 +201,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
201
201
|
class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
202
202
|
def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
|
203
203
|
super().__init__(path_or_stream)
|
204
|
-
|
205
|
-
if isinstance(path_or_stream, Path):
|
206
|
-
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
207
|
-
elif isinstance(path_or_stream, BytesIO):
|
208
|
-
self._pdoc = pdfium.PdfDocument(
|
209
|
-
path_or_stream
|
210
|
-
) # TODO Fix me, won't accept bytes.
|
204
|
+
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
211
205
|
|
212
206
|
def page_count(self) -> int:
|
213
207
|
return len(self._pdoc)
|
docling/document_converter.py
CHANGED
@@ -1,11 +1,15 @@
|
|
1
1
|
import functools
|
2
2
|
import logging
|
3
|
+
import tempfile
|
3
4
|
import time
|
4
5
|
import traceback
|
5
6
|
from pathlib import Path
|
6
7
|
from typing import Iterable, Optional, Type, Union
|
7
8
|
|
9
|
+
import requests
|
10
|
+
from docling_core.types import Document
|
8
11
|
from PIL import ImageDraw
|
12
|
+
from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
|
9
13
|
|
10
14
|
from docling.backend.abstract_backend import PdfDocumentBackend
|
11
15
|
from docling.datamodel.base_models import (
|
@@ -32,6 +36,7 @@ _log = logging.getLogger(__name__)
|
|
32
36
|
class DocumentConverter:
|
33
37
|
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
|
34
38
|
_table_model_path = "model_artifacts/tableformer"
|
39
|
+
_default_download_filename = "file.pdf"
|
35
40
|
|
36
41
|
def __init__(
|
37
42
|
self,
|
@@ -80,6 +85,57 @@ class DocumentConverter:
|
|
80
85
|
# Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
|
81
86
|
yield from map(self.process_document, input_batch)
|
82
87
|
|
88
|
+
def convert_single(self, source: Path | AnyHttpUrl | str) -> Document:
|
89
|
+
"""Convert a single document.
|
90
|
+
|
91
|
+
Args:
|
92
|
+
source (Path | AnyHttpUrl | str): The PDF input source. Can be a path or URL.
|
93
|
+
|
94
|
+
Raises:
|
95
|
+
ValueError: If source is of unexpected type.
|
96
|
+
RuntimeError: If conversion fails.
|
97
|
+
|
98
|
+
Returns:
|
99
|
+
Document: The converted document object.
|
100
|
+
"""
|
101
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
102
|
+
try:
|
103
|
+
http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
|
104
|
+
res = requests.get(http_url, stream=True)
|
105
|
+
res.raise_for_status()
|
106
|
+
fname = None
|
107
|
+
# try to get filename from response header
|
108
|
+
if cont_disp := res.headers.get("Content-Disposition"):
|
109
|
+
for par in cont_disp.strip().split(";"):
|
110
|
+
# currently only handling directive "filename" (not "*filename")
|
111
|
+
if (split := par.split("=")) and split[0].strip() == "filename":
|
112
|
+
fname = "=".join(split[1:]).strip().strip("'\"") or None
|
113
|
+
break
|
114
|
+
# otherwise, use name from URL:
|
115
|
+
if fname is None:
|
116
|
+
fname = Path(http_url.path).name or self._default_download_filename
|
117
|
+
local_path = Path(temp_dir) / fname
|
118
|
+
with open(local_path, "wb") as f:
|
119
|
+
for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
|
120
|
+
f.write(chunk)
|
121
|
+
except ValidationError:
|
122
|
+
try:
|
123
|
+
local_path = TypeAdapter(Path).validate_python(source)
|
124
|
+
except ValidationError:
|
125
|
+
raise ValueError(
|
126
|
+
f"Unexpected file path type encountered: {type(source)}"
|
127
|
+
)
|
128
|
+
conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
|
129
|
+
converted_docs_iter = self.convert(conv_inp)
|
130
|
+
converted_doc: ConvertedDocument = next(converted_docs_iter)
|
131
|
+
if converted_doc.status not in {
|
132
|
+
ConversionStatus.SUCCESS,
|
133
|
+
ConversionStatus.SUCCESS_WITH_ERRORS,
|
134
|
+
}:
|
135
|
+
raise RuntimeError(f"Conversion failed with status: {converted_doc.status}")
|
136
|
+
doc = converted_doc.to_ds_document()
|
137
|
+
return doc
|
138
|
+
|
83
139
|
def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
|
84
140
|
start_doc_time = time.time()
|
85
141
|
converted_doc = ConvertedDocument(input=in_doc)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 1.0
|
3
|
+
Version: 1.1.0
|
4
4
|
Summary: Docling PDF conversion package
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -24,11 +24,13 @@ Provides-Extra: ocr
|
|
24
24
|
Requires-Dist: deepsearch-glm (>=0.19.0,<1)
|
25
25
|
Requires-Dist: docling-core (>=1.1.0,<2.0.0)
|
26
26
|
Requires-Dist: docling-ibm-models (>=1.1.0,<2.0.0)
|
27
|
+
Requires-Dist: easyocr (>=1.7,<2.0) ; extra == "easyocr" or extra == "ocr"
|
27
28
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
28
29
|
Requires-Dist: huggingface_hub (>=0.23,<1)
|
29
30
|
Requires-Dist: pydantic (>=2.0.0,<3.0.0)
|
30
31
|
Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
|
31
32
|
Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
|
33
|
+
Requires-Dist: requests (>=2.32.3,<3.0.0)
|
32
34
|
Project-URL: Repository, https://github.com/DS4SD/docling
|
33
35
|
Description-Content-Type: text/markdown
|
34
36
|
|
@@ -64,19 +66,35 @@ To use Docling, simply install `docling` from your package manager, e.g. pip:
|
|
64
66
|
pip install docling
|
65
67
|
```
|
66
68
|
|
67
|
-
> [!NOTE]
|
69
|
+
> [!NOTE]
|
68
70
|
> Works on macOS and Linux environments. Windows platforms are currently not tested.
|
69
71
|
|
70
72
|
### Development setup
|
71
73
|
|
72
74
|
To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
|
73
75
|
```bash
|
74
|
-
poetry install
|
76
|
+
poetry install --all-extras
|
75
77
|
```
|
76
78
|
|
77
79
|
## Usage
|
78
80
|
|
79
|
-
|
81
|
+
### Convert a single document
|
82
|
+
|
83
|
+
To convert invidual PDF documents, use `convert_single()`, for example:
|
84
|
+
```python
|
85
|
+
from docling.document_converter import DocumentConverter
|
86
|
+
|
87
|
+
source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL
|
88
|
+
converter = DocumentConverter()
|
89
|
+
doc = converter.convert_single(source)
|
90
|
+
print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis [...]"
|
91
|
+
```
|
92
|
+
|
93
|
+
### Convert a batch of documents
|
94
|
+
|
95
|
+
For an example of converting multiple documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
|
96
|
+
|
97
|
+
From a local repo clone, you can run it with:
|
80
98
|
|
81
99
|
```
|
82
100
|
python examples/convert.py
|
@@ -92,7 +110,7 @@ You can control if table structure recognition or OCR should be performed by arg
|
|
92
110
|
doc_converter = DocumentConverter(
|
93
111
|
artifacts_path=artifacts_path,
|
94
112
|
pipeline_options=PipelineOptions(
|
95
|
-
do_table_structure=False, # controls if table structure is recovered
|
113
|
+
do_table_structure=False, # controls if table structure is recovered
|
96
114
|
do_ocr=True, # controls if OCR is applied (ignores programmatic content)
|
97
115
|
),
|
98
116
|
)
|
@@ -124,7 +142,7 @@ conv_input = DocumentConversionInput.from_paths(
|
|
124
142
|
)
|
125
143
|
```
|
126
144
|
|
127
|
-
### Convert from binary PDF streams
|
145
|
+
### Convert from binary PDF streams
|
128
146
|
|
129
147
|
You can convert PDFs from a binary stream instead of from the filesystem as follows:
|
130
148
|
```python
|
@@ -1,12 +1,12 @@
|
|
1
1
|
docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
2
|
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
3
|
docling/backend/abstract_backend.py,sha256=dINr8oTax9Fq31Y1AR0CGWNZtAHN5aqB_M7TAPkJNVQ,1122
|
4
|
-
docling/backend/pypdfium2_backend.py,sha256=
|
4
|
+
docling/backend/pypdfium2_backend.py,sha256=cIQGFkwzceN57PzmACt06CytRo0A_t-im6rW804RC3M,7421
|
5
5
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
6
|
docling/datamodel/base_models.py,sha256=k7gLFPnq3ArEMAFz6qUcp5qemlYzVhOmR9qtBTkAiX4,6862
|
7
7
|
docling/datamodel/document.py,sha256=7caefzaii6itMQgtXfA4SJhB1TAF32v1c8zRwbiU03s,12497
|
8
8
|
docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
|
9
|
-
docling/document_converter.py,sha256=
|
9
|
+
docling/document_converter.py,sha256=I9vjTLCLahsMrcs9ozM3C5r_CtBN-9qHk7-ANma7fkc,9895
|
10
10
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
11
|
docling/models/ds_glm_model.py,sha256=wmb--2JKFQby-kvidw6PyM8wURPXYPQ_Z_eKKCBAdYQ,3192
|
12
12
|
docling/models/easyocr_model.py,sha256=NaHVs8IN0eW9KB076E2Kae1s-bq74_4IMWueze9QqtE,2290
|
@@ -19,7 +19,7 @@ docling/pipeline/standard_model_pipeline.py,sha256=UTwodKUKrisLoVcntbNUBDhjzRyFv
|
|
19
19
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
20
20
|
docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
|
21
21
|
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
22
|
-
docling-1.0.
|
23
|
-
docling-1.0.
|
24
|
-
docling-1.0.
|
25
|
-
docling-1.0.
|
22
|
+
docling-1.1.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
|
23
|
+
docling-1.1.0.dist-info/METADATA,sha256=mUAryQOsHRejcJ3Qb4zFvRVWpcKX0e4aycnJM_OE0o0,6759
|
24
|
+
docling-1.1.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
25
|
+
docling-1.1.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|