docling 1.0.2__tar.gz → 1.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-1.0.2 → docling-1.1.1}/PKG-INFO +23 -6
- {docling-1.0.2 → docling-1.1.1}/README.md +21 -5
- {docling-1.0.2 → docling-1.1.1}/docling/backend/pypdfium2_backend.py +1 -7
- {docling-1.0.2 → docling-1.1.1}/docling/document_converter.py +56 -0
- {docling-1.0.2 → docling-1.1.1}/docling/models/table_structure_model.py +5 -2
- {docling-1.0.2 → docling-1.1.1}/pyproject.toml +2 -1
- {docling-1.0.2 → docling-1.1.1}/LICENSE +0 -0
- {docling-1.0.2 → docling-1.1.1}/docling/__init__.py +0 -0
- {docling-1.0.2 → docling-1.1.1}/docling/backend/__init__.py +0 -0
- {docling-1.0.2 → docling-1.1.1}/docling/backend/abstract_backend.py +0 -0
- {docling-1.0.2 → docling-1.1.1}/docling/datamodel/__init__.py +0 -0
- {docling-1.0.2 → docling-1.1.1}/docling/datamodel/base_models.py +0 -0
- {docling-1.0.2 → docling-1.1.1}/docling/datamodel/document.py +0 -0
- {docling-1.0.2 → docling-1.1.1}/docling/datamodel/settings.py +0 -0
- {docling-1.0.2 → docling-1.1.1}/docling/models/__init__.py +0 -0
- {docling-1.0.2 → docling-1.1.1}/docling/models/ds_glm_model.py +0 -0
- {docling-1.0.2 → docling-1.1.1}/docling/models/easyocr_model.py +0 -0
- {docling-1.0.2 → docling-1.1.1}/docling/models/layout_model.py +0 -0
- {docling-1.0.2 → docling-1.1.1}/docling/models/page_assemble_model.py +0 -0
- {docling-1.0.2 → docling-1.1.1}/docling/pipeline/__init__.py +0 -0
- {docling-1.0.2 → docling-1.1.1}/docling/pipeline/base_model_pipeline.py +0 -0
- {docling-1.0.2 → docling-1.1.1}/docling/pipeline/standard_model_pipeline.py +0 -0
- {docling-1.0.2 → docling-1.1.1}/docling/utils/__init__.py +0 -0
- {docling-1.0.2 → docling-1.1.1}/docling/utils/layout_utils.py +0 -0
- {docling-1.0.2 → docling-1.1.1}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.1.1
|
4
4
|
Summary: Docling PDF conversion package
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -30,6 +30,7 @@ Requires-Dist: huggingface_hub (>=0.23,<1)
|
|
30
30
|
Requires-Dist: pydantic (>=2.0.0,<3.0.0)
|
31
31
|
Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
|
32
32
|
Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
|
33
|
+
Requires-Dist: requests (>=2.32.3,<3.0.0)
|
33
34
|
Project-URL: Repository, https://github.com/DS4SD/docling
|
34
35
|
Description-Content-Type: text/markdown
|
35
36
|
|
@@ -65,19 +66,35 @@ To use Docling, simply install `docling` from your package manager, e.g. pip:
|
|
65
66
|
pip install docling
|
66
67
|
```
|
67
68
|
|
68
|
-
> [!NOTE]
|
69
|
+
> [!NOTE]
|
69
70
|
> Works on macOS and Linux environments. Windows platforms are currently not tested.
|
70
71
|
|
71
72
|
### Development setup
|
72
73
|
|
73
74
|
To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
|
74
75
|
```bash
|
75
|
-
poetry install
|
76
|
+
poetry install --all-extras
|
76
77
|
```
|
77
78
|
|
78
79
|
## Usage
|
79
80
|
|
80
|
-
|
81
|
+
### Convert a single document
|
82
|
+
|
83
|
+
To convert invidual PDF documents, use `convert_single()`, for example:
|
84
|
+
```python
|
85
|
+
from docling.document_converter import DocumentConverter
|
86
|
+
|
87
|
+
source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL
|
88
|
+
converter = DocumentConverter()
|
89
|
+
doc = converter.convert_single(source)
|
90
|
+
print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis [...]"
|
91
|
+
```
|
92
|
+
|
93
|
+
### Convert a batch of documents
|
94
|
+
|
95
|
+
For an example of converting multiple documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
|
96
|
+
|
97
|
+
From a local repo clone, you can run it with:
|
81
98
|
|
82
99
|
```
|
83
100
|
python examples/convert.py
|
@@ -93,7 +110,7 @@ You can control if table structure recognition or OCR should be performed by arg
|
|
93
110
|
doc_converter = DocumentConverter(
|
94
111
|
artifacts_path=artifacts_path,
|
95
112
|
pipeline_options=PipelineOptions(
|
96
|
-
do_table_structure=False, # controls if table structure is recovered
|
113
|
+
do_table_structure=False, # controls if table structure is recovered
|
97
114
|
do_ocr=True, # controls if OCR is applied (ignores programmatic content)
|
98
115
|
),
|
99
116
|
)
|
@@ -125,7 +142,7 @@ conv_input = DocumentConversionInput.from_paths(
|
|
125
142
|
)
|
126
143
|
```
|
127
144
|
|
128
|
-
### Convert from binary PDF streams
|
145
|
+
### Convert from binary PDF streams
|
129
146
|
|
130
147
|
You can convert PDFs from a binary stream instead of from the filesystem as follows:
|
131
148
|
```python
|
@@ -30,19 +30,35 @@ To use Docling, simply install `docling` from your package manager, e.g. pip:
|
|
30
30
|
pip install docling
|
31
31
|
```
|
32
32
|
|
33
|
-
> [!NOTE]
|
33
|
+
> [!NOTE]
|
34
34
|
> Works on macOS and Linux environments. Windows platforms are currently not tested.
|
35
35
|
|
36
36
|
### Development setup
|
37
37
|
|
38
38
|
To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
|
39
39
|
```bash
|
40
|
-
poetry install
|
40
|
+
poetry install --all-extras
|
41
41
|
```
|
42
42
|
|
43
43
|
## Usage
|
44
44
|
|
45
|
-
|
45
|
+
### Convert a single document
|
46
|
+
|
47
|
+
To convert invidual PDF documents, use `convert_single()`, for example:
|
48
|
+
```python
|
49
|
+
from docling.document_converter import DocumentConverter
|
50
|
+
|
51
|
+
source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL
|
52
|
+
converter = DocumentConverter()
|
53
|
+
doc = converter.convert_single(source)
|
54
|
+
print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis [...]"
|
55
|
+
```
|
56
|
+
|
57
|
+
### Convert a batch of documents
|
58
|
+
|
59
|
+
For an example of converting multiple documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
|
60
|
+
|
61
|
+
From a local repo clone, you can run it with:
|
46
62
|
|
47
63
|
```
|
48
64
|
python examples/convert.py
|
@@ -58,7 +74,7 @@ You can control if table structure recognition or OCR should be performed by arg
|
|
58
74
|
doc_converter = DocumentConverter(
|
59
75
|
artifacts_path=artifacts_path,
|
60
76
|
pipeline_options=PipelineOptions(
|
61
|
-
do_table_structure=False, # controls if table structure is recovered
|
77
|
+
do_table_structure=False, # controls if table structure is recovered
|
62
78
|
do_ocr=True, # controls if OCR is applied (ignores programmatic content)
|
63
79
|
),
|
64
80
|
)
|
@@ -90,7 +106,7 @@ conv_input = DocumentConversionInput.from_paths(
|
|
90
106
|
)
|
91
107
|
```
|
92
108
|
|
93
|
-
### Convert from binary PDF streams
|
109
|
+
### Convert from binary PDF streams
|
94
110
|
|
95
111
|
You can convert PDFs from a binary stream instead of from the filesystem as follows:
|
96
112
|
```python
|
@@ -201,13 +201,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
201
201
|
class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
202
202
|
def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
|
203
203
|
super().__init__(path_or_stream)
|
204
|
-
|
205
|
-
if isinstance(path_or_stream, Path):
|
206
|
-
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
207
|
-
elif isinstance(path_or_stream, BytesIO):
|
208
|
-
self._pdoc = pdfium.PdfDocument(
|
209
|
-
path_or_stream
|
210
|
-
) # TODO Fix me, won't accept bytes.
|
204
|
+
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
211
205
|
|
212
206
|
def page_count(self) -> int:
|
213
207
|
return len(self._pdoc)
|
@@ -1,11 +1,15 @@
|
|
1
1
|
import functools
|
2
2
|
import logging
|
3
|
+
import tempfile
|
3
4
|
import time
|
4
5
|
import traceback
|
5
6
|
from pathlib import Path
|
6
7
|
from typing import Iterable, Optional, Type, Union
|
7
8
|
|
9
|
+
import requests
|
10
|
+
from docling_core.types import Document
|
8
11
|
from PIL import ImageDraw
|
12
|
+
from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
|
9
13
|
|
10
14
|
from docling.backend.abstract_backend import PdfDocumentBackend
|
11
15
|
from docling.datamodel.base_models import (
|
@@ -32,6 +36,7 @@ _log = logging.getLogger(__name__)
|
|
32
36
|
class DocumentConverter:
|
33
37
|
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
|
34
38
|
_table_model_path = "model_artifacts/tableformer"
|
39
|
+
_default_download_filename = "file.pdf"
|
35
40
|
|
36
41
|
def __init__(
|
37
42
|
self,
|
@@ -80,6 +85,57 @@ class DocumentConverter:
|
|
80
85
|
# Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
|
81
86
|
yield from map(self.process_document, input_batch)
|
82
87
|
|
88
|
+
def convert_single(self, source: Path | AnyHttpUrl | str) -> Document:
|
89
|
+
"""Convert a single document.
|
90
|
+
|
91
|
+
Args:
|
92
|
+
source (Path | AnyHttpUrl | str): The PDF input source. Can be a path or URL.
|
93
|
+
|
94
|
+
Raises:
|
95
|
+
ValueError: If source is of unexpected type.
|
96
|
+
RuntimeError: If conversion fails.
|
97
|
+
|
98
|
+
Returns:
|
99
|
+
Document: The converted document object.
|
100
|
+
"""
|
101
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
102
|
+
try:
|
103
|
+
http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
|
104
|
+
res = requests.get(http_url, stream=True)
|
105
|
+
res.raise_for_status()
|
106
|
+
fname = None
|
107
|
+
# try to get filename from response header
|
108
|
+
if cont_disp := res.headers.get("Content-Disposition"):
|
109
|
+
for par in cont_disp.strip().split(";"):
|
110
|
+
# currently only handling directive "filename" (not "*filename")
|
111
|
+
if (split := par.split("=")) and split[0].strip() == "filename":
|
112
|
+
fname = "=".join(split[1:]).strip().strip("'\"") or None
|
113
|
+
break
|
114
|
+
# otherwise, use name from URL:
|
115
|
+
if fname is None:
|
116
|
+
fname = Path(http_url.path).name or self._default_download_filename
|
117
|
+
local_path = Path(temp_dir) / fname
|
118
|
+
with open(local_path, "wb") as f:
|
119
|
+
for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
|
120
|
+
f.write(chunk)
|
121
|
+
except ValidationError:
|
122
|
+
try:
|
123
|
+
local_path = TypeAdapter(Path).validate_python(source)
|
124
|
+
except ValidationError:
|
125
|
+
raise ValueError(
|
126
|
+
f"Unexpected file path type encountered: {type(source)}"
|
127
|
+
)
|
128
|
+
conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
|
129
|
+
converted_docs_iter = self.convert(conv_inp)
|
130
|
+
converted_doc: ConvertedDocument = next(converted_docs_iter)
|
131
|
+
if converted_doc.status not in {
|
132
|
+
ConversionStatus.SUCCESS,
|
133
|
+
ConversionStatus.SUCCESS_WITH_ERRORS,
|
134
|
+
}:
|
135
|
+
raise RuntimeError(f"Conversion failed with status: {converted_doc.status}")
|
136
|
+
doc = converted_doc.to_ds_document()
|
137
|
+
return doc
|
138
|
+
|
83
139
|
def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
|
84
140
|
start_doc_time = time.time()
|
85
141
|
converted_doc = ConvertedDocument(input=in_doc)
|
@@ -114,12 +114,15 @@ class TableStructureModel:
|
|
114
114
|
for element in table_out["tf_responses"]:
|
115
115
|
|
116
116
|
if not self.do_cell_matching:
|
117
|
-
the_bbox = BoundingBox.model_validate(
|
117
|
+
the_bbox = BoundingBox.model_validate(
|
118
|
+
element["bbox"]
|
119
|
+
).scaled(1 / self.scale)
|
118
120
|
text_piece = page._backend.get_text_in_rect(the_bbox)
|
119
121
|
element["bbox"]["token"] = text_piece
|
120
122
|
|
121
123
|
tc = TableCell.model_validate(element)
|
122
|
-
|
124
|
+
if self.do_cell_matching:
|
125
|
+
tc.bbox = tc.bbox.scaled(1 / self.scale)
|
123
126
|
table_cells.append(tc)
|
124
127
|
|
125
128
|
# Retrieving cols/rows, after post processing:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "docling"
|
3
|
-
version = "1.
|
3
|
+
version = "1.1.1" # DO NOT EDIT, updated automatically
|
4
4
|
description = "Docling PDF conversion package"
|
5
5
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
6
6
|
license = "MIT"
|
@@ -30,6 +30,7 @@ filetype = "^1.2.0"
|
|
30
30
|
pypdfium2 = "^4.30.0"
|
31
31
|
pydantic-settings = "^2.3.0"
|
32
32
|
huggingface_hub = ">=0.23,<1"
|
33
|
+
requests = "^2.32.3"
|
33
34
|
easyocr = { version = "^1.7", optional = true }
|
34
35
|
|
35
36
|
[tool.poetry.group.dev.dependencies]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|