docling 0.1.2__tar.gz → 1.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling-1.5.0/PKG-INFO +192 -0
- docling-1.5.0/README.md +153 -0
- {docling-0.1.2 → docling-1.5.0}/docling/backend/abstract_backend.py +1 -1
- docling-1.5.0/docling/backend/docling_parse_backend.py +187 -0
- {docling-0.1.2 → docling-1.5.0}/docling/backend/pypdfium2_backend.py +5 -9
- {docling-0.1.2 → docling-1.5.0}/docling/datamodel/base_models.py +68 -11
- {docling-0.1.2 → docling-1.5.0}/docling/datamodel/document.py +27 -15
- {docling-0.1.2 → docling-1.5.0}/docling/document_converter.py +77 -6
- {docling-0.1.2 → docling-1.5.0}/docling/models/easyocr_model.py +1 -1
- {docling-0.1.2 → docling-1.5.0}/docling/models/layout_model.py +11 -1
- {docling-0.1.2 → docling-1.5.0}/docling/models/page_assemble_model.py +0 -12
- {docling-0.1.2 → docling-1.5.0}/docling/models/table_structure_model.py +43 -12
- {docling-0.1.2 → docling-1.5.0}/docling/pipeline/standard_model_pipeline.py +1 -1
- {docling-0.1.2 → docling-1.5.0}/pyproject.toml +28 -11
- docling-0.1.2/PKG-INFO +0 -132
- docling-0.1.2/README.md +0 -99
- {docling-0.1.2 → docling-1.5.0}/LICENSE +0 -0
- {docling-0.1.2 → docling-1.5.0}/docling/__init__.py +0 -0
- {docling-0.1.2 → docling-1.5.0}/docling/backend/__init__.py +0 -0
- {docling-0.1.2 → docling-1.5.0}/docling/datamodel/__init__.py +0 -0
- {docling-0.1.2 → docling-1.5.0}/docling/datamodel/settings.py +0 -0
- {docling-0.1.2 → docling-1.5.0}/docling/models/__init__.py +0 -0
- {docling-0.1.2 → docling-1.5.0}/docling/models/ds_glm_model.py +0 -0
- {docling-0.1.2 → docling-1.5.0}/docling/pipeline/__init__.py +0 -0
- {docling-0.1.2 → docling-1.5.0}/docling/pipeline/base_model_pipeline.py +0 -0
- {docling-0.1.2 → docling-1.5.0}/docling/utils/__init__.py +0 -0
- {docling-0.1.2 → docling-1.5.0}/docling/utils/layout_utils.py +0 -0
- {docling-0.1.2 → docling-1.5.0}/docling/utils/utils.py +0 -0
docling-1.5.0/PKG-INFO
ADDED
@@ -0,0 +1,192 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: docling
|
3
|
+
Version: 1.5.0
|
4
|
+
Summary: Docling PDF conversion package
|
5
|
+
Home-page: https://github.com/DS4SD/docling
|
6
|
+
License: MIT
|
7
|
+
Keywords: docling,convert,document,pdf,layout model,segmentation,table structure,table former
|
8
|
+
Author: Christoph Auer
|
9
|
+
Author-email: cau@zurich.ibm.com
|
10
|
+
Requires-Python: >=3.10,<4.0
|
11
|
+
Classifier: Development Status :: 5 - Production/Stable
|
12
|
+
Classifier: Intended Audience :: Developers
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
15
|
+
Classifier: Operating System :: MacOS :: MacOS X
|
16
|
+
Classifier: Operating System :: POSIX :: Linux
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
22
|
+
Provides-Extra: easyocr
|
23
|
+
Provides-Extra: ocr
|
24
|
+
Requires-Dist: certifi (>=2024.7.4)
|
25
|
+
Requires-Dist: deepsearch-glm (>=0.19.0,<1)
|
26
|
+
Requires-Dist: docling-core (>=1.1.2,<2.0.0)
|
27
|
+
Requires-Dist: docling-ibm-models (>=1.1.1,<2.0.0)
|
28
|
+
Requires-Dist: docling-parse (>=0.2.0,<0.3.0)
|
29
|
+
Requires-Dist: easyocr (>=1.7,<2.0) ; extra == "easyocr" or extra == "ocr"
|
30
|
+
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
31
|
+
Requires-Dist: huggingface_hub (>=0.23,<1)
|
32
|
+
Requires-Dist: pydantic (>=2.0.0,<3.0.0)
|
33
|
+
Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
|
34
|
+
Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
|
35
|
+
Requires-Dist: requests (>=2.32.3,<3.0.0)
|
36
|
+
Project-URL: Repository, https://github.com/DS4SD/docling
|
37
|
+
Description-Content-Type: text/markdown
|
38
|
+
|
39
|
+
<p align="center">
|
40
|
+
<a href="https://github.com/ds4sd/docling">
|
41
|
+
<img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/logo.png" width="150" />
|
42
|
+
</a>
|
43
|
+
</p>
|
44
|
+
|
45
|
+
# Docling
|
46
|
+
|
47
|
+
[](https://arxiv.org/abs/2408.09869)
|
48
|
+
[](https://pypi.org/project/docling/)
|
49
|
+

|
50
|
+
[](https://python-poetry.org/)
|
51
|
+
[](https://github.com/psf/black)
|
52
|
+
[](https://pycqa.github.io/isort/)
|
53
|
+
[](https://pydantic.dev)
|
54
|
+
[](https://github.com/pre-commit/pre-commit)
|
55
|
+
[](https://opensource.org/licenses/MIT)
|
56
|
+
|
57
|
+
Docling bundles PDF document conversion to JSON and Markdown in an easy, self-contained package.
|
58
|
+
|
59
|
+
## Features
|
60
|
+
* ⚡ Converts any PDF document to JSON or Markdown format, stable and lightning fast
|
61
|
+
* 📑 Understands detailed page layout, reading order and recovers table structures
|
62
|
+
* 📝 Extracts metadata from the document, such as title, authors, references and language
|
63
|
+
* 🔍 Optionally applies OCR (use with scanned PDFs)
|
64
|
+
|
65
|
+
## Installation
|
66
|
+
|
67
|
+
To use Docling, simply install `docling` from your package manager, e.g. pip:
|
68
|
+
```bash
|
69
|
+
pip install docling
|
70
|
+
```
|
71
|
+
|
72
|
+
> [!NOTE]
|
73
|
+
> Works on macOS and Linux environments. Windows platforms are currently not tested.
|
74
|
+
|
75
|
+
### Development setup
|
76
|
+
|
77
|
+
To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
|
78
|
+
```bash
|
79
|
+
poetry install --all-extras
|
80
|
+
```
|
81
|
+
|
82
|
+
## Usage
|
83
|
+
|
84
|
+
### Convert a single document
|
85
|
+
|
86
|
+
To convert invidual PDF documents, use `convert_single()`, for example:
|
87
|
+
```python
|
88
|
+
from docling.document_converter import DocumentConverter
|
89
|
+
|
90
|
+
source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL
|
91
|
+
converter = DocumentConverter()
|
92
|
+
doc = converter.convert_single(source)
|
93
|
+
print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis [...]"
|
94
|
+
```
|
95
|
+
|
96
|
+
### Convert a batch of documents
|
97
|
+
|
98
|
+
For an example of batch-converting documents, see [batch_convert.py](https://github.com/DS4SD/docling/blob/main/examples/batch_convert.py).
|
99
|
+
|
100
|
+
From a local repo clone, you can run it with:
|
101
|
+
|
102
|
+
```
|
103
|
+
python examples/batch_convert.py
|
104
|
+
```
|
105
|
+
The output of the above command will be written to `./scratch`.
|
106
|
+
|
107
|
+
### Adjust pipeline features
|
108
|
+
|
109
|
+
The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
|
110
|
+
one can adjust the conversion pipeline and features.
|
111
|
+
|
112
|
+
|
113
|
+
#### Control pipeline options
|
114
|
+
|
115
|
+
You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`:
|
116
|
+
```python
|
117
|
+
doc_converter = DocumentConverter(
|
118
|
+
artifacts_path=artifacts_path,
|
119
|
+
pipeline_options=PipelineOptions(
|
120
|
+
do_table_structure=False, # controls if table structure is recovered
|
121
|
+
do_ocr=True, # controls if OCR is applied (ignores programmatic content)
|
122
|
+
),
|
123
|
+
)
|
124
|
+
```
|
125
|
+
|
126
|
+
#### Control table extraction options
|
127
|
+
|
128
|
+
You can control if table structure recognition should map the recognized structure back to PDF cells (default) or use text cells from the structure prediction itself.
|
129
|
+
This can improve output quality if you find that multiple columns in extracted tables are erroneously merged into one.
|
130
|
+
|
131
|
+
|
132
|
+
```python
|
133
|
+
pipeline_options = PipelineOptions(do_table_structure=True)
|
134
|
+
pipeline_options.table_structure_options.do_cell_matching = False # uses text cells predicted from table structure model
|
135
|
+
|
136
|
+
doc_converter = DocumentConverter(
|
137
|
+
artifacts_path=artifacts_path,
|
138
|
+
pipeline_options=pipeline_options,
|
139
|
+
)
|
140
|
+
```
|
141
|
+
|
142
|
+
### Impose limits on the document size
|
143
|
+
|
144
|
+
You can limit the file size and number of pages which should be allowed to process per document:
|
145
|
+
```python
|
146
|
+
conv_input = DocumentConversionInput.from_paths(
|
147
|
+
paths=[Path("./test/data/2206.01062.pdf")],
|
148
|
+
limits=DocumentLimits(max_num_pages=100, max_file_size=20971520)
|
149
|
+
)
|
150
|
+
```
|
151
|
+
|
152
|
+
### Convert from binary PDF streams
|
153
|
+
|
154
|
+
You can convert PDFs from a binary stream instead of from the filesystem as follows:
|
155
|
+
```python
|
156
|
+
buf = BytesIO(your_binary_stream)
|
157
|
+
docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
|
158
|
+
conv_input = DocumentConversionInput.from_streams(docs)
|
159
|
+
converted_docs = doc_converter.convert(conv_input)
|
160
|
+
```
|
161
|
+
### Limit resource usage
|
162
|
+
|
163
|
+
You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
|
164
|
+
|
165
|
+
|
166
|
+
## Contributing
|
167
|
+
|
168
|
+
Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
|
169
|
+
|
170
|
+
|
171
|
+
## References
|
172
|
+
|
173
|
+
If you use Docling in your projects, please consider citing the following:
|
174
|
+
|
175
|
+
```bib
|
176
|
+
@techreport{Docling,
|
177
|
+
author = {Deep Search Team},
|
178
|
+
month = {8},
|
179
|
+
title = {{Docling Technical Report}},
|
180
|
+
url={https://arxiv.org/abs/2408.09869},
|
181
|
+
eprint={2408.09869},
|
182
|
+
doi = "10.48550/arXiv.2408.09869",
|
183
|
+
version = {1.0.0},
|
184
|
+
year = {2024}
|
185
|
+
}
|
186
|
+
```
|
187
|
+
|
188
|
+
## License
|
189
|
+
|
190
|
+
The Docling codebase is under MIT license.
|
191
|
+
For individual model usage, please refer to the model licenses found in the original packages.
|
192
|
+
|
docling-1.5.0/README.md
ADDED
@@ -0,0 +1,153 @@
|
|
1
|
+
<p align="center">
|
2
|
+
<a href="https://github.com/ds4sd/docling">
|
3
|
+
<img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/logo.png" width="150" />
|
4
|
+
</a>
|
5
|
+
</p>
|
6
|
+
|
7
|
+
# Docling
|
8
|
+
|
9
|
+
[](https://arxiv.org/abs/2408.09869)
|
10
|
+
[](https://pypi.org/project/docling/)
|
11
|
+

|
12
|
+
[](https://python-poetry.org/)
|
13
|
+
[](https://github.com/psf/black)
|
14
|
+
[](https://pycqa.github.io/isort/)
|
15
|
+
[](https://pydantic.dev)
|
16
|
+
[](https://github.com/pre-commit/pre-commit)
|
17
|
+
[](https://opensource.org/licenses/MIT)
|
18
|
+
|
19
|
+
Docling bundles PDF document conversion to JSON and Markdown in an easy, self-contained package.
|
20
|
+
|
21
|
+
## Features
|
22
|
+
* ⚡ Converts any PDF document to JSON or Markdown format, stable and lightning fast
|
23
|
+
* 📑 Understands detailed page layout, reading order and recovers table structures
|
24
|
+
* 📝 Extracts metadata from the document, such as title, authors, references and language
|
25
|
+
* 🔍 Optionally applies OCR (use with scanned PDFs)
|
26
|
+
|
27
|
+
## Installation
|
28
|
+
|
29
|
+
To use Docling, simply install `docling` from your package manager, e.g. pip:
|
30
|
+
```bash
|
31
|
+
pip install docling
|
32
|
+
```
|
33
|
+
|
34
|
+
> [!NOTE]
|
35
|
+
> Works on macOS and Linux environments. Windows platforms are currently not tested.
|
36
|
+
|
37
|
+
### Development setup
|
38
|
+
|
39
|
+
To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
|
40
|
+
```bash
|
41
|
+
poetry install --all-extras
|
42
|
+
```
|
43
|
+
|
44
|
+
## Usage
|
45
|
+
|
46
|
+
### Convert a single document
|
47
|
+
|
48
|
+
To convert invidual PDF documents, use `convert_single()`, for example:
|
49
|
+
```python
|
50
|
+
from docling.document_converter import DocumentConverter
|
51
|
+
|
52
|
+
source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL
|
53
|
+
converter = DocumentConverter()
|
54
|
+
doc = converter.convert_single(source)
|
55
|
+
print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis [...]"
|
56
|
+
```
|
57
|
+
|
58
|
+
### Convert a batch of documents
|
59
|
+
|
60
|
+
For an example of batch-converting documents, see [batch_convert.py](https://github.com/DS4SD/docling/blob/main/examples/batch_convert.py).
|
61
|
+
|
62
|
+
From a local repo clone, you can run it with:
|
63
|
+
|
64
|
+
```
|
65
|
+
python examples/batch_convert.py
|
66
|
+
```
|
67
|
+
The output of the above command will be written to `./scratch`.
|
68
|
+
|
69
|
+
### Adjust pipeline features
|
70
|
+
|
71
|
+
The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
|
72
|
+
one can adjust the conversion pipeline and features.
|
73
|
+
|
74
|
+
|
75
|
+
#### Control pipeline options
|
76
|
+
|
77
|
+
You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`:
|
78
|
+
```python
|
79
|
+
doc_converter = DocumentConverter(
|
80
|
+
artifacts_path=artifacts_path,
|
81
|
+
pipeline_options=PipelineOptions(
|
82
|
+
do_table_structure=False, # controls if table structure is recovered
|
83
|
+
do_ocr=True, # controls if OCR is applied (ignores programmatic content)
|
84
|
+
),
|
85
|
+
)
|
86
|
+
```
|
87
|
+
|
88
|
+
#### Control table extraction options
|
89
|
+
|
90
|
+
You can control if table structure recognition should map the recognized structure back to PDF cells (default) or use text cells from the structure prediction itself.
|
91
|
+
This can improve output quality if you find that multiple columns in extracted tables are erroneously merged into one.
|
92
|
+
|
93
|
+
|
94
|
+
```python
|
95
|
+
pipeline_options = PipelineOptions(do_table_structure=True)
|
96
|
+
pipeline_options.table_structure_options.do_cell_matching = False # uses text cells predicted from table structure model
|
97
|
+
|
98
|
+
doc_converter = DocumentConverter(
|
99
|
+
artifacts_path=artifacts_path,
|
100
|
+
pipeline_options=pipeline_options,
|
101
|
+
)
|
102
|
+
```
|
103
|
+
|
104
|
+
### Impose limits on the document size
|
105
|
+
|
106
|
+
You can limit the file size and number of pages which should be allowed to process per document:
|
107
|
+
```python
|
108
|
+
conv_input = DocumentConversionInput.from_paths(
|
109
|
+
paths=[Path("./test/data/2206.01062.pdf")],
|
110
|
+
limits=DocumentLimits(max_num_pages=100, max_file_size=20971520)
|
111
|
+
)
|
112
|
+
```
|
113
|
+
|
114
|
+
### Convert from binary PDF streams
|
115
|
+
|
116
|
+
You can convert PDFs from a binary stream instead of from the filesystem as follows:
|
117
|
+
```python
|
118
|
+
buf = BytesIO(your_binary_stream)
|
119
|
+
docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
|
120
|
+
conv_input = DocumentConversionInput.from_streams(docs)
|
121
|
+
converted_docs = doc_converter.convert(conv_input)
|
122
|
+
```
|
123
|
+
### Limit resource usage
|
124
|
+
|
125
|
+
You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
|
126
|
+
|
127
|
+
|
128
|
+
## Contributing
|
129
|
+
|
130
|
+
Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
|
131
|
+
|
132
|
+
|
133
|
+
## References
|
134
|
+
|
135
|
+
If you use Docling in your projects, please consider citing the following:
|
136
|
+
|
137
|
+
```bib
|
138
|
+
@techreport{Docling,
|
139
|
+
author = {Deep Search Team},
|
140
|
+
month = {8},
|
141
|
+
title = {{Docling Technical Report}},
|
142
|
+
url={https://arxiv.org/abs/2408.09869},
|
143
|
+
eprint={2408.09869},
|
144
|
+
doi = "10.48550/arXiv.2408.09869",
|
145
|
+
version = {1.0.0},
|
146
|
+
year = {2024}
|
147
|
+
}
|
148
|
+
```
|
149
|
+
|
150
|
+
## License
|
151
|
+
|
152
|
+
The Docling codebase is under MIT license.
|
153
|
+
For individual model usage, please refer to the model licenses found in the original packages.
|
@@ -0,0 +1,187 @@
|
|
1
|
+
import logging
|
2
|
+
import random
|
3
|
+
import time
|
4
|
+
from io import BytesIO
|
5
|
+
from pathlib import Path
|
6
|
+
from typing import Iterable, List, Optional, Union
|
7
|
+
|
8
|
+
import pypdfium2 as pdfium
|
9
|
+
from docling_parse.docling_parse import pdf_parser
|
10
|
+
from PIL import Image, ImageDraw
|
11
|
+
from pypdfium2 import PdfPage
|
12
|
+
|
13
|
+
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
|
14
|
+
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
|
15
|
+
|
16
|
+
_log = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
|
19
|
+
class DoclingParsePageBackend(PdfPageBackend):
|
20
|
+
def __init__(self, page_obj: PdfPage, docling_page_obj):
|
21
|
+
super().__init__(page_obj)
|
22
|
+
self._ppage = page_obj
|
23
|
+
self._dpage = docling_page_obj
|
24
|
+
self.text_page = None
|
25
|
+
|
26
|
+
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
27
|
+
# Find intersecting cells on the page
|
28
|
+
text_piece = ""
|
29
|
+
page_size = self.get_size()
|
30
|
+
parser_width = self._dpage["width"]
|
31
|
+
parser_height = self._dpage["height"]
|
32
|
+
|
33
|
+
scale = (
|
34
|
+
1 # FIX - Replace with param in get_text_in_rect across backends (optional)
|
35
|
+
)
|
36
|
+
|
37
|
+
for i in range(len(self._dpage["cells"])):
|
38
|
+
rect = self._dpage["cells"][i]["box"]["device"]
|
39
|
+
x0, y0, x1, y1 = rect
|
40
|
+
cell_bbox = BoundingBox(
|
41
|
+
l=x0 * scale * page_size.width / parser_width,
|
42
|
+
b=y0 * scale * page_size.height / parser_height,
|
43
|
+
r=x1 * scale * page_size.width / parser_width,
|
44
|
+
t=y1 * scale * page_size.height / parser_height,
|
45
|
+
coord_origin=CoordOrigin.BOTTOMLEFT,
|
46
|
+
).to_top_left_origin(page_size.height * scale)
|
47
|
+
|
48
|
+
overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
|
49
|
+
|
50
|
+
if overlap_frac > 0.5:
|
51
|
+
if len(text_piece) > 0:
|
52
|
+
text_piece += " "
|
53
|
+
text_piece += self._dpage["cells"][i]["content"]["rnormalized"]
|
54
|
+
|
55
|
+
return text_piece
|
56
|
+
|
57
|
+
def get_text_cells(self) -> Iterable[Cell]:
|
58
|
+
cells = []
|
59
|
+
cell_counter = 0
|
60
|
+
|
61
|
+
page_size = self.get_size()
|
62
|
+
|
63
|
+
parser_width = self._dpage["width"]
|
64
|
+
parser_height = self._dpage["height"]
|
65
|
+
|
66
|
+
for i in range(len(self._dpage["cells"])):
|
67
|
+
rect = self._dpage["cells"][i]["box"]["device"]
|
68
|
+
x0, y0, x1, y1 = rect
|
69
|
+
text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
|
70
|
+
cells.append(
|
71
|
+
Cell(
|
72
|
+
id=cell_counter,
|
73
|
+
text=text_piece,
|
74
|
+
bbox=BoundingBox(
|
75
|
+
# l=x0, b=y0, r=x1, t=y1,
|
76
|
+
l=x0 * page_size.width / parser_width,
|
77
|
+
b=y0 * page_size.height / parser_height,
|
78
|
+
r=x1 * page_size.width / parser_width,
|
79
|
+
t=y1 * page_size.height / parser_height,
|
80
|
+
coord_origin=CoordOrigin.BOTTOMLEFT,
|
81
|
+
).to_top_left_origin(page_size.height),
|
82
|
+
)
|
83
|
+
)
|
84
|
+
cell_counter += 1
|
85
|
+
|
86
|
+
def draw_clusters_and_cells():
|
87
|
+
image = (
|
88
|
+
self.get_page_image()
|
89
|
+
) # make new image to avoid drawing on the saved ones
|
90
|
+
draw = ImageDraw.Draw(image)
|
91
|
+
for c in cells:
|
92
|
+
x0, y0, x1, y1 = c.bbox.as_tuple()
|
93
|
+
cell_color = (
|
94
|
+
random.randint(30, 140),
|
95
|
+
random.randint(30, 140),
|
96
|
+
random.randint(30, 140),
|
97
|
+
)
|
98
|
+
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
99
|
+
image.show()
|
100
|
+
|
101
|
+
# before merge:
|
102
|
+
# draw_clusters_and_cells()
|
103
|
+
|
104
|
+
# cells = merge_horizontal_cells(cells)
|
105
|
+
|
106
|
+
# after merge:
|
107
|
+
# draw_clusters_and_cells()
|
108
|
+
|
109
|
+
return cells
|
110
|
+
|
111
|
+
def get_page_image(
|
112
|
+
self, scale: int = 1, cropbox: Optional[BoundingBox] = None
|
113
|
+
) -> Image.Image:
|
114
|
+
|
115
|
+
page_size = self.get_size()
|
116
|
+
|
117
|
+
if not cropbox:
|
118
|
+
cropbox = BoundingBox(
|
119
|
+
l=0,
|
120
|
+
r=page_size.width,
|
121
|
+
t=0,
|
122
|
+
b=page_size.height,
|
123
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
124
|
+
)
|
125
|
+
padbox = BoundingBox(
|
126
|
+
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
127
|
+
)
|
128
|
+
else:
|
129
|
+
padbox = cropbox.to_bottom_left_origin(page_size.height)
|
130
|
+
padbox.r = page_size.width - padbox.r
|
131
|
+
padbox.t = page_size.height - padbox.t
|
132
|
+
|
133
|
+
image = (
|
134
|
+
self._ppage.render(
|
135
|
+
scale=scale * 1.5,
|
136
|
+
rotation=0, # no additional rotation
|
137
|
+
crop=padbox.as_tuple(),
|
138
|
+
)
|
139
|
+
.to_pil()
|
140
|
+
.resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
|
141
|
+
) # We resize the image from 1.5x the given scale to make it sharper.
|
142
|
+
|
143
|
+
return image
|
144
|
+
|
145
|
+
def get_size(self) -> PageSize:
|
146
|
+
return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
|
147
|
+
|
148
|
+
def unload(self):
|
149
|
+
self._ppage = None
|
150
|
+
self._dpage = None
|
151
|
+
self.text_page = None
|
152
|
+
|
153
|
+
|
154
|
+
class DoclingParseDocumentBackend(PdfDocumentBackend):
|
155
|
+
def __init__(self, path_or_stream: Union[BytesIO, Path]):
|
156
|
+
super().__init__(path_or_stream)
|
157
|
+
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
158
|
+
# Parsing cells with docling_parser call
|
159
|
+
parser = pdf_parser()
|
160
|
+
|
161
|
+
start_pb_time = time.time()
|
162
|
+
|
163
|
+
if isinstance(path_or_stream, BytesIO):
|
164
|
+
self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
|
165
|
+
else:
|
166
|
+
self._parser_doc = parser.find_cells(str(path_or_stream))
|
167
|
+
|
168
|
+
end_pb_time = time.time() - start_pb_time
|
169
|
+
_log.info(
|
170
|
+
f"Time to parse {path_or_stream.name} with docling-parse: time={end_pb_time:.3f}"
|
171
|
+
)
|
172
|
+
|
173
|
+
def page_count(self) -> int:
|
174
|
+
return len(self._parser_doc["pages"])
|
175
|
+
|
176
|
+
def load_page(self, page_no: int) -> PdfPage:
|
177
|
+
return DoclingParsePageBackend(
|
178
|
+
self._pdoc[page_no], self._parser_doc["pages"][page_no]
|
179
|
+
)
|
180
|
+
|
181
|
+
def is_valid(self) -> bool:
|
182
|
+
return self.page_count() > 0
|
183
|
+
|
184
|
+
def unload(self):
|
185
|
+
self._pdoc.close()
|
186
|
+
self._pdoc = None
|
187
|
+
self._parser_doc = None
|
@@ -134,7 +134,9 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
134
134
|
return merged_cells
|
135
135
|
|
136
136
|
def draw_clusters_and_cells():
|
137
|
-
image =
|
137
|
+
image = (
|
138
|
+
self.get_page_image()
|
139
|
+
) # make new image to avoid drawing on the saved ones
|
138
140
|
draw = ImageDraw.Draw(image)
|
139
141
|
for c in cells:
|
140
142
|
x0, y0, x1, y1 = c.bbox.as_tuple()
|
@@ -199,15 +201,9 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
199
201
|
|
200
202
|
|
201
203
|
class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
202
|
-
def __init__(self, path_or_stream:
|
204
|
+
def __init__(self, path_or_stream: Union[BytesIO, Path]):
|
203
205
|
super().__init__(path_or_stream)
|
204
|
-
|
205
|
-
if isinstance(path_or_stream, Path):
|
206
|
-
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
207
|
-
elif isinstance(path_or_stream, BytesIO):
|
208
|
-
self._pdoc = pdfium.PdfDocument(
|
209
|
-
path_or_stream
|
210
|
-
) # TODO Fix me, won't accept bytes.
|
206
|
+
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
211
207
|
|
212
208
|
def page_count(self) -> int:
|
213
209
|
return len(self._pdoc)
|
@@ -1,9 +1,12 @@
|
|
1
|
+
import copy
|
2
|
+
import warnings
|
1
3
|
from enum import Enum, auto
|
2
4
|
from io import BytesIO
|
3
|
-
from typing import Any, Dict, List, Optional, Tuple, Union
|
5
|
+
from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
|
4
6
|
|
5
7
|
from PIL.Image import Image
|
6
|
-
from pydantic import BaseModel, ConfigDict, model_validator
|
8
|
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
9
|
+
from typing_extensions import Self
|
7
10
|
|
8
11
|
from docling.backend.abstract_backend import PdfPageBackend
|
9
12
|
|
@@ -47,6 +50,15 @@ class BoundingBox(BaseModel):
|
|
47
50
|
def height(self):
|
48
51
|
return abs(self.t - self.b)
|
49
52
|
|
53
|
+
def scaled(self, scale: float) -> "BoundingBox":
|
54
|
+
out_bbox = copy.deepcopy(self)
|
55
|
+
out_bbox.l *= scale
|
56
|
+
out_bbox.r *= scale
|
57
|
+
out_bbox.t *= scale
|
58
|
+
out_bbox.b *= scale
|
59
|
+
|
60
|
+
return out_bbox
|
61
|
+
|
50
62
|
def as_tuple(self):
|
51
63
|
if self.coord_origin == CoordOrigin.TOPLEFT:
|
52
64
|
return (self.l, self.t, self.r, self.b)
|
@@ -180,8 +192,7 @@ class TableStructurePrediction(BaseModel):
|
|
180
192
|
table_map: Dict[int, TableElement] = {}
|
181
193
|
|
182
194
|
|
183
|
-
class TextElement(BasePageElement):
|
184
|
-
...
|
195
|
+
class TextElement(BasePageElement): ...
|
185
196
|
|
186
197
|
|
187
198
|
class FigureData(BaseModel):
|
@@ -225,14 +236,30 @@ class Page(BaseModel):
|
|
225
236
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
226
237
|
|
227
238
|
page_no: int
|
228
|
-
page_hash: str = None
|
229
|
-
size: PageSize = None
|
230
|
-
image: Image = None
|
239
|
+
page_hash: Optional[str] = None
|
240
|
+
size: Optional[PageSize] = None
|
231
241
|
cells: List[Cell] = None
|
232
242
|
predictions: PagePredictions = PagePredictions()
|
233
|
-
assembled: AssembledUnit = None
|
243
|
+
assembled: Optional[AssembledUnit] = None
|
244
|
+
|
245
|
+
_backend: Optional[PdfPageBackend] = (
|
246
|
+
None # Internal PDF backend. By default it is cleared during assembling.
|
247
|
+
)
|
248
|
+
_default_image_scale: float = 1.0 # Default image scale for external usage.
|
249
|
+
_image_cache: Dict[float, Image] = (
|
250
|
+
{}
|
251
|
+
) # Cache of images in different scales. By default it is cleared during assembling.
|
252
|
+
|
253
|
+
def get_image(self, scale: float = 1.0) -> Optional[Image]:
|
254
|
+
if self._backend is None:
|
255
|
+
return self._image_cache.get(scale, None)
|
256
|
+
if not scale in self._image_cache:
|
257
|
+
self._image_cache[scale] = self._backend.get_page_image(scale=scale)
|
258
|
+
return self._image_cache[scale]
|
234
259
|
|
235
|
-
|
260
|
+
@property
|
261
|
+
def image(self) -> Optional[Image]:
|
262
|
+
return self.get_image(scale=self._default_image_scale)
|
236
263
|
|
237
264
|
|
238
265
|
class DocumentStream(BaseModel):
|
@@ -242,6 +269,36 @@ class DocumentStream(BaseModel):
|
|
242
269
|
stream: BytesIO
|
243
270
|
|
244
271
|
|
272
|
+
class TableStructureOptions(BaseModel):
|
273
|
+
do_cell_matching: bool = (
|
274
|
+
True
|
275
|
+
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
276
|
+
# are merged across table columns.
|
277
|
+
# False: Let table structure model define the text cells, ignore PDF cells.
|
278
|
+
)
|
279
|
+
|
280
|
+
|
245
281
|
class PipelineOptions(BaseModel):
|
246
|
-
do_table_structure: bool = True
|
247
|
-
do_ocr: bool = False
|
282
|
+
do_table_structure: bool = True # True: perform table structure extraction
|
283
|
+
do_ocr: bool = False # True: perform OCR, replace programmatic PDF text
|
284
|
+
|
285
|
+
table_structure_options: TableStructureOptions = TableStructureOptions()
|
286
|
+
|
287
|
+
|
288
|
+
class AssembleOptions(BaseModel):
|
289
|
+
keep_page_images: Annotated[
|
290
|
+
bool,
|
291
|
+
Field(
|
292
|
+
deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
|
293
|
+
),
|
294
|
+
] = False # False: page images are removed in the assemble step
|
295
|
+
images_scale: Optional[float] = None # if set, the scale for generated images
|
296
|
+
|
297
|
+
@model_validator(mode="after")
|
298
|
+
def set_page_images_from_deprecated(self) -> Self:
|
299
|
+
with warnings.catch_warnings():
|
300
|
+
warnings.simplefilter("ignore", DeprecationWarning)
|
301
|
+
default_scale = 1.0
|
302
|
+
if self.keep_page_images and self.images_scale is None:
|
303
|
+
self.images_scale = default_scale
|
304
|
+
return self
|