docling 1.1.2__tar.gz → 1.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-1.1.2 → docling-1.5.0}/PKG-INFO +20 -11
- {docling-1.1.2 → docling-1.5.0}/README.md +16 -9
- {docling-1.1.2 → docling-1.5.0}/docling/backend/abstract_backend.py +1 -1
- docling-1.5.0/docling/backend/docling_parse_backend.py +187 -0
- {docling-1.1.2 → docling-1.5.0}/docling/backend/pypdfium2_backend.py +4 -2
- {docling-1.1.2 → docling-1.5.0}/docling/datamodel/base_models.py +44 -7
- {docling-1.1.2 → docling-1.5.0}/docling/datamodel/document.py +19 -4
- {docling-1.1.2 → docling-1.5.0}/docling/document_converter.py +21 -6
- {docling-1.1.2 → docling-1.5.0}/docling/models/easyocr_model.py +1 -1
- {docling-1.1.2 → docling-1.5.0}/docling/models/layout_model.py +11 -1
- {docling-1.1.2 → docling-1.5.0}/docling/models/table_structure_model.py +4 -8
- {docling-1.1.2 → docling-1.5.0}/pyproject.toml +4 -2
- {docling-1.1.2 → docling-1.5.0}/LICENSE +0 -0
- {docling-1.1.2 → docling-1.5.0}/docling/__init__.py +0 -0
- {docling-1.1.2 → docling-1.5.0}/docling/backend/__init__.py +0 -0
- {docling-1.1.2 → docling-1.5.0}/docling/datamodel/__init__.py +0 -0
- {docling-1.1.2 → docling-1.5.0}/docling/datamodel/settings.py +0 -0
- {docling-1.1.2 → docling-1.5.0}/docling/models/__init__.py +0 -0
- {docling-1.1.2 → docling-1.5.0}/docling/models/ds_glm_model.py +0 -0
- {docling-1.1.2 → docling-1.5.0}/docling/models/page_assemble_model.py +0 -0
- {docling-1.1.2 → docling-1.5.0}/docling/pipeline/__init__.py +0 -0
- {docling-1.1.2 → docling-1.5.0}/docling/pipeline/base_model_pipeline.py +0 -0
- {docling-1.1.2 → docling-1.5.0}/docling/pipeline/standard_model_pipeline.py +0 -0
- {docling-1.1.2 → docling-1.5.0}/docling/utils/__init__.py +0 -0
- {docling-1.1.2 → docling-1.5.0}/docling/utils/layout_utils.py +0 -0
- {docling-1.1.2 → docling-1.5.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.5.0
|
4
4
|
Summary: Docling PDF conversion package
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -21,9 +21,11 @@ Classifier: Programming Language :: Python :: 3.12
|
|
21
21
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
22
22
|
Provides-Extra: easyocr
|
23
23
|
Provides-Extra: ocr
|
24
|
+
Requires-Dist: certifi (>=2024.7.4)
|
24
25
|
Requires-Dist: deepsearch-glm (>=0.19.0,<1)
|
25
26
|
Requires-Dist: docling-core (>=1.1.2,<2.0.0)
|
26
|
-
Requires-Dist: docling-ibm-models (>=1.1.
|
27
|
+
Requires-Dist: docling-ibm-models (>=1.1.1,<2.0.0)
|
28
|
+
Requires-Dist: docling-parse (>=0.2.0,<0.3.0)
|
27
29
|
Requires-Dist: easyocr (>=1.7,<2.0) ; extra == "easyocr" or extra == "ocr"
|
28
30
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
29
31
|
Requires-Dist: huggingface_hub (>=0.23,<1)
|
@@ -42,6 +44,7 @@ Description-Content-Type: text/markdown
|
|
42
44
|
|
43
45
|
# Docling
|
44
46
|
|
47
|
+
[](https://arxiv.org/abs/2408.09869)
|
45
48
|
[](https://pypi.org/project/docling/)
|
46
49
|

|
47
50
|
[](https://python-poetry.org/)
|
@@ -92,17 +95,21 @@ print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotate
|
|
92
95
|
|
93
96
|
### Convert a batch of documents
|
94
97
|
|
95
|
-
For an example of batch-converting documents, see [
|
98
|
+
For an example of batch-converting documents, see [batch_convert.py](https://github.com/DS4SD/docling/blob/main/examples/batch_convert.py).
|
96
99
|
|
97
100
|
From a local repo clone, you can run it with:
|
98
101
|
|
99
102
|
```
|
100
|
-
python examples/
|
103
|
+
python examples/batch_convert.py
|
101
104
|
```
|
102
105
|
The output of the above command will be written to `./scratch`.
|
103
106
|
|
104
107
|
### Adjust pipeline features
|
105
108
|
|
109
|
+
The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
|
110
|
+
one can adjust the conversion pipeline and features.
|
111
|
+
|
112
|
+
|
106
113
|
#### Control pipeline options
|
107
114
|
|
108
115
|
You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`:
|
@@ -166,13 +173,15 @@ Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main
|
|
166
173
|
If you use Docling in your projects, please consider citing the following:
|
167
174
|
|
168
175
|
```bib
|
169
|
-
@
|
170
|
-
author = {Deep Search Team},
|
171
|
-
month = {
|
172
|
-
title = {{Docling}},
|
173
|
-
url
|
174
|
-
|
175
|
-
|
176
|
+
@techreport{Docling,
|
177
|
+
author = {Deep Search Team},
|
178
|
+
month = {8},
|
179
|
+
title = {{Docling Technical Report}},
|
180
|
+
url={https://arxiv.org/abs/2408.09869},
|
181
|
+
eprint={2408.09869},
|
182
|
+
doi = "10.48550/arXiv.2408.09869",
|
183
|
+
version = {1.0.0},
|
184
|
+
year = {2024}
|
176
185
|
}
|
177
186
|
```
|
178
187
|
|
@@ -6,6 +6,7 @@
|
|
6
6
|
|
7
7
|
# Docling
|
8
8
|
|
9
|
+
[](https://arxiv.org/abs/2408.09869)
|
9
10
|
[](https://pypi.org/project/docling/)
|
10
11
|

|
11
12
|
[](https://python-poetry.org/)
|
@@ -56,17 +57,21 @@ print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotate
|
|
56
57
|
|
57
58
|
### Convert a batch of documents
|
58
59
|
|
59
|
-
For an example of batch-converting documents, see [
|
60
|
+
For an example of batch-converting documents, see [batch_convert.py](https://github.com/DS4SD/docling/blob/main/examples/batch_convert.py).
|
60
61
|
|
61
62
|
From a local repo clone, you can run it with:
|
62
63
|
|
63
64
|
```
|
64
|
-
python examples/
|
65
|
+
python examples/batch_convert.py
|
65
66
|
```
|
66
67
|
The output of the above command will be written to `./scratch`.
|
67
68
|
|
68
69
|
### Adjust pipeline features
|
69
70
|
|
71
|
+
The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
|
72
|
+
one can adjust the conversion pipeline and features.
|
73
|
+
|
74
|
+
|
70
75
|
#### Control pipeline options
|
71
76
|
|
72
77
|
You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`:
|
@@ -130,13 +135,15 @@ Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main
|
|
130
135
|
If you use Docling in your projects, please consider citing the following:
|
131
136
|
|
132
137
|
```bib
|
133
|
-
@
|
134
|
-
author = {Deep Search Team},
|
135
|
-
month = {
|
136
|
-
title = {{Docling}},
|
137
|
-
url
|
138
|
-
|
139
|
-
|
138
|
+
@techreport{Docling,
|
139
|
+
author = {Deep Search Team},
|
140
|
+
month = {8},
|
141
|
+
title = {{Docling Technical Report}},
|
142
|
+
url={https://arxiv.org/abs/2408.09869},
|
143
|
+
eprint={2408.09869},
|
144
|
+
doi = "10.48550/arXiv.2408.09869",
|
145
|
+
version = {1.0.0},
|
146
|
+
year = {2024}
|
140
147
|
}
|
141
148
|
```
|
142
149
|
|
@@ -0,0 +1,187 @@
|
|
1
|
+
import logging
|
2
|
+
import random
|
3
|
+
import time
|
4
|
+
from io import BytesIO
|
5
|
+
from pathlib import Path
|
6
|
+
from typing import Iterable, List, Optional, Union
|
7
|
+
|
8
|
+
import pypdfium2 as pdfium
|
9
|
+
from docling_parse.docling_parse import pdf_parser
|
10
|
+
from PIL import Image, ImageDraw
|
11
|
+
from pypdfium2 import PdfPage
|
12
|
+
|
13
|
+
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
|
14
|
+
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
|
15
|
+
|
16
|
+
_log = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
|
19
|
+
class DoclingParsePageBackend(PdfPageBackend):
|
20
|
+
def __init__(self, page_obj: PdfPage, docling_page_obj):
|
21
|
+
super().__init__(page_obj)
|
22
|
+
self._ppage = page_obj
|
23
|
+
self._dpage = docling_page_obj
|
24
|
+
self.text_page = None
|
25
|
+
|
26
|
+
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
27
|
+
# Find intersecting cells on the page
|
28
|
+
text_piece = ""
|
29
|
+
page_size = self.get_size()
|
30
|
+
parser_width = self._dpage["width"]
|
31
|
+
parser_height = self._dpage["height"]
|
32
|
+
|
33
|
+
scale = (
|
34
|
+
1 # FIX - Replace with param in get_text_in_rect across backends (optional)
|
35
|
+
)
|
36
|
+
|
37
|
+
for i in range(len(self._dpage["cells"])):
|
38
|
+
rect = self._dpage["cells"][i]["box"]["device"]
|
39
|
+
x0, y0, x1, y1 = rect
|
40
|
+
cell_bbox = BoundingBox(
|
41
|
+
l=x0 * scale * page_size.width / parser_width,
|
42
|
+
b=y0 * scale * page_size.height / parser_height,
|
43
|
+
r=x1 * scale * page_size.width / parser_width,
|
44
|
+
t=y1 * scale * page_size.height / parser_height,
|
45
|
+
coord_origin=CoordOrigin.BOTTOMLEFT,
|
46
|
+
).to_top_left_origin(page_size.height * scale)
|
47
|
+
|
48
|
+
overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
|
49
|
+
|
50
|
+
if overlap_frac > 0.5:
|
51
|
+
if len(text_piece) > 0:
|
52
|
+
text_piece += " "
|
53
|
+
text_piece += self._dpage["cells"][i]["content"]["rnormalized"]
|
54
|
+
|
55
|
+
return text_piece
|
56
|
+
|
57
|
+
def get_text_cells(self) -> Iterable[Cell]:
|
58
|
+
cells = []
|
59
|
+
cell_counter = 0
|
60
|
+
|
61
|
+
page_size = self.get_size()
|
62
|
+
|
63
|
+
parser_width = self._dpage["width"]
|
64
|
+
parser_height = self._dpage["height"]
|
65
|
+
|
66
|
+
for i in range(len(self._dpage["cells"])):
|
67
|
+
rect = self._dpage["cells"][i]["box"]["device"]
|
68
|
+
x0, y0, x1, y1 = rect
|
69
|
+
text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
|
70
|
+
cells.append(
|
71
|
+
Cell(
|
72
|
+
id=cell_counter,
|
73
|
+
text=text_piece,
|
74
|
+
bbox=BoundingBox(
|
75
|
+
# l=x0, b=y0, r=x1, t=y1,
|
76
|
+
l=x0 * page_size.width / parser_width,
|
77
|
+
b=y0 * page_size.height / parser_height,
|
78
|
+
r=x1 * page_size.width / parser_width,
|
79
|
+
t=y1 * page_size.height / parser_height,
|
80
|
+
coord_origin=CoordOrigin.BOTTOMLEFT,
|
81
|
+
).to_top_left_origin(page_size.height),
|
82
|
+
)
|
83
|
+
)
|
84
|
+
cell_counter += 1
|
85
|
+
|
86
|
+
def draw_clusters_and_cells():
|
87
|
+
image = (
|
88
|
+
self.get_page_image()
|
89
|
+
) # make new image to avoid drawing on the saved ones
|
90
|
+
draw = ImageDraw.Draw(image)
|
91
|
+
for c in cells:
|
92
|
+
x0, y0, x1, y1 = c.bbox.as_tuple()
|
93
|
+
cell_color = (
|
94
|
+
random.randint(30, 140),
|
95
|
+
random.randint(30, 140),
|
96
|
+
random.randint(30, 140),
|
97
|
+
)
|
98
|
+
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
99
|
+
image.show()
|
100
|
+
|
101
|
+
# before merge:
|
102
|
+
# draw_clusters_and_cells()
|
103
|
+
|
104
|
+
# cells = merge_horizontal_cells(cells)
|
105
|
+
|
106
|
+
# after merge:
|
107
|
+
# draw_clusters_and_cells()
|
108
|
+
|
109
|
+
return cells
|
110
|
+
|
111
|
+
def get_page_image(
|
112
|
+
self, scale: int = 1, cropbox: Optional[BoundingBox] = None
|
113
|
+
) -> Image.Image:
|
114
|
+
|
115
|
+
page_size = self.get_size()
|
116
|
+
|
117
|
+
if not cropbox:
|
118
|
+
cropbox = BoundingBox(
|
119
|
+
l=0,
|
120
|
+
r=page_size.width,
|
121
|
+
t=0,
|
122
|
+
b=page_size.height,
|
123
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
124
|
+
)
|
125
|
+
padbox = BoundingBox(
|
126
|
+
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
127
|
+
)
|
128
|
+
else:
|
129
|
+
padbox = cropbox.to_bottom_left_origin(page_size.height)
|
130
|
+
padbox.r = page_size.width - padbox.r
|
131
|
+
padbox.t = page_size.height - padbox.t
|
132
|
+
|
133
|
+
image = (
|
134
|
+
self._ppage.render(
|
135
|
+
scale=scale * 1.5,
|
136
|
+
rotation=0, # no additional rotation
|
137
|
+
crop=padbox.as_tuple(),
|
138
|
+
)
|
139
|
+
.to_pil()
|
140
|
+
.resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
|
141
|
+
) # We resize the image from 1.5x the given scale to make it sharper.
|
142
|
+
|
143
|
+
return image
|
144
|
+
|
145
|
+
def get_size(self) -> PageSize:
|
146
|
+
return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
|
147
|
+
|
148
|
+
def unload(self):
|
149
|
+
self._ppage = None
|
150
|
+
self._dpage = None
|
151
|
+
self.text_page = None
|
152
|
+
|
153
|
+
|
154
|
+
class DoclingParseDocumentBackend(PdfDocumentBackend):
|
155
|
+
def __init__(self, path_or_stream: Union[BytesIO, Path]):
|
156
|
+
super().__init__(path_or_stream)
|
157
|
+
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
158
|
+
# Parsing cells with docling_parser call
|
159
|
+
parser = pdf_parser()
|
160
|
+
|
161
|
+
start_pb_time = time.time()
|
162
|
+
|
163
|
+
if isinstance(path_or_stream, BytesIO):
|
164
|
+
self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
|
165
|
+
else:
|
166
|
+
self._parser_doc = parser.find_cells(str(path_or_stream))
|
167
|
+
|
168
|
+
end_pb_time = time.time() - start_pb_time
|
169
|
+
_log.info(
|
170
|
+
f"Time to parse {path_or_stream.name} with docling-parse: time={end_pb_time:.3f}"
|
171
|
+
)
|
172
|
+
|
173
|
+
def page_count(self) -> int:
|
174
|
+
return len(self._parser_doc["pages"])
|
175
|
+
|
176
|
+
def load_page(self, page_no: int) -> PdfPage:
|
177
|
+
return DoclingParsePageBackend(
|
178
|
+
self._pdoc[page_no], self._parser_doc["pages"][page_no]
|
179
|
+
)
|
180
|
+
|
181
|
+
def is_valid(self) -> bool:
|
182
|
+
return self.page_count() > 0
|
183
|
+
|
184
|
+
def unload(self):
|
185
|
+
self._pdoc.close()
|
186
|
+
self._pdoc = None
|
187
|
+
self._parser_doc = None
|
@@ -134,7 +134,9 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
134
134
|
return merged_cells
|
135
135
|
|
136
136
|
def draw_clusters_and_cells():
|
137
|
-
image =
|
137
|
+
image = (
|
138
|
+
self.get_page_image()
|
139
|
+
) # make new image to avoid drawing on the saved ones
|
138
140
|
draw = ImageDraw.Draw(image)
|
139
141
|
for c in cells:
|
140
142
|
x0, y0, x1, y1 = c.bbox.as_tuple()
|
@@ -199,7 +201,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
199
201
|
|
200
202
|
|
201
203
|
class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
202
|
-
def __init__(self, path_or_stream:
|
204
|
+
def __init__(self, path_or_stream: Union[BytesIO, Path]):
|
203
205
|
super().__init__(path_or_stream)
|
204
206
|
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
205
207
|
|
@@ -1,10 +1,12 @@
|
|
1
1
|
import copy
|
2
|
+
import warnings
|
2
3
|
from enum import Enum, auto
|
3
4
|
from io import BytesIO
|
4
|
-
from typing import Any, Dict, List, Optional, Tuple, Union
|
5
|
+
from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
|
5
6
|
|
6
7
|
from PIL.Image import Image
|
7
|
-
from pydantic import BaseModel, ConfigDict, model_validator
|
8
|
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
9
|
+
from typing_extensions import Self
|
8
10
|
|
9
11
|
from docling.backend.abstract_backend import PdfPageBackend
|
10
12
|
|
@@ -234,14 +236,30 @@ class Page(BaseModel):
|
|
234
236
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
235
237
|
|
236
238
|
page_no: int
|
237
|
-
page_hash: str = None
|
238
|
-
size: PageSize = None
|
239
|
-
image: Image = None
|
239
|
+
page_hash: Optional[str] = None
|
240
|
+
size: Optional[PageSize] = None
|
240
241
|
cells: List[Cell] = None
|
241
242
|
predictions: PagePredictions = PagePredictions()
|
242
|
-
assembled: AssembledUnit = None
|
243
|
+
assembled: Optional[AssembledUnit] = None
|
243
244
|
|
244
|
-
_backend: PdfPageBackend =
|
245
|
+
_backend: Optional[PdfPageBackend] = (
|
246
|
+
None # Internal PDF backend. By default it is cleared during assembling.
|
247
|
+
)
|
248
|
+
_default_image_scale: float = 1.0 # Default image scale for external usage.
|
249
|
+
_image_cache: Dict[float, Image] = (
|
250
|
+
{}
|
251
|
+
) # Cache of images in different scales. By default it is cleared during assembling.
|
252
|
+
|
253
|
+
def get_image(self, scale: float = 1.0) -> Optional[Image]:
|
254
|
+
if self._backend is None:
|
255
|
+
return self._image_cache.get(scale, None)
|
256
|
+
if not scale in self._image_cache:
|
257
|
+
self._image_cache[scale] = self._backend.get_page_image(scale=scale)
|
258
|
+
return self._image_cache[scale]
|
259
|
+
|
260
|
+
@property
|
261
|
+
def image(self) -> Optional[Image]:
|
262
|
+
return self.get_image(scale=self._default_image_scale)
|
245
263
|
|
246
264
|
|
247
265
|
class DocumentStream(BaseModel):
|
@@ -265,3 +283,22 @@ class PipelineOptions(BaseModel):
|
|
265
283
|
do_ocr: bool = False # True: perform OCR, replace programmatic PDF text
|
266
284
|
|
267
285
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|
286
|
+
|
287
|
+
|
288
|
+
class AssembleOptions(BaseModel):
|
289
|
+
keep_page_images: Annotated[
|
290
|
+
bool,
|
291
|
+
Field(
|
292
|
+
deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
|
293
|
+
),
|
294
|
+
] = False # False: page images are removed in the assemble step
|
295
|
+
images_scale: Optional[float] = None # if set, the scale for generated images
|
296
|
+
|
297
|
+
@model_validator(mode="after")
|
298
|
+
def set_page_images_from_deprecated(self) -> Self:
|
299
|
+
with warnings.catch_warnings():
|
300
|
+
warnings.simplefilter("ignore", DeprecationWarning)
|
301
|
+
default_scale = 1.0
|
302
|
+
if self.keep_page_images and self.images_scale is None:
|
303
|
+
self.images_scale = default_scale
|
304
|
+
return self
|
@@ -1,7 +1,7 @@
|
|
1
1
|
import logging
|
2
2
|
from io import BytesIO
|
3
3
|
from pathlib import Path, PurePath
|
4
|
-
from typing import ClassVar, Dict, Iterable, List, Optional, Type, Union
|
4
|
+
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
|
5
5
|
|
6
6
|
from docling_core.types import BaseCell, BaseText
|
7
7
|
from docling_core.types import BoundingBox as DsBoundingBox
|
@@ -14,13 +14,14 @@ from docling_core.types import TableCell
|
|
14
14
|
from pydantic import BaseModel
|
15
15
|
|
16
16
|
from docling.backend.abstract_backend import PdfDocumentBackend
|
17
|
-
from docling.backend.
|
17
|
+
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
18
18
|
from docling.datamodel.base_models import (
|
19
19
|
AssembledUnit,
|
20
20
|
ConversionStatus,
|
21
21
|
DocumentStream,
|
22
22
|
FigureElement,
|
23
23
|
Page,
|
24
|
+
PageElement,
|
24
25
|
TableElement,
|
25
26
|
TextElement,
|
26
27
|
)
|
@@ -64,7 +65,7 @@ class InputDocument(BaseModel):
|
|
64
65
|
path_or_stream: Union[BytesIO, Path],
|
65
66
|
filename: Optional[str] = None,
|
66
67
|
limits: Optional[DocumentLimits] = None,
|
67
|
-
pdf_backend=
|
68
|
+
pdf_backend=DoclingParseDocumentBackend,
|
68
69
|
):
|
69
70
|
super().__init__()
|
70
71
|
|
@@ -302,13 +303,27 @@ class ConvertedDocument(BaseModel):
|
|
302
303
|
else:
|
303
304
|
return ""
|
304
305
|
|
306
|
+
def render_element_images(
|
307
|
+
self, element_types: Tuple[PageElement] = (FigureElement,)
|
308
|
+
):
|
309
|
+
for element in self.assembled.elements:
|
310
|
+
if isinstance(element, element_types):
|
311
|
+
page_ix = element.page_no
|
312
|
+
scale = self.pages[page_ix]._default_image_scale
|
313
|
+
crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin(
|
314
|
+
page_height=self.pages[page_ix].size.height * scale
|
315
|
+
)
|
316
|
+
|
317
|
+
cropped_im = self.pages[page_ix].image.crop(crop_bbox.as_tuple())
|
318
|
+
yield element, cropped_im
|
319
|
+
|
305
320
|
|
306
321
|
class DocumentConversionInput(BaseModel):
|
307
322
|
|
308
323
|
_path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
|
309
324
|
limits: Optional[DocumentLimits] = DocumentLimits()
|
310
325
|
|
311
|
-
DEFAULT_BACKEND: ClassVar =
|
326
|
+
DEFAULT_BACKEND: ClassVar = DoclingParseDocumentBackend
|
312
327
|
|
313
328
|
def docs(
|
314
329
|
self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None
|
@@ -14,6 +14,7 @@ from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
|
|
14
14
|
from docling.backend.abstract_backend import PdfDocumentBackend
|
15
15
|
from docling.datamodel.base_models import (
|
16
16
|
AssembledUnit,
|
17
|
+
AssembleOptions,
|
17
18
|
ConversionStatus,
|
18
19
|
Page,
|
19
20
|
PipelineOptions,
|
@@ -44,6 +45,7 @@ class DocumentConverter:
|
|
44
45
|
pipeline_options: PipelineOptions = PipelineOptions(),
|
45
46
|
pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
|
46
47
|
pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
|
48
|
+
assemble_options: AssembleOptions = AssembleOptions(),
|
47
49
|
):
|
48
50
|
if not artifacts_path:
|
49
51
|
artifacts_path = self.download_models_hf()
|
@@ -57,6 +59,7 @@ class DocumentConverter:
|
|
57
59
|
self.page_assemble_model = PageAssembleModel(config={})
|
58
60
|
self.glm_model = GlmModel(config={})
|
59
61
|
self.pdf_backend = pdf_backend
|
62
|
+
self.assemble_options = assemble_options
|
60
63
|
|
61
64
|
@staticmethod
|
62
65
|
def download_models_hf(
|
@@ -174,17 +177,21 @@ class DocumentConverter:
|
|
174
177
|
pages_with_images,
|
175
178
|
)
|
176
179
|
|
180
|
+
# 4. Run pipeline stages
|
177
181
|
pipeline_pages = self.model_pipeline.apply(pages_with_cells)
|
178
182
|
|
179
|
-
#
|
183
|
+
# 5. Assemble page elements (per page)
|
180
184
|
assembled_pages = self.page_assemble_model(pipeline_pages)
|
181
185
|
|
182
186
|
# exhaust assembled_pages
|
183
187
|
for assembled_page in assembled_pages:
|
184
188
|
# Free up mem resources before moving on with next batch
|
185
|
-
|
186
|
-
|
187
|
-
|
189
|
+
|
190
|
+
# Remove page images (can be disabled)
|
191
|
+
if self.assemble_options.images_scale is None:
|
192
|
+
assembled_page._image_cache = {}
|
193
|
+
|
194
|
+
# Unload backend
|
188
195
|
assembled_page._backend.unload()
|
189
196
|
|
190
197
|
all_assembled_pages.append(assembled_page)
|
@@ -222,7 +229,15 @@ class DocumentConverter:
|
|
222
229
|
|
223
230
|
# Generate the page image and store it in the page object
|
224
231
|
def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
|
225
|
-
|
232
|
+
# default scale
|
233
|
+
page.get_image(scale=1.0)
|
234
|
+
|
235
|
+
# user requested scales
|
236
|
+
if self.assemble_options.images_scale is not None:
|
237
|
+
page._default_image_scale = self.assemble_options.images_scale
|
238
|
+
page.get_image(
|
239
|
+
scale=self.assemble_options.images_scale
|
240
|
+
) # this will trigger storing the image in the internal cache
|
226
241
|
|
227
242
|
return page
|
228
243
|
|
@@ -238,7 +253,7 @@ class DocumentConverter:
|
|
238
253
|
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
239
254
|
image.show()
|
240
255
|
|
241
|
-
# draw_text_boxes(page.
|
256
|
+
# draw_text_boxes(page.get_image(scale=1.0), cells)
|
242
257
|
|
243
258
|
return page
|
244
259
|
|
@@ -30,7 +30,7 @@ class EasyOcrModel:
|
|
30
30
|
|
31
31
|
for page in page_batch:
|
32
32
|
# rects = page._fpage.
|
33
|
-
high_res_image = page.
|
33
|
+
high_res_image = page.get_image(scale=self.scale)
|
34
34
|
im = numpy.array(high_res_image)
|
35
35
|
result = self.reader.readtext(im)
|
36
36
|
|
@@ -69,6 +69,10 @@ class LayoutModel:
|
|
69
69
|
"Key-Value Region": 0.45,
|
70
70
|
}
|
71
71
|
|
72
|
+
CLASS_REMAPPINGS = {
|
73
|
+
"Document Index": "Table",
|
74
|
+
}
|
75
|
+
|
72
76
|
_log.debug("================= Start postprocess function ====================")
|
73
77
|
start_time = time.time()
|
74
78
|
# Apply Confidence Threshold to cluster predictions
|
@@ -79,6 +83,10 @@ class LayoutModel:
|
|
79
83
|
confidence = CLASS_THRESHOLDS[cluster.label]
|
80
84
|
if cluster.confidence >= confidence:
|
81
85
|
# annotation["created_by"] = "high_conf_pred"
|
86
|
+
|
87
|
+
# Remap class labels where needed.
|
88
|
+
if cluster.label in CLASS_REMAPPINGS.keys():
|
89
|
+
cluster.label = CLASS_REMAPPINGS[cluster.label]
|
82
90
|
clusters_out.append(cluster)
|
83
91
|
|
84
92
|
# map to dictionary clusters and cells, with bottom left origin
|
@@ -259,7 +267,9 @@ class LayoutModel:
|
|
259
267
|
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
260
268
|
for page in page_batch:
|
261
269
|
clusters = []
|
262
|
-
for ix, pred_item in enumerate(
|
270
|
+
for ix, pred_item in enumerate(
|
271
|
+
self.layout_predictor.predict(page.get_image(scale=1.0))
|
272
|
+
):
|
263
273
|
cluster = Cluster(
|
264
274
|
id=ix,
|
265
275
|
label=pred_item["label"],
|
@@ -34,7 +34,9 @@ class TableStructureModel:
|
|
34
34
|
self.scale = 2.0 # Scale up table input images to 144 dpi
|
35
35
|
|
36
36
|
def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]):
|
37
|
-
image =
|
37
|
+
image = (
|
38
|
+
page._backend.get_page_image()
|
39
|
+
) # make new image to avoid drawing on the saved ones
|
38
40
|
draw = ImageDraw.Draw(image)
|
39
41
|
|
40
42
|
for table_element in tbl_list:
|
@@ -94,13 +96,7 @@ class TableStructureModel:
|
|
94
96
|
"width": page.size.width * self.scale,
|
95
97
|
"height": page.size.height * self.scale,
|
96
98
|
}
|
97
|
-
|
98
|
-
if self.scale == 1.0:
|
99
|
-
page_input["image"] = numpy.asarray(page.image)
|
100
|
-
else: # render new page image on the fly at desired scale
|
101
|
-
page_input["image"] = numpy.asarray(
|
102
|
-
page._backend.get_page_image(scale=self.scale)
|
103
|
-
)
|
99
|
+
page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
|
104
100
|
|
105
101
|
table_clusters, table_bboxes = zip(*in_tables)
|
106
102
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "docling"
|
3
|
-
version = "1.
|
3
|
+
version = "1.5.0" # DO NOT EDIT, updated automatically
|
4
4
|
description = "Docling PDF conversion package"
|
5
5
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
6
6
|
license = "MIT"
|
@@ -24,7 +24,7 @@ packages = [{include = "docling"}]
|
|
24
24
|
python = "^3.10"
|
25
25
|
pydantic = "^2.0.0"
|
26
26
|
docling-core = "^1.1.2"
|
27
|
-
docling-ibm-models = "^1.1.
|
27
|
+
docling-ibm-models = "^1.1.1"
|
28
28
|
deepsearch-glm = ">=0.19.0,<1"
|
29
29
|
filetype = "^1.2.0"
|
30
30
|
pypdfium2 = "^4.30.0"
|
@@ -32,6 +32,8 @@ pydantic-settings = "^2.3.0"
|
|
32
32
|
huggingface_hub = ">=0.23,<1"
|
33
33
|
requests = "^2.32.3"
|
34
34
|
easyocr = { version = "^1.7", optional = true }
|
35
|
+
docling-parse = "^0.2.0"
|
36
|
+
certifi = ">=2024.7.4"
|
35
37
|
|
36
38
|
[tool.poetry.group.dev.dependencies]
|
37
39
|
black = {extras = ["jupyter"], version = "^24.4.2"}
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|