docling 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/docling_parse_backend.py +15 -1
- docling/backend/pypdfium2_backend.py +3 -1
- docling/datamodel/base_models.py +41 -10
- docling/datamodel/document.py +16 -1
- docling/document_converter.py +12 -6
- docling/models/easyocr_model.py +1 -1
- docling/models/layout_model.py +3 -1
- docling/models/table_structure_model.py +4 -8
- {docling-1.4.0.dist-info → docling-1.5.0.dist-info}/METADATA +11 -8
- {docling-1.4.0.dist-info → docling-1.5.0.dist-info}/RECORD +12 -12
- {docling-1.4.0.dist-info → docling-1.5.0.dist-info}/LICENSE +0 -0
- {docling-1.4.0.dist-info → docling-1.5.0.dist-info}/WHEEL +0 -0
@@ -1,4 +1,6 @@
|
|
1
|
+
import logging
|
1
2
|
import random
|
3
|
+
import time
|
2
4
|
from io import BytesIO
|
3
5
|
from pathlib import Path
|
4
6
|
from typing import Iterable, List, Optional, Union
|
@@ -11,6 +13,8 @@ from pypdfium2 import PdfPage
|
|
11
13
|
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
|
12
14
|
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
|
13
15
|
|
16
|
+
_log = logging.getLogger(__name__)
|
17
|
+
|
14
18
|
|
15
19
|
class DoclingParsePageBackend(PdfPageBackend):
|
16
20
|
def __init__(self, page_obj: PdfPage, docling_page_obj):
|
@@ -80,7 +84,9 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
80
84
|
cell_counter += 1
|
81
85
|
|
82
86
|
def draw_clusters_and_cells():
|
83
|
-
image =
|
87
|
+
image = (
|
88
|
+
self.get_page_image()
|
89
|
+
) # make new image to avoid drawing on the saved ones
|
84
90
|
draw = ImageDraw.Draw(image)
|
85
91
|
for c in cells:
|
86
92
|
x0, y0, x1, y1 = c.bbox.as_tuple()
|
@@ -151,11 +157,19 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
|
|
151
157
|
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
152
158
|
# Parsing cells with docling_parser call
|
153
159
|
parser = pdf_parser()
|
160
|
+
|
161
|
+
start_pb_time = time.time()
|
162
|
+
|
154
163
|
if isinstance(path_or_stream, BytesIO):
|
155
164
|
self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
|
156
165
|
else:
|
157
166
|
self._parser_doc = parser.find_cells(str(path_or_stream))
|
158
167
|
|
168
|
+
end_pb_time = time.time() - start_pb_time
|
169
|
+
_log.info(
|
170
|
+
f"Time to parse {path_or_stream.name} with docling-parse: time={end_pb_time:.3f}"
|
171
|
+
)
|
172
|
+
|
159
173
|
def page_count(self) -> int:
|
160
174
|
return len(self._parser_doc["pages"])
|
161
175
|
|
@@ -134,7 +134,9 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
134
134
|
return merged_cells
|
135
135
|
|
136
136
|
def draw_clusters_and_cells():
|
137
|
-
image =
|
137
|
+
image = (
|
138
|
+
self.get_page_image()
|
139
|
+
) # make new image to avoid drawing on the saved ones
|
138
140
|
draw = ImageDraw.Draw(image)
|
139
141
|
for c in cells:
|
140
142
|
x0, y0, x1, y1 = c.bbox.as_tuple()
|
docling/datamodel/base_models.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1
1
|
import copy
|
2
|
+
import warnings
|
2
3
|
from enum import Enum, auto
|
3
4
|
from io import BytesIO
|
4
|
-
from typing import Any, Dict, List, Optional, Tuple, Union
|
5
|
+
from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
|
5
6
|
|
6
7
|
from PIL.Image import Image
|
7
|
-
from pydantic import BaseModel, ConfigDict, model_validator
|
8
|
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
9
|
+
from typing_extensions import Self
|
8
10
|
|
9
11
|
from docling.backend.abstract_backend import PdfPageBackend
|
10
12
|
|
@@ -234,14 +236,30 @@ class Page(BaseModel):
|
|
234
236
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
235
237
|
|
236
238
|
page_no: int
|
237
|
-
page_hash: str = None
|
238
|
-
size: PageSize = None
|
239
|
-
image: Image = None
|
239
|
+
page_hash: Optional[str] = None
|
240
|
+
size: Optional[PageSize] = None
|
240
241
|
cells: List[Cell] = None
|
241
242
|
predictions: PagePredictions = PagePredictions()
|
242
|
-
assembled: AssembledUnit = None
|
243
|
+
assembled: Optional[AssembledUnit] = None
|
243
244
|
|
244
|
-
_backend: PdfPageBackend =
|
245
|
+
_backend: Optional[PdfPageBackend] = (
|
246
|
+
None # Internal PDF backend. By default it is cleared during assembling.
|
247
|
+
)
|
248
|
+
_default_image_scale: float = 1.0 # Default image scale for external usage.
|
249
|
+
_image_cache: Dict[float, Image] = (
|
250
|
+
{}
|
251
|
+
) # Cache of images in different scales. By default it is cleared during assembling.
|
252
|
+
|
253
|
+
def get_image(self, scale: float = 1.0) -> Optional[Image]:
|
254
|
+
if self._backend is None:
|
255
|
+
return self._image_cache.get(scale, None)
|
256
|
+
if not scale in self._image_cache:
|
257
|
+
self._image_cache[scale] = self._backend.get_page_image(scale=scale)
|
258
|
+
return self._image_cache[scale]
|
259
|
+
|
260
|
+
@property
|
261
|
+
def image(self) -> Optional[Image]:
|
262
|
+
return self.get_image(scale=self._default_image_scale)
|
245
263
|
|
246
264
|
|
247
265
|
class DocumentStream(BaseModel):
|
@@ -268,6 +286,19 @@ class PipelineOptions(BaseModel):
|
|
268
286
|
|
269
287
|
|
270
288
|
class AssembleOptions(BaseModel):
|
271
|
-
keep_page_images:
|
272
|
-
|
273
|
-
|
289
|
+
keep_page_images: Annotated[
|
290
|
+
bool,
|
291
|
+
Field(
|
292
|
+
deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
|
293
|
+
),
|
294
|
+
] = False # False: page images are removed in the assemble step
|
295
|
+
images_scale: Optional[float] = None # if set, the scale for generated images
|
296
|
+
|
297
|
+
@model_validator(mode="after")
|
298
|
+
def set_page_images_from_deprecated(self) -> Self:
|
299
|
+
with warnings.catch_warnings():
|
300
|
+
warnings.simplefilter("ignore", DeprecationWarning)
|
301
|
+
default_scale = 1.0
|
302
|
+
if self.keep_page_images and self.images_scale is None:
|
303
|
+
self.images_scale = default_scale
|
304
|
+
return self
|
docling/datamodel/document.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
import logging
|
2
2
|
from io import BytesIO
|
3
3
|
from pathlib import Path, PurePath
|
4
|
-
from typing import ClassVar, Dict, Iterable, List, Optional, Type, Union
|
4
|
+
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
|
5
5
|
|
6
6
|
from docling_core.types import BaseCell, BaseText
|
7
7
|
from docling_core.types import BoundingBox as DsBoundingBox
|
@@ -21,6 +21,7 @@ from docling.datamodel.base_models import (
|
|
21
21
|
DocumentStream,
|
22
22
|
FigureElement,
|
23
23
|
Page,
|
24
|
+
PageElement,
|
24
25
|
TableElement,
|
25
26
|
TextElement,
|
26
27
|
)
|
@@ -302,6 +303,20 @@ class ConvertedDocument(BaseModel):
|
|
302
303
|
else:
|
303
304
|
return ""
|
304
305
|
|
306
|
+
def render_element_images(
|
307
|
+
self, element_types: Tuple[PageElement] = (FigureElement,)
|
308
|
+
):
|
309
|
+
for element in self.assembled.elements:
|
310
|
+
if isinstance(element, element_types):
|
311
|
+
page_ix = element.page_no
|
312
|
+
scale = self.pages[page_ix]._default_image_scale
|
313
|
+
crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin(
|
314
|
+
page_height=self.pages[page_ix].size.height * scale
|
315
|
+
)
|
316
|
+
|
317
|
+
cropped_im = self.pages[page_ix].image.crop(crop_bbox.as_tuple())
|
318
|
+
yield element, cropped_im
|
319
|
+
|
305
320
|
|
306
321
|
class DocumentConversionInput(BaseModel):
|
307
322
|
|
docling/document_converter.py
CHANGED
@@ -188,10 +188,8 @@ class DocumentConverter:
|
|
188
188
|
# Free up mem resources before moving on with next batch
|
189
189
|
|
190
190
|
# Remove page images (can be disabled)
|
191
|
-
if
|
192
|
-
assembled_page.
|
193
|
-
None # Comment this if you want to visualize page images
|
194
|
-
)
|
191
|
+
if self.assemble_options.images_scale is None:
|
192
|
+
assembled_page._image_cache = {}
|
195
193
|
|
196
194
|
# Unload backend
|
197
195
|
assembled_page._backend.unload()
|
@@ -231,7 +229,15 @@ class DocumentConverter:
|
|
231
229
|
|
232
230
|
# Generate the page image and store it in the page object
|
233
231
|
def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
|
234
|
-
|
232
|
+
# default scale
|
233
|
+
page.get_image(scale=1.0)
|
234
|
+
|
235
|
+
# user requested scales
|
236
|
+
if self.assemble_options.images_scale is not None:
|
237
|
+
page._default_image_scale = self.assemble_options.images_scale
|
238
|
+
page.get_image(
|
239
|
+
scale=self.assemble_options.images_scale
|
240
|
+
) # this will trigger storing the image in the internal cache
|
235
241
|
|
236
242
|
return page
|
237
243
|
|
@@ -247,7 +253,7 @@ class DocumentConverter:
|
|
247
253
|
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
248
254
|
image.show()
|
249
255
|
|
250
|
-
# draw_text_boxes(page.
|
256
|
+
# draw_text_boxes(page.get_image(scale=1.0), cells)
|
251
257
|
|
252
258
|
return page
|
253
259
|
|
docling/models/easyocr_model.py
CHANGED
@@ -30,7 +30,7 @@ class EasyOcrModel:
|
|
30
30
|
|
31
31
|
for page in page_batch:
|
32
32
|
# rects = page._fpage.
|
33
|
-
high_res_image = page.
|
33
|
+
high_res_image = page.get_image(scale=self.scale)
|
34
34
|
im = numpy.array(high_res_image)
|
35
35
|
result = self.reader.readtext(im)
|
36
36
|
|
docling/models/layout_model.py
CHANGED
@@ -267,7 +267,9 @@ class LayoutModel:
|
|
267
267
|
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
268
268
|
for page in page_batch:
|
269
269
|
clusters = []
|
270
|
-
for ix, pred_item in enumerate(
|
270
|
+
for ix, pred_item in enumerate(
|
271
|
+
self.layout_predictor.predict(page.get_image(scale=1.0))
|
272
|
+
):
|
271
273
|
cluster = Cluster(
|
272
274
|
id=ix,
|
273
275
|
label=pred_item["label"],
|
@@ -34,7 +34,9 @@ class TableStructureModel:
|
|
34
34
|
self.scale = 2.0 # Scale up table input images to 144 dpi
|
35
35
|
|
36
36
|
def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]):
|
37
|
-
image =
|
37
|
+
image = (
|
38
|
+
page._backend.get_page_image()
|
39
|
+
) # make new image to avoid drawing on the saved ones
|
38
40
|
draw = ImageDraw.Draw(image)
|
39
41
|
|
40
42
|
for table_element in tbl_list:
|
@@ -94,13 +96,7 @@ class TableStructureModel:
|
|
94
96
|
"width": page.size.width * self.scale,
|
95
97
|
"height": page.size.height * self.scale,
|
96
98
|
}
|
97
|
-
|
98
|
-
if self.scale == 1.0:
|
99
|
-
page_input["image"] = numpy.asarray(page.image)
|
100
|
-
else: # render new page image on the fly at desired scale
|
101
|
-
page_input["image"] = numpy.asarray(
|
102
|
-
page._backend.get_page_image(scale=self.scale)
|
103
|
-
)
|
99
|
+
page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
|
104
100
|
|
105
101
|
table_clusters, table_bboxes = zip(*in_tables)
|
106
102
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.5.0
|
4
4
|
Summary: Docling PDF conversion package
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -44,6 +44,7 @@ Description-Content-Type: text/markdown
|
|
44
44
|
|
45
45
|
# Docling
|
46
46
|
|
47
|
+
[](https://arxiv.org/abs/2408.09869)
|
47
48
|
[](https://pypi.org/project/docling/)
|
48
49
|

|
49
50
|
[](https://python-poetry.org/)
|
@@ -172,13 +173,15 @@ Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main
|
|
172
173
|
If you use Docling in your projects, please consider citing the following:
|
173
174
|
|
174
175
|
```bib
|
175
|
-
@
|
176
|
-
author = {Deep Search Team},
|
177
|
-
month = {
|
178
|
-
title = {{Docling}},
|
179
|
-
url
|
180
|
-
|
181
|
-
|
176
|
+
@techreport{Docling,
|
177
|
+
author = {Deep Search Team},
|
178
|
+
month = {8},
|
179
|
+
title = {{Docling Technical Report}},
|
180
|
+
url={https://arxiv.org/abs/2408.09869},
|
181
|
+
eprint={2408.09869},
|
182
|
+
doi = "10.48550/arXiv.2408.09869",
|
183
|
+
version = {1.0.0},
|
184
|
+
year = {2024}
|
182
185
|
}
|
183
186
|
```
|
184
187
|
|
@@ -1,26 +1,26 @@
|
|
1
1
|
docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
2
|
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
3
|
docling/backend/abstract_backend.py,sha256=swwmXzNueZSHqEOvw4j-IFhP2OUJhBeB--gV7NtzKgo,1112
|
4
|
-
docling/backend/docling_parse_backend.py,sha256
|
5
|
-
docling/backend/pypdfium2_backend.py,sha256=
|
4
|
+
docling/backend/docling_parse_backend.py,sha256=-bIjYJ-80R2SArAEw_lAyzgW5_BFEoX83n1oBMmUGF4,6284
|
5
|
+
docling/backend/pypdfium2_backend.py,sha256=3Qeeal8z6DunUe4S10Z2TXrdeucanCpa8evt6SQtpKQ,7496
|
6
6
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
docling/datamodel/base_models.py,sha256=
|
8
|
-
docling/datamodel/document.py,sha256=
|
7
|
+
docling/datamodel/base_models.py,sha256=uOq0zjUS60aIkROREiypp3Jn1yqQTlWEf34jXTT43ls,8391
|
8
|
+
docling/datamodel/document.py,sha256=Dgi9pSwXCgIoR26MKiRDiVMyMaFKdvGSKq2Fm5Lef9M,13173
|
9
9
|
docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
|
10
|
-
docling/document_converter.py,sha256=
|
10
|
+
docling/document_converter.py,sha256=r9z48VjL_hkq-rbAgyZ135njzUGBJ5AnhEH6-1zfyCA,10490
|
11
11
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
12
|
docling/models/ds_glm_model.py,sha256=wmb--2JKFQby-kvidw6PyM8wURPXYPQ_Z_eKKCBAdYQ,3192
|
13
|
-
docling/models/easyocr_model.py,sha256=
|
14
|
-
docling/models/layout_model.py,sha256=
|
13
|
+
docling/models/easyocr_model.py,sha256=Y-RWolIFE3By6gk8dnb2qFy7Cr9qcHs6eo65fWPT0Nc,2276
|
14
|
+
docling/models/layout_model.py,sha256=ZFmaLXlRWUfsT1pJCiYVxhQFrBBsiz6Aw0m9GM3UvVM,11249
|
15
15
|
docling/models/page_assemble_model.py,sha256=8eoG2WiFxPxq9TPvM-wkngb2gkr0tdtCRVXg1JcTETo,5550
|
16
|
-
docling/models/table_structure_model.py,sha256=
|
16
|
+
docling/models/table_structure_model.py,sha256=lKsodvfZaGwxOHp-CbRW5nzCKZYMwf770h0Ka6Bdbgw,5451
|
17
17
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
18
|
docling/pipeline/base_model_pipeline.py,sha256=ozHdJak0yQAxQf7pQN_C480vI35A2e5KL5Qq1xSkq5c,560
|
19
19
|
docling/pipeline/standard_model_pipeline.py,sha256=UTwodKUKrisLoVcntbNUBDhjzRyFvpdUvyVw-gNmBlM,1541
|
20
20
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
21
|
docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
|
22
22
|
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
23
|
-
docling-1.
|
24
|
-
docling-1.
|
25
|
-
docling-1.
|
26
|
-
docling-1.
|
23
|
+
docling-1.5.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
|
24
|
+
docling-1.5.0.dist-info/METADATA,sha256=jWcjsrdfYcpeYFCRQ1h5C1b8MyaKsJWyUhGheXQEGvY,7235
|
25
|
+
docling-1.5.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
26
|
+
docling-1.5.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|