docling 1.3.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/docling_parse_backend.py +19 -4
- docling/backend/pypdfium2_backend.py +3 -1
- docling/datamodel/base_models.py +41 -10
- docling/datamodel/document.py +19 -4
- docling/document_converter.py +12 -6
- docling/models/easyocr_model.py +1 -1
- docling/models/layout_model.py +11 -1
- docling/models/table_structure_model.py +4 -8
- {docling-1.3.0.dist-info → docling-1.5.0.dist-info}/METADATA +13 -10
- {docling-1.3.0.dist-info → docling-1.5.0.dist-info}/RECORD +12 -12
- {docling-1.3.0.dist-info → docling-1.5.0.dist-info}/LICENSE +0 -0
- {docling-1.3.0.dist-info → docling-1.5.0.dist-info}/WHEEL +0 -0
@@ -1,4 +1,6 @@
|
|
1
|
+
import logging
|
1
2
|
import random
|
3
|
+
import time
|
2
4
|
from io import BytesIO
|
3
5
|
from pathlib import Path
|
4
6
|
from typing import Iterable, List, Optional, Union
|
@@ -11,6 +13,8 @@ from pypdfium2 import PdfPage
|
|
11
13
|
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
|
12
14
|
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
|
13
15
|
|
16
|
+
_log = logging.getLogger(__name__)
|
17
|
+
|
14
18
|
|
15
19
|
class DoclingParsePageBackend(PdfPageBackend):
|
16
20
|
def __init__(self, page_obj: PdfPage, docling_page_obj):
|
@@ -80,7 +84,9 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|
80
84
|
cell_counter += 1
|
81
85
|
|
82
86
|
def draw_clusters_and_cells():
|
83
|
-
image =
|
87
|
+
image = (
|
88
|
+
self.get_page_image()
|
89
|
+
) # make new image to avoid drawing on the saved ones
|
84
90
|
draw = ImageDraw.Draw(image)
|
85
91
|
for c in cells:
|
86
92
|
x0, y0, x1, y1 = c.bbox.as_tuple()
|
@@ -150,10 +156,19 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
|
|
150
156
|
super().__init__(path_or_stream)
|
151
157
|
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
152
158
|
# Parsing cells with docling_parser call
|
153
|
-
if isinstance(path_or_stream, BytesIO):
|
154
|
-
raise NotImplemented("This backend does not support byte streams yet.")
|
155
159
|
parser = pdf_parser()
|
156
|
-
|
160
|
+
|
161
|
+
start_pb_time = time.time()
|
162
|
+
|
163
|
+
if isinstance(path_or_stream, BytesIO):
|
164
|
+
self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
|
165
|
+
else:
|
166
|
+
self._parser_doc = parser.find_cells(str(path_or_stream))
|
167
|
+
|
168
|
+
end_pb_time = time.time() - start_pb_time
|
169
|
+
_log.info(
|
170
|
+
f"Time to parse {path_or_stream.name} with docling-parse: time={end_pb_time:.3f}"
|
171
|
+
)
|
157
172
|
|
158
173
|
def page_count(self) -> int:
|
159
174
|
return len(self._parser_doc["pages"])
|
@@ -134,7 +134,9 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
134
134
|
return merged_cells
|
135
135
|
|
136
136
|
def draw_clusters_and_cells():
|
137
|
-
image =
|
137
|
+
image = (
|
138
|
+
self.get_page_image()
|
139
|
+
) # make new image to avoid drawing on the saved ones
|
138
140
|
draw = ImageDraw.Draw(image)
|
139
141
|
for c in cells:
|
140
142
|
x0, y0, x1, y1 = c.bbox.as_tuple()
|
docling/datamodel/base_models.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1
1
|
import copy
|
2
|
+
import warnings
|
2
3
|
from enum import Enum, auto
|
3
4
|
from io import BytesIO
|
4
|
-
from typing import Any, Dict, List, Optional, Tuple, Union
|
5
|
+
from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
|
5
6
|
|
6
7
|
from PIL.Image import Image
|
7
|
-
from pydantic import BaseModel, ConfigDict, model_validator
|
8
|
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
9
|
+
from typing_extensions import Self
|
8
10
|
|
9
11
|
from docling.backend.abstract_backend import PdfPageBackend
|
10
12
|
|
@@ -234,14 +236,30 @@ class Page(BaseModel):
|
|
234
236
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
235
237
|
|
236
238
|
page_no: int
|
237
|
-
page_hash: str = None
|
238
|
-
size: PageSize = None
|
239
|
-
image: Image = None
|
239
|
+
page_hash: Optional[str] = None
|
240
|
+
size: Optional[PageSize] = None
|
240
241
|
cells: List[Cell] = None
|
241
242
|
predictions: PagePredictions = PagePredictions()
|
242
|
-
assembled: AssembledUnit = None
|
243
|
+
assembled: Optional[AssembledUnit] = None
|
243
244
|
|
244
|
-
_backend: PdfPageBackend =
|
245
|
+
_backend: Optional[PdfPageBackend] = (
|
246
|
+
None # Internal PDF backend. By default it is cleared during assembling.
|
247
|
+
)
|
248
|
+
_default_image_scale: float = 1.0 # Default image scale for external usage.
|
249
|
+
_image_cache: Dict[float, Image] = (
|
250
|
+
{}
|
251
|
+
) # Cache of images in different scales. By default it is cleared during assembling.
|
252
|
+
|
253
|
+
def get_image(self, scale: float = 1.0) -> Optional[Image]:
|
254
|
+
if self._backend is None:
|
255
|
+
return self._image_cache.get(scale, None)
|
256
|
+
if not scale in self._image_cache:
|
257
|
+
self._image_cache[scale] = self._backend.get_page_image(scale=scale)
|
258
|
+
return self._image_cache[scale]
|
259
|
+
|
260
|
+
@property
|
261
|
+
def image(self) -> Optional[Image]:
|
262
|
+
return self.get_image(scale=self._default_image_scale)
|
245
263
|
|
246
264
|
|
247
265
|
class DocumentStream(BaseModel):
|
@@ -268,6 +286,19 @@ class PipelineOptions(BaseModel):
|
|
268
286
|
|
269
287
|
|
270
288
|
class AssembleOptions(BaseModel):
|
271
|
-
keep_page_images:
|
272
|
-
|
273
|
-
|
289
|
+
keep_page_images: Annotated[
|
290
|
+
bool,
|
291
|
+
Field(
|
292
|
+
deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
|
293
|
+
),
|
294
|
+
] = False # False: page images are removed in the assemble step
|
295
|
+
images_scale: Optional[float] = None # if set, the scale for generated images
|
296
|
+
|
297
|
+
@model_validator(mode="after")
|
298
|
+
def set_page_images_from_deprecated(self) -> Self:
|
299
|
+
with warnings.catch_warnings():
|
300
|
+
warnings.simplefilter("ignore", DeprecationWarning)
|
301
|
+
default_scale = 1.0
|
302
|
+
if self.keep_page_images and self.images_scale is None:
|
303
|
+
self.images_scale = default_scale
|
304
|
+
return self
|
docling/datamodel/document.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
import logging
|
2
2
|
from io import BytesIO
|
3
3
|
from pathlib import Path, PurePath
|
4
|
-
from typing import ClassVar, Dict, Iterable, List, Optional, Type, Union
|
4
|
+
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
|
5
5
|
|
6
6
|
from docling_core.types import BaseCell, BaseText
|
7
7
|
from docling_core.types import BoundingBox as DsBoundingBox
|
@@ -14,13 +14,14 @@ from docling_core.types import TableCell
|
|
14
14
|
from pydantic import BaseModel
|
15
15
|
|
16
16
|
from docling.backend.abstract_backend import PdfDocumentBackend
|
17
|
-
from docling.backend.
|
17
|
+
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
18
18
|
from docling.datamodel.base_models import (
|
19
19
|
AssembledUnit,
|
20
20
|
ConversionStatus,
|
21
21
|
DocumentStream,
|
22
22
|
FigureElement,
|
23
23
|
Page,
|
24
|
+
PageElement,
|
24
25
|
TableElement,
|
25
26
|
TextElement,
|
26
27
|
)
|
@@ -64,7 +65,7 @@ class InputDocument(BaseModel):
|
|
64
65
|
path_or_stream: Union[BytesIO, Path],
|
65
66
|
filename: Optional[str] = None,
|
66
67
|
limits: Optional[DocumentLimits] = None,
|
67
|
-
pdf_backend=
|
68
|
+
pdf_backend=DoclingParseDocumentBackend,
|
68
69
|
):
|
69
70
|
super().__init__()
|
70
71
|
|
@@ -302,13 +303,27 @@ class ConvertedDocument(BaseModel):
|
|
302
303
|
else:
|
303
304
|
return ""
|
304
305
|
|
306
|
+
def render_element_images(
|
307
|
+
self, element_types: Tuple[PageElement] = (FigureElement,)
|
308
|
+
):
|
309
|
+
for element in self.assembled.elements:
|
310
|
+
if isinstance(element, element_types):
|
311
|
+
page_ix = element.page_no
|
312
|
+
scale = self.pages[page_ix]._default_image_scale
|
313
|
+
crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin(
|
314
|
+
page_height=self.pages[page_ix].size.height * scale
|
315
|
+
)
|
316
|
+
|
317
|
+
cropped_im = self.pages[page_ix].image.crop(crop_bbox.as_tuple())
|
318
|
+
yield element, cropped_im
|
319
|
+
|
305
320
|
|
306
321
|
class DocumentConversionInput(BaseModel):
|
307
322
|
|
308
323
|
_path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
|
309
324
|
limits: Optional[DocumentLimits] = DocumentLimits()
|
310
325
|
|
311
|
-
DEFAULT_BACKEND: ClassVar =
|
326
|
+
DEFAULT_BACKEND: ClassVar = DoclingParseDocumentBackend
|
312
327
|
|
313
328
|
def docs(
|
314
329
|
self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None
|
docling/document_converter.py
CHANGED
@@ -188,10 +188,8 @@ class DocumentConverter:
|
|
188
188
|
# Free up mem resources before moving on with next batch
|
189
189
|
|
190
190
|
# Remove page images (can be disabled)
|
191
|
-
if
|
192
|
-
assembled_page.
|
193
|
-
None # Comment this if you want to visualize page images
|
194
|
-
)
|
191
|
+
if self.assemble_options.images_scale is None:
|
192
|
+
assembled_page._image_cache = {}
|
195
193
|
|
196
194
|
# Unload backend
|
197
195
|
assembled_page._backend.unload()
|
@@ -231,7 +229,15 @@ class DocumentConverter:
|
|
231
229
|
|
232
230
|
# Generate the page image and store it in the page object
|
233
231
|
def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
|
234
|
-
|
232
|
+
# default scale
|
233
|
+
page.get_image(scale=1.0)
|
234
|
+
|
235
|
+
# user requested scales
|
236
|
+
if self.assemble_options.images_scale is not None:
|
237
|
+
page._default_image_scale = self.assemble_options.images_scale
|
238
|
+
page.get_image(
|
239
|
+
scale=self.assemble_options.images_scale
|
240
|
+
) # this will trigger storing the image in the internal cache
|
235
241
|
|
236
242
|
return page
|
237
243
|
|
@@ -247,7 +253,7 @@ class DocumentConverter:
|
|
247
253
|
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
248
254
|
image.show()
|
249
255
|
|
250
|
-
# draw_text_boxes(page.
|
256
|
+
# draw_text_boxes(page.get_image(scale=1.0), cells)
|
251
257
|
|
252
258
|
return page
|
253
259
|
|
docling/models/easyocr_model.py
CHANGED
@@ -30,7 +30,7 @@ class EasyOcrModel:
|
|
30
30
|
|
31
31
|
for page in page_batch:
|
32
32
|
# rects = page._fpage.
|
33
|
-
high_res_image = page.
|
33
|
+
high_res_image = page.get_image(scale=self.scale)
|
34
34
|
im = numpy.array(high_res_image)
|
35
35
|
result = self.reader.readtext(im)
|
36
36
|
|
docling/models/layout_model.py
CHANGED
@@ -69,6 +69,10 @@ class LayoutModel:
|
|
69
69
|
"Key-Value Region": 0.45,
|
70
70
|
}
|
71
71
|
|
72
|
+
CLASS_REMAPPINGS = {
|
73
|
+
"Document Index": "Table",
|
74
|
+
}
|
75
|
+
|
72
76
|
_log.debug("================= Start postprocess function ====================")
|
73
77
|
start_time = time.time()
|
74
78
|
# Apply Confidence Threshold to cluster predictions
|
@@ -79,6 +83,10 @@ class LayoutModel:
|
|
79
83
|
confidence = CLASS_THRESHOLDS[cluster.label]
|
80
84
|
if cluster.confidence >= confidence:
|
81
85
|
# annotation["created_by"] = "high_conf_pred"
|
86
|
+
|
87
|
+
# Remap class labels where needed.
|
88
|
+
if cluster.label in CLASS_REMAPPINGS.keys():
|
89
|
+
cluster.label = CLASS_REMAPPINGS[cluster.label]
|
82
90
|
clusters_out.append(cluster)
|
83
91
|
|
84
92
|
# map to dictionary clusters and cells, with bottom left origin
|
@@ -259,7 +267,9 @@ class LayoutModel:
|
|
259
267
|
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
260
268
|
for page in page_batch:
|
261
269
|
clusters = []
|
262
|
-
for ix, pred_item in enumerate(
|
270
|
+
for ix, pred_item in enumerate(
|
271
|
+
self.layout_predictor.predict(page.get_image(scale=1.0))
|
272
|
+
):
|
263
273
|
cluster = Cluster(
|
264
274
|
id=ix,
|
265
275
|
label=pred_item["label"],
|
@@ -34,7 +34,9 @@ class TableStructureModel:
|
|
34
34
|
self.scale = 2.0 # Scale up table input images to 144 dpi
|
35
35
|
|
36
36
|
def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]):
|
37
|
-
image =
|
37
|
+
image = (
|
38
|
+
page._backend.get_page_image()
|
39
|
+
) # make new image to avoid drawing on the saved ones
|
38
40
|
draw = ImageDraw.Draw(image)
|
39
41
|
|
40
42
|
for table_element in tbl_list:
|
@@ -94,13 +96,7 @@ class TableStructureModel:
|
|
94
96
|
"width": page.size.width * self.scale,
|
95
97
|
"height": page.size.height * self.scale,
|
96
98
|
}
|
97
|
-
|
98
|
-
if self.scale == 1.0:
|
99
|
-
page_input["image"] = numpy.asarray(page.image)
|
100
|
-
else: # render new page image on the fly at desired scale
|
101
|
-
page_input["image"] = numpy.asarray(
|
102
|
-
page._backend.get_page_image(scale=self.scale)
|
103
|
-
)
|
99
|
+
page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
|
104
100
|
|
105
101
|
table_clusters, table_bboxes = zip(*in_tables)
|
106
102
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.5.0
|
4
4
|
Summary: Docling PDF conversion package
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -24,8 +24,8 @@ Provides-Extra: ocr
|
|
24
24
|
Requires-Dist: certifi (>=2024.7.4)
|
25
25
|
Requires-Dist: deepsearch-glm (>=0.19.0,<1)
|
26
26
|
Requires-Dist: docling-core (>=1.1.2,<2.0.0)
|
27
|
-
Requires-Dist: docling-ibm-models (>=1.1.
|
28
|
-
Requires-Dist: docling-parse (>=0.0
|
27
|
+
Requires-Dist: docling-ibm-models (>=1.1.1,<2.0.0)
|
28
|
+
Requires-Dist: docling-parse (>=0.2.0,<0.3.0)
|
29
29
|
Requires-Dist: easyocr (>=1.7,<2.0) ; extra == "easyocr" or extra == "ocr"
|
30
30
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
31
31
|
Requires-Dist: huggingface_hub (>=0.23,<1)
|
@@ -44,6 +44,7 @@ Description-Content-Type: text/markdown
|
|
44
44
|
|
45
45
|
# Docling
|
46
46
|
|
47
|
+
[](https://arxiv.org/abs/2408.09869)
|
47
48
|
[](https://pypi.org/project/docling/)
|
48
49
|

|
49
50
|
[](https://python-poetry.org/)
|
@@ -172,13 +173,15 @@ Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main
|
|
172
173
|
If you use Docling in your projects, please consider citing the following:
|
173
174
|
|
174
175
|
```bib
|
175
|
-
@
|
176
|
-
author = {Deep Search Team},
|
177
|
-
month = {
|
178
|
-
title = {{Docling}},
|
179
|
-
url
|
180
|
-
|
181
|
-
|
176
|
+
@techreport{Docling,
|
177
|
+
author = {Deep Search Team},
|
178
|
+
month = {8},
|
179
|
+
title = {{Docling Technical Report}},
|
180
|
+
url={https://arxiv.org/abs/2408.09869},
|
181
|
+
eprint={2408.09869},
|
182
|
+
doi = "10.48550/arXiv.2408.09869",
|
183
|
+
version = {1.0.0},
|
184
|
+
year = {2024}
|
182
185
|
}
|
183
186
|
```
|
184
187
|
|
@@ -1,26 +1,26 @@
|
|
1
1
|
docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
2
|
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
3
|
docling/backend/abstract_backend.py,sha256=swwmXzNueZSHqEOvw4j-IFhP2OUJhBeB--gV7NtzKgo,1112
|
4
|
-
docling/backend/docling_parse_backend.py,sha256
|
5
|
-
docling/backend/pypdfium2_backend.py,sha256=
|
4
|
+
docling/backend/docling_parse_backend.py,sha256=-bIjYJ-80R2SArAEw_lAyzgW5_BFEoX83n1oBMmUGF4,6284
|
5
|
+
docling/backend/pypdfium2_backend.py,sha256=3Qeeal8z6DunUe4S10Z2TXrdeucanCpa8evt6SQtpKQ,7496
|
6
6
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
docling/datamodel/base_models.py,sha256=
|
8
|
-
docling/datamodel/document.py,sha256=
|
7
|
+
docling/datamodel/base_models.py,sha256=uOq0zjUS60aIkROREiypp3Jn1yqQTlWEf34jXTT43ls,8391
|
8
|
+
docling/datamodel/document.py,sha256=Dgi9pSwXCgIoR26MKiRDiVMyMaFKdvGSKq2Fm5Lef9M,13173
|
9
9
|
docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
|
10
|
-
docling/document_converter.py,sha256=
|
10
|
+
docling/document_converter.py,sha256=r9z48VjL_hkq-rbAgyZ135njzUGBJ5AnhEH6-1zfyCA,10490
|
11
11
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
12
|
docling/models/ds_glm_model.py,sha256=wmb--2JKFQby-kvidw6PyM8wURPXYPQ_Z_eKKCBAdYQ,3192
|
13
|
-
docling/models/easyocr_model.py,sha256=
|
14
|
-
docling/models/layout_model.py,sha256=
|
13
|
+
docling/models/easyocr_model.py,sha256=Y-RWolIFE3By6gk8dnb2qFy7Cr9qcHs6eo65fWPT0Nc,2276
|
14
|
+
docling/models/layout_model.py,sha256=ZFmaLXlRWUfsT1pJCiYVxhQFrBBsiz6Aw0m9GM3UvVM,11249
|
15
15
|
docling/models/page_assemble_model.py,sha256=8eoG2WiFxPxq9TPvM-wkngb2gkr0tdtCRVXg1JcTETo,5550
|
16
|
-
docling/models/table_structure_model.py,sha256=
|
16
|
+
docling/models/table_structure_model.py,sha256=lKsodvfZaGwxOHp-CbRW5nzCKZYMwf770h0Ka6Bdbgw,5451
|
17
17
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
18
|
docling/pipeline/base_model_pipeline.py,sha256=ozHdJak0yQAxQf7pQN_C480vI35A2e5KL5Qq1xSkq5c,560
|
19
19
|
docling/pipeline/standard_model_pipeline.py,sha256=UTwodKUKrisLoVcntbNUBDhjzRyFvpdUvyVw-gNmBlM,1541
|
20
20
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
21
|
docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
|
22
22
|
docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
|
23
|
-
docling-1.
|
24
|
-
docling-1.
|
25
|
-
docling-1.
|
26
|
-
docling-1.
|
23
|
+
docling-1.5.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
|
24
|
+
docling-1.5.0.dist-info/METADATA,sha256=jWcjsrdfYcpeYFCRQ1h5C1b8MyaKsJWyUhGheXQEGvY,7235
|
25
|
+
docling-1.5.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
26
|
+
docling-1.5.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|