docling 2.24.0__py3-none-any.whl → 2.25.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/docling_parse_v2_backend.py +38 -30
- docling/backend/html_backend.py +122 -21
- docling/backend/pypdfium2_backend.py +57 -41
- docling/cli/models.py +28 -4
- docling/datamodel/base_models.py +5 -0
- docling/datamodel/pipeline_options.py +62 -1
- docling/models/hf_vlm_model.py +180 -0
- docling/models/picture_description_vlm_model.py +2 -2
- docling/pipeline/vlm_pipeline.py +534 -0
- docling/utils/locks.py +3 -0
- docling/utils/model_downloader.py +15 -2
- docling/utils/visualization.py +5 -0
- {docling-2.24.0.dist-info → docling-2.25.1.dist-info}/METADATA +2 -1
- {docling-2.24.0.dist-info → docling-2.25.1.dist-info}/RECORD +17 -14
- {docling-2.24.0.dist-info → docling-2.25.1.dist-info}/LICENSE +0 -0
- {docling-2.24.0.dist-info → docling-2.25.1.dist-info}/WHEEL +0 -0
- {docling-2.24.0.dist-info → docling-2.25.1.dist-info}/entry_points.txt +0 -0
@@ -12,6 +12,7 @@ from pypdfium2 import PdfPage
|
|
12
12
|
|
13
13
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
14
14
|
from docling.datamodel.base_models import Cell, Size
|
15
|
+
from docling.utils.locks import pypdfium2_lock
|
15
16
|
|
16
17
|
if TYPE_CHECKING:
|
17
18
|
from docling.datamodel.document import InputDocument
|
@@ -182,20 +183,24 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|
182
183
|
padbox.r = page_size.width - padbox.r
|
183
184
|
padbox.t = page_size.height - padbox.t
|
184
185
|
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
186
|
+
with pypdfium2_lock:
|
187
|
+
image = (
|
188
|
+
self._ppage.render(
|
189
|
+
scale=scale * 1.5,
|
190
|
+
rotation=0, # no additional rotation
|
191
|
+
crop=padbox.as_tuple(),
|
192
|
+
)
|
193
|
+
.to_pil()
|
194
|
+
.resize(
|
195
|
+
size=(round(cropbox.width * scale), round(cropbox.height * scale))
|
196
|
+
)
|
197
|
+
) # We resize the image from 1.5x the given scale to make it sharper.
|
194
198
|
|
195
199
|
return image
|
196
200
|
|
197
201
|
def get_size(self) -> Size:
|
198
|
-
|
202
|
+
with pypdfium2_lock:
|
203
|
+
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
|
199
204
|
|
200
205
|
def unload(self):
|
201
206
|
self._ppage = None
|
@@ -206,23 +211,24 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
|
|
206
211
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
207
212
|
super().__init__(in_doc, path_or_stream)
|
208
213
|
|
209
|
-
|
210
|
-
|
214
|
+
with pypdfium2_lock:
|
215
|
+
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
216
|
+
self.parser = pdf_parser_v2("fatal")
|
211
217
|
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
218
|
+
success = False
|
219
|
+
if isinstance(self.path_or_stream, BytesIO):
|
220
|
+
success = self.parser.load_document_from_bytesio(
|
221
|
+
self.document_hash, self.path_or_stream
|
222
|
+
)
|
223
|
+
elif isinstance(self.path_or_stream, Path):
|
224
|
+
success = self.parser.load_document(
|
225
|
+
self.document_hash, str(self.path_or_stream)
|
226
|
+
)
|
221
227
|
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
228
|
+
if not success:
|
229
|
+
raise RuntimeError(
|
230
|
+
f"docling-parse v2 could not load document {self.document_hash}."
|
231
|
+
)
|
226
232
|
|
227
233
|
def page_count(self) -> int:
|
228
234
|
# return len(self._pdoc) # To be replaced with docling-parse API
|
@@ -236,9 +242,10 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
|
|
236
242
|
return len_2
|
237
243
|
|
238
244
|
def load_page(self, page_no: int) -> DoclingParseV2PageBackend:
|
239
|
-
|
240
|
-
|
241
|
-
|
245
|
+
with pypdfium2_lock:
|
246
|
+
return DoclingParseV2PageBackend(
|
247
|
+
self.parser, self.document_hash, page_no, self._pdoc[page_no]
|
248
|
+
)
|
242
249
|
|
243
250
|
def is_valid(self) -> bool:
|
244
251
|
return self.page_count() > 0
|
@@ -246,5 +253,6 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
|
|
246
253
|
def unload(self):
|
247
254
|
super().unload()
|
248
255
|
self.parser.unload_document(self.document_hash)
|
249
|
-
|
250
|
-
|
256
|
+
with pypdfium2_lock:
|
257
|
+
self._pdoc.close()
|
258
|
+
self._pdoc = None
|
docling/backend/html_backend.py
CHANGED
@@ -1,9 +1,10 @@
|
|
1
1
|
import logging
|
2
2
|
from io import BytesIO
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import Optional, Union, cast
|
4
|
+
from typing import Final, Optional, Union, cast
|
5
5
|
|
6
6
|
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
|
7
|
+
from bs4.element import PreformattedString
|
7
8
|
from docling_core.types.doc import (
|
8
9
|
DocItem,
|
9
10
|
DocItemLabel,
|
@@ -14,6 +15,7 @@ from docling_core.types.doc import (
|
|
14
15
|
TableCell,
|
15
16
|
TableData,
|
16
17
|
)
|
18
|
+
from docling_core.types.doc.document import ContentLayer
|
17
19
|
from typing_extensions import override
|
18
20
|
|
19
21
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
@@ -22,12 +24,29 @@ from docling.datamodel.document import InputDocument
|
|
22
24
|
|
23
25
|
_log = logging.getLogger(__name__)
|
24
26
|
|
27
|
+
# tags that generate NodeItem elements
|
28
|
+
TAGS_FOR_NODE_ITEMS: Final = [
|
29
|
+
"h1",
|
30
|
+
"h2",
|
31
|
+
"h3",
|
32
|
+
"h4",
|
33
|
+
"h5",
|
34
|
+
"h6",
|
35
|
+
"p",
|
36
|
+
"pre",
|
37
|
+
"ul",
|
38
|
+
"ol",
|
39
|
+
"li",
|
40
|
+
"table",
|
41
|
+
"figure",
|
42
|
+
"img",
|
43
|
+
]
|
44
|
+
|
25
45
|
|
26
46
|
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
27
47
|
@override
|
28
48
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
29
49
|
super().__init__(in_doc, path_or_stream)
|
30
|
-
_log.debug("About to init HTML backend...")
|
31
50
|
self.soup: Optional[Tag] = None
|
32
51
|
# HTML file:
|
33
52
|
self.path_or_stream = path_or_stream
|
@@ -48,7 +67,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
48
67
|
self.soup = BeautifulSoup(html_content, "html.parser")
|
49
68
|
except Exception as e:
|
50
69
|
raise RuntimeError(
|
51
|
-
|
70
|
+
"Could not initialize HTML backend for file with "
|
71
|
+
f"hash {self.document_hash}."
|
52
72
|
) from e
|
53
73
|
|
54
74
|
@override
|
@@ -88,17 +108,26 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
88
108
|
assert self.soup is not None
|
89
109
|
content = self.soup.body or self.soup
|
90
110
|
# Replace <br> tags with newline characters
|
111
|
+
# TODO: remove style to avoid losing text from tags like i, b, span, ...
|
91
112
|
for br in content("br"):
|
92
113
|
br.replace_with(NavigableString("\n"))
|
114
|
+
|
115
|
+
headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"])
|
116
|
+
self.content_layer = (
|
117
|
+
ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
|
118
|
+
)
|
93
119
|
self.walk(content, doc)
|
94
120
|
else:
|
95
121
|
raise RuntimeError(
|
96
|
-
f"Cannot convert doc with {self.document_hash} because the backend
|
122
|
+
f"Cannot convert doc with {self.document_hash} because the backend "
|
123
|
+
"failed to init."
|
97
124
|
)
|
98
125
|
return doc
|
99
126
|
|
100
127
|
def walk(self, tag: Tag, doc: DoclingDocument) -> None:
|
128
|
+
|
101
129
|
# Iterate over elements in the body of the document
|
130
|
+
text: str = ""
|
102
131
|
for element in tag.children:
|
103
132
|
if isinstance(element, Tag):
|
104
133
|
try:
|
@@ -108,6 +137,26 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
108
137
|
f"Error processing child from tag{tag.name}: {exc_child}"
|
109
138
|
)
|
110
139
|
raise exc_child
|
140
|
+
elif isinstance(element, NavigableString) and not isinstance(
|
141
|
+
element, PreformattedString
|
142
|
+
):
|
143
|
+
# Floating text outside paragraphs or analyzed tags
|
144
|
+
text += element
|
145
|
+
siblings: list[Tag] = [
|
146
|
+
item for item in element.next_siblings if isinstance(item, Tag)
|
147
|
+
]
|
148
|
+
if element.next_sibling is None or any(
|
149
|
+
[item.name in TAGS_FOR_NODE_ITEMS for item in siblings]
|
150
|
+
):
|
151
|
+
text = text.strip()
|
152
|
+
if text and tag.name in ["div"]:
|
153
|
+
doc.add_text(
|
154
|
+
parent=self.parents[self.level],
|
155
|
+
label=DocItemLabel.TEXT,
|
156
|
+
text=text,
|
157
|
+
content_layer=self.content_layer,
|
158
|
+
)
|
159
|
+
text = ""
|
111
160
|
|
112
161
|
return
|
113
162
|
|
@@ -127,7 +176,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
127
176
|
elif tag.name == "figure":
|
128
177
|
self.handle_figure(tag, doc)
|
129
178
|
elif tag.name == "img":
|
130
|
-
self.handle_image(doc)
|
179
|
+
self.handle_image(tag, doc)
|
131
180
|
else:
|
132
181
|
self.walk(tag, doc)
|
133
182
|
|
@@ -158,12 +207,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
158
207
|
text = element.text.strip()
|
159
208
|
|
160
209
|
if hlevel == 1:
|
161
|
-
|
210
|
+
self.content_layer = ContentLayer.BODY
|
211
|
+
|
212
|
+
for key in self.parents.keys():
|
162
213
|
self.parents[key] = None
|
163
214
|
|
164
215
|
self.level = 1
|
165
216
|
self.parents[self.level] = doc.add_text(
|
166
|
-
parent=self.parents[0],
|
217
|
+
parent=self.parents[0],
|
218
|
+
label=DocItemLabel.TITLE,
|
219
|
+
text=text,
|
220
|
+
content_layer=self.content_layer,
|
167
221
|
)
|
168
222
|
else:
|
169
223
|
if hlevel > self.level:
|
@@ -174,6 +228,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
174
228
|
name=f"header-{i}",
|
175
229
|
label=GroupLabel.SECTION,
|
176
230
|
parent=self.parents[i - 1],
|
231
|
+
content_layer=self.content_layer,
|
177
232
|
)
|
178
233
|
self.level = hlevel
|
179
234
|
|
@@ -189,6 +244,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
189
244
|
parent=self.parents[hlevel - 1],
|
190
245
|
text=text,
|
191
246
|
level=hlevel,
|
247
|
+
content_layer=self.content_layer,
|
192
248
|
)
|
193
249
|
|
194
250
|
def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
|
@@ -197,16 +253,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
197
253
|
return
|
198
254
|
text = element.text.strip()
|
199
255
|
if text:
|
200
|
-
doc.add_code(
|
256
|
+
doc.add_code(
|
257
|
+
parent=self.parents[self.level],
|
258
|
+
text=text,
|
259
|
+
content_layer=self.content_layer,
|
260
|
+
)
|
201
261
|
|
202
262
|
def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
|
203
263
|
"""Handles paragraph tags (p)."""
|
204
264
|
if element.text is None:
|
205
265
|
return
|
206
266
|
text = element.text.strip()
|
207
|
-
label = DocItemLabel.PARAGRAPH
|
208
267
|
if text:
|
209
|
-
doc.add_text(
|
268
|
+
doc.add_text(
|
269
|
+
parent=self.parents[self.level],
|
270
|
+
label=DocItemLabel.TEXT,
|
271
|
+
text=text,
|
272
|
+
content_layer=self.content_layer,
|
273
|
+
)
|
210
274
|
|
211
275
|
def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
|
212
276
|
"""Handles list tags (ul, ol) and their list items."""
|
@@ -214,14 +278,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
214
278
|
if element.name == "ul":
|
215
279
|
# create a list group
|
216
280
|
self.parents[self.level + 1] = doc.add_group(
|
217
|
-
parent=self.parents[self.level],
|
281
|
+
parent=self.parents[self.level],
|
282
|
+
name="list",
|
283
|
+
label=GroupLabel.LIST,
|
284
|
+
content_layer=self.content_layer,
|
218
285
|
)
|
219
286
|
elif element.name == "ol":
|
287
|
+
start_attr = element.get("start")
|
288
|
+
start: int = (
|
289
|
+
int(start_attr)
|
290
|
+
if isinstance(start_attr, str) and start_attr.isnumeric()
|
291
|
+
else 1
|
292
|
+
)
|
220
293
|
# create a list group
|
221
294
|
self.parents[self.level + 1] = doc.add_group(
|
222
295
|
parent=self.parents[self.level],
|
223
|
-
name="ordered list",
|
296
|
+
name="ordered list" + (f" start {start}" if start != 1 else ""),
|
224
297
|
label=GroupLabel.ORDERED_LIST,
|
298
|
+
content_layer=self.content_layer,
|
225
299
|
)
|
226
300
|
self.level += 1
|
227
301
|
|
@@ -231,15 +305,23 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
231
305
|
self.level -= 1
|
232
306
|
|
233
307
|
def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
|
234
|
-
"""Handles
|
308
|
+
"""Handles list item tags (li)."""
|
235
309
|
nested_list = element.find(["ul", "ol"])
|
236
310
|
|
237
311
|
parent = self.parents[self.level]
|
238
312
|
if parent is None:
|
239
|
-
_log.
|
313
|
+
_log.debug(f"list-item has no parent in DoclingDocument: {element}")
|
240
314
|
return
|
241
315
|
parent_label: str = parent.label
|
242
316
|
index_in_list = len(parent.children) + 1
|
317
|
+
if (
|
318
|
+
parent_label == GroupLabel.ORDERED_LIST
|
319
|
+
and isinstance(parent, GroupItem)
|
320
|
+
and parent.name
|
321
|
+
):
|
322
|
+
start_in_list: str = parent.name.split(" ")[-1]
|
323
|
+
start: int = int(start_in_list) if start_in_list.isnumeric() else 1
|
324
|
+
index_in_list += start - 1
|
243
325
|
|
244
326
|
if nested_list:
|
245
327
|
# Text in list item can be hidden within hierarchy, hence
|
@@ -262,6 +344,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
262
344
|
enumerated=enumerated,
|
263
345
|
marker=marker,
|
264
346
|
parent=parent,
|
347
|
+
content_layer=self.content_layer,
|
265
348
|
)
|
266
349
|
self.level += 1
|
267
350
|
|
@@ -283,15 +366,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
283
366
|
enumerated=enumerated,
|
284
367
|
marker=marker,
|
285
368
|
parent=parent,
|
369
|
+
content_layer=self.content_layer,
|
286
370
|
)
|
287
371
|
else:
|
288
|
-
_log.
|
372
|
+
_log.debug(f"list-item has no text: {element}")
|
289
373
|
|
290
374
|
@staticmethod
|
291
375
|
def parse_table_data(element: Tag) -> Optional[TableData]:
|
292
376
|
nested_tables = element.find("table")
|
293
377
|
if nested_tables is not None:
|
294
|
-
_log.
|
378
|
+
_log.debug("Skipping nested table.")
|
295
379
|
return None
|
296
380
|
|
297
381
|
# Count the number of rows (number of <tr> elements)
|
@@ -386,7 +470,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
386
470
|
table_data = HTMLDocumentBackend.parse_table_data(element)
|
387
471
|
|
388
472
|
if table_data is not None:
|
389
|
-
doc.add_table(
|
473
|
+
doc.add_table(
|
474
|
+
data=table_data,
|
475
|
+
parent=self.parents[self.level],
|
476
|
+
content_layer=self.content_layer,
|
477
|
+
)
|
390
478
|
|
391
479
|
def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
|
392
480
|
"""Recursively extract text from <ul> or <ol> with proper indentation."""
|
@@ -426,20 +514,33 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
426
514
|
|
427
515
|
contains_captions = element.find(["figcaption"])
|
428
516
|
if not isinstance(contains_captions, Tag):
|
429
|
-
doc.add_picture(
|
517
|
+
doc.add_picture(
|
518
|
+
parent=self.parents[self.level],
|
519
|
+
caption=None,
|
520
|
+
content_layer=self.content_layer,
|
521
|
+
)
|
430
522
|
else:
|
431
523
|
texts = []
|
432
524
|
for item in contains_captions:
|
433
525
|
texts.append(item.text)
|
434
526
|
|
435
527
|
fig_caption = doc.add_text(
|
436
|
-
label=DocItemLabel.CAPTION,
|
528
|
+
label=DocItemLabel.CAPTION,
|
529
|
+
text=("".join(texts)).strip(),
|
530
|
+
content_layer=self.content_layer,
|
437
531
|
)
|
438
532
|
doc.add_picture(
|
439
533
|
parent=self.parents[self.level],
|
440
534
|
caption=fig_caption,
|
535
|
+
content_layer=self.content_layer,
|
441
536
|
)
|
442
537
|
|
443
|
-
def handle_image(self, doc: DoclingDocument) -> None:
|
538
|
+
def handle_image(self, element: Tag, doc: DoclingDocument) -> None:
|
444
539
|
"""Handles image tags (img)."""
|
445
|
-
|
540
|
+
_log.debug(f"ignoring <img> tags at the moment: {element}")
|
541
|
+
|
542
|
+
doc.add_picture(
|
543
|
+
parent=self.parents[self.level],
|
544
|
+
caption=None,
|
545
|
+
content_layer=self.content_layer,
|
546
|
+
)
|
@@ -13,6 +13,7 @@ from pypdfium2._helpers.misc import PdfiumError
|
|
13
13
|
|
14
14
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
15
15
|
from docling.datamodel.base_models import Cell
|
16
|
+
from docling.utils.locks import pypdfium2_lock
|
16
17
|
|
17
18
|
if TYPE_CHECKING:
|
18
19
|
from docling.datamodel.document import InputDocument
|
@@ -24,6 +25,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
24
25
|
def __init__(
|
25
26
|
self, pdfium_doc: pdfium.PdfDocument, document_hash: str, page_no: int
|
26
27
|
):
|
28
|
+
# Note: lock applied by the caller
|
27
29
|
self.valid = True # No better way to tell from pypdfium.
|
28
30
|
try:
|
29
31
|
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
|
@@ -40,51 +42,57 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
40
42
|
|
41
43
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
42
44
|
AREA_THRESHOLD = 0 # 32 * 32
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
pos
|
47
|
-
|
45
|
+
page_size = self.get_size()
|
46
|
+
with pypdfium2_lock:
|
47
|
+
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
48
|
+
pos = obj.get_pos()
|
49
|
+
cropbox = BoundingBox.from_tuple(
|
50
|
+
pos, origin=CoordOrigin.BOTTOMLEFT
|
51
|
+
).to_top_left_origin(page_height=page_size.height)
|
48
52
|
|
49
|
-
|
50
|
-
|
53
|
+
if cropbox.area() > AREA_THRESHOLD:
|
54
|
+
cropbox = cropbox.scaled(scale=scale)
|
51
55
|
|
52
|
-
|
56
|
+
yield cropbox
|
53
57
|
|
54
58
|
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
55
|
-
|
56
|
-
|
59
|
+
with pypdfium2_lock:
|
60
|
+
if not self.text_page:
|
61
|
+
self.text_page = self._ppage.get_textpage()
|
57
62
|
|
58
63
|
if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
|
59
64
|
bbox = bbox.to_bottom_left_origin(self.get_size().height)
|
60
65
|
|
61
|
-
|
66
|
+
with pypdfium2_lock:
|
67
|
+
text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
|
62
68
|
|
63
69
|
return text_piece
|
64
70
|
|
65
71
|
def get_text_cells(self) -> Iterable[Cell]:
|
66
|
-
|
67
|
-
|
72
|
+
with pypdfium2_lock:
|
73
|
+
if not self.text_page:
|
74
|
+
self.text_page = self._ppage.get_textpage()
|
68
75
|
|
69
76
|
cells = []
|
70
77
|
cell_counter = 0
|
71
78
|
|
72
79
|
page_size = self.get_size()
|
73
80
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
81
|
+
with pypdfium2_lock:
|
82
|
+
for i in range(self.text_page.count_rects()):
|
83
|
+
rect = self.text_page.get_rect(i)
|
84
|
+
text_piece = self.text_page.get_text_bounded(*rect)
|
85
|
+
x0, y0, x1, y1 = rect
|
86
|
+
cells.append(
|
87
|
+
Cell(
|
88
|
+
id=cell_counter,
|
89
|
+
text=text_piece,
|
90
|
+
bbox=BoundingBox(
|
91
|
+
l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT
|
92
|
+
).to_top_left_origin(page_size.height),
|
93
|
+
)
|
85
94
|
)
|
86
|
-
|
87
|
-
cell_counter += 1
|
95
|
+
cell_counter += 1
|
88
96
|
|
89
97
|
# PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs.
|
90
98
|
# The cell merging code below is to clean this up.
|
@@ -214,20 +222,24 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
214
222
|
padbox.r = page_size.width - padbox.r
|
215
223
|
padbox.t = page_size.height - padbox.t
|
216
224
|
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
225
|
+
with pypdfium2_lock:
|
226
|
+
image = (
|
227
|
+
self._ppage.render(
|
228
|
+
scale=scale * 1.5,
|
229
|
+
rotation=0, # no additional rotation
|
230
|
+
crop=padbox.as_tuple(),
|
231
|
+
)
|
232
|
+
.to_pil()
|
233
|
+
.resize(
|
234
|
+
size=(round(cropbox.width * scale), round(cropbox.height * scale))
|
235
|
+
)
|
236
|
+
) # We resize the image from 1.5x the given scale to make it sharper.
|
226
237
|
|
227
238
|
return image
|
228
239
|
|
229
240
|
def get_size(self) -> Size:
|
230
|
-
|
241
|
+
with pypdfium2_lock:
|
242
|
+
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
|
231
243
|
|
232
244
|
def unload(self):
|
233
245
|
self._ppage = None
|
@@ -239,22 +251,26 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
|
239
251
|
super().__init__(in_doc, path_or_stream)
|
240
252
|
|
241
253
|
try:
|
242
|
-
|
254
|
+
with pypdfium2_lock:
|
255
|
+
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
243
256
|
except PdfiumError as e:
|
244
257
|
raise RuntimeError(
|
245
258
|
f"pypdfium could not load document with hash {self.document_hash}"
|
246
259
|
) from e
|
247
260
|
|
248
261
|
def page_count(self) -> int:
|
249
|
-
|
262
|
+
with pypdfium2_lock:
|
263
|
+
return len(self._pdoc)
|
250
264
|
|
251
265
|
def load_page(self, page_no: int) -> PyPdfiumPageBackend:
|
252
|
-
|
266
|
+
with pypdfium2_lock:
|
267
|
+
return PyPdfiumPageBackend(self._pdoc, self.document_hash, page_no)
|
253
268
|
|
254
269
|
def is_valid(self) -> bool:
|
255
270
|
return self.page_count() > 0
|
256
271
|
|
257
272
|
def unload(self):
|
258
273
|
super().unload()
|
259
|
-
|
260
|
-
|
274
|
+
with pypdfium2_lock:
|
275
|
+
self._pdoc.close()
|
276
|
+
self._pdoc = None
|
docling/cli/models.py
CHANGED
@@ -32,9 +32,19 @@ class _AvailableModels(str, Enum):
|
|
32
32
|
CODE_FORMULA = "code_formula"
|
33
33
|
PICTURE_CLASSIFIER = "picture_classifier"
|
34
34
|
SMOLVLM = "smolvlm"
|
35
|
+
GRANITE_VISION = "granite_vision"
|
35
36
|
EASYOCR = "easyocr"
|
36
37
|
|
37
38
|
|
39
|
+
_default_models = [
|
40
|
+
_AvailableModels.LAYOUT,
|
41
|
+
_AvailableModels.TABLEFORMER,
|
42
|
+
_AvailableModels.CODE_FORMULA,
|
43
|
+
_AvailableModels.PICTURE_CLASSIFIER,
|
44
|
+
_AvailableModels.EASYOCR,
|
45
|
+
]
|
46
|
+
|
47
|
+
|
38
48
|
@app.command("download")
|
39
49
|
def download(
|
40
50
|
output_dir: Annotated[
|
@@ -43,18 +53,27 @@ def download(
|
|
43
53
|
...,
|
44
54
|
"-o",
|
45
55
|
"--output-dir",
|
46
|
-
help="The directory where
|
56
|
+
help="The directory where to download the models.",
|
47
57
|
),
|
48
58
|
] = (settings.cache_dir / "models"),
|
49
59
|
force: Annotated[
|
50
|
-
bool, typer.Option(..., help="If true, the download will be forced")
|
60
|
+
bool, typer.Option(..., help="If true, the download will be forced.")
|
51
61
|
] = False,
|
52
62
|
models: Annotated[
|
53
63
|
Optional[list[_AvailableModels]],
|
54
64
|
typer.Argument(
|
55
|
-
help=f"Models to download (default behavior:
|
65
|
+
help=f"Models to download (default behavior: a predefined set of models will be downloaded).",
|
56
66
|
),
|
57
67
|
] = None,
|
68
|
+
all: Annotated[
|
69
|
+
bool,
|
70
|
+
typer.Option(
|
71
|
+
...,
|
72
|
+
"--all",
|
73
|
+
help="If true, all available models will be downloaded (mutually exclusive with passing specific models).",
|
74
|
+
show_default=True,
|
75
|
+
),
|
76
|
+
] = False,
|
58
77
|
quiet: Annotated[
|
59
78
|
bool,
|
60
79
|
typer.Option(
|
@@ -65,6 +84,10 @@ def download(
|
|
65
84
|
),
|
66
85
|
] = False,
|
67
86
|
):
|
87
|
+
if models and all:
|
88
|
+
raise typer.BadParameter(
|
89
|
+
"Cannot simultaneously set 'all' parameter and specify models to download."
|
90
|
+
)
|
68
91
|
if not quiet:
|
69
92
|
FORMAT = "%(message)s"
|
70
93
|
logging.basicConfig(
|
@@ -73,7 +96,7 @@ def download(
|
|
73
96
|
datefmt="[%X]",
|
74
97
|
handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
|
75
98
|
)
|
76
|
-
to_download = models or [m for m in _AvailableModels]
|
99
|
+
to_download = models or ([m for m in _AvailableModels] if all else _default_models)
|
77
100
|
output_dir = download_models(
|
78
101
|
output_dir=output_dir,
|
79
102
|
force=force,
|
@@ -83,6 +106,7 @@ def download(
|
|
83
106
|
with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
|
84
107
|
with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
|
85
108
|
with_smolvlm=_AvailableModels.SMOLVLM in to_download,
|
109
|
+
with_granite_vision=_AvailableModels.GRANITE_VISION in to_download,
|
86
110
|
with_easyocr=_AvailableModels.EASYOCR in to_download,
|
87
111
|
)
|
88
112
|
|