docling 2.25.0__py3-none-any.whl → 2.25.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/docling_parse_v2_backend.py +38 -30
- docling/backend/html_backend.py +81 -19
- docling/backend/pypdfium2_backend.py +57 -41
- docling/utils/layout_postprocessor.py +2 -1
- docling/utils/locks.py +3 -0
- {docling-2.25.0.dist-info → docling-2.25.2.dist-info}/METADATA +2 -2
- {docling-2.25.0.dist-info → docling-2.25.2.dist-info}/RECORD +10 -9
- {docling-2.25.0.dist-info → docling-2.25.2.dist-info}/LICENSE +0 -0
- {docling-2.25.0.dist-info → docling-2.25.2.dist-info}/WHEEL +0 -0
- {docling-2.25.0.dist-info → docling-2.25.2.dist-info}/entry_points.txt +0 -0
@@ -12,6 +12,7 @@ from pypdfium2 import PdfPage
|
|
12
12
|
|
13
13
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
14
14
|
from docling.datamodel.base_models import Cell, Size
|
15
|
+
from docling.utils.locks import pypdfium2_lock
|
15
16
|
|
16
17
|
if TYPE_CHECKING:
|
17
18
|
from docling.datamodel.document import InputDocument
|
@@ -182,20 +183,24 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|
182
183
|
padbox.r = page_size.width - padbox.r
|
183
184
|
padbox.t = page_size.height - padbox.t
|
184
185
|
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
186
|
+
with pypdfium2_lock:
|
187
|
+
image = (
|
188
|
+
self._ppage.render(
|
189
|
+
scale=scale * 1.5,
|
190
|
+
rotation=0, # no additional rotation
|
191
|
+
crop=padbox.as_tuple(),
|
192
|
+
)
|
193
|
+
.to_pil()
|
194
|
+
.resize(
|
195
|
+
size=(round(cropbox.width * scale), round(cropbox.height * scale))
|
196
|
+
)
|
197
|
+
) # We resize the image from 1.5x the given scale to make it sharper.
|
194
198
|
|
195
199
|
return image
|
196
200
|
|
197
201
|
def get_size(self) -> Size:
|
198
|
-
|
202
|
+
with pypdfium2_lock:
|
203
|
+
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
|
199
204
|
|
200
205
|
def unload(self):
|
201
206
|
self._ppage = None
|
@@ -206,23 +211,24 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
|
|
206
211
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
207
212
|
super().__init__(in_doc, path_or_stream)
|
208
213
|
|
209
|
-
|
210
|
-
|
214
|
+
with pypdfium2_lock:
|
215
|
+
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
216
|
+
self.parser = pdf_parser_v2("fatal")
|
211
217
|
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
218
|
+
success = False
|
219
|
+
if isinstance(self.path_or_stream, BytesIO):
|
220
|
+
success = self.parser.load_document_from_bytesio(
|
221
|
+
self.document_hash, self.path_or_stream
|
222
|
+
)
|
223
|
+
elif isinstance(self.path_or_stream, Path):
|
224
|
+
success = self.parser.load_document(
|
225
|
+
self.document_hash, str(self.path_or_stream)
|
226
|
+
)
|
221
227
|
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
228
|
+
if not success:
|
229
|
+
raise RuntimeError(
|
230
|
+
f"docling-parse v2 could not load document {self.document_hash}."
|
231
|
+
)
|
226
232
|
|
227
233
|
def page_count(self) -> int:
|
228
234
|
# return len(self._pdoc) # To be replaced with docling-parse API
|
@@ -236,9 +242,10 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
|
|
236
242
|
return len_2
|
237
243
|
|
238
244
|
def load_page(self, page_no: int) -> DoclingParseV2PageBackend:
|
239
|
-
|
240
|
-
|
241
|
-
|
245
|
+
with pypdfium2_lock:
|
246
|
+
return DoclingParseV2PageBackend(
|
247
|
+
self.parser, self.document_hash, page_no, self._pdoc[page_no]
|
248
|
+
)
|
242
249
|
|
243
250
|
def is_valid(self) -> bool:
|
244
251
|
return self.page_count() > 0
|
@@ -246,5 +253,6 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
|
|
246
253
|
def unload(self):
|
247
254
|
super().unload()
|
248
255
|
self.parser.unload_document(self.document_hash)
|
249
|
-
|
250
|
-
|
256
|
+
with pypdfium2_lock:
|
257
|
+
self._pdoc.close()
|
258
|
+
self._pdoc = None
|
docling/backend/html_backend.py
CHANGED
@@ -15,6 +15,7 @@ from docling_core.types.doc import (
|
|
15
15
|
TableCell,
|
16
16
|
TableData,
|
17
17
|
)
|
18
|
+
from docling_core.types.doc.document import ContentLayer
|
18
19
|
from typing_extensions import override
|
19
20
|
|
20
21
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
@@ -66,7 +67,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
66
67
|
self.soup = BeautifulSoup(html_content, "html.parser")
|
67
68
|
except Exception as e:
|
68
69
|
raise RuntimeError(
|
69
|
-
|
70
|
+
"Could not initialize HTML backend for file with "
|
71
|
+
f"hash {self.document_hash}."
|
70
72
|
) from e
|
71
73
|
|
72
74
|
@override
|
@@ -109,14 +111,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
109
111
|
# TODO: remove style to avoid losing text from tags like i, b, span, ...
|
110
112
|
for br in content("br"):
|
111
113
|
br.replace_with(NavigableString("\n"))
|
114
|
+
|
115
|
+
headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"])
|
116
|
+
self.content_layer = (
|
117
|
+
ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
|
118
|
+
)
|
112
119
|
self.walk(content, doc)
|
113
120
|
else:
|
114
121
|
raise RuntimeError(
|
115
|
-
f"Cannot convert doc with {self.document_hash} because the backend
|
122
|
+
f"Cannot convert doc with {self.document_hash} because the backend "
|
123
|
+
"failed to init."
|
116
124
|
)
|
117
125
|
return doc
|
118
126
|
|
119
127
|
def walk(self, tag: Tag, doc: DoclingDocument) -> None:
|
128
|
+
|
120
129
|
# Iterate over elements in the body of the document
|
121
130
|
text: str = ""
|
122
131
|
for element in tag.children:
|
@@ -143,8 +152,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
143
152
|
if text and tag.name in ["div"]:
|
144
153
|
doc.add_text(
|
145
154
|
parent=self.parents[self.level],
|
146
|
-
label=DocItemLabel.
|
155
|
+
label=DocItemLabel.TEXT,
|
147
156
|
text=text,
|
157
|
+
content_layer=self.content_layer,
|
148
158
|
)
|
149
159
|
text = ""
|
150
160
|
|
@@ -166,7 +176,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
166
176
|
elif tag.name == "figure":
|
167
177
|
self.handle_figure(tag, doc)
|
168
178
|
elif tag.name == "img":
|
169
|
-
self.handle_image(doc)
|
179
|
+
self.handle_image(tag, doc)
|
170
180
|
else:
|
171
181
|
self.walk(tag, doc)
|
172
182
|
|
@@ -197,12 +207,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
197
207
|
text = element.text.strip()
|
198
208
|
|
199
209
|
if hlevel == 1:
|
210
|
+
self.content_layer = ContentLayer.BODY
|
211
|
+
|
200
212
|
for key in self.parents.keys():
|
201
213
|
self.parents[key] = None
|
202
214
|
|
203
215
|
self.level = 1
|
204
216
|
self.parents[self.level] = doc.add_text(
|
205
|
-
parent=self.parents[0],
|
217
|
+
parent=self.parents[0],
|
218
|
+
label=DocItemLabel.TITLE,
|
219
|
+
text=text,
|
220
|
+
content_layer=self.content_layer,
|
206
221
|
)
|
207
222
|
else:
|
208
223
|
if hlevel > self.level:
|
@@ -213,6 +228,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
213
228
|
name=f"header-{i}",
|
214
229
|
label=GroupLabel.SECTION,
|
215
230
|
parent=self.parents[i - 1],
|
231
|
+
content_layer=self.content_layer,
|
216
232
|
)
|
217
233
|
self.level = hlevel
|
218
234
|
|
@@ -228,6 +244,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
228
244
|
parent=self.parents[hlevel - 1],
|
229
245
|
text=text,
|
230
246
|
level=hlevel,
|
247
|
+
content_layer=self.content_layer,
|
231
248
|
)
|
232
249
|
|
233
250
|
def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
|
@@ -236,16 +253,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
236
253
|
return
|
237
254
|
text = element.text.strip()
|
238
255
|
if text:
|
239
|
-
doc.add_code(
|
256
|
+
doc.add_code(
|
257
|
+
parent=self.parents[self.level],
|
258
|
+
text=text,
|
259
|
+
content_layer=self.content_layer,
|
260
|
+
)
|
240
261
|
|
241
262
|
def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
|
242
263
|
"""Handles paragraph tags (p)."""
|
243
264
|
if element.text is None:
|
244
265
|
return
|
245
266
|
text = element.text.strip()
|
246
|
-
label = DocItemLabel.PARAGRAPH
|
247
267
|
if text:
|
248
|
-
doc.add_text(
|
268
|
+
doc.add_text(
|
269
|
+
parent=self.parents[self.level],
|
270
|
+
label=DocItemLabel.TEXT,
|
271
|
+
text=text,
|
272
|
+
content_layer=self.content_layer,
|
273
|
+
)
|
249
274
|
|
250
275
|
def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
|
251
276
|
"""Handles list tags (ul, ol) and their list items."""
|
@@ -253,14 +278,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
253
278
|
if element.name == "ul":
|
254
279
|
# create a list group
|
255
280
|
self.parents[self.level + 1] = doc.add_group(
|
256
|
-
parent=self.parents[self.level],
|
281
|
+
parent=self.parents[self.level],
|
282
|
+
name="list",
|
283
|
+
label=GroupLabel.LIST,
|
284
|
+
content_layer=self.content_layer,
|
257
285
|
)
|
258
286
|
elif element.name == "ol":
|
287
|
+
start_attr = element.get("start")
|
288
|
+
start: int = (
|
289
|
+
int(start_attr)
|
290
|
+
if isinstance(start_attr, str) and start_attr.isnumeric()
|
291
|
+
else 1
|
292
|
+
)
|
259
293
|
# create a list group
|
260
294
|
self.parents[self.level + 1] = doc.add_group(
|
261
295
|
parent=self.parents[self.level],
|
262
|
-
name="ordered list",
|
296
|
+
name="ordered list" + (f" start {start}" if start != 1 else ""),
|
263
297
|
label=GroupLabel.ORDERED_LIST,
|
298
|
+
content_layer=self.content_layer,
|
264
299
|
)
|
265
300
|
self.level += 1
|
266
301
|
|
@@ -270,15 +305,23 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
270
305
|
self.level -= 1
|
271
306
|
|
272
307
|
def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
|
273
|
-
"""Handles
|
308
|
+
"""Handles list item tags (li)."""
|
274
309
|
nested_list = element.find(["ul", "ol"])
|
275
310
|
|
276
311
|
parent = self.parents[self.level]
|
277
312
|
if parent is None:
|
278
|
-
_log.
|
313
|
+
_log.debug(f"list-item has no parent in DoclingDocument: {element}")
|
279
314
|
return
|
280
315
|
parent_label: str = parent.label
|
281
316
|
index_in_list = len(parent.children) + 1
|
317
|
+
if (
|
318
|
+
parent_label == GroupLabel.ORDERED_LIST
|
319
|
+
and isinstance(parent, GroupItem)
|
320
|
+
and parent.name
|
321
|
+
):
|
322
|
+
start_in_list: str = parent.name.split(" ")[-1]
|
323
|
+
start: int = int(start_in_list) if start_in_list.isnumeric() else 1
|
324
|
+
index_in_list += start - 1
|
282
325
|
|
283
326
|
if nested_list:
|
284
327
|
# Text in list item can be hidden within hierarchy, hence
|
@@ -301,6 +344,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
301
344
|
enumerated=enumerated,
|
302
345
|
marker=marker,
|
303
346
|
parent=parent,
|
347
|
+
content_layer=self.content_layer,
|
304
348
|
)
|
305
349
|
self.level += 1
|
306
350
|
|
@@ -322,15 +366,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
322
366
|
enumerated=enumerated,
|
323
367
|
marker=marker,
|
324
368
|
parent=parent,
|
369
|
+
content_layer=self.content_layer,
|
325
370
|
)
|
326
371
|
else:
|
327
|
-
_log.
|
372
|
+
_log.debug(f"list-item has no text: {element}")
|
328
373
|
|
329
374
|
@staticmethod
|
330
375
|
def parse_table_data(element: Tag) -> Optional[TableData]:
|
331
376
|
nested_tables = element.find("table")
|
332
377
|
if nested_tables is not None:
|
333
|
-
_log.
|
378
|
+
_log.debug("Skipping nested table.")
|
334
379
|
return None
|
335
380
|
|
336
381
|
# Count the number of rows (number of <tr> elements)
|
@@ -425,7 +470,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
425
470
|
table_data = HTMLDocumentBackend.parse_table_data(element)
|
426
471
|
|
427
472
|
if table_data is not None:
|
428
|
-
doc.add_table(
|
473
|
+
doc.add_table(
|
474
|
+
data=table_data,
|
475
|
+
parent=self.parents[self.level],
|
476
|
+
content_layer=self.content_layer,
|
477
|
+
)
|
429
478
|
|
430
479
|
def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
|
431
480
|
"""Recursively extract text from <ul> or <ol> with proper indentation."""
|
@@ -465,20 +514,33 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
465
514
|
|
466
515
|
contains_captions = element.find(["figcaption"])
|
467
516
|
if not isinstance(contains_captions, Tag):
|
468
|
-
doc.add_picture(
|
517
|
+
doc.add_picture(
|
518
|
+
parent=self.parents[self.level],
|
519
|
+
caption=None,
|
520
|
+
content_layer=self.content_layer,
|
521
|
+
)
|
469
522
|
else:
|
470
523
|
texts = []
|
471
524
|
for item in contains_captions:
|
472
525
|
texts.append(item.text)
|
473
526
|
|
474
527
|
fig_caption = doc.add_text(
|
475
|
-
label=DocItemLabel.CAPTION,
|
528
|
+
label=DocItemLabel.CAPTION,
|
529
|
+
text=("".join(texts)).strip(),
|
530
|
+
content_layer=self.content_layer,
|
476
531
|
)
|
477
532
|
doc.add_picture(
|
478
533
|
parent=self.parents[self.level],
|
479
534
|
caption=fig_caption,
|
535
|
+
content_layer=self.content_layer,
|
480
536
|
)
|
481
537
|
|
482
|
-
def handle_image(self, doc: DoclingDocument) -> None:
|
538
|
+
def handle_image(self, element: Tag, doc: DoclingDocument) -> None:
|
483
539
|
"""Handles image tags (img)."""
|
484
|
-
|
540
|
+
_log.debug(f"ignoring <img> tags at the moment: {element}")
|
541
|
+
|
542
|
+
doc.add_picture(
|
543
|
+
parent=self.parents[self.level],
|
544
|
+
caption=None,
|
545
|
+
content_layer=self.content_layer,
|
546
|
+
)
|
@@ -13,6 +13,7 @@ from pypdfium2._helpers.misc import PdfiumError
|
|
13
13
|
|
14
14
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
15
15
|
from docling.datamodel.base_models import Cell
|
16
|
+
from docling.utils.locks import pypdfium2_lock
|
16
17
|
|
17
18
|
if TYPE_CHECKING:
|
18
19
|
from docling.datamodel.document import InputDocument
|
@@ -24,6 +25,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
24
25
|
def __init__(
|
25
26
|
self, pdfium_doc: pdfium.PdfDocument, document_hash: str, page_no: int
|
26
27
|
):
|
28
|
+
# Note: lock applied by the caller
|
27
29
|
self.valid = True # No better way to tell from pypdfium.
|
28
30
|
try:
|
29
31
|
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
|
@@ -40,51 +42,57 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
40
42
|
|
41
43
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
42
44
|
AREA_THRESHOLD = 0 # 32 * 32
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
pos
|
47
|
-
|
45
|
+
page_size = self.get_size()
|
46
|
+
with pypdfium2_lock:
|
47
|
+
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
48
|
+
pos = obj.get_pos()
|
49
|
+
cropbox = BoundingBox.from_tuple(
|
50
|
+
pos, origin=CoordOrigin.BOTTOMLEFT
|
51
|
+
).to_top_left_origin(page_height=page_size.height)
|
48
52
|
|
49
|
-
|
50
|
-
|
53
|
+
if cropbox.area() > AREA_THRESHOLD:
|
54
|
+
cropbox = cropbox.scaled(scale=scale)
|
51
55
|
|
52
|
-
|
56
|
+
yield cropbox
|
53
57
|
|
54
58
|
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
55
|
-
|
56
|
-
|
59
|
+
with pypdfium2_lock:
|
60
|
+
if not self.text_page:
|
61
|
+
self.text_page = self._ppage.get_textpage()
|
57
62
|
|
58
63
|
if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
|
59
64
|
bbox = bbox.to_bottom_left_origin(self.get_size().height)
|
60
65
|
|
61
|
-
|
66
|
+
with pypdfium2_lock:
|
67
|
+
text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
|
62
68
|
|
63
69
|
return text_piece
|
64
70
|
|
65
71
|
def get_text_cells(self) -> Iterable[Cell]:
|
66
|
-
|
67
|
-
|
72
|
+
with pypdfium2_lock:
|
73
|
+
if not self.text_page:
|
74
|
+
self.text_page = self._ppage.get_textpage()
|
68
75
|
|
69
76
|
cells = []
|
70
77
|
cell_counter = 0
|
71
78
|
|
72
79
|
page_size = self.get_size()
|
73
80
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
81
|
+
with pypdfium2_lock:
|
82
|
+
for i in range(self.text_page.count_rects()):
|
83
|
+
rect = self.text_page.get_rect(i)
|
84
|
+
text_piece = self.text_page.get_text_bounded(*rect)
|
85
|
+
x0, y0, x1, y1 = rect
|
86
|
+
cells.append(
|
87
|
+
Cell(
|
88
|
+
id=cell_counter,
|
89
|
+
text=text_piece,
|
90
|
+
bbox=BoundingBox(
|
91
|
+
l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT
|
92
|
+
).to_top_left_origin(page_size.height),
|
93
|
+
)
|
85
94
|
)
|
86
|
-
|
87
|
-
cell_counter += 1
|
95
|
+
cell_counter += 1
|
88
96
|
|
89
97
|
# PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs.
|
90
98
|
# The cell merging code below is to clean this up.
|
@@ -214,20 +222,24 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
214
222
|
padbox.r = page_size.width - padbox.r
|
215
223
|
padbox.t = page_size.height - padbox.t
|
216
224
|
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
225
|
+
with pypdfium2_lock:
|
226
|
+
image = (
|
227
|
+
self._ppage.render(
|
228
|
+
scale=scale * 1.5,
|
229
|
+
rotation=0, # no additional rotation
|
230
|
+
crop=padbox.as_tuple(),
|
231
|
+
)
|
232
|
+
.to_pil()
|
233
|
+
.resize(
|
234
|
+
size=(round(cropbox.width * scale), round(cropbox.height * scale))
|
235
|
+
)
|
236
|
+
) # We resize the image from 1.5x the given scale to make it sharper.
|
226
237
|
|
227
238
|
return image
|
228
239
|
|
229
240
|
def get_size(self) -> Size:
|
230
|
-
|
241
|
+
with pypdfium2_lock:
|
242
|
+
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
|
231
243
|
|
232
244
|
def unload(self):
|
233
245
|
self._ppage = None
|
@@ -239,22 +251,26 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
|
239
251
|
super().__init__(in_doc, path_or_stream)
|
240
252
|
|
241
253
|
try:
|
242
|
-
|
254
|
+
with pypdfium2_lock:
|
255
|
+
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
243
256
|
except PdfiumError as e:
|
244
257
|
raise RuntimeError(
|
245
258
|
f"pypdfium could not load document with hash {self.document_hash}"
|
246
259
|
) from e
|
247
260
|
|
248
261
|
def page_count(self) -> int:
|
249
|
-
|
262
|
+
with pypdfium2_lock:
|
263
|
+
return len(self._pdoc)
|
250
264
|
|
251
265
|
def load_page(self, page_no: int) -> PyPdfiumPageBackend:
|
252
|
-
|
266
|
+
with pypdfium2_lock:
|
267
|
+
return PyPdfiumPageBackend(self._pdoc, self.document_hash, page_no)
|
253
268
|
|
254
269
|
def is_valid(self) -> bool:
|
255
270
|
return self.page_count() > 0
|
256
271
|
|
257
272
|
def unload(self):
|
258
273
|
super().unload()
|
259
|
-
|
260
|
-
|
274
|
+
with pypdfium2_lock:
|
275
|
+
self._pdoc.close()
|
276
|
+
self._pdoc = None
|
@@ -203,6 +203,7 @@ class LayoutPostprocessor:
|
|
203
203
|
"""Initialize processor with cells and spatial indices."""
|
204
204
|
self.cells = cells
|
205
205
|
self.page_size = page_size
|
206
|
+
self.all_clusters = clusters
|
206
207
|
self.regular_clusters = [
|
207
208
|
c for c in clusters if c.label not in self.SPECIAL_TYPES
|
208
209
|
]
|
@@ -267,7 +268,7 @@ class LayoutPostprocessor:
|
|
267
268
|
# Handle orphaned cells
|
268
269
|
unassigned = self._find_unassigned_cells(clusters)
|
269
270
|
if unassigned:
|
270
|
-
next_id = max((c.id for c in
|
271
|
+
next_id = max((c.id for c in self.all_clusters), default=0) + 1
|
271
272
|
orphan_clusters = []
|
272
273
|
for i, cell in enumerate(unassigned):
|
273
274
|
conf = 1.0
|
docling/utils/locks.py
ADDED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.25.
|
3
|
+
Version: 2.25.2
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -185,7 +185,7 @@ For individual model usage, please refer to the model licenses found in the orig
|
|
185
185
|
|
186
186
|
Docling has been brought to you by IBM.
|
187
187
|
|
188
|
-
[supported_formats]: https://ds4sd.github.io/docling/supported_formats/
|
188
|
+
[supported_formats]: https://ds4sd.github.io/docling/usage/supported_formats/
|
189
189
|
[docling_document]: https://ds4sd.github.io/docling/concepts/docling_document/
|
190
190
|
[integrations]: https://ds4sd.github.io/docling/integrations/
|
191
191
|
|
@@ -4,8 +4,8 @@ docling/backend/abstract_backend.py,sha256=1lNxzwDTn303aXduPDVmTyXn-5ZIoWMLYqNxA
|
|
4
4
|
docling/backend/asciidoc_backend.py,sha256=zyHxlG_BvlLwvpdNca3P6aopxOJZw8wbDFkJQQknNXk,14050
|
5
5
|
docling/backend/csv_backend.py,sha256=xuId4JGEXjoyPgO9Fy9hQ5C-ezXvJwv0TGB8fyFHgWM,4533
|
6
6
|
docling/backend/docling_parse_backend.py,sha256=hEEJibI1oJS0LAnFoIs6gMshS3bCqGtVxHnDNvBGZuA,7649
|
7
|
-
docling/backend/docling_parse_v2_backend.py,sha256=
|
8
|
-
docling/backend/html_backend.py,sha256=
|
7
|
+
docling/backend/docling_parse_v2_backend.py,sha256=oF8W-zuvEfpmyXp7Itt6-ot_feeMneMmSG7CpKclMhc,9005
|
8
|
+
docling/backend/html_backend.py,sha256=qLzNpMpfmllwpp-5uARrmaVyN5D1YOpmsbS3-RyL2p0,19370
|
9
9
|
docling/backend/json/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
docling/backend/json/docling_json_backend.py,sha256=LlFMVoZrrCfVwbDuRbNN4Xg96Lujh4xxrTBt9jGhY9I,1984
|
11
11
|
docling/backend/md_backend.py,sha256=NaVfcnEH-5bwVovjn76EobF6B6Wm8AhaTZ4E8k0TUPo,16826
|
@@ -13,7 +13,7 @@ docling/backend/msexcel_backend.py,sha256=lyJc4ShJGAN2ZfNTTuhdYTF-44cZsGyn_8Djst
|
|
13
13
|
docling/backend/mspowerpoint_backend.py,sha256=esAyaaQe17BQFweGAGJHvImKETefY0BpvfpUSECC49w,16424
|
14
14
|
docling/backend/msword_backend.py,sha256=V4miLIcOH8DDlSCm25F_DALBW60Uf9JoSS0TB4yrQBw,20591
|
15
15
|
docling/backend/pdf_backend.py,sha256=17Pr8dWsD1C4FYUprrwMM9trDGW-JYLjrcScx1Ul4io,2048
|
16
|
-
docling/backend/pypdfium2_backend.py,sha256=
|
16
|
+
docling/backend/pypdfium2_backend.py,sha256=l6YfoiIibw-Z4wrRwQTPP96IGOMAf1SIT_TPVBIuZRs,9663
|
17
17
|
docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
18
|
docling/backend/xml/jats_backend.py,sha256=HXailrDjiwu4swwFnXy3lNfRtLZmkBBp4yqafCvdr7s,24945
|
19
19
|
docling/backend/xml/uspto_backend.py,sha256=IGUNeF2xpLeaVrX6nKb-jXgtSYD2ozULsrDPcrI1IbQ,71040
|
@@ -58,14 +58,15 @@ docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
58
58
|
docling/utils/accelerator_utils.py,sha256=ONNRrC8fH-8E93WUCNhfOq1t7WrQ1T7-YsmExTOY5f0,2292
|
59
59
|
docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
|
60
60
|
docling/utils/glm_utils.py,sha256=W4JRoP0xQ6SJmhhIoAfcKxm5dr1CFvLHp8pqI1kdhxs,12250
|
61
|
-
docling/utils/layout_postprocessor.py,sha256=
|
61
|
+
docling/utils/layout_postprocessor.py,sha256=kdIk5TpAEXvsQUvkdALBDnAbjc4I_j8s8w6GEvbu4f0,24304
|
62
|
+
docling/utils/locks.py,sha256=RzqQtD5UispgV71pGN_nU6GYfeN11BN0Sh_Dq9ycqGo,52
|
62
63
|
docling/utils/model_downloader.py,sha256=sxAQvjiIu9m2Ur5Ot5C5SATmgWJAHi0xSjzxj8QXYJk,3213
|
63
64
|
docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,263
|
64
65
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
65
66
|
docling/utils/utils.py,sha256=0ozCk7zUkYzxRVmYoIB2zA1lqjQOuaQzxfGuf1wmKW4,1866
|
66
67
|
docling/utils/visualization.py,sha256=cmbIroPQXPmJdFrNIfpC26WpijBwx05qmpu3QhiG1EI,2850
|
67
|
-
docling-2.25.
|
68
|
-
docling-2.25.
|
69
|
-
docling-2.25.
|
70
|
-
docling-2.25.
|
71
|
-
docling-2.25.
|
68
|
+
docling-2.25.2.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
69
|
+
docling-2.25.2.dist-info/METADATA,sha256=NsR1pyqk-Q5G5pHrpaLf6TCQEE-r-hGrEB9Hpqdgykk,8803
|
70
|
+
docling-2.25.2.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
71
|
+
docling-2.25.2.dist-info/entry_points.txt,sha256=cFrINXsORijdm2EWJzf1m9_rDxH9G9W1fP385-9atY4,84
|
72
|
+
docling-2.25.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|