docling 2.25.0__tar.gz → 2.25.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. {docling-2.25.0 → docling-2.25.2}/PKG-INFO +2 -2
  2. {docling-2.25.0 → docling-2.25.2}/README.md +1 -1
  3. {docling-2.25.0 → docling-2.25.2}/docling/backend/docling_parse_v2_backend.py +38 -30
  4. {docling-2.25.0 → docling-2.25.2}/docling/backend/html_backend.py +81 -19
  5. {docling-2.25.0 → docling-2.25.2}/docling/backend/pypdfium2_backend.py +57 -41
  6. {docling-2.25.0 → docling-2.25.2}/docling/utils/layout_postprocessor.py +2 -1
  7. docling-2.25.2/docling/utils/locks.py +3 -0
  8. {docling-2.25.0 → docling-2.25.2}/pyproject.toml +1 -1
  9. {docling-2.25.0 → docling-2.25.2}/LICENSE +0 -0
  10. {docling-2.25.0 → docling-2.25.2}/docling/__init__.py +0 -0
  11. {docling-2.25.0 → docling-2.25.2}/docling/backend/__init__.py +0 -0
  12. {docling-2.25.0 → docling-2.25.2}/docling/backend/abstract_backend.py +0 -0
  13. {docling-2.25.0 → docling-2.25.2}/docling/backend/asciidoc_backend.py +0 -0
  14. {docling-2.25.0 → docling-2.25.2}/docling/backend/csv_backend.py +0 -0
  15. {docling-2.25.0 → docling-2.25.2}/docling/backend/docling_parse_backend.py +0 -0
  16. {docling-2.25.0 → docling-2.25.2}/docling/backend/json/__init__.py +0 -0
  17. {docling-2.25.0 → docling-2.25.2}/docling/backend/json/docling_json_backend.py +0 -0
  18. {docling-2.25.0 → docling-2.25.2}/docling/backend/md_backend.py +0 -0
  19. {docling-2.25.0 → docling-2.25.2}/docling/backend/msexcel_backend.py +0 -0
  20. {docling-2.25.0 → docling-2.25.2}/docling/backend/mspowerpoint_backend.py +0 -0
  21. {docling-2.25.0 → docling-2.25.2}/docling/backend/msword_backend.py +0 -0
  22. {docling-2.25.0 → docling-2.25.2}/docling/backend/pdf_backend.py +0 -0
  23. {docling-2.25.0 → docling-2.25.2}/docling/backend/xml/__init__.py +0 -0
  24. {docling-2.25.0 → docling-2.25.2}/docling/backend/xml/jats_backend.py +0 -0
  25. {docling-2.25.0 → docling-2.25.2}/docling/backend/xml/uspto_backend.py +0 -0
  26. {docling-2.25.0 → docling-2.25.2}/docling/chunking/__init__.py +0 -0
  27. {docling-2.25.0 → docling-2.25.2}/docling/cli/__init__.py +0 -0
  28. {docling-2.25.0 → docling-2.25.2}/docling/cli/main.py +0 -0
  29. {docling-2.25.0 → docling-2.25.2}/docling/cli/models.py +0 -0
  30. {docling-2.25.0 → docling-2.25.2}/docling/cli/tools.py +0 -0
  31. {docling-2.25.0 → docling-2.25.2}/docling/datamodel/__init__.py +0 -0
  32. {docling-2.25.0 → docling-2.25.2}/docling/datamodel/base_models.py +0 -0
  33. {docling-2.25.0 → docling-2.25.2}/docling/datamodel/document.py +0 -0
  34. {docling-2.25.0 → docling-2.25.2}/docling/datamodel/pipeline_options.py +0 -0
  35. {docling-2.25.0 → docling-2.25.2}/docling/datamodel/settings.py +0 -0
  36. {docling-2.25.0 → docling-2.25.2}/docling/document_converter.py +0 -0
  37. {docling-2.25.0 → docling-2.25.2}/docling/exceptions.py +0 -0
  38. {docling-2.25.0 → docling-2.25.2}/docling/models/__init__.py +0 -0
  39. {docling-2.25.0 → docling-2.25.2}/docling/models/base_model.py +0 -0
  40. {docling-2.25.0 → docling-2.25.2}/docling/models/base_ocr_model.py +0 -0
  41. {docling-2.25.0 → docling-2.25.2}/docling/models/code_formula_model.py +0 -0
  42. {docling-2.25.0 → docling-2.25.2}/docling/models/document_picture_classifier.py +0 -0
  43. {docling-2.25.0 → docling-2.25.2}/docling/models/easyocr_model.py +0 -0
  44. {docling-2.25.0 → docling-2.25.2}/docling/models/hf_vlm_model.py +0 -0
  45. {docling-2.25.0 → docling-2.25.2}/docling/models/layout_model.py +0 -0
  46. {docling-2.25.0 → docling-2.25.2}/docling/models/ocr_mac_model.py +0 -0
  47. {docling-2.25.0 → docling-2.25.2}/docling/models/page_assemble_model.py +0 -0
  48. {docling-2.25.0 → docling-2.25.2}/docling/models/page_preprocessing_model.py +0 -0
  49. {docling-2.25.0 → docling-2.25.2}/docling/models/picture_description_api_model.py +0 -0
  50. {docling-2.25.0 → docling-2.25.2}/docling/models/picture_description_base_model.py +0 -0
  51. {docling-2.25.0 → docling-2.25.2}/docling/models/picture_description_vlm_model.py +0 -0
  52. {docling-2.25.0 → docling-2.25.2}/docling/models/rapid_ocr_model.py +0 -0
  53. {docling-2.25.0 → docling-2.25.2}/docling/models/readingorder_model.py +0 -0
  54. {docling-2.25.0 → docling-2.25.2}/docling/models/table_structure_model.py +0 -0
  55. {docling-2.25.0 → docling-2.25.2}/docling/models/tesseract_ocr_cli_model.py +0 -0
  56. {docling-2.25.0 → docling-2.25.2}/docling/models/tesseract_ocr_model.py +0 -0
  57. {docling-2.25.0 → docling-2.25.2}/docling/pipeline/__init__.py +0 -0
  58. {docling-2.25.0 → docling-2.25.2}/docling/pipeline/base_pipeline.py +0 -0
  59. {docling-2.25.0 → docling-2.25.2}/docling/pipeline/simple_pipeline.py +0 -0
  60. {docling-2.25.0 → docling-2.25.2}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  61. {docling-2.25.0 → docling-2.25.2}/docling/pipeline/vlm_pipeline.py +0 -0
  62. {docling-2.25.0 → docling-2.25.2}/docling/py.typed +0 -0
  63. {docling-2.25.0 → docling-2.25.2}/docling/utils/__init__.py +0 -0
  64. {docling-2.25.0 → docling-2.25.2}/docling/utils/accelerator_utils.py +0 -0
  65. {docling-2.25.0 → docling-2.25.2}/docling/utils/export.py +0 -0
  66. {docling-2.25.0 → docling-2.25.2}/docling/utils/glm_utils.py +0 -0
  67. {docling-2.25.0 → docling-2.25.2}/docling/utils/model_downloader.py +0 -0
  68. {docling-2.25.0 → docling-2.25.2}/docling/utils/ocr_utils.py +0 -0
  69. {docling-2.25.0 → docling-2.25.2}/docling/utils/profiling.py +0 -0
  70. {docling-2.25.0 → docling-2.25.2}/docling/utils/utils.py +0 -0
  71. {docling-2.25.0 → docling-2.25.2}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.25.0
3
+ Version: 2.25.2
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -185,7 +185,7 @@ For individual model usage, please refer to the model licenses found in the orig
185
185
 
186
186
  Docling has been brought to you by IBM.
187
187
 
188
- [supported_formats]: https://ds4sd.github.io/docling/supported_formats/
188
+ [supported_formats]: https://ds4sd.github.io/docling/usage/supported_formats/
189
189
  [docling_document]: https://ds4sd.github.io/docling/concepts/docling_document/
190
190
  [integrations]: https://ds4sd.github.io/docling/integrations/
191
191
 
@@ -123,6 +123,6 @@ For individual model usage, please refer to the model licenses found in the orig
123
123
 
124
124
  Docling has been brought to you by IBM.
125
125
 
126
- [supported_formats]: https://ds4sd.github.io/docling/supported_formats/
126
+ [supported_formats]: https://ds4sd.github.io/docling/usage/supported_formats/
127
127
  [docling_document]: https://ds4sd.github.io/docling/concepts/docling_document/
128
128
  [integrations]: https://ds4sd.github.io/docling/integrations/
@@ -12,6 +12,7 @@ from pypdfium2 import PdfPage
12
12
 
13
13
  from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
14
14
  from docling.datamodel.base_models import Cell, Size
15
+ from docling.utils.locks import pypdfium2_lock
15
16
 
16
17
  if TYPE_CHECKING:
17
18
  from docling.datamodel.document import InputDocument
@@ -182,20 +183,24 @@ class DoclingParseV2PageBackend(PdfPageBackend):
182
183
  padbox.r = page_size.width - padbox.r
183
184
  padbox.t = page_size.height - padbox.t
184
185
 
185
- image = (
186
- self._ppage.render(
187
- scale=scale * 1.5,
188
- rotation=0, # no additional rotation
189
- crop=padbox.as_tuple(),
190
- )
191
- .to_pil()
192
- .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
193
- ) # We resize the image from 1.5x the given scale to make it sharper.
186
+ with pypdfium2_lock:
187
+ image = (
188
+ self._ppage.render(
189
+ scale=scale * 1.5,
190
+ rotation=0, # no additional rotation
191
+ crop=padbox.as_tuple(),
192
+ )
193
+ .to_pil()
194
+ .resize(
195
+ size=(round(cropbox.width * scale), round(cropbox.height * scale))
196
+ )
197
+ ) # We resize the image from 1.5x the given scale to make it sharper.
194
198
 
195
199
  return image
196
200
 
197
201
  def get_size(self) -> Size:
198
- return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
202
+ with pypdfium2_lock:
203
+ return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
199
204
 
200
205
  def unload(self):
201
206
  self._ppage = None
@@ -206,23 +211,24 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
206
211
  def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
207
212
  super().__init__(in_doc, path_or_stream)
208
213
 
209
- self._pdoc = pdfium.PdfDocument(self.path_or_stream)
210
- self.parser = pdf_parser_v2("fatal")
214
+ with pypdfium2_lock:
215
+ self._pdoc = pdfium.PdfDocument(self.path_or_stream)
216
+ self.parser = pdf_parser_v2("fatal")
211
217
 
212
- success = False
213
- if isinstance(self.path_or_stream, BytesIO):
214
- success = self.parser.load_document_from_bytesio(
215
- self.document_hash, self.path_or_stream
216
- )
217
- elif isinstance(self.path_or_stream, Path):
218
- success = self.parser.load_document(
219
- self.document_hash, str(self.path_or_stream)
220
- )
218
+ success = False
219
+ if isinstance(self.path_or_stream, BytesIO):
220
+ success = self.parser.load_document_from_bytesio(
221
+ self.document_hash, self.path_or_stream
222
+ )
223
+ elif isinstance(self.path_or_stream, Path):
224
+ success = self.parser.load_document(
225
+ self.document_hash, str(self.path_or_stream)
226
+ )
221
227
 
222
- if not success:
223
- raise RuntimeError(
224
- f"docling-parse v2 could not load document {self.document_hash}."
225
- )
228
+ if not success:
229
+ raise RuntimeError(
230
+ f"docling-parse v2 could not load document {self.document_hash}."
231
+ )
226
232
 
227
233
  def page_count(self) -> int:
228
234
  # return len(self._pdoc) # To be replaced with docling-parse API
@@ -236,9 +242,10 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
236
242
  return len_2
237
243
 
238
244
  def load_page(self, page_no: int) -> DoclingParseV2PageBackend:
239
- return DoclingParseV2PageBackend(
240
- self.parser, self.document_hash, page_no, self._pdoc[page_no]
241
- )
245
+ with pypdfium2_lock:
246
+ return DoclingParseV2PageBackend(
247
+ self.parser, self.document_hash, page_no, self._pdoc[page_no]
248
+ )
242
249
 
243
250
  def is_valid(self) -> bool:
244
251
  return self.page_count() > 0
@@ -246,5 +253,6 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
246
253
  def unload(self):
247
254
  super().unload()
248
255
  self.parser.unload_document(self.document_hash)
249
- self._pdoc.close()
250
- self._pdoc = None
256
+ with pypdfium2_lock:
257
+ self._pdoc.close()
258
+ self._pdoc = None
@@ -15,6 +15,7 @@ from docling_core.types.doc import (
15
15
  TableCell,
16
16
  TableData,
17
17
  )
18
+ from docling_core.types.doc.document import ContentLayer
18
19
  from typing_extensions import override
19
20
 
20
21
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
@@ -66,7 +67,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
66
67
  self.soup = BeautifulSoup(html_content, "html.parser")
67
68
  except Exception as e:
68
69
  raise RuntimeError(
69
- f"Could not initialize HTML backend for file with hash {self.document_hash}."
70
+ "Could not initialize HTML backend for file with "
71
+ f"hash {self.document_hash}."
70
72
  ) from e
71
73
 
72
74
  @override
@@ -109,14 +111,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
109
111
  # TODO: remove style to avoid losing text from tags like i, b, span, ...
110
112
  for br in content("br"):
111
113
  br.replace_with(NavigableString("\n"))
114
+
115
+ headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"])
116
+ self.content_layer = (
117
+ ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
118
+ )
112
119
  self.walk(content, doc)
113
120
  else:
114
121
  raise RuntimeError(
115
- f"Cannot convert doc with {self.document_hash} because the backend failed to init."
122
+ f"Cannot convert doc with {self.document_hash} because the backend "
123
+ "failed to init."
116
124
  )
117
125
  return doc
118
126
 
119
127
  def walk(self, tag: Tag, doc: DoclingDocument) -> None:
128
+
120
129
  # Iterate over elements in the body of the document
121
130
  text: str = ""
122
131
  for element in tag.children:
@@ -143,8 +152,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
143
152
  if text and tag.name in ["div"]:
144
153
  doc.add_text(
145
154
  parent=self.parents[self.level],
146
- label=DocItemLabel.PARAGRAPH,
155
+ label=DocItemLabel.TEXT,
147
156
  text=text,
157
+ content_layer=self.content_layer,
148
158
  )
149
159
  text = ""
150
160
 
@@ -166,7 +176,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
166
176
  elif tag.name == "figure":
167
177
  self.handle_figure(tag, doc)
168
178
  elif tag.name == "img":
169
- self.handle_image(doc)
179
+ self.handle_image(tag, doc)
170
180
  else:
171
181
  self.walk(tag, doc)
172
182
 
@@ -197,12 +207,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
197
207
  text = element.text.strip()
198
208
 
199
209
  if hlevel == 1:
210
+ self.content_layer = ContentLayer.BODY
211
+
200
212
  for key in self.parents.keys():
201
213
  self.parents[key] = None
202
214
 
203
215
  self.level = 1
204
216
  self.parents[self.level] = doc.add_text(
205
- parent=self.parents[0], label=DocItemLabel.TITLE, text=text
217
+ parent=self.parents[0],
218
+ label=DocItemLabel.TITLE,
219
+ text=text,
220
+ content_layer=self.content_layer,
206
221
  )
207
222
  else:
208
223
  if hlevel > self.level:
@@ -213,6 +228,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
213
228
  name=f"header-{i}",
214
229
  label=GroupLabel.SECTION,
215
230
  parent=self.parents[i - 1],
231
+ content_layer=self.content_layer,
216
232
  )
217
233
  self.level = hlevel
218
234
 
@@ -228,6 +244,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
228
244
  parent=self.parents[hlevel - 1],
229
245
  text=text,
230
246
  level=hlevel,
247
+ content_layer=self.content_layer,
231
248
  )
232
249
 
233
250
  def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
@@ -236,16 +253,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
236
253
  return
237
254
  text = element.text.strip()
238
255
  if text:
239
- doc.add_code(parent=self.parents[self.level], text=text)
256
+ doc.add_code(
257
+ parent=self.parents[self.level],
258
+ text=text,
259
+ content_layer=self.content_layer,
260
+ )
240
261
 
241
262
  def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
242
263
  """Handles paragraph tags (p)."""
243
264
  if element.text is None:
244
265
  return
245
266
  text = element.text.strip()
246
- label = DocItemLabel.PARAGRAPH
247
267
  if text:
248
- doc.add_text(parent=self.parents[self.level], label=label, text=text)
268
+ doc.add_text(
269
+ parent=self.parents[self.level],
270
+ label=DocItemLabel.TEXT,
271
+ text=text,
272
+ content_layer=self.content_layer,
273
+ )
249
274
 
250
275
  def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
251
276
  """Handles list tags (ul, ol) and their list items."""
@@ -253,14 +278,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
253
278
  if element.name == "ul":
254
279
  # create a list group
255
280
  self.parents[self.level + 1] = doc.add_group(
256
- parent=self.parents[self.level], name="list", label=GroupLabel.LIST
281
+ parent=self.parents[self.level],
282
+ name="list",
283
+ label=GroupLabel.LIST,
284
+ content_layer=self.content_layer,
257
285
  )
258
286
  elif element.name == "ol":
287
+ start_attr = element.get("start")
288
+ start: int = (
289
+ int(start_attr)
290
+ if isinstance(start_attr, str) and start_attr.isnumeric()
291
+ else 1
292
+ )
259
293
  # create a list group
260
294
  self.parents[self.level + 1] = doc.add_group(
261
295
  parent=self.parents[self.level],
262
- name="ordered list",
296
+ name="ordered list" + (f" start {start}" if start != 1 else ""),
263
297
  label=GroupLabel.ORDERED_LIST,
298
+ content_layer=self.content_layer,
264
299
  )
265
300
  self.level += 1
266
301
 
@@ -270,15 +305,23 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
270
305
  self.level -= 1
271
306
 
272
307
  def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
273
- """Handles listitem tags (li)."""
308
+ """Handles list item tags (li)."""
274
309
  nested_list = element.find(["ul", "ol"])
275
310
 
276
311
  parent = self.parents[self.level]
277
312
  if parent is None:
278
- _log.warning(f"list-item has no parent in DoclingDocument: {element}")
313
+ _log.debug(f"list-item has no parent in DoclingDocument: {element}")
279
314
  return
280
315
  parent_label: str = parent.label
281
316
  index_in_list = len(parent.children) + 1
317
+ if (
318
+ parent_label == GroupLabel.ORDERED_LIST
319
+ and isinstance(parent, GroupItem)
320
+ and parent.name
321
+ ):
322
+ start_in_list: str = parent.name.split(" ")[-1]
323
+ start: int = int(start_in_list) if start_in_list.isnumeric() else 1
324
+ index_in_list += start - 1
282
325
 
283
326
  if nested_list:
284
327
  # Text in list item can be hidden within hierarchy, hence
@@ -301,6 +344,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
301
344
  enumerated=enumerated,
302
345
  marker=marker,
303
346
  parent=parent,
347
+ content_layer=self.content_layer,
304
348
  )
305
349
  self.level += 1
306
350
 
@@ -322,15 +366,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
322
366
  enumerated=enumerated,
323
367
  marker=marker,
324
368
  parent=parent,
369
+ content_layer=self.content_layer,
325
370
  )
326
371
  else:
327
- _log.warning(f"list-item has no text: {element}")
372
+ _log.debug(f"list-item has no text: {element}")
328
373
 
329
374
  @staticmethod
330
375
  def parse_table_data(element: Tag) -> Optional[TableData]:
331
376
  nested_tables = element.find("table")
332
377
  if nested_tables is not None:
333
- _log.warning("Skipping nested table.")
378
+ _log.debug("Skipping nested table.")
334
379
  return None
335
380
 
336
381
  # Count the number of rows (number of <tr> elements)
@@ -425,7 +470,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
425
470
  table_data = HTMLDocumentBackend.parse_table_data(element)
426
471
 
427
472
  if table_data is not None:
428
- doc.add_table(data=table_data, parent=self.parents[self.level])
473
+ doc.add_table(
474
+ data=table_data,
475
+ parent=self.parents[self.level],
476
+ content_layer=self.content_layer,
477
+ )
429
478
 
430
479
  def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
431
480
  """Recursively extract text from <ul> or <ol> with proper indentation."""
@@ -465,20 +514,33 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
465
514
 
466
515
  contains_captions = element.find(["figcaption"])
467
516
  if not isinstance(contains_captions, Tag):
468
- doc.add_picture(parent=self.parents[self.level], caption=None)
517
+ doc.add_picture(
518
+ parent=self.parents[self.level],
519
+ caption=None,
520
+ content_layer=self.content_layer,
521
+ )
469
522
  else:
470
523
  texts = []
471
524
  for item in contains_captions:
472
525
  texts.append(item.text)
473
526
 
474
527
  fig_caption = doc.add_text(
475
- label=DocItemLabel.CAPTION, text=("".join(texts)).strip()
528
+ label=DocItemLabel.CAPTION,
529
+ text=("".join(texts)).strip(),
530
+ content_layer=self.content_layer,
476
531
  )
477
532
  doc.add_picture(
478
533
  parent=self.parents[self.level],
479
534
  caption=fig_caption,
535
+ content_layer=self.content_layer,
480
536
  )
481
537
 
482
- def handle_image(self, doc: DoclingDocument) -> None:
538
+ def handle_image(self, element: Tag, doc: DoclingDocument) -> None:
483
539
  """Handles image tags (img)."""
484
- doc.add_picture(parent=self.parents[self.level], caption=None)
540
+ _log.debug(f"ignoring <img> tags at the moment: {element}")
541
+
542
+ doc.add_picture(
543
+ parent=self.parents[self.level],
544
+ caption=None,
545
+ content_layer=self.content_layer,
546
+ )
@@ -13,6 +13,7 @@ from pypdfium2._helpers.misc import PdfiumError
13
13
 
14
14
  from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
15
15
  from docling.datamodel.base_models import Cell
16
+ from docling.utils.locks import pypdfium2_lock
16
17
 
17
18
  if TYPE_CHECKING:
18
19
  from docling.datamodel.document import InputDocument
@@ -24,6 +25,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
24
25
  def __init__(
25
26
  self, pdfium_doc: pdfium.PdfDocument, document_hash: str, page_no: int
26
27
  ):
28
+ # Note: lock applied by the caller
27
29
  self.valid = True # No better way to tell from pypdfium.
28
30
  try:
29
31
  self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
@@ -40,51 +42,57 @@ class PyPdfiumPageBackend(PdfPageBackend):
40
42
 
41
43
  def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
42
44
  AREA_THRESHOLD = 0 # 32 * 32
43
- for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
44
- pos = obj.get_pos()
45
- cropbox = BoundingBox.from_tuple(
46
- pos, origin=CoordOrigin.BOTTOMLEFT
47
- ).to_top_left_origin(page_height=self.get_size().height)
45
+ page_size = self.get_size()
46
+ with pypdfium2_lock:
47
+ for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
48
+ pos = obj.get_pos()
49
+ cropbox = BoundingBox.from_tuple(
50
+ pos, origin=CoordOrigin.BOTTOMLEFT
51
+ ).to_top_left_origin(page_height=page_size.height)
48
52
 
49
- if cropbox.area() > AREA_THRESHOLD:
50
- cropbox = cropbox.scaled(scale=scale)
53
+ if cropbox.area() > AREA_THRESHOLD:
54
+ cropbox = cropbox.scaled(scale=scale)
51
55
 
52
- yield cropbox
56
+ yield cropbox
53
57
 
54
58
  def get_text_in_rect(self, bbox: BoundingBox) -> str:
55
- if not self.text_page:
56
- self.text_page = self._ppage.get_textpage()
59
+ with pypdfium2_lock:
60
+ if not self.text_page:
61
+ self.text_page = self._ppage.get_textpage()
57
62
 
58
63
  if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
59
64
  bbox = bbox.to_bottom_left_origin(self.get_size().height)
60
65
 
61
- text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
66
+ with pypdfium2_lock:
67
+ text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
62
68
 
63
69
  return text_piece
64
70
 
65
71
  def get_text_cells(self) -> Iterable[Cell]:
66
- if not self.text_page:
67
- self.text_page = self._ppage.get_textpage()
72
+ with pypdfium2_lock:
73
+ if not self.text_page:
74
+ self.text_page = self._ppage.get_textpage()
68
75
 
69
76
  cells = []
70
77
  cell_counter = 0
71
78
 
72
79
  page_size = self.get_size()
73
80
 
74
- for i in range(self.text_page.count_rects()):
75
- rect = self.text_page.get_rect(i)
76
- text_piece = self.text_page.get_text_bounded(*rect)
77
- x0, y0, x1, y1 = rect
78
- cells.append(
79
- Cell(
80
- id=cell_counter,
81
- text=text_piece,
82
- bbox=BoundingBox(
83
- l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT
84
- ).to_top_left_origin(page_size.height),
81
+ with pypdfium2_lock:
82
+ for i in range(self.text_page.count_rects()):
83
+ rect = self.text_page.get_rect(i)
84
+ text_piece = self.text_page.get_text_bounded(*rect)
85
+ x0, y0, x1, y1 = rect
86
+ cells.append(
87
+ Cell(
88
+ id=cell_counter,
89
+ text=text_piece,
90
+ bbox=BoundingBox(
91
+ l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT
92
+ ).to_top_left_origin(page_size.height),
93
+ )
85
94
  )
86
- )
87
- cell_counter += 1
95
+ cell_counter += 1
88
96
 
89
97
  # PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs.
90
98
  # The cell merging code below is to clean this up.
@@ -214,20 +222,24 @@ class PyPdfiumPageBackend(PdfPageBackend):
214
222
  padbox.r = page_size.width - padbox.r
215
223
  padbox.t = page_size.height - padbox.t
216
224
 
217
- image = (
218
- self._ppage.render(
219
- scale=scale * 1.5,
220
- rotation=0, # no additional rotation
221
- crop=padbox.as_tuple(),
222
- )
223
- .to_pil()
224
- .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
225
- ) # We resize the image from 1.5x the given scale to make it sharper.
225
+ with pypdfium2_lock:
226
+ image = (
227
+ self._ppage.render(
228
+ scale=scale * 1.5,
229
+ rotation=0, # no additional rotation
230
+ crop=padbox.as_tuple(),
231
+ )
232
+ .to_pil()
233
+ .resize(
234
+ size=(round(cropbox.width * scale), round(cropbox.height * scale))
235
+ )
236
+ ) # We resize the image from 1.5x the given scale to make it sharper.
226
237
 
227
238
  return image
228
239
 
229
240
  def get_size(self) -> Size:
230
- return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
241
+ with pypdfium2_lock:
242
+ return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
231
243
 
232
244
  def unload(self):
233
245
  self._ppage = None
@@ -239,22 +251,26 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
239
251
  super().__init__(in_doc, path_or_stream)
240
252
 
241
253
  try:
242
- self._pdoc = pdfium.PdfDocument(self.path_or_stream)
254
+ with pypdfium2_lock:
255
+ self._pdoc = pdfium.PdfDocument(self.path_or_stream)
243
256
  except PdfiumError as e:
244
257
  raise RuntimeError(
245
258
  f"pypdfium could not load document with hash {self.document_hash}"
246
259
  ) from e
247
260
 
248
261
  def page_count(self) -> int:
249
- return len(self._pdoc)
262
+ with pypdfium2_lock:
263
+ return len(self._pdoc)
250
264
 
251
265
  def load_page(self, page_no: int) -> PyPdfiumPageBackend:
252
- return PyPdfiumPageBackend(self._pdoc, self.document_hash, page_no)
266
+ with pypdfium2_lock:
267
+ return PyPdfiumPageBackend(self._pdoc, self.document_hash, page_no)
253
268
 
254
269
  def is_valid(self) -> bool:
255
270
  return self.page_count() > 0
256
271
 
257
272
  def unload(self):
258
273
  super().unload()
259
- self._pdoc.close()
260
- self._pdoc = None
274
+ with pypdfium2_lock:
275
+ self._pdoc.close()
276
+ self._pdoc = None
@@ -203,6 +203,7 @@ class LayoutPostprocessor:
203
203
  """Initialize processor with cells and spatial indices."""
204
204
  self.cells = cells
205
205
  self.page_size = page_size
206
+ self.all_clusters = clusters
206
207
  self.regular_clusters = [
207
208
  c for c in clusters if c.label not in self.SPECIAL_TYPES
208
209
  ]
@@ -267,7 +268,7 @@ class LayoutPostprocessor:
267
268
  # Handle orphaned cells
268
269
  unassigned = self._find_unassigned_cells(clusters)
269
270
  if unassigned:
270
- next_id = max((c.id for c in clusters), default=0) + 1
271
+ next_id = max((c.id for c in self.all_clusters), default=0) + 1
271
272
  orphan_clusters = []
272
273
  for i, cell in enumerate(unassigned):
273
274
  conf = 1.0
@@ -0,0 +1,3 @@
1
+ import threading
2
+
3
+ pypdfium2_lock = threading.Lock()
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "2.25.0" # DO NOT EDIT, updated automatically
3
+ version = "2.25.2" # DO NOT EDIT, updated automatically
4
4
  description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes