docling 2.24.0__tar.gz → 2.25.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. {docling-2.24.0 → docling-2.25.1}/PKG-INFO +2 -1
  2. {docling-2.24.0 → docling-2.25.1}/docling/backend/docling_parse_v2_backend.py +38 -30
  3. {docling-2.24.0 → docling-2.25.1}/docling/backend/html_backend.py +122 -21
  4. {docling-2.24.0 → docling-2.25.1}/docling/backend/pypdfium2_backend.py +57 -41
  5. {docling-2.24.0 → docling-2.25.1}/docling/cli/models.py +28 -4
  6. {docling-2.24.0 → docling-2.25.1}/docling/datamodel/base_models.py +5 -0
  7. {docling-2.24.0 → docling-2.25.1}/docling/datamodel/pipeline_options.py +62 -1
  8. docling-2.25.1/docling/models/hf_vlm_model.py +180 -0
  9. {docling-2.24.0 → docling-2.25.1}/docling/models/picture_description_vlm_model.py +2 -2
  10. docling-2.25.1/docling/pipeline/vlm_pipeline.py +534 -0
  11. docling-2.25.1/docling/utils/locks.py +3 -0
  12. {docling-2.24.0 → docling-2.25.1}/docling/utils/model_downloader.py +15 -2
  13. {docling-2.24.0 → docling-2.25.1}/docling/utils/visualization.py +5 -0
  14. {docling-2.24.0 → docling-2.25.1}/pyproject.toml +6 -2
  15. {docling-2.24.0 → docling-2.25.1}/LICENSE +0 -0
  16. {docling-2.24.0 → docling-2.25.1}/README.md +0 -0
  17. {docling-2.24.0 → docling-2.25.1}/docling/__init__.py +0 -0
  18. {docling-2.24.0 → docling-2.25.1}/docling/backend/__init__.py +0 -0
  19. {docling-2.24.0 → docling-2.25.1}/docling/backend/abstract_backend.py +0 -0
  20. {docling-2.24.0 → docling-2.25.1}/docling/backend/asciidoc_backend.py +0 -0
  21. {docling-2.24.0 → docling-2.25.1}/docling/backend/csv_backend.py +0 -0
  22. {docling-2.24.0 → docling-2.25.1}/docling/backend/docling_parse_backend.py +0 -0
  23. {docling-2.24.0 → docling-2.25.1}/docling/backend/json/__init__.py +0 -0
  24. {docling-2.24.0 → docling-2.25.1}/docling/backend/json/docling_json_backend.py +0 -0
  25. {docling-2.24.0 → docling-2.25.1}/docling/backend/md_backend.py +0 -0
  26. {docling-2.24.0 → docling-2.25.1}/docling/backend/msexcel_backend.py +0 -0
  27. {docling-2.24.0 → docling-2.25.1}/docling/backend/mspowerpoint_backend.py +0 -0
  28. {docling-2.24.0 → docling-2.25.1}/docling/backend/msword_backend.py +0 -0
  29. {docling-2.24.0 → docling-2.25.1}/docling/backend/pdf_backend.py +0 -0
  30. {docling-2.24.0 → docling-2.25.1}/docling/backend/xml/__init__.py +0 -0
  31. {docling-2.24.0 → docling-2.25.1}/docling/backend/xml/jats_backend.py +0 -0
  32. {docling-2.24.0 → docling-2.25.1}/docling/backend/xml/uspto_backend.py +0 -0
  33. {docling-2.24.0 → docling-2.25.1}/docling/chunking/__init__.py +0 -0
  34. {docling-2.24.0 → docling-2.25.1}/docling/cli/__init__.py +0 -0
  35. {docling-2.24.0 → docling-2.25.1}/docling/cli/main.py +0 -0
  36. {docling-2.24.0 → docling-2.25.1}/docling/cli/tools.py +0 -0
  37. {docling-2.24.0 → docling-2.25.1}/docling/datamodel/__init__.py +0 -0
  38. {docling-2.24.0 → docling-2.25.1}/docling/datamodel/document.py +0 -0
  39. {docling-2.24.0 → docling-2.25.1}/docling/datamodel/settings.py +0 -0
  40. {docling-2.24.0 → docling-2.25.1}/docling/document_converter.py +0 -0
  41. {docling-2.24.0 → docling-2.25.1}/docling/exceptions.py +0 -0
  42. {docling-2.24.0 → docling-2.25.1}/docling/models/__init__.py +0 -0
  43. {docling-2.24.0 → docling-2.25.1}/docling/models/base_model.py +0 -0
  44. {docling-2.24.0 → docling-2.25.1}/docling/models/base_ocr_model.py +0 -0
  45. {docling-2.24.0 → docling-2.25.1}/docling/models/code_formula_model.py +0 -0
  46. {docling-2.24.0 → docling-2.25.1}/docling/models/document_picture_classifier.py +0 -0
  47. {docling-2.24.0 → docling-2.25.1}/docling/models/easyocr_model.py +0 -0
  48. {docling-2.24.0 → docling-2.25.1}/docling/models/layout_model.py +0 -0
  49. {docling-2.24.0 → docling-2.25.1}/docling/models/ocr_mac_model.py +0 -0
  50. {docling-2.24.0 → docling-2.25.1}/docling/models/page_assemble_model.py +0 -0
  51. {docling-2.24.0 → docling-2.25.1}/docling/models/page_preprocessing_model.py +0 -0
  52. {docling-2.24.0 → docling-2.25.1}/docling/models/picture_description_api_model.py +0 -0
  53. {docling-2.24.0 → docling-2.25.1}/docling/models/picture_description_base_model.py +0 -0
  54. {docling-2.24.0 → docling-2.25.1}/docling/models/rapid_ocr_model.py +0 -0
  55. {docling-2.24.0 → docling-2.25.1}/docling/models/readingorder_model.py +0 -0
  56. {docling-2.24.0 → docling-2.25.1}/docling/models/table_structure_model.py +0 -0
  57. {docling-2.24.0 → docling-2.25.1}/docling/models/tesseract_ocr_cli_model.py +0 -0
  58. {docling-2.24.0 → docling-2.25.1}/docling/models/tesseract_ocr_model.py +0 -0
  59. {docling-2.24.0 → docling-2.25.1}/docling/pipeline/__init__.py +0 -0
  60. {docling-2.24.0 → docling-2.25.1}/docling/pipeline/base_pipeline.py +0 -0
  61. {docling-2.24.0 → docling-2.25.1}/docling/pipeline/simple_pipeline.py +0 -0
  62. {docling-2.24.0 → docling-2.25.1}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  63. {docling-2.24.0 → docling-2.25.1}/docling/py.typed +0 -0
  64. {docling-2.24.0 → docling-2.25.1}/docling/utils/__init__.py +0 -0
  65. {docling-2.24.0 → docling-2.25.1}/docling/utils/accelerator_utils.py +0 -0
  66. {docling-2.24.0 → docling-2.25.1}/docling/utils/export.py +0 -0
  67. {docling-2.24.0 → docling-2.25.1}/docling/utils/glm_utils.py +0 -0
  68. {docling-2.24.0 → docling-2.25.1}/docling/utils/layout_postprocessor.py +0 -0
  69. {docling-2.24.0 → docling-2.25.1}/docling/utils/ocr_utils.py +0 -0
  70. {docling-2.24.0 → docling-2.25.1}/docling/utils/profiling.py +0 -0
  71. {docling-2.24.0 → docling-2.25.1}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.24.0
3
+ Version: 2.25.1
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -25,6 +25,7 @@ Provides-Extra: ocrmac
25
25
  Provides-Extra: rapidocr
26
26
  Provides-Extra: tesserocr
27
27
  Provides-Extra: vlm
28
+ Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
28
29
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
29
30
  Requires-Dist: certifi (>=2024.7.4)
30
31
  Requires-Dist: docling-core[chunking] (>=2.19.0,<3.0.0)
@@ -12,6 +12,7 @@ from pypdfium2 import PdfPage
12
12
 
13
13
  from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
14
14
  from docling.datamodel.base_models import Cell, Size
15
+ from docling.utils.locks import pypdfium2_lock
15
16
 
16
17
  if TYPE_CHECKING:
17
18
  from docling.datamodel.document import InputDocument
@@ -182,20 +183,24 @@ class DoclingParseV2PageBackend(PdfPageBackend):
182
183
  padbox.r = page_size.width - padbox.r
183
184
  padbox.t = page_size.height - padbox.t
184
185
 
185
- image = (
186
- self._ppage.render(
187
- scale=scale * 1.5,
188
- rotation=0, # no additional rotation
189
- crop=padbox.as_tuple(),
190
- )
191
- .to_pil()
192
- .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
193
- ) # We resize the image from 1.5x the given scale to make it sharper.
186
+ with pypdfium2_lock:
187
+ image = (
188
+ self._ppage.render(
189
+ scale=scale * 1.5,
190
+ rotation=0, # no additional rotation
191
+ crop=padbox.as_tuple(),
192
+ )
193
+ .to_pil()
194
+ .resize(
195
+ size=(round(cropbox.width * scale), round(cropbox.height * scale))
196
+ )
197
+ ) # We resize the image from 1.5x the given scale to make it sharper.
194
198
 
195
199
  return image
196
200
 
197
201
  def get_size(self) -> Size:
198
- return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
202
+ with pypdfium2_lock:
203
+ return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
199
204
 
200
205
  def unload(self):
201
206
  self._ppage = None
@@ -206,23 +211,24 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
206
211
  def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
207
212
  super().__init__(in_doc, path_or_stream)
208
213
 
209
- self._pdoc = pdfium.PdfDocument(self.path_or_stream)
210
- self.parser = pdf_parser_v2("fatal")
214
+ with pypdfium2_lock:
215
+ self._pdoc = pdfium.PdfDocument(self.path_or_stream)
216
+ self.parser = pdf_parser_v2("fatal")
211
217
 
212
- success = False
213
- if isinstance(self.path_or_stream, BytesIO):
214
- success = self.parser.load_document_from_bytesio(
215
- self.document_hash, self.path_or_stream
216
- )
217
- elif isinstance(self.path_or_stream, Path):
218
- success = self.parser.load_document(
219
- self.document_hash, str(self.path_or_stream)
220
- )
218
+ success = False
219
+ if isinstance(self.path_or_stream, BytesIO):
220
+ success = self.parser.load_document_from_bytesio(
221
+ self.document_hash, self.path_or_stream
222
+ )
223
+ elif isinstance(self.path_or_stream, Path):
224
+ success = self.parser.load_document(
225
+ self.document_hash, str(self.path_or_stream)
226
+ )
221
227
 
222
- if not success:
223
- raise RuntimeError(
224
- f"docling-parse v2 could not load document {self.document_hash}."
225
- )
228
+ if not success:
229
+ raise RuntimeError(
230
+ f"docling-parse v2 could not load document {self.document_hash}."
231
+ )
226
232
 
227
233
  def page_count(self) -> int:
228
234
  # return len(self._pdoc) # To be replaced with docling-parse API
@@ -236,9 +242,10 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
236
242
  return len_2
237
243
 
238
244
  def load_page(self, page_no: int) -> DoclingParseV2PageBackend:
239
- return DoclingParseV2PageBackend(
240
- self.parser, self.document_hash, page_no, self._pdoc[page_no]
241
- )
245
+ with pypdfium2_lock:
246
+ return DoclingParseV2PageBackend(
247
+ self.parser, self.document_hash, page_no, self._pdoc[page_no]
248
+ )
242
249
 
243
250
  def is_valid(self) -> bool:
244
251
  return self.page_count() > 0
@@ -246,5 +253,6 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
246
253
  def unload(self):
247
254
  super().unload()
248
255
  self.parser.unload_document(self.document_hash)
249
- self._pdoc.close()
250
- self._pdoc = None
256
+ with pypdfium2_lock:
257
+ self._pdoc.close()
258
+ self._pdoc = None
@@ -1,9 +1,10 @@
1
1
  import logging
2
2
  from io import BytesIO
3
3
  from pathlib import Path
4
- from typing import Optional, Union, cast
4
+ from typing import Final, Optional, Union, cast
5
5
 
6
6
  from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
7
+ from bs4.element import PreformattedString
7
8
  from docling_core.types.doc import (
8
9
  DocItem,
9
10
  DocItemLabel,
@@ -14,6 +15,7 @@ from docling_core.types.doc import (
14
15
  TableCell,
15
16
  TableData,
16
17
  )
18
+ from docling_core.types.doc.document import ContentLayer
17
19
  from typing_extensions import override
18
20
 
19
21
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
@@ -22,12 +24,29 @@ from docling.datamodel.document import InputDocument
22
24
 
23
25
  _log = logging.getLogger(__name__)
24
26
 
27
+ # tags that generate NodeItem elements
28
+ TAGS_FOR_NODE_ITEMS: Final = [
29
+ "h1",
30
+ "h2",
31
+ "h3",
32
+ "h4",
33
+ "h5",
34
+ "h6",
35
+ "p",
36
+ "pre",
37
+ "ul",
38
+ "ol",
39
+ "li",
40
+ "table",
41
+ "figure",
42
+ "img",
43
+ ]
44
+
25
45
 
26
46
  class HTMLDocumentBackend(DeclarativeDocumentBackend):
27
47
  @override
28
48
  def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
29
49
  super().__init__(in_doc, path_or_stream)
30
- _log.debug("About to init HTML backend...")
31
50
  self.soup: Optional[Tag] = None
32
51
  # HTML file:
33
52
  self.path_or_stream = path_or_stream
@@ -48,7 +67,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
48
67
  self.soup = BeautifulSoup(html_content, "html.parser")
49
68
  except Exception as e:
50
69
  raise RuntimeError(
51
- f"Could not initialize HTML backend for file with hash {self.document_hash}."
70
+ "Could not initialize HTML backend for file with "
71
+ f"hash {self.document_hash}."
52
72
  ) from e
53
73
 
54
74
  @override
@@ -88,17 +108,26 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
88
108
  assert self.soup is not None
89
109
  content = self.soup.body or self.soup
90
110
  # Replace <br> tags with newline characters
111
+ # TODO: remove style to avoid losing text from tags like i, b, span, ...
91
112
  for br in content("br"):
92
113
  br.replace_with(NavigableString("\n"))
114
+
115
+ headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"])
116
+ self.content_layer = (
117
+ ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
118
+ )
93
119
  self.walk(content, doc)
94
120
  else:
95
121
  raise RuntimeError(
96
- f"Cannot convert doc with {self.document_hash} because the backend failed to init."
122
+ f"Cannot convert doc with {self.document_hash} because the backend "
123
+ "failed to init."
97
124
  )
98
125
  return doc
99
126
 
100
127
  def walk(self, tag: Tag, doc: DoclingDocument) -> None:
128
+
101
129
  # Iterate over elements in the body of the document
130
+ text: str = ""
102
131
  for element in tag.children:
103
132
  if isinstance(element, Tag):
104
133
  try:
@@ -108,6 +137,26 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
108
137
  f"Error processing child from tag{tag.name}: {exc_child}"
109
138
  )
110
139
  raise exc_child
140
+ elif isinstance(element, NavigableString) and not isinstance(
141
+ element, PreformattedString
142
+ ):
143
+ # Floating text outside paragraphs or analyzed tags
144
+ text += element
145
+ siblings: list[Tag] = [
146
+ item for item in element.next_siblings if isinstance(item, Tag)
147
+ ]
148
+ if element.next_sibling is None or any(
149
+ [item.name in TAGS_FOR_NODE_ITEMS for item in siblings]
150
+ ):
151
+ text = text.strip()
152
+ if text and tag.name in ["div"]:
153
+ doc.add_text(
154
+ parent=self.parents[self.level],
155
+ label=DocItemLabel.TEXT,
156
+ text=text,
157
+ content_layer=self.content_layer,
158
+ )
159
+ text = ""
111
160
 
112
161
  return
113
162
 
@@ -127,7 +176,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
127
176
  elif tag.name == "figure":
128
177
  self.handle_figure(tag, doc)
129
178
  elif tag.name == "img":
130
- self.handle_image(doc)
179
+ self.handle_image(tag, doc)
131
180
  else:
132
181
  self.walk(tag, doc)
133
182
 
@@ -158,12 +207,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
158
207
  text = element.text.strip()
159
208
 
160
209
  if hlevel == 1:
161
- for key, val in self.parents.items():
210
+ self.content_layer = ContentLayer.BODY
211
+
212
+ for key in self.parents.keys():
162
213
  self.parents[key] = None
163
214
 
164
215
  self.level = 1
165
216
  self.parents[self.level] = doc.add_text(
166
- parent=self.parents[0], label=DocItemLabel.TITLE, text=text
217
+ parent=self.parents[0],
218
+ label=DocItemLabel.TITLE,
219
+ text=text,
220
+ content_layer=self.content_layer,
167
221
  )
168
222
  else:
169
223
  if hlevel > self.level:
@@ -174,6 +228,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
174
228
  name=f"header-{i}",
175
229
  label=GroupLabel.SECTION,
176
230
  parent=self.parents[i - 1],
231
+ content_layer=self.content_layer,
177
232
  )
178
233
  self.level = hlevel
179
234
 
@@ -189,6 +244,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
189
244
  parent=self.parents[hlevel - 1],
190
245
  text=text,
191
246
  level=hlevel,
247
+ content_layer=self.content_layer,
192
248
  )
193
249
 
194
250
  def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
@@ -197,16 +253,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
197
253
  return
198
254
  text = element.text.strip()
199
255
  if text:
200
- doc.add_code(parent=self.parents[self.level], text=text)
256
+ doc.add_code(
257
+ parent=self.parents[self.level],
258
+ text=text,
259
+ content_layer=self.content_layer,
260
+ )
201
261
 
202
262
  def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
203
263
  """Handles paragraph tags (p)."""
204
264
  if element.text is None:
205
265
  return
206
266
  text = element.text.strip()
207
- label = DocItemLabel.PARAGRAPH
208
267
  if text:
209
- doc.add_text(parent=self.parents[self.level], label=label, text=text)
268
+ doc.add_text(
269
+ parent=self.parents[self.level],
270
+ label=DocItemLabel.TEXT,
271
+ text=text,
272
+ content_layer=self.content_layer,
273
+ )
210
274
 
211
275
  def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
212
276
  """Handles list tags (ul, ol) and their list items."""
@@ -214,14 +278,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
214
278
  if element.name == "ul":
215
279
  # create a list group
216
280
  self.parents[self.level + 1] = doc.add_group(
217
- parent=self.parents[self.level], name="list", label=GroupLabel.LIST
281
+ parent=self.parents[self.level],
282
+ name="list",
283
+ label=GroupLabel.LIST,
284
+ content_layer=self.content_layer,
218
285
  )
219
286
  elif element.name == "ol":
287
+ start_attr = element.get("start")
288
+ start: int = (
289
+ int(start_attr)
290
+ if isinstance(start_attr, str) and start_attr.isnumeric()
291
+ else 1
292
+ )
220
293
  # create a list group
221
294
  self.parents[self.level + 1] = doc.add_group(
222
295
  parent=self.parents[self.level],
223
- name="ordered list",
296
+ name="ordered list" + (f" start {start}" if start != 1 else ""),
224
297
  label=GroupLabel.ORDERED_LIST,
298
+ content_layer=self.content_layer,
225
299
  )
226
300
  self.level += 1
227
301
 
@@ -231,15 +305,23 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
231
305
  self.level -= 1
232
306
 
233
307
  def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
234
- """Handles listitem tags (li)."""
308
+ """Handles list item tags (li)."""
235
309
  nested_list = element.find(["ul", "ol"])
236
310
 
237
311
  parent = self.parents[self.level]
238
312
  if parent is None:
239
- _log.warning(f"list-item has no parent in DoclingDocument: {element}")
313
+ _log.debug(f"list-item has no parent in DoclingDocument: {element}")
240
314
  return
241
315
  parent_label: str = parent.label
242
316
  index_in_list = len(parent.children) + 1
317
+ if (
318
+ parent_label == GroupLabel.ORDERED_LIST
319
+ and isinstance(parent, GroupItem)
320
+ and parent.name
321
+ ):
322
+ start_in_list: str = parent.name.split(" ")[-1]
323
+ start: int = int(start_in_list) if start_in_list.isnumeric() else 1
324
+ index_in_list += start - 1
243
325
 
244
326
  if nested_list:
245
327
  # Text in list item can be hidden within hierarchy, hence
@@ -262,6 +344,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
262
344
  enumerated=enumerated,
263
345
  marker=marker,
264
346
  parent=parent,
347
+ content_layer=self.content_layer,
265
348
  )
266
349
  self.level += 1
267
350
 
@@ -283,15 +366,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
283
366
  enumerated=enumerated,
284
367
  marker=marker,
285
368
  parent=parent,
369
+ content_layer=self.content_layer,
286
370
  )
287
371
  else:
288
- _log.warning(f"list-item has no text: {element}")
372
+ _log.debug(f"list-item has no text: {element}")
289
373
 
290
374
  @staticmethod
291
375
  def parse_table_data(element: Tag) -> Optional[TableData]:
292
376
  nested_tables = element.find("table")
293
377
  if nested_tables is not None:
294
- _log.warning("Skipping nested table.")
378
+ _log.debug("Skipping nested table.")
295
379
  return None
296
380
 
297
381
  # Count the number of rows (number of <tr> elements)
@@ -386,7 +470,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
386
470
  table_data = HTMLDocumentBackend.parse_table_data(element)
387
471
 
388
472
  if table_data is not None:
389
- doc.add_table(data=table_data, parent=self.parents[self.level])
473
+ doc.add_table(
474
+ data=table_data,
475
+ parent=self.parents[self.level],
476
+ content_layer=self.content_layer,
477
+ )
390
478
 
391
479
  def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
392
480
  """Recursively extract text from <ul> or <ol> with proper indentation."""
@@ -426,20 +514,33 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
426
514
 
427
515
  contains_captions = element.find(["figcaption"])
428
516
  if not isinstance(contains_captions, Tag):
429
- doc.add_picture(parent=self.parents[self.level], caption=None)
517
+ doc.add_picture(
518
+ parent=self.parents[self.level],
519
+ caption=None,
520
+ content_layer=self.content_layer,
521
+ )
430
522
  else:
431
523
  texts = []
432
524
  for item in contains_captions:
433
525
  texts.append(item.text)
434
526
 
435
527
  fig_caption = doc.add_text(
436
- label=DocItemLabel.CAPTION, text=("".join(texts)).strip()
528
+ label=DocItemLabel.CAPTION,
529
+ text=("".join(texts)).strip(),
530
+ content_layer=self.content_layer,
437
531
  )
438
532
  doc.add_picture(
439
533
  parent=self.parents[self.level],
440
534
  caption=fig_caption,
535
+ content_layer=self.content_layer,
441
536
  )
442
537
 
443
- def handle_image(self, doc: DoclingDocument) -> None:
538
+ def handle_image(self, element: Tag, doc: DoclingDocument) -> None:
444
539
  """Handles image tags (img)."""
445
- doc.add_picture(parent=self.parents[self.level], caption=None)
540
+ _log.debug(f"ignoring <img> tags at the moment: {element}")
541
+
542
+ doc.add_picture(
543
+ parent=self.parents[self.level],
544
+ caption=None,
545
+ content_layer=self.content_layer,
546
+ )
@@ -13,6 +13,7 @@ from pypdfium2._helpers.misc import PdfiumError
13
13
 
14
14
  from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
15
15
  from docling.datamodel.base_models import Cell
16
+ from docling.utils.locks import pypdfium2_lock
16
17
 
17
18
  if TYPE_CHECKING:
18
19
  from docling.datamodel.document import InputDocument
@@ -24,6 +25,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
24
25
  def __init__(
25
26
  self, pdfium_doc: pdfium.PdfDocument, document_hash: str, page_no: int
26
27
  ):
28
+ # Note: lock applied by the caller
27
29
  self.valid = True # No better way to tell from pypdfium.
28
30
  try:
29
31
  self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
@@ -40,51 +42,57 @@ class PyPdfiumPageBackend(PdfPageBackend):
40
42
 
41
43
  def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
42
44
  AREA_THRESHOLD = 0 # 32 * 32
43
- for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
44
- pos = obj.get_pos()
45
- cropbox = BoundingBox.from_tuple(
46
- pos, origin=CoordOrigin.BOTTOMLEFT
47
- ).to_top_left_origin(page_height=self.get_size().height)
45
+ page_size = self.get_size()
46
+ with pypdfium2_lock:
47
+ for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
48
+ pos = obj.get_pos()
49
+ cropbox = BoundingBox.from_tuple(
50
+ pos, origin=CoordOrigin.BOTTOMLEFT
51
+ ).to_top_left_origin(page_height=page_size.height)
48
52
 
49
- if cropbox.area() > AREA_THRESHOLD:
50
- cropbox = cropbox.scaled(scale=scale)
53
+ if cropbox.area() > AREA_THRESHOLD:
54
+ cropbox = cropbox.scaled(scale=scale)
51
55
 
52
- yield cropbox
56
+ yield cropbox
53
57
 
54
58
  def get_text_in_rect(self, bbox: BoundingBox) -> str:
55
- if not self.text_page:
56
- self.text_page = self._ppage.get_textpage()
59
+ with pypdfium2_lock:
60
+ if not self.text_page:
61
+ self.text_page = self._ppage.get_textpage()
57
62
 
58
63
  if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
59
64
  bbox = bbox.to_bottom_left_origin(self.get_size().height)
60
65
 
61
- text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
66
+ with pypdfium2_lock:
67
+ text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
62
68
 
63
69
  return text_piece
64
70
 
65
71
  def get_text_cells(self) -> Iterable[Cell]:
66
- if not self.text_page:
67
- self.text_page = self._ppage.get_textpage()
72
+ with pypdfium2_lock:
73
+ if not self.text_page:
74
+ self.text_page = self._ppage.get_textpage()
68
75
 
69
76
  cells = []
70
77
  cell_counter = 0
71
78
 
72
79
  page_size = self.get_size()
73
80
 
74
- for i in range(self.text_page.count_rects()):
75
- rect = self.text_page.get_rect(i)
76
- text_piece = self.text_page.get_text_bounded(*rect)
77
- x0, y0, x1, y1 = rect
78
- cells.append(
79
- Cell(
80
- id=cell_counter,
81
- text=text_piece,
82
- bbox=BoundingBox(
83
- l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT
84
- ).to_top_left_origin(page_size.height),
81
+ with pypdfium2_lock:
82
+ for i in range(self.text_page.count_rects()):
83
+ rect = self.text_page.get_rect(i)
84
+ text_piece = self.text_page.get_text_bounded(*rect)
85
+ x0, y0, x1, y1 = rect
86
+ cells.append(
87
+ Cell(
88
+ id=cell_counter,
89
+ text=text_piece,
90
+ bbox=BoundingBox(
91
+ l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT
92
+ ).to_top_left_origin(page_size.height),
93
+ )
85
94
  )
86
- )
87
- cell_counter += 1
95
+ cell_counter += 1
88
96
 
89
97
  # PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs.
90
98
  # The cell merging code below is to clean this up.
@@ -214,20 +222,24 @@ class PyPdfiumPageBackend(PdfPageBackend):
214
222
  padbox.r = page_size.width - padbox.r
215
223
  padbox.t = page_size.height - padbox.t
216
224
 
217
- image = (
218
- self._ppage.render(
219
- scale=scale * 1.5,
220
- rotation=0, # no additional rotation
221
- crop=padbox.as_tuple(),
222
- )
223
- .to_pil()
224
- .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
225
- ) # We resize the image from 1.5x the given scale to make it sharper.
225
+ with pypdfium2_lock:
226
+ image = (
227
+ self._ppage.render(
228
+ scale=scale * 1.5,
229
+ rotation=0, # no additional rotation
230
+ crop=padbox.as_tuple(),
231
+ )
232
+ .to_pil()
233
+ .resize(
234
+ size=(round(cropbox.width * scale), round(cropbox.height * scale))
235
+ )
236
+ ) # We resize the image from 1.5x the given scale to make it sharper.
226
237
 
227
238
  return image
228
239
 
229
240
  def get_size(self) -> Size:
230
- return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
241
+ with pypdfium2_lock:
242
+ return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
231
243
 
232
244
  def unload(self):
233
245
  self._ppage = None
@@ -239,22 +251,26 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
239
251
  super().__init__(in_doc, path_or_stream)
240
252
 
241
253
  try:
242
- self._pdoc = pdfium.PdfDocument(self.path_or_stream)
254
+ with pypdfium2_lock:
255
+ self._pdoc = pdfium.PdfDocument(self.path_or_stream)
243
256
  except PdfiumError as e:
244
257
  raise RuntimeError(
245
258
  f"pypdfium could not load document with hash {self.document_hash}"
246
259
  ) from e
247
260
 
248
261
  def page_count(self) -> int:
249
- return len(self._pdoc)
262
+ with pypdfium2_lock:
263
+ return len(self._pdoc)
250
264
 
251
265
  def load_page(self, page_no: int) -> PyPdfiumPageBackend:
252
- return PyPdfiumPageBackend(self._pdoc, self.document_hash, page_no)
266
+ with pypdfium2_lock:
267
+ return PyPdfiumPageBackend(self._pdoc, self.document_hash, page_no)
253
268
 
254
269
  def is_valid(self) -> bool:
255
270
  return self.page_count() > 0
256
271
 
257
272
  def unload(self):
258
273
  super().unload()
259
- self._pdoc.close()
260
- self._pdoc = None
274
+ with pypdfium2_lock:
275
+ self._pdoc.close()
276
+ self._pdoc = None
@@ -32,9 +32,19 @@ class _AvailableModels(str, Enum):
32
32
  CODE_FORMULA = "code_formula"
33
33
  PICTURE_CLASSIFIER = "picture_classifier"
34
34
  SMOLVLM = "smolvlm"
35
+ GRANITE_VISION = "granite_vision"
35
36
  EASYOCR = "easyocr"
36
37
 
37
38
 
39
+ _default_models = [
40
+ _AvailableModels.LAYOUT,
41
+ _AvailableModels.TABLEFORMER,
42
+ _AvailableModels.CODE_FORMULA,
43
+ _AvailableModels.PICTURE_CLASSIFIER,
44
+ _AvailableModels.EASYOCR,
45
+ ]
46
+
47
+
38
48
  @app.command("download")
39
49
  def download(
40
50
  output_dir: Annotated[
@@ -43,18 +53,27 @@ def download(
43
53
  ...,
44
54
  "-o",
45
55
  "--output-dir",
46
- help="The directory where all the models are downloaded.",
56
+ help="The directory where to download the models.",
47
57
  ),
48
58
  ] = (settings.cache_dir / "models"),
49
59
  force: Annotated[
50
- bool, typer.Option(..., help="If true, the download will be forced")
60
+ bool, typer.Option(..., help="If true, the download will be forced.")
51
61
  ] = False,
52
62
  models: Annotated[
53
63
  Optional[list[_AvailableModels]],
54
64
  typer.Argument(
55
- help=f"Models to download (default behavior: all will be downloaded)",
65
+ help=f"Models to download (default behavior: a predefined set of models will be downloaded).",
56
66
  ),
57
67
  ] = None,
68
+ all: Annotated[
69
+ bool,
70
+ typer.Option(
71
+ ...,
72
+ "--all",
73
+ help="If true, all available models will be downloaded (mutually exclusive with passing specific models).",
74
+ show_default=True,
75
+ ),
76
+ ] = False,
58
77
  quiet: Annotated[
59
78
  bool,
60
79
  typer.Option(
@@ -65,6 +84,10 @@ def download(
65
84
  ),
66
85
  ] = False,
67
86
  ):
87
+ if models and all:
88
+ raise typer.BadParameter(
89
+ "Cannot simultaneously set 'all' parameter and specify models to download."
90
+ )
68
91
  if not quiet:
69
92
  FORMAT = "%(message)s"
70
93
  logging.basicConfig(
@@ -73,7 +96,7 @@ def download(
73
96
  datefmt="[%X]",
74
97
  handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
75
98
  )
76
- to_download = models or [m for m in _AvailableModels]
99
+ to_download = models or ([m for m in _AvailableModels] if all else _default_models)
77
100
  output_dir = download_models(
78
101
  output_dir=output_dir,
79
102
  force=force,
@@ -83,6 +106,7 @@ def download(
83
106
  with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
84
107
  with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
85
108
  with_smolvlm=_AvailableModels.SMOLVLM in to_download,
109
+ with_granite_vision=_AvailableModels.GRANITE_VISION in to_download,
86
110
  with_easyocr=_AvailableModels.EASYOCR in to_download,
87
111
  )
88
112