docling 2.56.1__py3-none-any.whl → 2.58.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from io import BytesIO
3
3
  from pathlib import Path
4
- from typing import Any, Optional, Union, cast
4
+ from typing import Annotated, Any, Optional, Union, cast
5
5
 
6
6
  from docling_core.types.doc import (
7
7
  BoundingBox,
@@ -23,7 +23,8 @@ from openpyxl.drawing.image import Image
23
23
  from openpyxl.drawing.spreadsheet_drawing import TwoCellAnchor
24
24
  from openpyxl.worksheet.worksheet import Worksheet
25
25
  from PIL import Image as PILImage
26
- from pydantic import BaseModel, NonNegativeInt, PositiveInt
26
+ from pydantic import BaseModel, Field, NonNegativeInt, PositiveInt
27
+ from pydantic.dataclasses import dataclass
27
28
  from typing_extensions import override
28
29
 
29
30
  from docling.backend.abstract_backend import (
@@ -36,6 +37,32 @@ from docling.datamodel.document import InputDocument
36
37
  _log = logging.getLogger(__name__)
37
38
 
38
39
 
40
+ @dataclass
41
+ class DataRegion:
42
+ """Represents the bounding rectangle of non-empty cells in a worksheet."""
43
+
44
+ min_row: Annotated[
45
+ PositiveInt, Field(description="Smallest row index (1-based index).")
46
+ ]
47
+ max_row: Annotated[
48
+ PositiveInt, Field(description="Largest row index (1-based index).")
49
+ ]
50
+ min_col: Annotated[
51
+ PositiveInt, Field(description="Smallest column index (1-based index).")
52
+ ]
53
+ max_col: Annotated[
54
+ PositiveInt, Field(description="Largest column index (1-based index).")
55
+ ]
56
+
57
+ def width(self) -> PositiveInt:
58
+ """Number of columns in the data region."""
59
+ return self.max_col - self.min_col + 1
60
+
61
+ def height(self) -> PositiveInt:
62
+ """Number of rows in the data region."""
63
+ return self.max_row - self.min_row + 1
64
+
65
+
39
66
  class ExcelCell(BaseModel):
40
67
  """Represents an Excel cell.
41
68
 
@@ -294,6 +321,48 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
294
321
 
295
322
  return doc
296
323
 
324
+ def _find_true_data_bounds(self, sheet: Worksheet) -> DataRegion:
325
+ """Find the true data boundaries (min/max rows and columns) in a worksheet.
326
+
327
+ This function scans all cells to find the smallest rectangular region that contains
328
+ all non-empty cells or merged cell ranges. It returns the minimal and maximal
329
+ row/column indices that bound the actual data region.
330
+
331
+ Args:
332
+ sheet: The worksheet to analyze.
333
+
334
+ Returns:
335
+ A data region representing the smallest rectangle that covers all data and merged cells.
336
+ If the sheet is empty, returns (1, 1, 1, 1) by default.
337
+ """
338
+ min_row, min_col = None, None
339
+ max_row, max_col = 0, 0
340
+
341
+ for cell in sheet._cells.values():
342
+ if cell.value is not None:
343
+ r, c = cell.row, cell.column
344
+ min_row = r if min_row is None else min(min_row, r)
345
+ min_col = c if min_col is None else min(min_col, c)
346
+ max_row = max(max_row, r)
347
+ max_col = max(max_col, c)
348
+
349
+ # Expand bounds to include merged cells
350
+ for merged in sheet.merged_cells.ranges:
351
+ min_row = (
352
+ merged.min_row if min_row is None else min(min_row, merged.min_row)
353
+ )
354
+ min_col = (
355
+ merged.min_col if min_col is None else min(min_col, merged.min_col)
356
+ )
357
+ max_row = max(max_row, merged.max_row)
358
+ max_col = max(max_col, merged.max_col)
359
+
360
+ # If no data found, default to (1, 1, 1, 1)
361
+ if min_row is None or min_col is None:
362
+ min_row = min_col = max_row = max_col = 1
363
+
364
+ return DataRegion(min_row, max_row, min_col, max_col)
365
+
297
366
  def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
298
367
  """Find all compact rectangular data tables in an Excel worksheet.
299
368
 
@@ -303,18 +372,31 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
303
372
  Returns:
304
373
  A list of ExcelTable objects representing the data tables.
305
374
  """
375
+ bounds: DataRegion = self._find_true_data_bounds(
376
+ sheet
377
+ ) # The true data boundaries
306
378
  tables: list[ExcelTable] = [] # List to store found tables
307
379
  visited: set[tuple[int, int]] = set() # Track already visited cells
308
380
 
309
- # Iterate over all cells in the sheet
310
- for ri, row in enumerate(sheet.iter_rows(values_only=False)):
311
- for rj, cell in enumerate(row):
312
- # Skip empty or already visited cells
381
+ # Limit scan to actual data bounds
382
+ for ri, row in enumerate(
383
+ sheet.iter_rows(
384
+ min_row=bounds.min_row,
385
+ max_row=bounds.max_row,
386
+ min_col=bounds.min_col,
387
+ max_col=bounds.max_col,
388
+ values_only=False,
389
+ ),
390
+ start=bounds.min_row - 1,
391
+ ):
392
+ for rj, cell in enumerate(row, start=bounds.min_col - 1):
313
393
  if cell.value is None or (ri, rj) in visited:
314
394
  continue
315
395
 
316
396
  # If the cell starts a new table, find its bounds
317
- table_bounds, visited_cells = self._find_table_bounds(sheet, ri, rj)
397
+ table_bounds, visited_cells = self._find_table_bounds(
398
+ sheet, ri, rj, bounds.max_row, bounds.max_col
399
+ )
318
400
 
319
401
  visited.update(visited_cells) # Mark these cells as visited
320
402
  tables.append(table_bounds)
@@ -326,6 +408,8 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
326
408
  sheet: Worksheet,
327
409
  start_row: int,
328
410
  start_col: int,
411
+ max_row: int,
412
+ max_col: int,
329
413
  ) -> tuple[ExcelTable, set[tuple[int, int]]]:
330
414
  """Determine the bounds of a compact rectangular table.
331
415
 
@@ -333,14 +417,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
333
417
  sheet: The Excel worksheet to be parsed.
334
418
  start_row: The row number of the starting cell.
335
419
  start_col: The column number of the starting cell.
420
+ max_row: Maximum row boundary from true data bounds.
421
+ max_col: Maximum column boundary from true data bounds.
336
422
 
337
423
  Returns:
338
424
  A tuple with an Excel table and a set of cell coordinates.
339
425
  """
340
426
  _log.debug("find_table_bounds")
341
427
 
342
- max_row = self._find_table_bottom(sheet, start_row, start_col)
343
- max_col = self._find_table_right(sheet, start_row, start_col)
428
+ table_max_row = self._find_table_bottom(sheet, start_row, start_col, max_row)
429
+ table_max_col = self._find_table_right(sheet, start_row, start_col, max_col)
344
430
 
345
431
  # Collect the data within the bounds
346
432
  data = []
@@ -348,9 +434,9 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
348
434
  for ri, row in enumerate(
349
435
  sheet.iter_rows(
350
436
  min_row=start_row + 1, # start_row is 0-based but iter_rows is 1-based
351
- max_row=max_row + 1,
437
+ max_row=table_max_row + 1,
352
438
  min_col=start_col + 1,
353
- max_col=max_col + 1,
439
+ max_col=table_max_col + 1,
354
440
  values_only=False,
355
441
  ),
356
442
  start_row,
@@ -390,15 +476,15 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
390
476
  return (
391
477
  ExcelTable(
392
478
  anchor=(start_col, start_row),
393
- num_rows=max_row + 1 - start_row,
394
- num_cols=max_col + 1 - start_col,
479
+ num_rows=table_max_row + 1 - start_row,
480
+ num_cols=table_max_col + 1 - start_col,
395
481
  data=data,
396
482
  ),
397
483
  visited_cells,
398
484
  )
399
485
 
400
486
  def _find_table_bottom(
401
- self, sheet: Worksheet, start_row: int, start_col: int
487
+ self, sheet: Worksheet, start_row: int, start_col: int, max_row: int
402
488
  ) -> int:
403
489
  """Find the bottom boundary of a table.
404
490
 
@@ -406,16 +492,17 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
406
492
  sheet: The Excel worksheet to be parsed.
407
493
  start_row: The starting row of the table.
408
494
  start_col: The starting column of the table.
495
+ max_row: Maximum row boundary from true data bounds.
409
496
 
410
497
  Returns:
411
498
  The row index representing the bottom boundary of the table.
412
499
  """
413
- max_row: int = start_row
500
+ table_max_row: int = start_row
414
501
 
415
502
  for ri, (cell,) in enumerate(
416
503
  sheet.iter_rows(
417
504
  min_row=start_row + 2,
418
- max_row=sheet.max_row,
505
+ max_row=max_row,
419
506
  min_col=start_col + 1,
420
507
  max_col=start_col + 1,
421
508
  values_only=False,
@@ -431,16 +518,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
431
518
  if cell.value is None and not merged_range:
432
519
  break # Stop if the cell is empty and not merged
433
520
 
434
- # Expand max_row to include the merged range if applicable
521
+ # Expand table_max_row to include the merged range if applicable
435
522
  if merged_range:
436
- max_row = max(max_row, merged_range.max_row - 1)
523
+ table_max_row = max(table_max_row, merged_range.max_row - 1)
437
524
  else:
438
- max_row = ri
525
+ table_max_row = ri
439
526
 
440
- return max_row
527
+ return table_max_row
441
528
 
442
529
  def _find_table_right(
443
- self, sheet: Worksheet, start_row: int, start_col: int
530
+ self, sheet: Worksheet, start_row: int, start_col: int, max_col: int
444
531
  ) -> int:
445
532
  """Find the right boundary of a table.
446
533
 
@@ -448,18 +535,19 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
448
535
  sheet: The Excel worksheet to be parsed.
449
536
  start_row: The starting row of the table.
450
537
  start_col: The starting column of the table.
538
+ max_col: The actual max column of the table.
451
539
 
452
540
  Returns:
453
541
  The column index representing the right boundary of the table."
454
542
  """
455
- max_col: int = start_col
543
+ table_max_col: int = start_col
456
544
 
457
545
  for rj, (cell,) in enumerate(
458
546
  sheet.iter_cols(
459
547
  min_row=start_row + 1,
460
548
  max_row=start_row + 1,
461
549
  min_col=start_col + 2,
462
- max_col=sheet.max_column,
550
+ max_col=max_col,
463
551
  values_only=False,
464
552
  ),
465
553
  start_col + 1,
@@ -473,13 +561,13 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
473
561
  if cell.value is None and not merged_range:
474
562
  break # Stop if the cell is empty and not merged
475
563
 
476
- # Expand max_col to include the merged range if applicable
564
+ # Expand table_max_col to include the merged range if applicable
477
565
  if merged_range:
478
- max_col = max(max_col, merged_range.max_col - 1)
566
+ table_max_col = max(table_max_col, merged_range.max_col - 1)
479
567
  else:
480
- max_col = rj
568
+ table_max_col = rj
481
569
 
482
- return max_col
570
+ return table_max_col
483
571
 
484
572
  def _find_images_in_sheet(
485
573
  self, doc: DoclingDocument, sheet: Worksheet
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from io import BytesIO
3
3
  from pathlib import Path
4
- from typing import Set, Union
4
+ from typing import Union
5
5
 
6
6
  from docling_core.types.doc import (
7
7
  BoundingBox,
@@ -80,7 +80,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
80
80
  self.path_or_stream = None
81
81
 
82
82
  @classmethod
83
- def supported_formats(cls) -> Set[InputFormat]:
83
+ def supported_formats(cls) -> set[InputFormat]:
84
84
  return {InputFormat.PPTX}
85
85
 
86
86
  def convert(self) -> DoclingDocument:
@@ -1,8 +1,9 @@
1
1
  import logging
2
2
  import re
3
+ from copy import deepcopy
3
4
  from io import BytesIO
4
5
  from pathlib import Path
5
- from typing import Any, List, Optional, Union
6
+ from typing import Any, Callable, Optional, Union
6
7
 
7
8
  from docling_core.types.doc import (
8
9
  DocItemLabel,
@@ -33,6 +34,11 @@ from pydantic import AnyUrl
33
34
  from typing_extensions import override
34
35
 
35
36
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
37
+ from docling.backend.docx.drawingml.utils import (
38
+ get_docx_to_pdf_converter,
39
+ get_libreoffice_cmd,
40
+ get_pil_from_dml_docx,
41
+ )
36
42
  from docling.backend.docx.latex.omml import oMath2Latex
37
43
  from docling.datamodel.base_models import InputFormat
38
44
  from docling.datamodel.document import InputDocument
@@ -63,7 +69,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
63
69
  self.numbered_headers: dict[int, int] = {}
64
70
  self.equation_bookends: str = "<eq>{EQ}</eq>"
65
71
  # Track processed textbox elements to avoid duplication
66
- self.processed_textbox_elements: List[int] = []
72
+ self.processed_textbox_elements: list[int] = []
73
+ self.docx_to_pdf_converter: Optional[Callable] = None
74
+ self.docx_to_pdf_converter_init = False
75
+ self.display_drawingml_warning = True
67
76
 
68
77
  for i in range(-1, self.max_levels):
69
78
  self.parents[i] = None
@@ -80,18 +89,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
80
89
  "indents": [None],
81
90
  }
82
91
 
83
- self.docx_obj = None
84
- try:
85
- if isinstance(self.path_or_stream, BytesIO):
86
- self.docx_obj = Document(self.path_or_stream)
87
- elif isinstance(self.path_or_stream, Path):
88
- self.docx_obj = Document(str(self.path_or_stream))
89
-
92
+ self.docx_obj = self.load_msword_file(
93
+ path_or_stream=self.path_or_stream, document_hash=self.document_hash
94
+ )
95
+ if self.docx_obj:
90
96
  self.valid = True
91
- except Exception as e:
92
- raise RuntimeError(
93
- f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
94
- ) from e
95
97
 
96
98
  @override
97
99
  def is_valid(self) -> bool:
@@ -139,6 +141,22 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
139
141
  f"Cannot convert doc with {self.document_hash} because the backend failed to init."
140
142
  )
141
143
 
144
+ @staticmethod
145
+ def load_msword_file(
146
+ path_or_stream: Union[BytesIO, Path], document_hash: str
147
+ ) -> DocxDocument:
148
+ try:
149
+ if isinstance(path_or_stream, BytesIO):
150
+ return Document(path_or_stream)
151
+ elif isinstance(path_or_stream, Path):
152
+ return Document(str(path_or_stream))
153
+ else:
154
+ return None
155
+ except Exception as e:
156
+ raise RuntimeError(
157
+ f"MsWordDocumentBackend could not load document with hash {document_hash}"
158
+ ) from e
159
+
142
160
  def _update_history(
143
161
  self,
144
162
  name: str,
@@ -195,6 +213,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
195
213
  }
196
214
  xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces)
197
215
  drawing_blip = xpath_expr(element)
216
+ drawingml_els = element.findall(".//w:drawing", namespaces=namespaces)
198
217
 
199
218
  # Check for textbox content - check multiple textbox formats
200
219
  # Only process if the element hasn't been processed before
@@ -274,6 +293,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
274
293
  ):
275
294
  te1 = self._handle_text_elements(element, docx_obj, doc)
276
295
  added_elements.extend(te1)
296
+ # Check for DrawingML elements
297
+ elif drawingml_els:
298
+ if (
299
+ self.docx_to_pdf_converter is None
300
+ and self.docx_to_pdf_converter_init is False
301
+ ):
302
+ self.docx_to_pdf_converter = get_docx_to_pdf_converter()
303
+ self.docx_to_pdf_converter_init = True
304
+
305
+ if self.docx_to_pdf_converter is None:
306
+ if self.display_drawingml_warning:
307
+ if self.docx_to_pdf_converter is None:
308
+ _log.warning(
309
+ "Found DrawingML elements in document, but no DOCX to PDF converters. "
310
+ "If you want these exported, make sure you have "
311
+ "LibreOffice binary in PATH or specify its path with DOCLING_LIBREOFFICE_CMD."
312
+ )
313
+ self.display_drawingml_warning = False
314
+ else:
315
+ self._handle_drawingml(doc=doc, drawingml_els=drawingml_els)
277
316
  # Check for the sdt containers, like table of contents
278
317
  elif tag_name in ["sdt"]:
279
318
  sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
@@ -687,8 +726,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
687
726
  textbox_elements: list,
688
727
  docx_obj: DocxDocument,
689
728
  doc: DoclingDocument,
690
- ) -> List[RefItem]:
691
- elem_ref: List[RefItem] = []
729
+ ) -> list[RefItem]:
730
+ elem_ref: list[RefItem] = []
692
731
  """Process textbox content and add it to the document structure."""
693
732
  level = self._get_level()
694
733
  # Create a textbox group to contain all text from the textbox
@@ -817,8 +856,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
817
856
  element: BaseOxmlElement,
818
857
  docx_obj: DocxDocument,
819
858
  doc: DoclingDocument,
820
- ) -> List[RefItem]:
821
- elem_ref: List[RefItem] = []
859
+ ) -> list[RefItem]:
860
+ elem_ref: list[RefItem] = []
822
861
  paragraph = Paragraph(element, docx_obj)
823
862
  paragraph_elements = self._get_paragraph_elements(paragraph)
824
863
  text, equations = self._handle_equations_in_text(
@@ -993,8 +1032,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
993
1032
  curr_level: Optional[int],
994
1033
  text: str,
995
1034
  is_numbered_style: bool = False,
996
- ) -> List[RefItem]:
997
- elem_ref: List[RefItem] = []
1035
+ ) -> list[RefItem]:
1036
+ elem_ref: list[RefItem] = []
998
1037
  level = self._get_level()
999
1038
  if isinstance(curr_level, int):
1000
1039
  if curr_level > level:
@@ -1063,8 +1102,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1063
1102
  marker: str,
1064
1103
  enumerated: bool,
1065
1104
  level: int,
1066
- ) -> List[RefItem]:
1067
- elem_ref: List[RefItem] = []
1105
+ ) -> list[RefItem]:
1106
+ elem_ref: list[RefItem] = []
1068
1107
  # This should not happen by construction
1069
1108
  if not isinstance(self.parents[level], ListGroup):
1070
1109
  return elem_ref
@@ -1109,8 +1148,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1109
1148
  ilevel: int,
1110
1149
  elements: list,
1111
1150
  is_numbered: bool = False,
1112
- ) -> List[RefItem]:
1113
- elem_ref: List[RefItem] = []
1151
+ ) -> list[RefItem]:
1152
+ elem_ref: list[RefItem] = []
1114
1153
  # this method is always called with is_numbered. Numbered lists should be properly addressed.
1115
1154
  if not elements:
1116
1155
  return elem_ref
@@ -1205,8 +1244,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1205
1244
  element: BaseOxmlElement,
1206
1245
  docx_obj: DocxDocument,
1207
1246
  doc: DoclingDocument,
1208
- ) -> List[RefItem]:
1209
- elem_ref: List[RefItem] = []
1247
+ ) -> list[RefItem]:
1248
+ elem_ref: list[RefItem] = []
1210
1249
  table: Table = Table(element, docx_obj)
1211
1250
  num_rows = len(table.rows)
1212
1251
  num_cols = len(table.columns)
@@ -1260,13 +1299,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1260
1299
  else:
1261
1300
  text = text.replace("<eq>", "$").replace("</eq>", "$")
1262
1301
 
1263
- provs_in_cell: List[RefItem] = []
1302
+ provs_in_cell: list[RefItem] = []
1264
1303
  _, provs_in_cell = self._walk_linear(cell._element, docx_obj, doc)
1265
1304
  ref_for_rich_cell = provs_in_cell[0]
1266
1305
  rich_table_cell = False
1267
1306
 
1268
1307
  def group_cell_elements(
1269
- group_name: str, doc: DoclingDocument, provs_in_cell: List[RefItem]
1308
+ group_name: str, doc: DoclingDocument, provs_in_cell: list[RefItem]
1270
1309
  ) -> RefItem:
1271
1310
  group_element = doc.add_group(
1272
1311
  label=GroupLabel.UNSPECIFIED,
@@ -1340,7 +1379,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1340
1379
 
1341
1380
  def _handle_pictures(
1342
1381
  self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
1343
- ) -> List[RefItem]:
1382
+ ) -> list[RefItem]:
1344
1383
  def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
1345
1384
  image_data: Optional[bytes] = None
1346
1385
  rId = drawing_blip[0].get(
@@ -1352,7 +1391,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1352
1391
  image_data = image_part.blob # Get the binary image data
1353
1392
  return image_data
1354
1393
 
1355
- elem_ref: List[RefItem] = []
1394
+ elem_ref: list[RefItem] = []
1356
1395
  level = self._get_level()
1357
1396
  # Open the BytesIO object with PIL to create an Image
1358
1397
  image_data: Optional[bytes] = get_docx_image(drawing_blip)
@@ -1381,3 +1420,39 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1381
1420
  )
1382
1421
  elem_ref.append(p3.get_ref())
1383
1422
  return elem_ref
1423
+
1424
+ def _handle_drawingml(self, doc: DoclingDocument, drawingml_els: Any):
1425
+ # 1) Make an empty copy of the original document
1426
+ dml_doc = self.load_msword_file(self.path_or_stream, self.document_hash)
1427
+ body = dml_doc._element.body
1428
+ for child in list(body):
1429
+ body.remove(child)
1430
+
1431
+ # 2) Add DrawingML to empty document
1432
+ new_para = dml_doc.add_paragraph()
1433
+ new_r = new_para.add_run()
1434
+ for dml in drawingml_els:
1435
+ new_r._r.append(deepcopy(dml))
1436
+
1437
+ # 3) Export DOCX->PDF->PNG and save it in DoclingDocument
1438
+ level = self._get_level()
1439
+ try:
1440
+ pil_image = get_pil_from_dml_docx(
1441
+ dml_doc, converter=self.docx_to_pdf_converter
1442
+ )
1443
+ if pil_image is None:
1444
+ raise UnidentifiedImageError
1445
+
1446
+ doc.add_picture(
1447
+ parent=self.parents[level - 1],
1448
+ image=ImageRef.from_pil(image=pil_image, dpi=72),
1449
+ caption=None,
1450
+ )
1451
+ except (UnidentifiedImageError, OSError):
1452
+ _log.warning("Warning: DrawingML image cannot be loaded by Pillow")
1453
+ doc.add_picture(
1454
+ parent=self.parents[level - 1],
1455
+ caption=None,
1456
+ )
1457
+
1458
+ return
@@ -9,6 +9,7 @@ from docling_core.types.doc.page import SegmentedPdfPage, TextCell
9
9
  from PIL import Image
10
10
 
11
11
  from docling.backend.abstract_backend import PaginatedDocumentBackend
12
+ from docling.datamodel.backend_options import PdfBackendOptions
12
13
  from docling.datamodel.base_models import InputFormat
13
14
  from docling.datamodel.document import InputDocument
14
15
 
@@ -50,8 +51,14 @@ class PdfPageBackend(ABC):
50
51
 
51
52
 
52
53
  class PdfDocumentBackend(PaginatedDocumentBackend):
53
- def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
54
- super().__init__(in_doc, path_or_stream)
54
+ def __init__(
55
+ self,
56
+ in_doc: InputDocument,
57
+ path_or_stream: Union[BytesIO, Path],
58
+ options: PdfBackendOptions = PdfBackendOptions(),
59
+ ):
60
+ super().__init__(in_doc, path_or_stream, options)
61
+ self.options: PdfBackendOptions
55
62
 
56
63
  if self.input_format is not InputFormat.PDF:
57
64
  if self.input_format is InputFormat.IMAGE:
@@ -20,6 +20,7 @@ from pypdfium2 import PdfTextPage
20
20
  from pypdfium2._helpers.misc import PdfiumError
21
21
 
22
22
  from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
23
+ from docling.datamodel.backend_options import PdfBackendOptions
23
24
  from docling.utils.locks import pypdfium2_lock
24
25
 
25
26
 
@@ -370,12 +371,20 @@ class PyPdfiumPageBackend(PdfPageBackend):
370
371
 
371
372
 
372
373
  class PyPdfiumDocumentBackend(PdfDocumentBackend):
373
- def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
374
- super().__init__(in_doc, path_or_stream)
374
+ def __init__(
375
+ self,
376
+ in_doc: "InputDocument",
377
+ path_or_stream: Union[BytesIO, Path],
378
+ options: PdfBackendOptions = PdfBackendOptions(),
379
+ ):
380
+ super().__init__(in_doc, path_or_stream, options)
375
381
 
382
+ password = (
383
+ self.options.password.get_secret_value() if self.options.password else None
384
+ )
376
385
  try:
377
386
  with pypdfium2_lock:
378
- self._pdoc = pdfium.PdfDocument(self.path_or_stream)
387
+ self._pdoc = pdfium.PdfDocument(self.path_or_stream, password=password)
379
388
  except PdfiumError as e:
380
389
  raise RuntimeError(
381
390
  f"pypdfium could not load document with hash {self.document_hash}"