docling 2.57.0__py3-none-any.whl → 2.59.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (35) hide show
  1. docling/backend/abstract_backend.py +24 -3
  2. docling/backend/asciidoc_backend.py +3 -3
  3. docling/backend/docling_parse_v4_backend.py +15 -4
  4. docling/backend/html_backend.py +130 -20
  5. docling/backend/md_backend.py +27 -5
  6. docling/backend/msexcel_backend.py +121 -29
  7. docling/backend/mspowerpoint_backend.py +2 -2
  8. docling/backend/msword_backend.py +18 -18
  9. docling/backend/pdf_backend.py +9 -2
  10. docling/backend/pypdfium2_backend.py +12 -3
  11. docling/cli/main.py +104 -38
  12. docling/datamodel/asr_model_specs.py +408 -6
  13. docling/datamodel/backend_options.py +82 -0
  14. docling/datamodel/base_models.py +19 -2
  15. docling/datamodel/document.py +81 -48
  16. docling/datamodel/pipeline_options_asr_model.py +21 -1
  17. docling/datamodel/pipeline_options_vlm_model.py +1 -0
  18. docling/document_converter.py +37 -45
  19. docling/document_extractor.py +12 -11
  20. docling/models/api_vlm_model.py +5 -3
  21. docling/models/picture_description_vlm_model.py +5 -1
  22. docling/models/readingorder_model.py +6 -7
  23. docling/models/vlm_models_inline/hf_transformers_model.py +13 -3
  24. docling/models/vlm_models_inline/mlx_model.py +9 -3
  25. docling/models/vlm_models_inline/nuextract_transformers_model.py +13 -3
  26. docling/models/vlm_models_inline/vllm_model.py +42 -8
  27. docling/pipeline/asr_pipeline.py +149 -6
  28. docling/utils/api_image_request.py +20 -9
  29. docling/utils/layout_postprocessor.py +23 -24
  30. {docling-2.57.0.dist-info → docling-2.59.0.dist-info}/METADATA +11 -8
  31. {docling-2.57.0.dist-info → docling-2.59.0.dist-info}/RECORD +35 -34
  32. {docling-2.57.0.dist-info → docling-2.59.0.dist-info}/WHEEL +0 -0
  33. {docling-2.57.0.dist-info → docling-2.59.0.dist-info}/entry_points.txt +0 -0
  34. {docling-2.57.0.dist-info → docling-2.59.0.dist-info}/licenses/LICENSE +0 -0
  35. {docling-2.57.0.dist-info → docling-2.59.0.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from io import BytesIO
3
3
  from pathlib import Path
4
- from typing import Any, Optional, Union, cast
4
+ from typing import Annotated, Any, Optional, Union, cast
5
5
 
6
6
  from docling_core.types.doc import (
7
7
  BoundingBox,
@@ -23,7 +23,8 @@ from openpyxl.drawing.image import Image
23
23
  from openpyxl.drawing.spreadsheet_drawing import TwoCellAnchor
24
24
  from openpyxl.worksheet.worksheet import Worksheet
25
25
  from PIL import Image as PILImage
26
- from pydantic import BaseModel, NonNegativeInt, PositiveInt
26
+ from pydantic import BaseModel, Field, NonNegativeInt, PositiveInt
27
+ from pydantic.dataclasses import dataclass
27
28
  from typing_extensions import override
28
29
 
29
30
  from docling.backend.abstract_backend import (
@@ -36,6 +37,32 @@ from docling.datamodel.document import InputDocument
36
37
  _log = logging.getLogger(__name__)
37
38
 
38
39
 
40
+ @dataclass
41
+ class DataRegion:
42
+ """Represents the bounding rectangle of non-empty cells in a worksheet."""
43
+
44
+ min_row: Annotated[
45
+ PositiveInt, Field(description="Smallest row index (1-based index).")
46
+ ]
47
+ max_row: Annotated[
48
+ PositiveInt, Field(description="Largest row index (1-based index).")
49
+ ]
50
+ min_col: Annotated[
51
+ PositiveInt, Field(description="Smallest column index (1-based index).")
52
+ ]
53
+ max_col: Annotated[
54
+ PositiveInt, Field(description="Largest column index (1-based index).")
55
+ ]
56
+
57
+ def width(self) -> PositiveInt:
58
+ """Number of columns in the data region."""
59
+ return self.max_col - self.min_col + 1
60
+
61
+ def height(self) -> PositiveInt:
62
+ """Number of rows in the data region."""
63
+ return self.max_row - self.min_row + 1
64
+
65
+
39
66
  class ExcelCell(BaseModel):
40
67
  """Represents an Excel cell.
41
68
 
@@ -112,10 +139,14 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
112
139
  self.workbook = None
113
140
  try:
114
141
  if isinstance(self.path_or_stream, BytesIO):
115
- self.workbook = load_workbook(filename=self.path_or_stream)
142
+ self.workbook = load_workbook(
143
+ filename=self.path_or_stream, data_only=True
144
+ )
116
145
 
117
146
  elif isinstance(self.path_or_stream, Path):
118
- self.workbook = load_workbook(filename=str(self.path_or_stream))
147
+ self.workbook = load_workbook(
148
+ filename=str(self.path_or_stream), data_only=True
149
+ )
119
150
 
120
151
  self.valid = self.workbook is not None
121
152
  except Exception as e:
@@ -294,6 +325,48 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
294
325
 
295
326
  return doc
296
327
 
328
+ def _find_true_data_bounds(self, sheet: Worksheet) -> DataRegion:
329
+ """Find the true data boundaries (min/max rows and columns) in a worksheet.
330
+
331
+ This function scans all cells to find the smallest rectangular region that contains
332
+ all non-empty cells or merged cell ranges. It returns the minimal and maximal
333
+ row/column indices that bound the actual data region.
334
+
335
+ Args:
336
+ sheet: The worksheet to analyze.
337
+
338
+ Returns:
339
+ A data region representing the smallest rectangle that covers all data and merged cells.
340
+ If the sheet is empty, returns (1, 1, 1, 1) by default.
341
+ """
342
+ min_row, min_col = None, None
343
+ max_row, max_col = 0, 0
344
+
345
+ for cell in sheet._cells.values():
346
+ if cell.value is not None:
347
+ r, c = cell.row, cell.column
348
+ min_row = r if min_row is None else min(min_row, r)
349
+ min_col = c if min_col is None else min(min_col, c)
350
+ max_row = max(max_row, r)
351
+ max_col = max(max_col, c)
352
+
353
+ # Expand bounds to include merged cells
354
+ for merged in sheet.merged_cells.ranges:
355
+ min_row = (
356
+ merged.min_row if min_row is None else min(min_row, merged.min_row)
357
+ )
358
+ min_col = (
359
+ merged.min_col if min_col is None else min(min_col, merged.min_col)
360
+ )
361
+ max_row = max(max_row, merged.max_row)
362
+ max_col = max(max_col, merged.max_col)
363
+
364
+ # If no data found, default to (1, 1, 1, 1)
365
+ if min_row is None or min_col is None:
366
+ min_row = min_col = max_row = max_col = 1
367
+
368
+ return DataRegion(min_row, max_row, min_col, max_col)
369
+
297
370
  def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
298
371
  """Find all compact rectangular data tables in an Excel worksheet.
299
372
 
@@ -303,18 +376,31 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
303
376
  Returns:
304
377
  A list of ExcelTable objects representing the data tables.
305
378
  """
379
+ bounds: DataRegion = self._find_true_data_bounds(
380
+ sheet
381
+ ) # The true data boundaries
306
382
  tables: list[ExcelTable] = [] # List to store found tables
307
383
  visited: set[tuple[int, int]] = set() # Track already visited cells
308
384
 
309
- # Iterate over all cells in the sheet
310
- for ri, row in enumerate(sheet.iter_rows(values_only=False)):
311
- for rj, cell in enumerate(row):
312
- # Skip empty or already visited cells
385
+ # Limit scan to actual data bounds
386
+ for ri, row in enumerate(
387
+ sheet.iter_rows(
388
+ min_row=bounds.min_row,
389
+ max_row=bounds.max_row,
390
+ min_col=bounds.min_col,
391
+ max_col=bounds.max_col,
392
+ values_only=False,
393
+ ),
394
+ start=bounds.min_row - 1,
395
+ ):
396
+ for rj, cell in enumerate(row, start=bounds.min_col - 1):
313
397
  if cell.value is None or (ri, rj) in visited:
314
398
  continue
315
399
 
316
400
  # If the cell starts a new table, find its bounds
317
- table_bounds, visited_cells = self._find_table_bounds(sheet, ri, rj)
401
+ table_bounds, visited_cells = self._find_table_bounds(
402
+ sheet, ri, rj, bounds.max_row, bounds.max_col
403
+ )
318
404
 
319
405
  visited.update(visited_cells) # Mark these cells as visited
320
406
  tables.append(table_bounds)
@@ -326,6 +412,8 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
326
412
  sheet: Worksheet,
327
413
  start_row: int,
328
414
  start_col: int,
415
+ max_row: int,
416
+ max_col: int,
329
417
  ) -> tuple[ExcelTable, set[tuple[int, int]]]:
330
418
  """Determine the bounds of a compact rectangular table.
331
419
 
@@ -333,14 +421,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
333
421
  sheet: The Excel worksheet to be parsed.
334
422
  start_row: The row number of the starting cell.
335
423
  start_col: The column number of the starting cell.
424
+ max_row: Maximum row boundary from true data bounds.
425
+ max_col: Maximum column boundary from true data bounds.
336
426
 
337
427
  Returns:
338
428
  A tuple with an Excel table and a set of cell coordinates.
339
429
  """
340
430
  _log.debug("find_table_bounds")
341
431
 
342
- max_row = self._find_table_bottom(sheet, start_row, start_col)
343
- max_col = self._find_table_right(sheet, start_row, start_col)
432
+ table_max_row = self._find_table_bottom(sheet, start_row, start_col, max_row)
433
+ table_max_col = self._find_table_right(sheet, start_row, start_col, max_col)
344
434
 
345
435
  # Collect the data within the bounds
346
436
  data = []
@@ -348,9 +438,9 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
348
438
  for ri, row in enumerate(
349
439
  sheet.iter_rows(
350
440
  min_row=start_row + 1, # start_row is 0-based but iter_rows is 1-based
351
- max_row=max_row + 1,
441
+ max_row=table_max_row + 1,
352
442
  min_col=start_col + 1,
353
- max_col=max_col + 1,
443
+ max_col=table_max_col + 1,
354
444
  values_only=False,
355
445
  ),
356
446
  start_row,
@@ -390,15 +480,15 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
390
480
  return (
391
481
  ExcelTable(
392
482
  anchor=(start_col, start_row),
393
- num_rows=max_row + 1 - start_row,
394
- num_cols=max_col + 1 - start_col,
483
+ num_rows=table_max_row + 1 - start_row,
484
+ num_cols=table_max_col + 1 - start_col,
395
485
  data=data,
396
486
  ),
397
487
  visited_cells,
398
488
  )
399
489
 
400
490
  def _find_table_bottom(
401
- self, sheet: Worksheet, start_row: int, start_col: int
491
+ self, sheet: Worksheet, start_row: int, start_col: int, max_row: int
402
492
  ) -> int:
403
493
  """Find the bottom boundary of a table.
404
494
 
@@ -406,16 +496,17 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
406
496
  sheet: The Excel worksheet to be parsed.
407
497
  start_row: The starting row of the table.
408
498
  start_col: The starting column of the table.
499
+ max_row: Maximum row boundary from true data bounds.
409
500
 
410
501
  Returns:
411
502
  The row index representing the bottom boundary of the table.
412
503
  """
413
- max_row: int = start_row
504
+ table_max_row: int = start_row
414
505
 
415
506
  for ri, (cell,) in enumerate(
416
507
  sheet.iter_rows(
417
508
  min_row=start_row + 2,
418
- max_row=sheet.max_row,
509
+ max_row=max_row,
419
510
  min_col=start_col + 1,
420
511
  max_col=start_col + 1,
421
512
  values_only=False,
@@ -431,16 +522,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
431
522
  if cell.value is None and not merged_range:
432
523
  break # Stop if the cell is empty and not merged
433
524
 
434
- # Expand max_row to include the merged range if applicable
525
+ # Expand table_max_row to include the merged range if applicable
435
526
  if merged_range:
436
- max_row = max(max_row, merged_range.max_row - 1)
527
+ table_max_row = max(table_max_row, merged_range.max_row - 1)
437
528
  else:
438
- max_row = ri
529
+ table_max_row = ri
439
530
 
440
- return max_row
531
+ return table_max_row
441
532
 
442
533
  def _find_table_right(
443
- self, sheet: Worksheet, start_row: int, start_col: int
534
+ self, sheet: Worksheet, start_row: int, start_col: int, max_col: int
444
535
  ) -> int:
445
536
  """Find the right boundary of a table.
446
537
 
@@ -448,18 +539,19 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
448
539
  sheet: The Excel worksheet to be parsed.
449
540
  start_row: The starting row of the table.
450
541
  start_col: The starting column of the table.
542
+ max_col: The actual max column of the table.
451
543
 
452
544
  Returns:
453
545
  The column index representing the right boundary of the table."
454
546
  """
455
- max_col: int = start_col
547
+ table_max_col: int = start_col
456
548
 
457
549
  for rj, (cell,) in enumerate(
458
550
  sheet.iter_cols(
459
551
  min_row=start_row + 1,
460
552
  max_row=start_row + 1,
461
553
  min_col=start_col + 2,
462
- max_col=sheet.max_column,
554
+ max_col=max_col,
463
555
  values_only=False,
464
556
  ),
465
557
  start_col + 1,
@@ -473,13 +565,13 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
473
565
  if cell.value is None and not merged_range:
474
566
  break # Stop if the cell is empty and not merged
475
567
 
476
- # Expand max_col to include the merged range if applicable
568
+ # Expand table_max_col to include the merged range if applicable
477
569
  if merged_range:
478
- max_col = max(max_col, merged_range.max_col - 1)
570
+ table_max_col = max(table_max_col, merged_range.max_col - 1)
479
571
  else:
480
- max_col = rj
572
+ table_max_col = rj
481
573
 
482
- return max_col
574
+ return table_max_col
483
575
 
484
576
  def _find_images_in_sheet(
485
577
  self, doc: DoclingDocument, sheet: Worksheet
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from io import BytesIO
3
3
  from pathlib import Path
4
- from typing import Set, Union
4
+ from typing import Union
5
5
 
6
6
  from docling_core.types.doc import (
7
7
  BoundingBox,
@@ -80,7 +80,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
80
80
  self.path_or_stream = None
81
81
 
82
82
  @classmethod
83
- def supported_formats(cls) -> Set[InputFormat]:
83
+ def supported_formats(cls) -> set[InputFormat]:
84
84
  return {InputFormat.PPTX}
85
85
 
86
86
  def convert(self) -> DoclingDocument:
@@ -3,7 +3,7 @@ import re
3
3
  from copy import deepcopy
4
4
  from io import BytesIO
5
5
  from pathlib import Path
6
- from typing import Any, Callable, List, Optional, Union
6
+ from typing import Any, Callable, Optional, Union
7
7
 
8
8
  from docling_core.types.doc import (
9
9
  DocItemLabel,
@@ -69,7 +69,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
69
69
  self.numbered_headers: dict[int, int] = {}
70
70
  self.equation_bookends: str = "<eq>{EQ}</eq>"
71
71
  # Track processed textbox elements to avoid duplication
72
- self.processed_textbox_elements: List[int] = []
72
+ self.processed_textbox_elements: list[int] = []
73
73
  self.docx_to_pdf_converter: Optional[Callable] = None
74
74
  self.docx_to_pdf_converter_init = False
75
75
  self.display_drawingml_warning = True
@@ -726,8 +726,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
726
726
  textbox_elements: list,
727
727
  docx_obj: DocxDocument,
728
728
  doc: DoclingDocument,
729
- ) -> List[RefItem]:
730
- elem_ref: List[RefItem] = []
729
+ ) -> list[RefItem]:
730
+ elem_ref: list[RefItem] = []
731
731
  """Process textbox content and add it to the document structure."""
732
732
  level = self._get_level()
733
733
  # Create a textbox group to contain all text from the textbox
@@ -856,8 +856,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
856
856
  element: BaseOxmlElement,
857
857
  docx_obj: DocxDocument,
858
858
  doc: DoclingDocument,
859
- ) -> List[RefItem]:
860
- elem_ref: List[RefItem] = []
859
+ ) -> list[RefItem]:
860
+ elem_ref: list[RefItem] = []
861
861
  paragraph = Paragraph(element, docx_obj)
862
862
  paragraph_elements = self._get_paragraph_elements(paragraph)
863
863
  text, equations = self._handle_equations_in_text(
@@ -1032,8 +1032,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1032
1032
  curr_level: Optional[int],
1033
1033
  text: str,
1034
1034
  is_numbered_style: bool = False,
1035
- ) -> List[RefItem]:
1036
- elem_ref: List[RefItem] = []
1035
+ ) -> list[RefItem]:
1036
+ elem_ref: list[RefItem] = []
1037
1037
  level = self._get_level()
1038
1038
  if isinstance(curr_level, int):
1039
1039
  if curr_level > level:
@@ -1102,8 +1102,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1102
1102
  marker: str,
1103
1103
  enumerated: bool,
1104
1104
  level: int,
1105
- ) -> List[RefItem]:
1106
- elem_ref: List[RefItem] = []
1105
+ ) -> list[RefItem]:
1106
+ elem_ref: list[RefItem] = []
1107
1107
  # This should not happen by construction
1108
1108
  if not isinstance(self.parents[level], ListGroup):
1109
1109
  return elem_ref
@@ -1148,8 +1148,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1148
1148
  ilevel: int,
1149
1149
  elements: list,
1150
1150
  is_numbered: bool = False,
1151
- ) -> List[RefItem]:
1152
- elem_ref: List[RefItem] = []
1151
+ ) -> list[RefItem]:
1152
+ elem_ref: list[RefItem] = []
1153
1153
  # this method is always called with is_numbered. Numbered lists should be properly addressed.
1154
1154
  if not elements:
1155
1155
  return elem_ref
@@ -1244,8 +1244,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1244
1244
  element: BaseOxmlElement,
1245
1245
  docx_obj: DocxDocument,
1246
1246
  doc: DoclingDocument,
1247
- ) -> List[RefItem]:
1248
- elem_ref: List[RefItem] = []
1247
+ ) -> list[RefItem]:
1248
+ elem_ref: list[RefItem] = []
1249
1249
  table: Table = Table(element, docx_obj)
1250
1250
  num_rows = len(table.rows)
1251
1251
  num_cols = len(table.columns)
@@ -1299,13 +1299,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1299
1299
  else:
1300
1300
  text = text.replace("<eq>", "$").replace("</eq>", "$")
1301
1301
 
1302
- provs_in_cell: List[RefItem] = []
1302
+ provs_in_cell: list[RefItem] = []
1303
1303
  _, provs_in_cell = self._walk_linear(cell._element, docx_obj, doc)
1304
1304
  ref_for_rich_cell = provs_in_cell[0]
1305
1305
  rich_table_cell = False
1306
1306
 
1307
1307
  def group_cell_elements(
1308
- group_name: str, doc: DoclingDocument, provs_in_cell: List[RefItem]
1308
+ group_name: str, doc: DoclingDocument, provs_in_cell: list[RefItem]
1309
1309
  ) -> RefItem:
1310
1310
  group_element = doc.add_group(
1311
1311
  label=GroupLabel.UNSPECIFIED,
@@ -1379,7 +1379,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1379
1379
 
1380
1380
  def _handle_pictures(
1381
1381
  self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
1382
- ) -> List[RefItem]:
1382
+ ) -> list[RefItem]:
1383
1383
  def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
1384
1384
  image_data: Optional[bytes] = None
1385
1385
  rId = drawing_blip[0].get(
@@ -1391,7 +1391,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1391
1391
  image_data = image_part.blob # Get the binary image data
1392
1392
  return image_data
1393
1393
 
1394
- elem_ref: List[RefItem] = []
1394
+ elem_ref: list[RefItem] = []
1395
1395
  level = self._get_level()
1396
1396
  # Open the BytesIO object with PIL to create an Image
1397
1397
  image_data: Optional[bytes] = get_docx_image(drawing_blip)
@@ -9,6 +9,7 @@ from docling_core.types.doc.page import SegmentedPdfPage, TextCell
9
9
  from PIL import Image
10
10
 
11
11
  from docling.backend.abstract_backend import PaginatedDocumentBackend
12
+ from docling.datamodel.backend_options import PdfBackendOptions
12
13
  from docling.datamodel.base_models import InputFormat
13
14
  from docling.datamodel.document import InputDocument
14
15
 
@@ -50,8 +51,14 @@ class PdfPageBackend(ABC):
50
51
 
51
52
 
52
53
  class PdfDocumentBackend(PaginatedDocumentBackend):
53
- def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
54
- super().__init__(in_doc, path_or_stream)
54
+ def __init__(
55
+ self,
56
+ in_doc: InputDocument,
57
+ path_or_stream: Union[BytesIO, Path],
58
+ options: PdfBackendOptions = PdfBackendOptions(),
59
+ ):
60
+ super().__init__(in_doc, path_or_stream, options)
61
+ self.options: PdfBackendOptions
55
62
 
56
63
  if self.input_format is not InputFormat.PDF:
57
64
  if self.input_format is InputFormat.IMAGE:
@@ -20,6 +20,7 @@ from pypdfium2 import PdfTextPage
20
20
  from pypdfium2._helpers.misc import PdfiumError
21
21
 
22
22
  from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
23
+ from docling.datamodel.backend_options import PdfBackendOptions
23
24
  from docling.utils.locks import pypdfium2_lock
24
25
 
25
26
 
@@ -370,12 +371,20 @@ class PyPdfiumPageBackend(PdfPageBackend):
370
371
 
371
372
 
372
373
  class PyPdfiumDocumentBackend(PdfDocumentBackend):
373
- def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
374
- super().__init__(in_doc, path_or_stream)
374
+ def __init__(
375
+ self,
376
+ in_doc: "InputDocument",
377
+ path_or_stream: Union[BytesIO, Path],
378
+ options: PdfBackendOptions = PdfBackendOptions(),
379
+ ):
380
+ super().__init__(in_doc, path_or_stream, options)
375
381
 
382
+ password = (
383
+ self.options.password.get_secret_value() if self.options.password else None
384
+ )
376
385
  try:
377
386
  with pypdfium2_lock:
378
- self._pdoc = pdfium.PdfDocument(self.path_or_stream)
387
+ self._pdoc = pdfium.PdfDocument(self.path_or_stream, password=password)
379
388
  except PdfiumError as e:
380
389
  raise RuntimeError(
381
390
  f"pypdfium could not load document with hash {self.document_hash}"