docling 2.57.0__py3-none-any.whl → 2.58.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from io import BytesIO
3
3
  from pathlib import Path
4
- from typing import Any, Optional, Union, cast
4
+ from typing import Annotated, Any, Optional, Union, cast
5
5
 
6
6
  from docling_core.types.doc import (
7
7
  BoundingBox,
@@ -23,7 +23,8 @@ from openpyxl.drawing.image import Image
23
23
  from openpyxl.drawing.spreadsheet_drawing import TwoCellAnchor
24
24
  from openpyxl.worksheet.worksheet import Worksheet
25
25
  from PIL import Image as PILImage
26
- from pydantic import BaseModel, NonNegativeInt, PositiveInt
26
+ from pydantic import BaseModel, Field, NonNegativeInt, PositiveInt
27
+ from pydantic.dataclasses import dataclass
27
28
  from typing_extensions import override
28
29
 
29
30
  from docling.backend.abstract_backend import (
@@ -36,6 +37,32 @@ from docling.datamodel.document import InputDocument
36
37
  _log = logging.getLogger(__name__)
37
38
 
38
39
 
40
+ @dataclass
41
+ class DataRegion:
42
+ """Represents the bounding rectangle of non-empty cells in a worksheet."""
43
+
44
+ min_row: Annotated[
45
+ PositiveInt, Field(description="Smallest row index (1-based index).")
46
+ ]
47
+ max_row: Annotated[
48
+ PositiveInt, Field(description="Largest row index (1-based index).")
49
+ ]
50
+ min_col: Annotated[
51
+ PositiveInt, Field(description="Smallest column index (1-based index).")
52
+ ]
53
+ max_col: Annotated[
54
+ PositiveInt, Field(description="Largest column index (1-based index).")
55
+ ]
56
+
57
+ def width(self) -> PositiveInt:
58
+ """Number of columns in the data region."""
59
+ return self.max_col - self.min_col + 1
60
+
61
+ def height(self) -> PositiveInt:
62
+ """Number of rows in the data region."""
63
+ return self.max_row - self.min_row + 1
64
+
65
+
39
66
  class ExcelCell(BaseModel):
40
67
  """Represents an Excel cell.
41
68
 
@@ -294,6 +321,48 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
294
321
 
295
322
  return doc
296
323
 
324
+ def _find_true_data_bounds(self, sheet: Worksheet) -> DataRegion:
325
+ """Find the true data boundaries (min/max rows and columns) in a worksheet.
326
+
327
+ This function scans all cells to find the smallest rectangular region that contains
328
+ all non-empty cells or merged cell ranges. It returns the minimal and maximal
329
+ row/column indices that bound the actual data region.
330
+
331
+ Args:
332
+ sheet: The worksheet to analyze.
333
+
334
+ Returns:
335
+ A data region representing the smallest rectangle that covers all data and merged cells.
336
+ If the sheet is empty, returns (1, 1, 1, 1) by default.
337
+ """
338
+ min_row, min_col = None, None
339
+ max_row, max_col = 0, 0
340
+
341
+ for cell in sheet._cells.values():
342
+ if cell.value is not None:
343
+ r, c = cell.row, cell.column
344
+ min_row = r if min_row is None else min(min_row, r)
345
+ min_col = c if min_col is None else min(min_col, c)
346
+ max_row = max(max_row, r)
347
+ max_col = max(max_col, c)
348
+
349
+ # Expand bounds to include merged cells
350
+ for merged in sheet.merged_cells.ranges:
351
+ min_row = (
352
+ merged.min_row if min_row is None else min(min_row, merged.min_row)
353
+ )
354
+ min_col = (
355
+ merged.min_col if min_col is None else min(min_col, merged.min_col)
356
+ )
357
+ max_row = max(max_row, merged.max_row)
358
+ max_col = max(max_col, merged.max_col)
359
+
360
+ # If no data found, default to (1, 1, 1, 1)
361
+ if min_row is None or min_col is None:
362
+ min_row = min_col = max_row = max_col = 1
363
+
364
+ return DataRegion(min_row, max_row, min_col, max_col)
365
+
297
366
  def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
298
367
  """Find all compact rectangular data tables in an Excel worksheet.
299
368
 
@@ -303,18 +372,31 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
303
372
  Returns:
304
373
  A list of ExcelTable objects representing the data tables.
305
374
  """
375
+ bounds: DataRegion = self._find_true_data_bounds(
376
+ sheet
377
+ ) # The true data boundaries
306
378
  tables: list[ExcelTable] = [] # List to store found tables
307
379
  visited: set[tuple[int, int]] = set() # Track already visited cells
308
380
 
309
- # Iterate over all cells in the sheet
310
- for ri, row in enumerate(sheet.iter_rows(values_only=False)):
311
- for rj, cell in enumerate(row):
312
- # Skip empty or already visited cells
381
+ # Limit scan to actual data bounds
382
+ for ri, row in enumerate(
383
+ sheet.iter_rows(
384
+ min_row=bounds.min_row,
385
+ max_row=bounds.max_row,
386
+ min_col=bounds.min_col,
387
+ max_col=bounds.max_col,
388
+ values_only=False,
389
+ ),
390
+ start=bounds.min_row - 1,
391
+ ):
392
+ for rj, cell in enumerate(row, start=bounds.min_col - 1):
313
393
  if cell.value is None or (ri, rj) in visited:
314
394
  continue
315
395
 
316
396
  # If the cell starts a new table, find its bounds
317
- table_bounds, visited_cells = self._find_table_bounds(sheet, ri, rj)
397
+ table_bounds, visited_cells = self._find_table_bounds(
398
+ sheet, ri, rj, bounds.max_row, bounds.max_col
399
+ )
318
400
 
319
401
  visited.update(visited_cells) # Mark these cells as visited
320
402
  tables.append(table_bounds)
@@ -326,6 +408,8 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
326
408
  sheet: Worksheet,
327
409
  start_row: int,
328
410
  start_col: int,
411
+ max_row: int,
412
+ max_col: int,
329
413
  ) -> tuple[ExcelTable, set[tuple[int, int]]]:
330
414
  """Determine the bounds of a compact rectangular table.
331
415
 
@@ -333,14 +417,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
333
417
  sheet: The Excel worksheet to be parsed.
334
418
  start_row: The row number of the starting cell.
335
419
  start_col: The column number of the starting cell.
420
+ max_row: Maximum row boundary from true data bounds.
421
+ max_col: Maximum column boundary from true data bounds.
336
422
 
337
423
  Returns:
338
424
  A tuple with an Excel table and a set of cell coordinates.
339
425
  """
340
426
  _log.debug("find_table_bounds")
341
427
 
342
- max_row = self._find_table_bottom(sheet, start_row, start_col)
343
- max_col = self._find_table_right(sheet, start_row, start_col)
428
+ table_max_row = self._find_table_bottom(sheet, start_row, start_col, max_row)
429
+ table_max_col = self._find_table_right(sheet, start_row, start_col, max_col)
344
430
 
345
431
  # Collect the data within the bounds
346
432
  data = []
@@ -348,9 +434,9 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
348
434
  for ri, row in enumerate(
349
435
  sheet.iter_rows(
350
436
  min_row=start_row + 1, # start_row is 0-based but iter_rows is 1-based
351
- max_row=max_row + 1,
437
+ max_row=table_max_row + 1,
352
438
  min_col=start_col + 1,
353
- max_col=max_col + 1,
439
+ max_col=table_max_col + 1,
354
440
  values_only=False,
355
441
  ),
356
442
  start_row,
@@ -390,15 +476,15 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
390
476
  return (
391
477
  ExcelTable(
392
478
  anchor=(start_col, start_row),
393
- num_rows=max_row + 1 - start_row,
394
- num_cols=max_col + 1 - start_col,
479
+ num_rows=table_max_row + 1 - start_row,
480
+ num_cols=table_max_col + 1 - start_col,
395
481
  data=data,
396
482
  ),
397
483
  visited_cells,
398
484
  )
399
485
 
400
486
  def _find_table_bottom(
401
- self, sheet: Worksheet, start_row: int, start_col: int
487
+ self, sheet: Worksheet, start_row: int, start_col: int, max_row: int
402
488
  ) -> int:
403
489
  """Find the bottom boundary of a table.
404
490
 
@@ -406,16 +492,17 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
406
492
  sheet: The Excel worksheet to be parsed.
407
493
  start_row: The starting row of the table.
408
494
  start_col: The starting column of the table.
495
+ max_row: Maximum row boundary from true data bounds.
409
496
 
410
497
  Returns:
411
498
  The row index representing the bottom boundary of the table.
412
499
  """
413
- max_row: int = start_row
500
+ table_max_row: int = start_row
414
501
 
415
502
  for ri, (cell,) in enumerate(
416
503
  sheet.iter_rows(
417
504
  min_row=start_row + 2,
418
- max_row=sheet.max_row,
505
+ max_row=max_row,
419
506
  min_col=start_col + 1,
420
507
  max_col=start_col + 1,
421
508
  values_only=False,
@@ -431,16 +518,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
431
518
  if cell.value is None and not merged_range:
432
519
  break # Stop if the cell is empty and not merged
433
520
 
434
- # Expand max_row to include the merged range if applicable
521
+ # Expand table_max_row to include the merged range if applicable
435
522
  if merged_range:
436
- max_row = max(max_row, merged_range.max_row - 1)
523
+ table_max_row = max(table_max_row, merged_range.max_row - 1)
437
524
  else:
438
- max_row = ri
525
+ table_max_row = ri
439
526
 
440
- return max_row
527
+ return table_max_row
441
528
 
442
529
  def _find_table_right(
443
- self, sheet: Worksheet, start_row: int, start_col: int
530
+ self, sheet: Worksheet, start_row: int, start_col: int, max_col: int
444
531
  ) -> int:
445
532
  """Find the right boundary of a table.
446
533
 
@@ -448,18 +535,19 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
448
535
  sheet: The Excel worksheet to be parsed.
449
536
  start_row: The starting row of the table.
450
537
  start_col: The starting column of the table.
538
+ max_col: The actual max column of the table.
451
539
 
452
540
  Returns:
453
541
  The column index representing the right boundary of the table."
454
542
  """
455
- max_col: int = start_col
543
+ table_max_col: int = start_col
456
544
 
457
545
  for rj, (cell,) in enumerate(
458
546
  sheet.iter_cols(
459
547
  min_row=start_row + 1,
460
548
  max_row=start_row + 1,
461
549
  min_col=start_col + 2,
462
- max_col=sheet.max_column,
550
+ max_col=max_col,
463
551
  values_only=False,
464
552
  ),
465
553
  start_col + 1,
@@ -473,13 +561,13 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
473
561
  if cell.value is None and not merged_range:
474
562
  break # Stop if the cell is empty and not merged
475
563
 
476
- # Expand max_col to include the merged range if applicable
564
+ # Expand table_max_col to include the merged range if applicable
477
565
  if merged_range:
478
- max_col = max(max_col, merged_range.max_col - 1)
566
+ table_max_col = max(table_max_col, merged_range.max_col - 1)
479
567
  else:
480
- max_col = rj
568
+ table_max_col = rj
481
569
 
482
- return max_col
570
+ return table_max_col
483
571
 
484
572
  def _find_images_in_sheet(
485
573
  self, doc: DoclingDocument, sheet: Worksheet
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from io import BytesIO
3
3
  from pathlib import Path
4
- from typing import Set, Union
4
+ from typing import Union
5
5
 
6
6
  from docling_core.types.doc import (
7
7
  BoundingBox,
@@ -80,7 +80,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
80
80
  self.path_or_stream = None
81
81
 
82
82
  @classmethod
83
- def supported_formats(cls) -> Set[InputFormat]:
83
+ def supported_formats(cls) -> set[InputFormat]:
84
84
  return {InputFormat.PPTX}
85
85
 
86
86
  def convert(self) -> DoclingDocument:
@@ -3,7 +3,7 @@ import re
3
3
  from copy import deepcopy
4
4
  from io import BytesIO
5
5
  from pathlib import Path
6
- from typing import Any, Callable, List, Optional, Union
6
+ from typing import Any, Callable, Optional, Union
7
7
 
8
8
  from docling_core.types.doc import (
9
9
  DocItemLabel,
@@ -69,7 +69,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
69
69
  self.numbered_headers: dict[int, int] = {}
70
70
  self.equation_bookends: str = "<eq>{EQ}</eq>"
71
71
  # Track processed textbox elements to avoid duplication
72
- self.processed_textbox_elements: List[int] = []
72
+ self.processed_textbox_elements: list[int] = []
73
73
  self.docx_to_pdf_converter: Optional[Callable] = None
74
74
  self.docx_to_pdf_converter_init = False
75
75
  self.display_drawingml_warning = True
@@ -726,8 +726,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
726
726
  textbox_elements: list,
727
727
  docx_obj: DocxDocument,
728
728
  doc: DoclingDocument,
729
- ) -> List[RefItem]:
730
- elem_ref: List[RefItem] = []
729
+ ) -> list[RefItem]:
730
+ elem_ref: list[RefItem] = []
731
731
  """Process textbox content and add it to the document structure."""
732
732
  level = self._get_level()
733
733
  # Create a textbox group to contain all text from the textbox
@@ -856,8 +856,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
856
856
  element: BaseOxmlElement,
857
857
  docx_obj: DocxDocument,
858
858
  doc: DoclingDocument,
859
- ) -> List[RefItem]:
860
- elem_ref: List[RefItem] = []
859
+ ) -> list[RefItem]:
860
+ elem_ref: list[RefItem] = []
861
861
  paragraph = Paragraph(element, docx_obj)
862
862
  paragraph_elements = self._get_paragraph_elements(paragraph)
863
863
  text, equations = self._handle_equations_in_text(
@@ -1032,8 +1032,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1032
1032
  curr_level: Optional[int],
1033
1033
  text: str,
1034
1034
  is_numbered_style: bool = False,
1035
- ) -> List[RefItem]:
1036
- elem_ref: List[RefItem] = []
1035
+ ) -> list[RefItem]:
1036
+ elem_ref: list[RefItem] = []
1037
1037
  level = self._get_level()
1038
1038
  if isinstance(curr_level, int):
1039
1039
  if curr_level > level:
@@ -1102,8 +1102,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1102
1102
  marker: str,
1103
1103
  enumerated: bool,
1104
1104
  level: int,
1105
- ) -> List[RefItem]:
1106
- elem_ref: List[RefItem] = []
1105
+ ) -> list[RefItem]:
1106
+ elem_ref: list[RefItem] = []
1107
1107
  # This should not happen by construction
1108
1108
  if not isinstance(self.parents[level], ListGroup):
1109
1109
  return elem_ref
@@ -1148,8 +1148,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1148
1148
  ilevel: int,
1149
1149
  elements: list,
1150
1150
  is_numbered: bool = False,
1151
- ) -> List[RefItem]:
1152
- elem_ref: List[RefItem] = []
1151
+ ) -> list[RefItem]:
1152
+ elem_ref: list[RefItem] = []
1153
1153
  # this method is always called with is_numbered. Numbered lists should be properly addressed.
1154
1154
  if not elements:
1155
1155
  return elem_ref
@@ -1244,8 +1244,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1244
1244
  element: BaseOxmlElement,
1245
1245
  docx_obj: DocxDocument,
1246
1246
  doc: DoclingDocument,
1247
- ) -> List[RefItem]:
1248
- elem_ref: List[RefItem] = []
1247
+ ) -> list[RefItem]:
1248
+ elem_ref: list[RefItem] = []
1249
1249
  table: Table = Table(element, docx_obj)
1250
1250
  num_rows = len(table.rows)
1251
1251
  num_cols = len(table.columns)
@@ -1299,13 +1299,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1299
1299
  else:
1300
1300
  text = text.replace("<eq>", "$").replace("</eq>", "$")
1301
1301
 
1302
- provs_in_cell: List[RefItem] = []
1302
+ provs_in_cell: list[RefItem] = []
1303
1303
  _, provs_in_cell = self._walk_linear(cell._element, docx_obj, doc)
1304
1304
  ref_for_rich_cell = provs_in_cell[0]
1305
1305
  rich_table_cell = False
1306
1306
 
1307
1307
  def group_cell_elements(
1308
- group_name: str, doc: DoclingDocument, provs_in_cell: List[RefItem]
1308
+ group_name: str, doc: DoclingDocument, provs_in_cell: list[RefItem]
1309
1309
  ) -> RefItem:
1310
1310
  group_element = doc.add_group(
1311
1311
  label=GroupLabel.UNSPECIFIED,
@@ -1379,7 +1379,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1379
1379
 
1380
1380
  def _handle_pictures(
1381
1381
  self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
1382
- ) -> List[RefItem]:
1382
+ ) -> list[RefItem]:
1383
1383
  def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
1384
1384
  image_data: Optional[bytes] = None
1385
1385
  rId = drawing_blip[0].get(
@@ -1391,7 +1391,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1391
1391
  image_data = image_part.blob # Get the binary image data
1392
1392
  return image_data
1393
1393
 
1394
- elem_ref: List[RefItem] = []
1394
+ elem_ref: list[RefItem] = []
1395
1395
  level = self._get_level()
1396
1396
  # Open the BytesIO object with PIL to create an Image
1397
1397
  image_data: Optional[bytes] = get_docx_image(drawing_blip)
@@ -9,6 +9,7 @@ from docling_core.types.doc.page import SegmentedPdfPage, TextCell
9
9
  from PIL import Image
10
10
 
11
11
  from docling.backend.abstract_backend import PaginatedDocumentBackend
12
+ from docling.datamodel.backend_options import PdfBackendOptions
12
13
  from docling.datamodel.base_models import InputFormat
13
14
  from docling.datamodel.document import InputDocument
14
15
 
@@ -50,8 +51,14 @@ class PdfPageBackend(ABC):
50
51
 
51
52
 
52
53
  class PdfDocumentBackend(PaginatedDocumentBackend):
53
- def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
54
- super().__init__(in_doc, path_or_stream)
54
+ def __init__(
55
+ self,
56
+ in_doc: InputDocument,
57
+ path_or_stream: Union[BytesIO, Path],
58
+ options: PdfBackendOptions = PdfBackendOptions(),
59
+ ):
60
+ super().__init__(in_doc, path_or_stream, options)
61
+ self.options: PdfBackendOptions
55
62
 
56
63
  if self.input_format is not InputFormat.PDF:
57
64
  if self.input_format is InputFormat.IMAGE:
@@ -20,6 +20,7 @@ from pypdfium2 import PdfTextPage
20
20
  from pypdfium2._helpers.misc import PdfiumError
21
21
 
22
22
  from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
23
+ from docling.datamodel.backend_options import PdfBackendOptions
23
24
  from docling.utils.locks import pypdfium2_lock
24
25
 
25
26
 
@@ -370,12 +371,20 @@ class PyPdfiumPageBackend(PdfPageBackend):
370
371
 
371
372
 
372
373
  class PyPdfiumDocumentBackend(PdfDocumentBackend):
373
- def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
374
- super().__init__(in_doc, path_or_stream)
374
+ def __init__(
375
+ self,
376
+ in_doc: "InputDocument",
377
+ path_or_stream: Union[BytesIO, Path],
378
+ options: PdfBackendOptions = PdfBackendOptions(),
379
+ ):
380
+ super().__init__(in_doc, path_or_stream, options)
375
381
 
382
+ password = (
383
+ self.options.password.get_secret_value() if self.options.password else None
384
+ )
376
385
  try:
377
386
  with pypdfium2_lock:
378
- self._pdoc = pdfium.PdfDocument(self.path_or_stream)
387
+ self._pdoc = pdfium.PdfDocument(self.path_or_stream, password=password)
379
388
  except PdfiumError as e:
380
389
  raise RuntimeError(
381
390
  f"pypdfium could not load document with hash {self.document_hash}"
docling/cli/main.py CHANGED
@@ -32,13 +32,26 @@ from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
32
32
  from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
33
33
  from docling.datamodel.asr_model_specs import (
34
34
  WHISPER_BASE,
35
+ WHISPER_BASE_MLX,
36
+ WHISPER_BASE_NATIVE,
35
37
  WHISPER_LARGE,
38
+ WHISPER_LARGE_MLX,
39
+ WHISPER_LARGE_NATIVE,
36
40
  WHISPER_MEDIUM,
41
+ WHISPER_MEDIUM_MLX,
42
+ WHISPER_MEDIUM_NATIVE,
37
43
  WHISPER_SMALL,
44
+ WHISPER_SMALL_MLX,
45
+ WHISPER_SMALL_NATIVE,
38
46
  WHISPER_TINY,
47
+ WHISPER_TINY_MLX,
48
+ WHISPER_TINY_NATIVE,
39
49
  WHISPER_TURBO,
50
+ WHISPER_TURBO_MLX,
51
+ WHISPER_TURBO_NATIVE,
40
52
  AsrModelType,
41
53
  )
54
+ from docling.datamodel.backend_options import PdfBackendOptions
42
55
  from docling.datamodel.base_models import (
43
56
  ConversionStatus,
44
57
  FormatToExtensions,
@@ -391,7 +404,10 @@ def convert( # noqa: C901
391
404
  ] = None,
392
405
  pdf_backend: Annotated[
393
406
  PdfBackend, typer.Option(..., help="The PDF backend to use.")
394
- ] = PdfBackend.DLPARSE_V2,
407
+ ] = PdfBackend.DLPARSE_V4,
408
+ pdf_password: Annotated[
409
+ Optional[str], typer.Option(..., help="Password for protected PDF documents")
410
+ ] = None,
395
411
  table_mode: Annotated[
396
412
  TableFormerMode,
397
413
  typer.Option(..., help="The mode to use in the table structure model."),
@@ -611,10 +627,14 @@ def convert( # noqa: C901
611
627
  ocr_options.psm = psm
612
628
 
613
629
  accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
630
+
614
631
  # pipeline_options: PaginatedPipelineOptions
615
632
  pipeline_options: PipelineOptions
616
633
 
617
634
  format_options: Dict[InputFormat, FormatOption] = {}
635
+ pdf_backend_options: Optional[PdfBackendOptions] = PdfBackendOptions(
636
+ password=pdf_password
637
+ )
618
638
 
619
639
  if pipeline == ProcessingPipeline.STANDARD:
620
640
  pipeline_options = PdfPipelineOptions(
@@ -645,8 +665,10 @@ def convert( # noqa: C901
645
665
  backend: Type[PdfDocumentBackend]
646
666
  if pdf_backend == PdfBackend.DLPARSE_V1:
647
667
  backend = DoclingParseDocumentBackend
668
+ pdf_backend_options = None
648
669
  elif pdf_backend == PdfBackend.DLPARSE_V2:
649
670
  backend = DoclingParseV2DocumentBackend
671
+ pdf_backend_options = None
650
672
  elif pdf_backend == PdfBackend.DLPARSE_V4:
651
673
  backend = DoclingParseV4DocumentBackend # type: ignore
652
674
  elif pdf_backend == PdfBackend.PYPDFIUM2:
@@ -657,6 +679,7 @@ def convert( # noqa: C901
657
679
  pdf_format_option = PdfFormatOption(
658
680
  pipeline_options=pipeline_options,
659
681
  backend=backend, # pdf_backend
682
+ backend_options=pdf_backend_options,
660
683
  )
661
684
 
662
685
  # METS GBS options
@@ -747,42 +770,74 @@ def convert( # noqa: C901
747
770
  InputFormat.IMAGE: pdf_format_option,
748
771
  }
749
772
 
750
- elif pipeline == ProcessingPipeline.ASR:
751
- pipeline_options = AsrPipelineOptions(
752
- # enable_remote_services=enable_remote_services,
753
- # artifacts_path = artifacts_path
754
- )
773
+ # Set ASR options
774
+ asr_pipeline_options = AsrPipelineOptions(
775
+ accelerator_options=AcceleratorOptions(
776
+ device=device,
777
+ num_threads=num_threads,
778
+ ),
779
+ # enable_remote_services=enable_remote_services,
780
+ # artifacts_path = artifacts_path
781
+ )
755
782
 
756
- if asr_model == AsrModelType.WHISPER_TINY:
757
- pipeline_options.asr_options = WHISPER_TINY
758
- elif asr_model == AsrModelType.WHISPER_SMALL:
759
- pipeline_options.asr_options = WHISPER_SMALL
760
- elif asr_model == AsrModelType.WHISPER_MEDIUM:
761
- pipeline_options.asr_options = WHISPER_MEDIUM
762
- elif asr_model == AsrModelType.WHISPER_BASE:
763
- pipeline_options.asr_options = WHISPER_BASE
764
- elif asr_model == AsrModelType.WHISPER_LARGE:
765
- pipeline_options.asr_options = WHISPER_LARGE
766
- elif asr_model == AsrModelType.WHISPER_TURBO:
767
- pipeline_options.asr_options = WHISPER_TURBO
768
- else:
769
- _log.error(f"{asr_model} is not known")
770
- raise ValueError(f"{asr_model} is not known")
783
+ # Auto-selecting models (choose best implementation for hardware)
784
+ if asr_model == AsrModelType.WHISPER_TINY:
785
+ asr_pipeline_options.asr_options = WHISPER_TINY
786
+ elif asr_model == AsrModelType.WHISPER_SMALL:
787
+ asr_pipeline_options.asr_options = WHISPER_SMALL
788
+ elif asr_model == AsrModelType.WHISPER_MEDIUM:
789
+ asr_pipeline_options.asr_options = WHISPER_MEDIUM
790
+ elif asr_model == AsrModelType.WHISPER_BASE:
791
+ asr_pipeline_options.asr_options = WHISPER_BASE
792
+ elif asr_model == AsrModelType.WHISPER_LARGE:
793
+ asr_pipeline_options.asr_options = WHISPER_LARGE
794
+ elif asr_model == AsrModelType.WHISPER_TURBO:
795
+ asr_pipeline_options.asr_options = WHISPER_TURBO
796
+
797
+ # Explicit MLX models (force MLX implementation)
798
+ elif asr_model == AsrModelType.WHISPER_TINY_MLX:
799
+ asr_pipeline_options.asr_options = WHISPER_TINY_MLX
800
+ elif asr_model == AsrModelType.WHISPER_SMALL_MLX:
801
+ asr_pipeline_options.asr_options = WHISPER_SMALL_MLX
802
+ elif asr_model == AsrModelType.WHISPER_MEDIUM_MLX:
803
+ asr_pipeline_options.asr_options = WHISPER_MEDIUM_MLX
804
+ elif asr_model == AsrModelType.WHISPER_BASE_MLX:
805
+ asr_pipeline_options.asr_options = WHISPER_BASE_MLX
806
+ elif asr_model == AsrModelType.WHISPER_LARGE_MLX:
807
+ asr_pipeline_options.asr_options = WHISPER_LARGE_MLX
808
+ elif asr_model == AsrModelType.WHISPER_TURBO_MLX:
809
+ asr_pipeline_options.asr_options = WHISPER_TURBO_MLX
810
+
811
+ # Explicit Native models (force native implementation)
812
+ elif asr_model == AsrModelType.WHISPER_TINY_NATIVE:
813
+ asr_pipeline_options.asr_options = WHISPER_TINY_NATIVE
814
+ elif asr_model == AsrModelType.WHISPER_SMALL_NATIVE:
815
+ asr_pipeline_options.asr_options = WHISPER_SMALL_NATIVE
816
+ elif asr_model == AsrModelType.WHISPER_MEDIUM_NATIVE:
817
+ asr_pipeline_options.asr_options = WHISPER_MEDIUM_NATIVE
818
+ elif asr_model == AsrModelType.WHISPER_BASE_NATIVE:
819
+ asr_pipeline_options.asr_options = WHISPER_BASE_NATIVE
820
+ elif asr_model == AsrModelType.WHISPER_LARGE_NATIVE:
821
+ asr_pipeline_options.asr_options = WHISPER_LARGE_NATIVE
822
+ elif asr_model == AsrModelType.WHISPER_TURBO_NATIVE:
823
+ asr_pipeline_options.asr_options = WHISPER_TURBO_NATIVE
771
824
 
772
- _log.info(f"pipeline_options: {pipeline_options}")
825
+ else:
826
+ _log.error(f"{asr_model} is not known")
827
+ raise ValueError(f"{asr_model} is not known")
773
828
 
774
- audio_format_option = AudioFormatOption(
775
- pipeline_cls=AsrPipeline,
776
- pipeline_options=pipeline_options,
777
- )
829
+ _log.debug(f"ASR pipeline_options: {asr_pipeline_options}")
778
830
 
779
- format_options = {
780
- InputFormat.AUDIO: audio_format_option,
781
- }
831
+ audio_format_option = AudioFormatOption(
832
+ pipeline_cls=AsrPipeline,
833
+ pipeline_options=asr_pipeline_options,
834
+ )
835
+ format_options[InputFormat.AUDIO] = audio_format_option
782
836
 
837
+ # Common options for all pipelines
783
838
  if artifacts_path is not None:
784
839
  pipeline_options.artifacts_path = artifacts_path
785
- # audio_pipeline_options.artifacts_path = artifacts_path
840
+ asr_pipeline_options.artifacts_path = artifacts_path
786
841
 
787
842
  doc_converter = DocumentConverter(
788
843
  allowed_formats=from_formats,