docling 2.57.0__py3-none-any.whl → 2.58.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/backend/abstract_backend.py +24 -3
- docling/backend/asciidoc_backend.py +3 -3
- docling/backend/docling_parse_v4_backend.py +15 -4
- docling/backend/html_backend.py +130 -20
- docling/backend/md_backend.py +27 -5
- docling/backend/msexcel_backend.py +115 -27
- docling/backend/mspowerpoint_backend.py +2 -2
- docling/backend/msword_backend.py +18 -18
- docling/backend/pdf_backend.py +9 -2
- docling/backend/pypdfium2_backend.py +12 -3
- docling/cli/main.py +85 -30
- docling/datamodel/asr_model_specs.py +408 -6
- docling/datamodel/backend_options.py +82 -0
- docling/datamodel/base_models.py +17 -2
- docling/datamodel/document.py +81 -48
- docling/datamodel/pipeline_options_asr_model.py +21 -1
- docling/document_converter.py +37 -45
- docling/document_extractor.py +12 -11
- docling/models/readingorder_model.py +6 -7
- docling/pipeline/asr_pipeline.py +139 -3
- docling/utils/api_image_request.py +4 -4
- docling/utils/layout_postprocessor.py +23 -24
- {docling-2.57.0.dist-info → docling-2.58.0.dist-info}/METADATA +4 -2
- {docling-2.57.0.dist-info → docling-2.58.0.dist-info}/RECORD +28 -27
- {docling-2.57.0.dist-info → docling-2.58.0.dist-info}/WHEEL +0 -0
- {docling-2.57.0.dist-info → docling-2.58.0.dist-info}/entry_points.txt +0 -0
- {docling-2.57.0.dist-info → docling-2.58.0.dist-info}/licenses/LICENSE +0 -0
- {docling-2.57.0.dist-info → docling-2.58.0.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from io import BytesIO
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Any, Optional, Union, cast
|
|
4
|
+
from typing import Annotated, Any, Optional, Union, cast
|
|
5
5
|
|
|
6
6
|
from docling_core.types.doc import (
|
|
7
7
|
BoundingBox,
|
|
@@ -23,7 +23,8 @@ from openpyxl.drawing.image import Image
|
|
|
23
23
|
from openpyxl.drawing.spreadsheet_drawing import TwoCellAnchor
|
|
24
24
|
from openpyxl.worksheet.worksheet import Worksheet
|
|
25
25
|
from PIL import Image as PILImage
|
|
26
|
-
from pydantic import BaseModel, NonNegativeInt, PositiveInt
|
|
26
|
+
from pydantic import BaseModel, Field, NonNegativeInt, PositiveInt
|
|
27
|
+
from pydantic.dataclasses import dataclass
|
|
27
28
|
from typing_extensions import override
|
|
28
29
|
|
|
29
30
|
from docling.backend.abstract_backend import (
|
|
@@ -36,6 +37,32 @@ from docling.datamodel.document import InputDocument
|
|
|
36
37
|
_log = logging.getLogger(__name__)
|
|
37
38
|
|
|
38
39
|
|
|
40
|
+
@dataclass
|
|
41
|
+
class DataRegion:
|
|
42
|
+
"""Represents the bounding rectangle of non-empty cells in a worksheet."""
|
|
43
|
+
|
|
44
|
+
min_row: Annotated[
|
|
45
|
+
PositiveInt, Field(description="Smallest row index (1-based index).")
|
|
46
|
+
]
|
|
47
|
+
max_row: Annotated[
|
|
48
|
+
PositiveInt, Field(description="Largest row index (1-based index).")
|
|
49
|
+
]
|
|
50
|
+
min_col: Annotated[
|
|
51
|
+
PositiveInt, Field(description="Smallest column index (1-based index).")
|
|
52
|
+
]
|
|
53
|
+
max_col: Annotated[
|
|
54
|
+
PositiveInt, Field(description="Largest column index (1-based index).")
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
def width(self) -> PositiveInt:
|
|
58
|
+
"""Number of columns in the data region."""
|
|
59
|
+
return self.max_col - self.min_col + 1
|
|
60
|
+
|
|
61
|
+
def height(self) -> PositiveInt:
|
|
62
|
+
"""Number of rows in the data region."""
|
|
63
|
+
return self.max_row - self.min_row + 1
|
|
64
|
+
|
|
65
|
+
|
|
39
66
|
class ExcelCell(BaseModel):
|
|
40
67
|
"""Represents an Excel cell.
|
|
41
68
|
|
|
@@ -294,6 +321,48 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
294
321
|
|
|
295
322
|
return doc
|
|
296
323
|
|
|
324
|
+
def _find_true_data_bounds(self, sheet: Worksheet) -> DataRegion:
|
|
325
|
+
"""Find the true data boundaries (min/max rows and columns) in a worksheet.
|
|
326
|
+
|
|
327
|
+
This function scans all cells to find the smallest rectangular region that contains
|
|
328
|
+
all non-empty cells or merged cell ranges. It returns the minimal and maximal
|
|
329
|
+
row/column indices that bound the actual data region.
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
sheet: The worksheet to analyze.
|
|
333
|
+
|
|
334
|
+
Returns:
|
|
335
|
+
A data region representing the smallest rectangle that covers all data and merged cells.
|
|
336
|
+
If the sheet is empty, returns (1, 1, 1, 1) by default.
|
|
337
|
+
"""
|
|
338
|
+
min_row, min_col = None, None
|
|
339
|
+
max_row, max_col = 0, 0
|
|
340
|
+
|
|
341
|
+
for cell in sheet._cells.values():
|
|
342
|
+
if cell.value is not None:
|
|
343
|
+
r, c = cell.row, cell.column
|
|
344
|
+
min_row = r if min_row is None else min(min_row, r)
|
|
345
|
+
min_col = c if min_col is None else min(min_col, c)
|
|
346
|
+
max_row = max(max_row, r)
|
|
347
|
+
max_col = max(max_col, c)
|
|
348
|
+
|
|
349
|
+
# Expand bounds to include merged cells
|
|
350
|
+
for merged in sheet.merged_cells.ranges:
|
|
351
|
+
min_row = (
|
|
352
|
+
merged.min_row if min_row is None else min(min_row, merged.min_row)
|
|
353
|
+
)
|
|
354
|
+
min_col = (
|
|
355
|
+
merged.min_col if min_col is None else min(min_col, merged.min_col)
|
|
356
|
+
)
|
|
357
|
+
max_row = max(max_row, merged.max_row)
|
|
358
|
+
max_col = max(max_col, merged.max_col)
|
|
359
|
+
|
|
360
|
+
# If no data found, default to (1, 1, 1, 1)
|
|
361
|
+
if min_row is None or min_col is None:
|
|
362
|
+
min_row = min_col = max_row = max_col = 1
|
|
363
|
+
|
|
364
|
+
return DataRegion(min_row, max_row, min_col, max_col)
|
|
365
|
+
|
|
297
366
|
def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
|
|
298
367
|
"""Find all compact rectangular data tables in an Excel worksheet.
|
|
299
368
|
|
|
@@ -303,18 +372,31 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
303
372
|
Returns:
|
|
304
373
|
A list of ExcelTable objects representing the data tables.
|
|
305
374
|
"""
|
|
375
|
+
bounds: DataRegion = self._find_true_data_bounds(
|
|
376
|
+
sheet
|
|
377
|
+
) # The true data boundaries
|
|
306
378
|
tables: list[ExcelTable] = [] # List to store found tables
|
|
307
379
|
visited: set[tuple[int, int]] = set() # Track already visited cells
|
|
308
380
|
|
|
309
|
-
#
|
|
310
|
-
for ri, row in enumerate(
|
|
311
|
-
|
|
312
|
-
|
|
381
|
+
# Limit scan to actual data bounds
|
|
382
|
+
for ri, row in enumerate(
|
|
383
|
+
sheet.iter_rows(
|
|
384
|
+
min_row=bounds.min_row,
|
|
385
|
+
max_row=bounds.max_row,
|
|
386
|
+
min_col=bounds.min_col,
|
|
387
|
+
max_col=bounds.max_col,
|
|
388
|
+
values_only=False,
|
|
389
|
+
),
|
|
390
|
+
start=bounds.min_row - 1,
|
|
391
|
+
):
|
|
392
|
+
for rj, cell in enumerate(row, start=bounds.min_col - 1):
|
|
313
393
|
if cell.value is None or (ri, rj) in visited:
|
|
314
394
|
continue
|
|
315
395
|
|
|
316
396
|
# If the cell starts a new table, find its bounds
|
|
317
|
-
table_bounds, visited_cells = self._find_table_bounds(
|
|
397
|
+
table_bounds, visited_cells = self._find_table_bounds(
|
|
398
|
+
sheet, ri, rj, bounds.max_row, bounds.max_col
|
|
399
|
+
)
|
|
318
400
|
|
|
319
401
|
visited.update(visited_cells) # Mark these cells as visited
|
|
320
402
|
tables.append(table_bounds)
|
|
@@ -326,6 +408,8 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
326
408
|
sheet: Worksheet,
|
|
327
409
|
start_row: int,
|
|
328
410
|
start_col: int,
|
|
411
|
+
max_row: int,
|
|
412
|
+
max_col: int,
|
|
329
413
|
) -> tuple[ExcelTable, set[tuple[int, int]]]:
|
|
330
414
|
"""Determine the bounds of a compact rectangular table.
|
|
331
415
|
|
|
@@ -333,14 +417,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
333
417
|
sheet: The Excel worksheet to be parsed.
|
|
334
418
|
start_row: The row number of the starting cell.
|
|
335
419
|
start_col: The column number of the starting cell.
|
|
420
|
+
max_row: Maximum row boundary from true data bounds.
|
|
421
|
+
max_col: Maximum column boundary from true data bounds.
|
|
336
422
|
|
|
337
423
|
Returns:
|
|
338
424
|
A tuple with an Excel table and a set of cell coordinates.
|
|
339
425
|
"""
|
|
340
426
|
_log.debug("find_table_bounds")
|
|
341
427
|
|
|
342
|
-
|
|
343
|
-
|
|
428
|
+
table_max_row = self._find_table_bottom(sheet, start_row, start_col, max_row)
|
|
429
|
+
table_max_col = self._find_table_right(sheet, start_row, start_col, max_col)
|
|
344
430
|
|
|
345
431
|
# Collect the data within the bounds
|
|
346
432
|
data = []
|
|
@@ -348,9 +434,9 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
348
434
|
for ri, row in enumerate(
|
|
349
435
|
sheet.iter_rows(
|
|
350
436
|
min_row=start_row + 1, # start_row is 0-based but iter_rows is 1-based
|
|
351
|
-
max_row=
|
|
437
|
+
max_row=table_max_row + 1,
|
|
352
438
|
min_col=start_col + 1,
|
|
353
|
-
max_col=
|
|
439
|
+
max_col=table_max_col + 1,
|
|
354
440
|
values_only=False,
|
|
355
441
|
),
|
|
356
442
|
start_row,
|
|
@@ -390,15 +476,15 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
390
476
|
return (
|
|
391
477
|
ExcelTable(
|
|
392
478
|
anchor=(start_col, start_row),
|
|
393
|
-
num_rows=
|
|
394
|
-
num_cols=
|
|
479
|
+
num_rows=table_max_row + 1 - start_row,
|
|
480
|
+
num_cols=table_max_col + 1 - start_col,
|
|
395
481
|
data=data,
|
|
396
482
|
),
|
|
397
483
|
visited_cells,
|
|
398
484
|
)
|
|
399
485
|
|
|
400
486
|
def _find_table_bottom(
|
|
401
|
-
self, sheet: Worksheet, start_row: int, start_col: int
|
|
487
|
+
self, sheet: Worksheet, start_row: int, start_col: int, max_row: int
|
|
402
488
|
) -> int:
|
|
403
489
|
"""Find the bottom boundary of a table.
|
|
404
490
|
|
|
@@ -406,16 +492,17 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
406
492
|
sheet: The Excel worksheet to be parsed.
|
|
407
493
|
start_row: The starting row of the table.
|
|
408
494
|
start_col: The starting column of the table.
|
|
495
|
+
max_row: Maximum row boundary from true data bounds.
|
|
409
496
|
|
|
410
497
|
Returns:
|
|
411
498
|
The row index representing the bottom boundary of the table.
|
|
412
499
|
"""
|
|
413
|
-
|
|
500
|
+
table_max_row: int = start_row
|
|
414
501
|
|
|
415
502
|
for ri, (cell,) in enumerate(
|
|
416
503
|
sheet.iter_rows(
|
|
417
504
|
min_row=start_row + 2,
|
|
418
|
-
max_row=
|
|
505
|
+
max_row=max_row,
|
|
419
506
|
min_col=start_col + 1,
|
|
420
507
|
max_col=start_col + 1,
|
|
421
508
|
values_only=False,
|
|
@@ -431,16 +518,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
431
518
|
if cell.value is None and not merged_range:
|
|
432
519
|
break # Stop if the cell is empty and not merged
|
|
433
520
|
|
|
434
|
-
# Expand
|
|
521
|
+
# Expand table_max_row to include the merged range if applicable
|
|
435
522
|
if merged_range:
|
|
436
|
-
|
|
523
|
+
table_max_row = max(table_max_row, merged_range.max_row - 1)
|
|
437
524
|
else:
|
|
438
|
-
|
|
525
|
+
table_max_row = ri
|
|
439
526
|
|
|
440
|
-
return
|
|
527
|
+
return table_max_row
|
|
441
528
|
|
|
442
529
|
def _find_table_right(
|
|
443
|
-
self, sheet: Worksheet, start_row: int, start_col: int
|
|
530
|
+
self, sheet: Worksheet, start_row: int, start_col: int, max_col: int
|
|
444
531
|
) -> int:
|
|
445
532
|
"""Find the right boundary of a table.
|
|
446
533
|
|
|
@@ -448,18 +535,19 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
448
535
|
sheet: The Excel worksheet to be parsed.
|
|
449
536
|
start_row: The starting row of the table.
|
|
450
537
|
start_col: The starting column of the table.
|
|
538
|
+
max_col: The actual max column of the table.
|
|
451
539
|
|
|
452
540
|
Returns:
|
|
453
541
|
The column index representing the right boundary of the table."
|
|
454
542
|
"""
|
|
455
|
-
|
|
543
|
+
table_max_col: int = start_col
|
|
456
544
|
|
|
457
545
|
for rj, (cell,) in enumerate(
|
|
458
546
|
sheet.iter_cols(
|
|
459
547
|
min_row=start_row + 1,
|
|
460
548
|
max_row=start_row + 1,
|
|
461
549
|
min_col=start_col + 2,
|
|
462
|
-
max_col=
|
|
550
|
+
max_col=max_col,
|
|
463
551
|
values_only=False,
|
|
464
552
|
),
|
|
465
553
|
start_col + 1,
|
|
@@ -473,13 +561,13 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
473
561
|
if cell.value is None and not merged_range:
|
|
474
562
|
break # Stop if the cell is empty and not merged
|
|
475
563
|
|
|
476
|
-
# Expand
|
|
564
|
+
# Expand table_max_col to include the merged range if applicable
|
|
477
565
|
if merged_range:
|
|
478
|
-
|
|
566
|
+
table_max_col = max(table_max_col, merged_range.max_col - 1)
|
|
479
567
|
else:
|
|
480
|
-
|
|
568
|
+
table_max_col = rj
|
|
481
569
|
|
|
482
|
-
return
|
|
570
|
+
return table_max_col
|
|
483
571
|
|
|
484
572
|
def _find_images_in_sheet(
|
|
485
573
|
self, doc: DoclingDocument, sheet: Worksheet
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from io import BytesIO
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import
|
|
4
|
+
from typing import Union
|
|
5
5
|
|
|
6
6
|
from docling_core.types.doc import (
|
|
7
7
|
BoundingBox,
|
|
@@ -80,7 +80,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
|
80
80
|
self.path_or_stream = None
|
|
81
81
|
|
|
82
82
|
@classmethod
|
|
83
|
-
def supported_formats(cls) ->
|
|
83
|
+
def supported_formats(cls) -> set[InputFormat]:
|
|
84
84
|
return {InputFormat.PPTX}
|
|
85
85
|
|
|
86
86
|
def convert(self) -> DoclingDocument:
|
|
@@ -3,7 +3,7 @@ import re
|
|
|
3
3
|
from copy import deepcopy
|
|
4
4
|
from io import BytesIO
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Any, Callable,
|
|
6
|
+
from typing import Any, Callable, Optional, Union
|
|
7
7
|
|
|
8
8
|
from docling_core.types.doc import (
|
|
9
9
|
DocItemLabel,
|
|
@@ -69,7 +69,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
69
69
|
self.numbered_headers: dict[int, int] = {}
|
|
70
70
|
self.equation_bookends: str = "<eq>{EQ}</eq>"
|
|
71
71
|
# Track processed textbox elements to avoid duplication
|
|
72
|
-
self.processed_textbox_elements:
|
|
72
|
+
self.processed_textbox_elements: list[int] = []
|
|
73
73
|
self.docx_to_pdf_converter: Optional[Callable] = None
|
|
74
74
|
self.docx_to_pdf_converter_init = False
|
|
75
75
|
self.display_drawingml_warning = True
|
|
@@ -726,8 +726,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
726
726
|
textbox_elements: list,
|
|
727
727
|
docx_obj: DocxDocument,
|
|
728
728
|
doc: DoclingDocument,
|
|
729
|
-
) ->
|
|
730
|
-
elem_ref:
|
|
729
|
+
) -> list[RefItem]:
|
|
730
|
+
elem_ref: list[RefItem] = []
|
|
731
731
|
"""Process textbox content and add it to the document structure."""
|
|
732
732
|
level = self._get_level()
|
|
733
733
|
# Create a textbox group to contain all text from the textbox
|
|
@@ -856,8 +856,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
856
856
|
element: BaseOxmlElement,
|
|
857
857
|
docx_obj: DocxDocument,
|
|
858
858
|
doc: DoclingDocument,
|
|
859
|
-
) ->
|
|
860
|
-
elem_ref:
|
|
859
|
+
) -> list[RefItem]:
|
|
860
|
+
elem_ref: list[RefItem] = []
|
|
861
861
|
paragraph = Paragraph(element, docx_obj)
|
|
862
862
|
paragraph_elements = self._get_paragraph_elements(paragraph)
|
|
863
863
|
text, equations = self._handle_equations_in_text(
|
|
@@ -1032,8 +1032,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1032
1032
|
curr_level: Optional[int],
|
|
1033
1033
|
text: str,
|
|
1034
1034
|
is_numbered_style: bool = False,
|
|
1035
|
-
) ->
|
|
1036
|
-
elem_ref:
|
|
1035
|
+
) -> list[RefItem]:
|
|
1036
|
+
elem_ref: list[RefItem] = []
|
|
1037
1037
|
level = self._get_level()
|
|
1038
1038
|
if isinstance(curr_level, int):
|
|
1039
1039
|
if curr_level > level:
|
|
@@ -1102,8 +1102,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1102
1102
|
marker: str,
|
|
1103
1103
|
enumerated: bool,
|
|
1104
1104
|
level: int,
|
|
1105
|
-
) ->
|
|
1106
|
-
elem_ref:
|
|
1105
|
+
) -> list[RefItem]:
|
|
1106
|
+
elem_ref: list[RefItem] = []
|
|
1107
1107
|
# This should not happen by construction
|
|
1108
1108
|
if not isinstance(self.parents[level], ListGroup):
|
|
1109
1109
|
return elem_ref
|
|
@@ -1148,8 +1148,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1148
1148
|
ilevel: int,
|
|
1149
1149
|
elements: list,
|
|
1150
1150
|
is_numbered: bool = False,
|
|
1151
|
-
) ->
|
|
1152
|
-
elem_ref:
|
|
1151
|
+
) -> list[RefItem]:
|
|
1152
|
+
elem_ref: list[RefItem] = []
|
|
1153
1153
|
# this method is always called with is_numbered. Numbered lists should be properly addressed.
|
|
1154
1154
|
if not elements:
|
|
1155
1155
|
return elem_ref
|
|
@@ -1244,8 +1244,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1244
1244
|
element: BaseOxmlElement,
|
|
1245
1245
|
docx_obj: DocxDocument,
|
|
1246
1246
|
doc: DoclingDocument,
|
|
1247
|
-
) ->
|
|
1248
|
-
elem_ref:
|
|
1247
|
+
) -> list[RefItem]:
|
|
1248
|
+
elem_ref: list[RefItem] = []
|
|
1249
1249
|
table: Table = Table(element, docx_obj)
|
|
1250
1250
|
num_rows = len(table.rows)
|
|
1251
1251
|
num_cols = len(table.columns)
|
|
@@ -1299,13 +1299,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1299
1299
|
else:
|
|
1300
1300
|
text = text.replace("<eq>", "$").replace("</eq>", "$")
|
|
1301
1301
|
|
|
1302
|
-
provs_in_cell:
|
|
1302
|
+
provs_in_cell: list[RefItem] = []
|
|
1303
1303
|
_, provs_in_cell = self._walk_linear(cell._element, docx_obj, doc)
|
|
1304
1304
|
ref_for_rich_cell = provs_in_cell[0]
|
|
1305
1305
|
rich_table_cell = False
|
|
1306
1306
|
|
|
1307
1307
|
def group_cell_elements(
|
|
1308
|
-
group_name: str, doc: DoclingDocument, provs_in_cell:
|
|
1308
|
+
group_name: str, doc: DoclingDocument, provs_in_cell: list[RefItem]
|
|
1309
1309
|
) -> RefItem:
|
|
1310
1310
|
group_element = doc.add_group(
|
|
1311
1311
|
label=GroupLabel.UNSPECIFIED,
|
|
@@ -1379,7 +1379,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1379
1379
|
|
|
1380
1380
|
def _handle_pictures(
|
|
1381
1381
|
self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
|
|
1382
|
-
) ->
|
|
1382
|
+
) -> list[RefItem]:
|
|
1383
1383
|
def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
|
|
1384
1384
|
image_data: Optional[bytes] = None
|
|
1385
1385
|
rId = drawing_blip[0].get(
|
|
@@ -1391,7 +1391,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1391
1391
|
image_data = image_part.blob # Get the binary image data
|
|
1392
1392
|
return image_data
|
|
1393
1393
|
|
|
1394
|
-
elem_ref:
|
|
1394
|
+
elem_ref: list[RefItem] = []
|
|
1395
1395
|
level = self._get_level()
|
|
1396
1396
|
# Open the BytesIO object with PIL to create an Image
|
|
1397
1397
|
image_data: Optional[bytes] = get_docx_image(drawing_blip)
|
docling/backend/pdf_backend.py
CHANGED
|
@@ -9,6 +9,7 @@ from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
|
|
9
9
|
from PIL import Image
|
|
10
10
|
|
|
11
11
|
from docling.backend.abstract_backend import PaginatedDocumentBackend
|
|
12
|
+
from docling.datamodel.backend_options import PdfBackendOptions
|
|
12
13
|
from docling.datamodel.base_models import InputFormat
|
|
13
14
|
from docling.datamodel.document import InputDocument
|
|
14
15
|
|
|
@@ -50,8 +51,14 @@ class PdfPageBackend(ABC):
|
|
|
50
51
|
|
|
51
52
|
|
|
52
53
|
class PdfDocumentBackend(PaginatedDocumentBackend):
|
|
53
|
-
def __init__(
|
|
54
|
-
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
in_doc: InputDocument,
|
|
57
|
+
path_or_stream: Union[BytesIO, Path],
|
|
58
|
+
options: PdfBackendOptions = PdfBackendOptions(),
|
|
59
|
+
):
|
|
60
|
+
super().__init__(in_doc, path_or_stream, options)
|
|
61
|
+
self.options: PdfBackendOptions
|
|
55
62
|
|
|
56
63
|
if self.input_format is not InputFormat.PDF:
|
|
57
64
|
if self.input_format is InputFormat.IMAGE:
|
|
@@ -20,6 +20,7 @@ from pypdfium2 import PdfTextPage
|
|
|
20
20
|
from pypdfium2._helpers.misc import PdfiumError
|
|
21
21
|
|
|
22
22
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
|
23
|
+
from docling.datamodel.backend_options import PdfBackendOptions
|
|
23
24
|
from docling.utils.locks import pypdfium2_lock
|
|
24
25
|
|
|
25
26
|
|
|
@@ -370,12 +371,20 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
|
370
371
|
|
|
371
372
|
|
|
372
373
|
class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
|
373
|
-
def __init__(
|
|
374
|
-
|
|
374
|
+
def __init__(
|
|
375
|
+
self,
|
|
376
|
+
in_doc: "InputDocument",
|
|
377
|
+
path_or_stream: Union[BytesIO, Path],
|
|
378
|
+
options: PdfBackendOptions = PdfBackendOptions(),
|
|
379
|
+
):
|
|
380
|
+
super().__init__(in_doc, path_or_stream, options)
|
|
375
381
|
|
|
382
|
+
password = (
|
|
383
|
+
self.options.password.get_secret_value() if self.options.password else None
|
|
384
|
+
)
|
|
376
385
|
try:
|
|
377
386
|
with pypdfium2_lock:
|
|
378
|
-
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
|
387
|
+
self._pdoc = pdfium.PdfDocument(self.path_or_stream, password=password)
|
|
379
388
|
except PdfiumError as e:
|
|
380
389
|
raise RuntimeError(
|
|
381
390
|
f"pypdfium could not load document with hash {self.document_hash}"
|
docling/cli/main.py
CHANGED
|
@@ -32,13 +32,26 @@ from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
|
|
32
32
|
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
|
33
33
|
from docling.datamodel.asr_model_specs import (
|
|
34
34
|
WHISPER_BASE,
|
|
35
|
+
WHISPER_BASE_MLX,
|
|
36
|
+
WHISPER_BASE_NATIVE,
|
|
35
37
|
WHISPER_LARGE,
|
|
38
|
+
WHISPER_LARGE_MLX,
|
|
39
|
+
WHISPER_LARGE_NATIVE,
|
|
36
40
|
WHISPER_MEDIUM,
|
|
41
|
+
WHISPER_MEDIUM_MLX,
|
|
42
|
+
WHISPER_MEDIUM_NATIVE,
|
|
37
43
|
WHISPER_SMALL,
|
|
44
|
+
WHISPER_SMALL_MLX,
|
|
45
|
+
WHISPER_SMALL_NATIVE,
|
|
38
46
|
WHISPER_TINY,
|
|
47
|
+
WHISPER_TINY_MLX,
|
|
48
|
+
WHISPER_TINY_NATIVE,
|
|
39
49
|
WHISPER_TURBO,
|
|
50
|
+
WHISPER_TURBO_MLX,
|
|
51
|
+
WHISPER_TURBO_NATIVE,
|
|
40
52
|
AsrModelType,
|
|
41
53
|
)
|
|
54
|
+
from docling.datamodel.backend_options import PdfBackendOptions
|
|
42
55
|
from docling.datamodel.base_models import (
|
|
43
56
|
ConversionStatus,
|
|
44
57
|
FormatToExtensions,
|
|
@@ -391,7 +404,10 @@ def convert( # noqa: C901
|
|
|
391
404
|
] = None,
|
|
392
405
|
pdf_backend: Annotated[
|
|
393
406
|
PdfBackend, typer.Option(..., help="The PDF backend to use.")
|
|
394
|
-
] = PdfBackend.
|
|
407
|
+
] = PdfBackend.DLPARSE_V4,
|
|
408
|
+
pdf_password: Annotated[
|
|
409
|
+
Optional[str], typer.Option(..., help="Password for protected PDF documents")
|
|
410
|
+
] = None,
|
|
395
411
|
table_mode: Annotated[
|
|
396
412
|
TableFormerMode,
|
|
397
413
|
typer.Option(..., help="The mode to use in the table structure model."),
|
|
@@ -611,10 +627,14 @@ def convert( # noqa: C901
|
|
|
611
627
|
ocr_options.psm = psm
|
|
612
628
|
|
|
613
629
|
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
|
|
630
|
+
|
|
614
631
|
# pipeline_options: PaginatedPipelineOptions
|
|
615
632
|
pipeline_options: PipelineOptions
|
|
616
633
|
|
|
617
634
|
format_options: Dict[InputFormat, FormatOption] = {}
|
|
635
|
+
pdf_backend_options: Optional[PdfBackendOptions] = PdfBackendOptions(
|
|
636
|
+
password=pdf_password
|
|
637
|
+
)
|
|
618
638
|
|
|
619
639
|
if pipeline == ProcessingPipeline.STANDARD:
|
|
620
640
|
pipeline_options = PdfPipelineOptions(
|
|
@@ -645,8 +665,10 @@ def convert( # noqa: C901
|
|
|
645
665
|
backend: Type[PdfDocumentBackend]
|
|
646
666
|
if pdf_backend == PdfBackend.DLPARSE_V1:
|
|
647
667
|
backend = DoclingParseDocumentBackend
|
|
668
|
+
pdf_backend_options = None
|
|
648
669
|
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
|
649
670
|
backend = DoclingParseV2DocumentBackend
|
|
671
|
+
pdf_backend_options = None
|
|
650
672
|
elif pdf_backend == PdfBackend.DLPARSE_V4:
|
|
651
673
|
backend = DoclingParseV4DocumentBackend # type: ignore
|
|
652
674
|
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
|
@@ -657,6 +679,7 @@ def convert( # noqa: C901
|
|
|
657
679
|
pdf_format_option = PdfFormatOption(
|
|
658
680
|
pipeline_options=pipeline_options,
|
|
659
681
|
backend=backend, # pdf_backend
|
|
682
|
+
backend_options=pdf_backend_options,
|
|
660
683
|
)
|
|
661
684
|
|
|
662
685
|
# METS GBS options
|
|
@@ -747,42 +770,74 @@ def convert( # noqa: C901
|
|
|
747
770
|
InputFormat.IMAGE: pdf_format_option,
|
|
748
771
|
}
|
|
749
772
|
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
773
|
+
# Set ASR options
|
|
774
|
+
asr_pipeline_options = AsrPipelineOptions(
|
|
775
|
+
accelerator_options=AcceleratorOptions(
|
|
776
|
+
device=device,
|
|
777
|
+
num_threads=num_threads,
|
|
778
|
+
),
|
|
779
|
+
# enable_remote_services=enable_remote_services,
|
|
780
|
+
# artifacts_path = artifacts_path
|
|
781
|
+
)
|
|
755
782
|
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
783
|
+
# Auto-selecting models (choose best implementation for hardware)
|
|
784
|
+
if asr_model == AsrModelType.WHISPER_TINY:
|
|
785
|
+
asr_pipeline_options.asr_options = WHISPER_TINY
|
|
786
|
+
elif asr_model == AsrModelType.WHISPER_SMALL:
|
|
787
|
+
asr_pipeline_options.asr_options = WHISPER_SMALL
|
|
788
|
+
elif asr_model == AsrModelType.WHISPER_MEDIUM:
|
|
789
|
+
asr_pipeline_options.asr_options = WHISPER_MEDIUM
|
|
790
|
+
elif asr_model == AsrModelType.WHISPER_BASE:
|
|
791
|
+
asr_pipeline_options.asr_options = WHISPER_BASE
|
|
792
|
+
elif asr_model == AsrModelType.WHISPER_LARGE:
|
|
793
|
+
asr_pipeline_options.asr_options = WHISPER_LARGE
|
|
794
|
+
elif asr_model == AsrModelType.WHISPER_TURBO:
|
|
795
|
+
asr_pipeline_options.asr_options = WHISPER_TURBO
|
|
796
|
+
|
|
797
|
+
# Explicit MLX models (force MLX implementation)
|
|
798
|
+
elif asr_model == AsrModelType.WHISPER_TINY_MLX:
|
|
799
|
+
asr_pipeline_options.asr_options = WHISPER_TINY_MLX
|
|
800
|
+
elif asr_model == AsrModelType.WHISPER_SMALL_MLX:
|
|
801
|
+
asr_pipeline_options.asr_options = WHISPER_SMALL_MLX
|
|
802
|
+
elif asr_model == AsrModelType.WHISPER_MEDIUM_MLX:
|
|
803
|
+
asr_pipeline_options.asr_options = WHISPER_MEDIUM_MLX
|
|
804
|
+
elif asr_model == AsrModelType.WHISPER_BASE_MLX:
|
|
805
|
+
asr_pipeline_options.asr_options = WHISPER_BASE_MLX
|
|
806
|
+
elif asr_model == AsrModelType.WHISPER_LARGE_MLX:
|
|
807
|
+
asr_pipeline_options.asr_options = WHISPER_LARGE_MLX
|
|
808
|
+
elif asr_model == AsrModelType.WHISPER_TURBO_MLX:
|
|
809
|
+
asr_pipeline_options.asr_options = WHISPER_TURBO_MLX
|
|
810
|
+
|
|
811
|
+
# Explicit Native models (force native implementation)
|
|
812
|
+
elif asr_model == AsrModelType.WHISPER_TINY_NATIVE:
|
|
813
|
+
asr_pipeline_options.asr_options = WHISPER_TINY_NATIVE
|
|
814
|
+
elif asr_model == AsrModelType.WHISPER_SMALL_NATIVE:
|
|
815
|
+
asr_pipeline_options.asr_options = WHISPER_SMALL_NATIVE
|
|
816
|
+
elif asr_model == AsrModelType.WHISPER_MEDIUM_NATIVE:
|
|
817
|
+
asr_pipeline_options.asr_options = WHISPER_MEDIUM_NATIVE
|
|
818
|
+
elif asr_model == AsrModelType.WHISPER_BASE_NATIVE:
|
|
819
|
+
asr_pipeline_options.asr_options = WHISPER_BASE_NATIVE
|
|
820
|
+
elif asr_model == AsrModelType.WHISPER_LARGE_NATIVE:
|
|
821
|
+
asr_pipeline_options.asr_options = WHISPER_LARGE_NATIVE
|
|
822
|
+
elif asr_model == AsrModelType.WHISPER_TURBO_NATIVE:
|
|
823
|
+
asr_pipeline_options.asr_options = WHISPER_TURBO_NATIVE
|
|
771
824
|
|
|
772
|
-
|
|
825
|
+
else:
|
|
826
|
+
_log.error(f"{asr_model} is not known")
|
|
827
|
+
raise ValueError(f"{asr_model} is not known")
|
|
773
828
|
|
|
774
|
-
|
|
775
|
-
pipeline_cls=AsrPipeline,
|
|
776
|
-
pipeline_options=pipeline_options,
|
|
777
|
-
)
|
|
829
|
+
_log.debug(f"ASR pipeline_options: {asr_pipeline_options}")
|
|
778
830
|
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
831
|
+
audio_format_option = AudioFormatOption(
|
|
832
|
+
pipeline_cls=AsrPipeline,
|
|
833
|
+
pipeline_options=asr_pipeline_options,
|
|
834
|
+
)
|
|
835
|
+
format_options[InputFormat.AUDIO] = audio_format_option
|
|
782
836
|
|
|
837
|
+
# Common options for all pipelines
|
|
783
838
|
if artifacts_path is not None:
|
|
784
839
|
pipeline_options.artifacts_path = artifacts_path
|
|
785
|
-
|
|
840
|
+
asr_pipeline_options.artifacts_path = artifacts_path
|
|
786
841
|
|
|
787
842
|
doc_converter = DocumentConverter(
|
|
788
843
|
allowed_formats=from_formats,
|