docling 2.57.0__py3-none-any.whl → 2.59.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/backend/abstract_backend.py +24 -3
- docling/backend/asciidoc_backend.py +3 -3
- docling/backend/docling_parse_v4_backend.py +15 -4
- docling/backend/html_backend.py +130 -20
- docling/backend/md_backend.py +27 -5
- docling/backend/msexcel_backend.py +121 -29
- docling/backend/mspowerpoint_backend.py +2 -2
- docling/backend/msword_backend.py +18 -18
- docling/backend/pdf_backend.py +9 -2
- docling/backend/pypdfium2_backend.py +12 -3
- docling/cli/main.py +104 -38
- docling/datamodel/asr_model_specs.py +408 -6
- docling/datamodel/backend_options.py +82 -0
- docling/datamodel/base_models.py +19 -2
- docling/datamodel/document.py +81 -48
- docling/datamodel/pipeline_options_asr_model.py +21 -1
- docling/datamodel/pipeline_options_vlm_model.py +1 -0
- docling/document_converter.py +37 -45
- docling/document_extractor.py +12 -11
- docling/models/api_vlm_model.py +5 -3
- docling/models/picture_description_vlm_model.py +5 -1
- docling/models/readingorder_model.py +6 -7
- docling/models/vlm_models_inline/hf_transformers_model.py +13 -3
- docling/models/vlm_models_inline/mlx_model.py +9 -3
- docling/models/vlm_models_inline/nuextract_transformers_model.py +13 -3
- docling/models/vlm_models_inline/vllm_model.py +42 -8
- docling/pipeline/asr_pipeline.py +149 -6
- docling/utils/api_image_request.py +20 -9
- docling/utils/layout_postprocessor.py +23 -24
- {docling-2.57.0.dist-info → docling-2.59.0.dist-info}/METADATA +11 -8
- {docling-2.57.0.dist-info → docling-2.59.0.dist-info}/RECORD +35 -34
- {docling-2.57.0.dist-info → docling-2.59.0.dist-info}/WHEEL +0 -0
- {docling-2.57.0.dist-info → docling-2.59.0.dist-info}/entry_points.txt +0 -0
- {docling-2.57.0.dist-info → docling-2.59.0.dist-info}/licenses/LICENSE +0 -0
- {docling-2.57.0.dist-info → docling-2.59.0.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from io import BytesIO
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Any, Optional, Union, cast
|
|
4
|
+
from typing import Annotated, Any, Optional, Union, cast
|
|
5
5
|
|
|
6
6
|
from docling_core.types.doc import (
|
|
7
7
|
BoundingBox,
|
|
@@ -23,7 +23,8 @@ from openpyxl.drawing.image import Image
|
|
|
23
23
|
from openpyxl.drawing.spreadsheet_drawing import TwoCellAnchor
|
|
24
24
|
from openpyxl.worksheet.worksheet import Worksheet
|
|
25
25
|
from PIL import Image as PILImage
|
|
26
|
-
from pydantic import BaseModel, NonNegativeInt, PositiveInt
|
|
26
|
+
from pydantic import BaseModel, Field, NonNegativeInt, PositiveInt
|
|
27
|
+
from pydantic.dataclasses import dataclass
|
|
27
28
|
from typing_extensions import override
|
|
28
29
|
|
|
29
30
|
from docling.backend.abstract_backend import (
|
|
@@ -36,6 +37,32 @@ from docling.datamodel.document import InputDocument
|
|
|
36
37
|
_log = logging.getLogger(__name__)
|
|
37
38
|
|
|
38
39
|
|
|
40
|
+
@dataclass
|
|
41
|
+
class DataRegion:
|
|
42
|
+
"""Represents the bounding rectangle of non-empty cells in a worksheet."""
|
|
43
|
+
|
|
44
|
+
min_row: Annotated[
|
|
45
|
+
PositiveInt, Field(description="Smallest row index (1-based index).")
|
|
46
|
+
]
|
|
47
|
+
max_row: Annotated[
|
|
48
|
+
PositiveInt, Field(description="Largest row index (1-based index).")
|
|
49
|
+
]
|
|
50
|
+
min_col: Annotated[
|
|
51
|
+
PositiveInt, Field(description="Smallest column index (1-based index).")
|
|
52
|
+
]
|
|
53
|
+
max_col: Annotated[
|
|
54
|
+
PositiveInt, Field(description="Largest column index (1-based index).")
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
def width(self) -> PositiveInt:
|
|
58
|
+
"""Number of columns in the data region."""
|
|
59
|
+
return self.max_col - self.min_col + 1
|
|
60
|
+
|
|
61
|
+
def height(self) -> PositiveInt:
|
|
62
|
+
"""Number of rows in the data region."""
|
|
63
|
+
return self.max_row - self.min_row + 1
|
|
64
|
+
|
|
65
|
+
|
|
39
66
|
class ExcelCell(BaseModel):
|
|
40
67
|
"""Represents an Excel cell.
|
|
41
68
|
|
|
@@ -112,10 +139,14 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
112
139
|
self.workbook = None
|
|
113
140
|
try:
|
|
114
141
|
if isinstance(self.path_or_stream, BytesIO):
|
|
115
|
-
self.workbook = load_workbook(
|
|
142
|
+
self.workbook = load_workbook(
|
|
143
|
+
filename=self.path_or_stream, data_only=True
|
|
144
|
+
)
|
|
116
145
|
|
|
117
146
|
elif isinstance(self.path_or_stream, Path):
|
|
118
|
-
self.workbook = load_workbook(
|
|
147
|
+
self.workbook = load_workbook(
|
|
148
|
+
filename=str(self.path_or_stream), data_only=True
|
|
149
|
+
)
|
|
119
150
|
|
|
120
151
|
self.valid = self.workbook is not None
|
|
121
152
|
except Exception as e:
|
|
@@ -294,6 +325,48 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
294
325
|
|
|
295
326
|
return doc
|
|
296
327
|
|
|
328
|
+
def _find_true_data_bounds(self, sheet: Worksheet) -> DataRegion:
|
|
329
|
+
"""Find the true data boundaries (min/max rows and columns) in a worksheet.
|
|
330
|
+
|
|
331
|
+
This function scans all cells to find the smallest rectangular region that contains
|
|
332
|
+
all non-empty cells or merged cell ranges. It returns the minimal and maximal
|
|
333
|
+
row/column indices that bound the actual data region.
|
|
334
|
+
|
|
335
|
+
Args:
|
|
336
|
+
sheet: The worksheet to analyze.
|
|
337
|
+
|
|
338
|
+
Returns:
|
|
339
|
+
A data region representing the smallest rectangle that covers all data and merged cells.
|
|
340
|
+
If the sheet is empty, returns (1, 1, 1, 1) by default.
|
|
341
|
+
"""
|
|
342
|
+
min_row, min_col = None, None
|
|
343
|
+
max_row, max_col = 0, 0
|
|
344
|
+
|
|
345
|
+
for cell in sheet._cells.values():
|
|
346
|
+
if cell.value is not None:
|
|
347
|
+
r, c = cell.row, cell.column
|
|
348
|
+
min_row = r if min_row is None else min(min_row, r)
|
|
349
|
+
min_col = c if min_col is None else min(min_col, c)
|
|
350
|
+
max_row = max(max_row, r)
|
|
351
|
+
max_col = max(max_col, c)
|
|
352
|
+
|
|
353
|
+
# Expand bounds to include merged cells
|
|
354
|
+
for merged in sheet.merged_cells.ranges:
|
|
355
|
+
min_row = (
|
|
356
|
+
merged.min_row if min_row is None else min(min_row, merged.min_row)
|
|
357
|
+
)
|
|
358
|
+
min_col = (
|
|
359
|
+
merged.min_col if min_col is None else min(min_col, merged.min_col)
|
|
360
|
+
)
|
|
361
|
+
max_row = max(max_row, merged.max_row)
|
|
362
|
+
max_col = max(max_col, merged.max_col)
|
|
363
|
+
|
|
364
|
+
# If no data found, default to (1, 1, 1, 1)
|
|
365
|
+
if min_row is None or min_col is None:
|
|
366
|
+
min_row = min_col = max_row = max_col = 1
|
|
367
|
+
|
|
368
|
+
return DataRegion(min_row, max_row, min_col, max_col)
|
|
369
|
+
|
|
297
370
|
def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
|
|
298
371
|
"""Find all compact rectangular data tables in an Excel worksheet.
|
|
299
372
|
|
|
@@ -303,18 +376,31 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
303
376
|
Returns:
|
|
304
377
|
A list of ExcelTable objects representing the data tables.
|
|
305
378
|
"""
|
|
379
|
+
bounds: DataRegion = self._find_true_data_bounds(
|
|
380
|
+
sheet
|
|
381
|
+
) # The true data boundaries
|
|
306
382
|
tables: list[ExcelTable] = [] # List to store found tables
|
|
307
383
|
visited: set[tuple[int, int]] = set() # Track already visited cells
|
|
308
384
|
|
|
309
|
-
#
|
|
310
|
-
for ri, row in enumerate(
|
|
311
|
-
|
|
312
|
-
|
|
385
|
+
# Limit scan to actual data bounds
|
|
386
|
+
for ri, row in enumerate(
|
|
387
|
+
sheet.iter_rows(
|
|
388
|
+
min_row=bounds.min_row,
|
|
389
|
+
max_row=bounds.max_row,
|
|
390
|
+
min_col=bounds.min_col,
|
|
391
|
+
max_col=bounds.max_col,
|
|
392
|
+
values_only=False,
|
|
393
|
+
),
|
|
394
|
+
start=bounds.min_row - 1,
|
|
395
|
+
):
|
|
396
|
+
for rj, cell in enumerate(row, start=bounds.min_col - 1):
|
|
313
397
|
if cell.value is None or (ri, rj) in visited:
|
|
314
398
|
continue
|
|
315
399
|
|
|
316
400
|
# If the cell starts a new table, find its bounds
|
|
317
|
-
table_bounds, visited_cells = self._find_table_bounds(
|
|
401
|
+
table_bounds, visited_cells = self._find_table_bounds(
|
|
402
|
+
sheet, ri, rj, bounds.max_row, bounds.max_col
|
|
403
|
+
)
|
|
318
404
|
|
|
319
405
|
visited.update(visited_cells) # Mark these cells as visited
|
|
320
406
|
tables.append(table_bounds)
|
|
@@ -326,6 +412,8 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
326
412
|
sheet: Worksheet,
|
|
327
413
|
start_row: int,
|
|
328
414
|
start_col: int,
|
|
415
|
+
max_row: int,
|
|
416
|
+
max_col: int,
|
|
329
417
|
) -> tuple[ExcelTable, set[tuple[int, int]]]:
|
|
330
418
|
"""Determine the bounds of a compact rectangular table.
|
|
331
419
|
|
|
@@ -333,14 +421,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
333
421
|
sheet: The Excel worksheet to be parsed.
|
|
334
422
|
start_row: The row number of the starting cell.
|
|
335
423
|
start_col: The column number of the starting cell.
|
|
424
|
+
max_row: Maximum row boundary from true data bounds.
|
|
425
|
+
max_col: Maximum column boundary from true data bounds.
|
|
336
426
|
|
|
337
427
|
Returns:
|
|
338
428
|
A tuple with an Excel table and a set of cell coordinates.
|
|
339
429
|
"""
|
|
340
430
|
_log.debug("find_table_bounds")
|
|
341
431
|
|
|
342
|
-
|
|
343
|
-
|
|
432
|
+
table_max_row = self._find_table_bottom(sheet, start_row, start_col, max_row)
|
|
433
|
+
table_max_col = self._find_table_right(sheet, start_row, start_col, max_col)
|
|
344
434
|
|
|
345
435
|
# Collect the data within the bounds
|
|
346
436
|
data = []
|
|
@@ -348,9 +438,9 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
348
438
|
for ri, row in enumerate(
|
|
349
439
|
sheet.iter_rows(
|
|
350
440
|
min_row=start_row + 1, # start_row is 0-based but iter_rows is 1-based
|
|
351
|
-
max_row=
|
|
441
|
+
max_row=table_max_row + 1,
|
|
352
442
|
min_col=start_col + 1,
|
|
353
|
-
max_col=
|
|
443
|
+
max_col=table_max_col + 1,
|
|
354
444
|
values_only=False,
|
|
355
445
|
),
|
|
356
446
|
start_row,
|
|
@@ -390,15 +480,15 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
390
480
|
return (
|
|
391
481
|
ExcelTable(
|
|
392
482
|
anchor=(start_col, start_row),
|
|
393
|
-
num_rows=
|
|
394
|
-
num_cols=
|
|
483
|
+
num_rows=table_max_row + 1 - start_row,
|
|
484
|
+
num_cols=table_max_col + 1 - start_col,
|
|
395
485
|
data=data,
|
|
396
486
|
),
|
|
397
487
|
visited_cells,
|
|
398
488
|
)
|
|
399
489
|
|
|
400
490
|
def _find_table_bottom(
|
|
401
|
-
self, sheet: Worksheet, start_row: int, start_col: int
|
|
491
|
+
self, sheet: Worksheet, start_row: int, start_col: int, max_row: int
|
|
402
492
|
) -> int:
|
|
403
493
|
"""Find the bottom boundary of a table.
|
|
404
494
|
|
|
@@ -406,16 +496,17 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
406
496
|
sheet: The Excel worksheet to be parsed.
|
|
407
497
|
start_row: The starting row of the table.
|
|
408
498
|
start_col: The starting column of the table.
|
|
499
|
+
max_row: Maximum row boundary from true data bounds.
|
|
409
500
|
|
|
410
501
|
Returns:
|
|
411
502
|
The row index representing the bottom boundary of the table.
|
|
412
503
|
"""
|
|
413
|
-
|
|
504
|
+
table_max_row: int = start_row
|
|
414
505
|
|
|
415
506
|
for ri, (cell,) in enumerate(
|
|
416
507
|
sheet.iter_rows(
|
|
417
508
|
min_row=start_row + 2,
|
|
418
|
-
max_row=
|
|
509
|
+
max_row=max_row,
|
|
419
510
|
min_col=start_col + 1,
|
|
420
511
|
max_col=start_col + 1,
|
|
421
512
|
values_only=False,
|
|
@@ -431,16 +522,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
431
522
|
if cell.value is None and not merged_range:
|
|
432
523
|
break # Stop if the cell is empty and not merged
|
|
433
524
|
|
|
434
|
-
# Expand
|
|
525
|
+
# Expand table_max_row to include the merged range if applicable
|
|
435
526
|
if merged_range:
|
|
436
|
-
|
|
527
|
+
table_max_row = max(table_max_row, merged_range.max_row - 1)
|
|
437
528
|
else:
|
|
438
|
-
|
|
529
|
+
table_max_row = ri
|
|
439
530
|
|
|
440
|
-
return
|
|
531
|
+
return table_max_row
|
|
441
532
|
|
|
442
533
|
def _find_table_right(
|
|
443
|
-
self, sheet: Worksheet, start_row: int, start_col: int
|
|
534
|
+
self, sheet: Worksheet, start_row: int, start_col: int, max_col: int
|
|
444
535
|
) -> int:
|
|
445
536
|
"""Find the right boundary of a table.
|
|
446
537
|
|
|
@@ -448,18 +539,19 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
448
539
|
sheet: The Excel worksheet to be parsed.
|
|
449
540
|
start_row: The starting row of the table.
|
|
450
541
|
start_col: The starting column of the table.
|
|
542
|
+
max_col: The actual max column of the table.
|
|
451
543
|
|
|
452
544
|
Returns:
|
|
453
545
|
The column index representing the right boundary of the table."
|
|
454
546
|
"""
|
|
455
|
-
|
|
547
|
+
table_max_col: int = start_col
|
|
456
548
|
|
|
457
549
|
for rj, (cell,) in enumerate(
|
|
458
550
|
sheet.iter_cols(
|
|
459
551
|
min_row=start_row + 1,
|
|
460
552
|
max_row=start_row + 1,
|
|
461
553
|
min_col=start_col + 2,
|
|
462
|
-
max_col=
|
|
554
|
+
max_col=max_col,
|
|
463
555
|
values_only=False,
|
|
464
556
|
),
|
|
465
557
|
start_col + 1,
|
|
@@ -473,13 +565,13 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
473
565
|
if cell.value is None and not merged_range:
|
|
474
566
|
break # Stop if the cell is empty and not merged
|
|
475
567
|
|
|
476
|
-
# Expand
|
|
568
|
+
# Expand table_max_col to include the merged range if applicable
|
|
477
569
|
if merged_range:
|
|
478
|
-
|
|
570
|
+
table_max_col = max(table_max_col, merged_range.max_col - 1)
|
|
479
571
|
else:
|
|
480
|
-
|
|
572
|
+
table_max_col = rj
|
|
481
573
|
|
|
482
|
-
return
|
|
574
|
+
return table_max_col
|
|
483
575
|
|
|
484
576
|
def _find_images_in_sheet(
|
|
485
577
|
self, doc: DoclingDocument, sheet: Worksheet
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from io import BytesIO
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import
|
|
4
|
+
from typing import Union
|
|
5
5
|
|
|
6
6
|
from docling_core.types.doc import (
|
|
7
7
|
BoundingBox,
|
|
@@ -80,7 +80,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
|
80
80
|
self.path_or_stream = None
|
|
81
81
|
|
|
82
82
|
@classmethod
|
|
83
|
-
def supported_formats(cls) ->
|
|
83
|
+
def supported_formats(cls) -> set[InputFormat]:
|
|
84
84
|
return {InputFormat.PPTX}
|
|
85
85
|
|
|
86
86
|
def convert(self) -> DoclingDocument:
|
|
@@ -3,7 +3,7 @@ import re
|
|
|
3
3
|
from copy import deepcopy
|
|
4
4
|
from io import BytesIO
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Any, Callable,
|
|
6
|
+
from typing import Any, Callable, Optional, Union
|
|
7
7
|
|
|
8
8
|
from docling_core.types.doc import (
|
|
9
9
|
DocItemLabel,
|
|
@@ -69,7 +69,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
69
69
|
self.numbered_headers: dict[int, int] = {}
|
|
70
70
|
self.equation_bookends: str = "<eq>{EQ}</eq>"
|
|
71
71
|
# Track processed textbox elements to avoid duplication
|
|
72
|
-
self.processed_textbox_elements:
|
|
72
|
+
self.processed_textbox_elements: list[int] = []
|
|
73
73
|
self.docx_to_pdf_converter: Optional[Callable] = None
|
|
74
74
|
self.docx_to_pdf_converter_init = False
|
|
75
75
|
self.display_drawingml_warning = True
|
|
@@ -726,8 +726,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
726
726
|
textbox_elements: list,
|
|
727
727
|
docx_obj: DocxDocument,
|
|
728
728
|
doc: DoclingDocument,
|
|
729
|
-
) ->
|
|
730
|
-
elem_ref:
|
|
729
|
+
) -> list[RefItem]:
|
|
730
|
+
elem_ref: list[RefItem] = []
|
|
731
731
|
"""Process textbox content and add it to the document structure."""
|
|
732
732
|
level = self._get_level()
|
|
733
733
|
# Create a textbox group to contain all text from the textbox
|
|
@@ -856,8 +856,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
856
856
|
element: BaseOxmlElement,
|
|
857
857
|
docx_obj: DocxDocument,
|
|
858
858
|
doc: DoclingDocument,
|
|
859
|
-
) ->
|
|
860
|
-
elem_ref:
|
|
859
|
+
) -> list[RefItem]:
|
|
860
|
+
elem_ref: list[RefItem] = []
|
|
861
861
|
paragraph = Paragraph(element, docx_obj)
|
|
862
862
|
paragraph_elements = self._get_paragraph_elements(paragraph)
|
|
863
863
|
text, equations = self._handle_equations_in_text(
|
|
@@ -1032,8 +1032,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1032
1032
|
curr_level: Optional[int],
|
|
1033
1033
|
text: str,
|
|
1034
1034
|
is_numbered_style: bool = False,
|
|
1035
|
-
) ->
|
|
1036
|
-
elem_ref:
|
|
1035
|
+
) -> list[RefItem]:
|
|
1036
|
+
elem_ref: list[RefItem] = []
|
|
1037
1037
|
level = self._get_level()
|
|
1038
1038
|
if isinstance(curr_level, int):
|
|
1039
1039
|
if curr_level > level:
|
|
@@ -1102,8 +1102,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1102
1102
|
marker: str,
|
|
1103
1103
|
enumerated: bool,
|
|
1104
1104
|
level: int,
|
|
1105
|
-
) ->
|
|
1106
|
-
elem_ref:
|
|
1105
|
+
) -> list[RefItem]:
|
|
1106
|
+
elem_ref: list[RefItem] = []
|
|
1107
1107
|
# This should not happen by construction
|
|
1108
1108
|
if not isinstance(self.parents[level], ListGroup):
|
|
1109
1109
|
return elem_ref
|
|
@@ -1148,8 +1148,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1148
1148
|
ilevel: int,
|
|
1149
1149
|
elements: list,
|
|
1150
1150
|
is_numbered: bool = False,
|
|
1151
|
-
) ->
|
|
1152
|
-
elem_ref:
|
|
1151
|
+
) -> list[RefItem]:
|
|
1152
|
+
elem_ref: list[RefItem] = []
|
|
1153
1153
|
# this method is always called with is_numbered. Numbered lists should be properly addressed.
|
|
1154
1154
|
if not elements:
|
|
1155
1155
|
return elem_ref
|
|
@@ -1244,8 +1244,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1244
1244
|
element: BaseOxmlElement,
|
|
1245
1245
|
docx_obj: DocxDocument,
|
|
1246
1246
|
doc: DoclingDocument,
|
|
1247
|
-
) ->
|
|
1248
|
-
elem_ref:
|
|
1247
|
+
) -> list[RefItem]:
|
|
1248
|
+
elem_ref: list[RefItem] = []
|
|
1249
1249
|
table: Table = Table(element, docx_obj)
|
|
1250
1250
|
num_rows = len(table.rows)
|
|
1251
1251
|
num_cols = len(table.columns)
|
|
@@ -1299,13 +1299,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1299
1299
|
else:
|
|
1300
1300
|
text = text.replace("<eq>", "$").replace("</eq>", "$")
|
|
1301
1301
|
|
|
1302
|
-
provs_in_cell:
|
|
1302
|
+
provs_in_cell: list[RefItem] = []
|
|
1303
1303
|
_, provs_in_cell = self._walk_linear(cell._element, docx_obj, doc)
|
|
1304
1304
|
ref_for_rich_cell = provs_in_cell[0]
|
|
1305
1305
|
rich_table_cell = False
|
|
1306
1306
|
|
|
1307
1307
|
def group_cell_elements(
|
|
1308
|
-
group_name: str, doc: DoclingDocument, provs_in_cell:
|
|
1308
|
+
group_name: str, doc: DoclingDocument, provs_in_cell: list[RefItem]
|
|
1309
1309
|
) -> RefItem:
|
|
1310
1310
|
group_element = doc.add_group(
|
|
1311
1311
|
label=GroupLabel.UNSPECIFIED,
|
|
@@ -1379,7 +1379,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1379
1379
|
|
|
1380
1380
|
def _handle_pictures(
|
|
1381
1381
|
self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
|
|
1382
|
-
) ->
|
|
1382
|
+
) -> list[RefItem]:
|
|
1383
1383
|
def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
|
|
1384
1384
|
image_data: Optional[bytes] = None
|
|
1385
1385
|
rId = drawing_blip[0].get(
|
|
@@ -1391,7 +1391,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1391
1391
|
image_data = image_part.blob # Get the binary image data
|
|
1392
1392
|
return image_data
|
|
1393
1393
|
|
|
1394
|
-
elem_ref:
|
|
1394
|
+
elem_ref: list[RefItem] = []
|
|
1395
1395
|
level = self._get_level()
|
|
1396
1396
|
# Open the BytesIO object with PIL to create an Image
|
|
1397
1397
|
image_data: Optional[bytes] = get_docx_image(drawing_blip)
|
docling/backend/pdf_backend.py
CHANGED
|
@@ -9,6 +9,7 @@ from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
|
|
9
9
|
from PIL import Image
|
|
10
10
|
|
|
11
11
|
from docling.backend.abstract_backend import PaginatedDocumentBackend
|
|
12
|
+
from docling.datamodel.backend_options import PdfBackendOptions
|
|
12
13
|
from docling.datamodel.base_models import InputFormat
|
|
13
14
|
from docling.datamodel.document import InputDocument
|
|
14
15
|
|
|
@@ -50,8 +51,14 @@ class PdfPageBackend(ABC):
|
|
|
50
51
|
|
|
51
52
|
|
|
52
53
|
class PdfDocumentBackend(PaginatedDocumentBackend):
|
|
53
|
-
def __init__(
|
|
54
|
-
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
in_doc: InputDocument,
|
|
57
|
+
path_or_stream: Union[BytesIO, Path],
|
|
58
|
+
options: PdfBackendOptions = PdfBackendOptions(),
|
|
59
|
+
):
|
|
60
|
+
super().__init__(in_doc, path_or_stream, options)
|
|
61
|
+
self.options: PdfBackendOptions
|
|
55
62
|
|
|
56
63
|
if self.input_format is not InputFormat.PDF:
|
|
57
64
|
if self.input_format is InputFormat.IMAGE:
|
|
@@ -20,6 +20,7 @@ from pypdfium2 import PdfTextPage
|
|
|
20
20
|
from pypdfium2._helpers.misc import PdfiumError
|
|
21
21
|
|
|
22
22
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
|
23
|
+
from docling.datamodel.backend_options import PdfBackendOptions
|
|
23
24
|
from docling.utils.locks import pypdfium2_lock
|
|
24
25
|
|
|
25
26
|
|
|
@@ -370,12 +371,20 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
|
370
371
|
|
|
371
372
|
|
|
372
373
|
class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
|
373
|
-
def __init__(
|
|
374
|
-
|
|
374
|
+
def __init__(
|
|
375
|
+
self,
|
|
376
|
+
in_doc: "InputDocument",
|
|
377
|
+
path_or_stream: Union[BytesIO, Path],
|
|
378
|
+
options: PdfBackendOptions = PdfBackendOptions(),
|
|
379
|
+
):
|
|
380
|
+
super().__init__(in_doc, path_or_stream, options)
|
|
375
381
|
|
|
382
|
+
password = (
|
|
383
|
+
self.options.password.get_secret_value() if self.options.password else None
|
|
384
|
+
)
|
|
376
385
|
try:
|
|
377
386
|
with pypdfium2_lock:
|
|
378
|
-
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
|
387
|
+
self._pdoc = pdfium.PdfDocument(self.path_or_stream, password=password)
|
|
379
388
|
except PdfiumError as e:
|
|
380
389
|
raise RuntimeError(
|
|
381
390
|
f"pypdfium could not load document with hash {self.document_hash}"
|