docling 2.56.1__py3-none-any.whl → 2.58.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/backend/abstract_backend.py +24 -3
- docling/backend/asciidoc_backend.py +3 -3
- docling/backend/docling_parse_v4_backend.py +15 -4
- docling/backend/docx/drawingml/utils.py +131 -0
- docling/backend/html_backend.py +130 -20
- docling/backend/md_backend.py +27 -5
- docling/backend/msexcel_backend.py +115 -27
- docling/backend/mspowerpoint_backend.py +2 -2
- docling/backend/msword_backend.py +104 -29
- docling/backend/pdf_backend.py +9 -2
- docling/backend/pypdfium2_backend.py +12 -3
- docling/cli/main.py +85 -30
- docling/datamodel/asr_model_specs.py +408 -6
- docling/datamodel/backend_options.py +82 -0
- docling/datamodel/base_models.py +17 -2
- docling/datamodel/document.py +81 -48
- docling/datamodel/pipeline_options_asr_model.py +21 -1
- docling/document_converter.py +37 -45
- docling/document_extractor.py +12 -11
- docling/models/readingorder_model.py +6 -7
- docling/pipeline/asr_pipeline.py +139 -3
- docling/pipeline/vlm_pipeline.py +53 -33
- docling/utils/api_image_request.py +4 -4
- docling/utils/layout_postprocessor.py +23 -24
- {docling-2.56.1.dist-info → docling-2.58.0.dist-info}/METADATA +4 -2
- {docling-2.56.1.dist-info → docling-2.58.0.dist-info}/RECORD +30 -28
- {docling-2.56.1.dist-info → docling-2.58.0.dist-info}/WHEEL +0 -0
- {docling-2.56.1.dist-info → docling-2.58.0.dist-info}/entry_points.txt +0 -0
- {docling-2.56.1.dist-info → docling-2.58.0.dist-info}/licenses/LICENSE +0 -0
- {docling-2.56.1.dist-info → docling-2.58.0.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from io import BytesIO
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Any, Optional, Union, cast
|
|
4
|
+
from typing import Annotated, Any, Optional, Union, cast
|
|
5
5
|
|
|
6
6
|
from docling_core.types.doc import (
|
|
7
7
|
BoundingBox,
|
|
@@ -23,7 +23,8 @@ from openpyxl.drawing.image import Image
|
|
|
23
23
|
from openpyxl.drawing.spreadsheet_drawing import TwoCellAnchor
|
|
24
24
|
from openpyxl.worksheet.worksheet import Worksheet
|
|
25
25
|
from PIL import Image as PILImage
|
|
26
|
-
from pydantic import BaseModel, NonNegativeInt, PositiveInt
|
|
26
|
+
from pydantic import BaseModel, Field, NonNegativeInt, PositiveInt
|
|
27
|
+
from pydantic.dataclasses import dataclass
|
|
27
28
|
from typing_extensions import override
|
|
28
29
|
|
|
29
30
|
from docling.backend.abstract_backend import (
|
|
@@ -36,6 +37,32 @@ from docling.datamodel.document import InputDocument
|
|
|
36
37
|
_log = logging.getLogger(__name__)
|
|
37
38
|
|
|
38
39
|
|
|
40
|
+
@dataclass
|
|
41
|
+
class DataRegion:
|
|
42
|
+
"""Represents the bounding rectangle of non-empty cells in a worksheet."""
|
|
43
|
+
|
|
44
|
+
min_row: Annotated[
|
|
45
|
+
PositiveInt, Field(description="Smallest row index (1-based index).")
|
|
46
|
+
]
|
|
47
|
+
max_row: Annotated[
|
|
48
|
+
PositiveInt, Field(description="Largest row index (1-based index).")
|
|
49
|
+
]
|
|
50
|
+
min_col: Annotated[
|
|
51
|
+
PositiveInt, Field(description="Smallest column index (1-based index).")
|
|
52
|
+
]
|
|
53
|
+
max_col: Annotated[
|
|
54
|
+
PositiveInt, Field(description="Largest column index (1-based index).")
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
def width(self) -> PositiveInt:
|
|
58
|
+
"""Number of columns in the data region."""
|
|
59
|
+
return self.max_col - self.min_col + 1
|
|
60
|
+
|
|
61
|
+
def height(self) -> PositiveInt:
|
|
62
|
+
"""Number of rows in the data region."""
|
|
63
|
+
return self.max_row - self.min_row + 1
|
|
64
|
+
|
|
65
|
+
|
|
39
66
|
class ExcelCell(BaseModel):
|
|
40
67
|
"""Represents an Excel cell.
|
|
41
68
|
|
|
@@ -294,6 +321,48 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
294
321
|
|
|
295
322
|
return doc
|
|
296
323
|
|
|
324
|
+
def _find_true_data_bounds(self, sheet: Worksheet) -> DataRegion:
|
|
325
|
+
"""Find the true data boundaries (min/max rows and columns) in a worksheet.
|
|
326
|
+
|
|
327
|
+
This function scans all cells to find the smallest rectangular region that contains
|
|
328
|
+
all non-empty cells or merged cell ranges. It returns the minimal and maximal
|
|
329
|
+
row/column indices that bound the actual data region.
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
sheet: The worksheet to analyze.
|
|
333
|
+
|
|
334
|
+
Returns:
|
|
335
|
+
A data region representing the smallest rectangle that covers all data and merged cells.
|
|
336
|
+
If the sheet is empty, returns (1, 1, 1, 1) by default.
|
|
337
|
+
"""
|
|
338
|
+
min_row, min_col = None, None
|
|
339
|
+
max_row, max_col = 0, 0
|
|
340
|
+
|
|
341
|
+
for cell in sheet._cells.values():
|
|
342
|
+
if cell.value is not None:
|
|
343
|
+
r, c = cell.row, cell.column
|
|
344
|
+
min_row = r if min_row is None else min(min_row, r)
|
|
345
|
+
min_col = c if min_col is None else min(min_col, c)
|
|
346
|
+
max_row = max(max_row, r)
|
|
347
|
+
max_col = max(max_col, c)
|
|
348
|
+
|
|
349
|
+
# Expand bounds to include merged cells
|
|
350
|
+
for merged in sheet.merged_cells.ranges:
|
|
351
|
+
min_row = (
|
|
352
|
+
merged.min_row if min_row is None else min(min_row, merged.min_row)
|
|
353
|
+
)
|
|
354
|
+
min_col = (
|
|
355
|
+
merged.min_col if min_col is None else min(min_col, merged.min_col)
|
|
356
|
+
)
|
|
357
|
+
max_row = max(max_row, merged.max_row)
|
|
358
|
+
max_col = max(max_col, merged.max_col)
|
|
359
|
+
|
|
360
|
+
# If no data found, default to (1, 1, 1, 1)
|
|
361
|
+
if min_row is None or min_col is None:
|
|
362
|
+
min_row = min_col = max_row = max_col = 1
|
|
363
|
+
|
|
364
|
+
return DataRegion(min_row, max_row, min_col, max_col)
|
|
365
|
+
|
|
297
366
|
def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
|
|
298
367
|
"""Find all compact rectangular data tables in an Excel worksheet.
|
|
299
368
|
|
|
@@ -303,18 +372,31 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
303
372
|
Returns:
|
|
304
373
|
A list of ExcelTable objects representing the data tables.
|
|
305
374
|
"""
|
|
375
|
+
bounds: DataRegion = self._find_true_data_bounds(
|
|
376
|
+
sheet
|
|
377
|
+
) # The true data boundaries
|
|
306
378
|
tables: list[ExcelTable] = [] # List to store found tables
|
|
307
379
|
visited: set[tuple[int, int]] = set() # Track already visited cells
|
|
308
380
|
|
|
309
|
-
#
|
|
310
|
-
for ri, row in enumerate(
|
|
311
|
-
|
|
312
|
-
|
|
381
|
+
# Limit scan to actual data bounds
|
|
382
|
+
for ri, row in enumerate(
|
|
383
|
+
sheet.iter_rows(
|
|
384
|
+
min_row=bounds.min_row,
|
|
385
|
+
max_row=bounds.max_row,
|
|
386
|
+
min_col=bounds.min_col,
|
|
387
|
+
max_col=bounds.max_col,
|
|
388
|
+
values_only=False,
|
|
389
|
+
),
|
|
390
|
+
start=bounds.min_row - 1,
|
|
391
|
+
):
|
|
392
|
+
for rj, cell in enumerate(row, start=bounds.min_col - 1):
|
|
313
393
|
if cell.value is None or (ri, rj) in visited:
|
|
314
394
|
continue
|
|
315
395
|
|
|
316
396
|
# If the cell starts a new table, find its bounds
|
|
317
|
-
table_bounds, visited_cells = self._find_table_bounds(
|
|
397
|
+
table_bounds, visited_cells = self._find_table_bounds(
|
|
398
|
+
sheet, ri, rj, bounds.max_row, bounds.max_col
|
|
399
|
+
)
|
|
318
400
|
|
|
319
401
|
visited.update(visited_cells) # Mark these cells as visited
|
|
320
402
|
tables.append(table_bounds)
|
|
@@ -326,6 +408,8 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
326
408
|
sheet: Worksheet,
|
|
327
409
|
start_row: int,
|
|
328
410
|
start_col: int,
|
|
411
|
+
max_row: int,
|
|
412
|
+
max_col: int,
|
|
329
413
|
) -> tuple[ExcelTable, set[tuple[int, int]]]:
|
|
330
414
|
"""Determine the bounds of a compact rectangular table.
|
|
331
415
|
|
|
@@ -333,14 +417,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
333
417
|
sheet: The Excel worksheet to be parsed.
|
|
334
418
|
start_row: The row number of the starting cell.
|
|
335
419
|
start_col: The column number of the starting cell.
|
|
420
|
+
max_row: Maximum row boundary from true data bounds.
|
|
421
|
+
max_col: Maximum column boundary from true data bounds.
|
|
336
422
|
|
|
337
423
|
Returns:
|
|
338
424
|
A tuple with an Excel table and a set of cell coordinates.
|
|
339
425
|
"""
|
|
340
426
|
_log.debug("find_table_bounds")
|
|
341
427
|
|
|
342
|
-
|
|
343
|
-
|
|
428
|
+
table_max_row = self._find_table_bottom(sheet, start_row, start_col, max_row)
|
|
429
|
+
table_max_col = self._find_table_right(sheet, start_row, start_col, max_col)
|
|
344
430
|
|
|
345
431
|
# Collect the data within the bounds
|
|
346
432
|
data = []
|
|
@@ -348,9 +434,9 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
348
434
|
for ri, row in enumerate(
|
|
349
435
|
sheet.iter_rows(
|
|
350
436
|
min_row=start_row + 1, # start_row is 0-based but iter_rows is 1-based
|
|
351
|
-
max_row=
|
|
437
|
+
max_row=table_max_row + 1,
|
|
352
438
|
min_col=start_col + 1,
|
|
353
|
-
max_col=
|
|
439
|
+
max_col=table_max_col + 1,
|
|
354
440
|
values_only=False,
|
|
355
441
|
),
|
|
356
442
|
start_row,
|
|
@@ -390,15 +476,15 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
390
476
|
return (
|
|
391
477
|
ExcelTable(
|
|
392
478
|
anchor=(start_col, start_row),
|
|
393
|
-
num_rows=
|
|
394
|
-
num_cols=
|
|
479
|
+
num_rows=table_max_row + 1 - start_row,
|
|
480
|
+
num_cols=table_max_col + 1 - start_col,
|
|
395
481
|
data=data,
|
|
396
482
|
),
|
|
397
483
|
visited_cells,
|
|
398
484
|
)
|
|
399
485
|
|
|
400
486
|
def _find_table_bottom(
|
|
401
|
-
self, sheet: Worksheet, start_row: int, start_col: int
|
|
487
|
+
self, sheet: Worksheet, start_row: int, start_col: int, max_row: int
|
|
402
488
|
) -> int:
|
|
403
489
|
"""Find the bottom boundary of a table.
|
|
404
490
|
|
|
@@ -406,16 +492,17 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
406
492
|
sheet: The Excel worksheet to be parsed.
|
|
407
493
|
start_row: The starting row of the table.
|
|
408
494
|
start_col: The starting column of the table.
|
|
495
|
+
max_row: Maximum row boundary from true data bounds.
|
|
409
496
|
|
|
410
497
|
Returns:
|
|
411
498
|
The row index representing the bottom boundary of the table.
|
|
412
499
|
"""
|
|
413
|
-
|
|
500
|
+
table_max_row: int = start_row
|
|
414
501
|
|
|
415
502
|
for ri, (cell,) in enumerate(
|
|
416
503
|
sheet.iter_rows(
|
|
417
504
|
min_row=start_row + 2,
|
|
418
|
-
max_row=
|
|
505
|
+
max_row=max_row,
|
|
419
506
|
min_col=start_col + 1,
|
|
420
507
|
max_col=start_col + 1,
|
|
421
508
|
values_only=False,
|
|
@@ -431,16 +518,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
431
518
|
if cell.value is None and not merged_range:
|
|
432
519
|
break # Stop if the cell is empty and not merged
|
|
433
520
|
|
|
434
|
-
# Expand
|
|
521
|
+
# Expand table_max_row to include the merged range if applicable
|
|
435
522
|
if merged_range:
|
|
436
|
-
|
|
523
|
+
table_max_row = max(table_max_row, merged_range.max_row - 1)
|
|
437
524
|
else:
|
|
438
|
-
|
|
525
|
+
table_max_row = ri
|
|
439
526
|
|
|
440
|
-
return
|
|
527
|
+
return table_max_row
|
|
441
528
|
|
|
442
529
|
def _find_table_right(
|
|
443
|
-
self, sheet: Worksheet, start_row: int, start_col: int
|
|
530
|
+
self, sheet: Worksheet, start_row: int, start_col: int, max_col: int
|
|
444
531
|
) -> int:
|
|
445
532
|
"""Find the right boundary of a table.
|
|
446
533
|
|
|
@@ -448,18 +535,19 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
448
535
|
sheet: The Excel worksheet to be parsed.
|
|
449
536
|
start_row: The starting row of the table.
|
|
450
537
|
start_col: The starting column of the table.
|
|
538
|
+
max_col: The actual max column of the table.
|
|
451
539
|
|
|
452
540
|
Returns:
|
|
453
541
|
The column index representing the right boundary of the table."
|
|
454
542
|
"""
|
|
455
|
-
|
|
543
|
+
table_max_col: int = start_col
|
|
456
544
|
|
|
457
545
|
for rj, (cell,) in enumerate(
|
|
458
546
|
sheet.iter_cols(
|
|
459
547
|
min_row=start_row + 1,
|
|
460
548
|
max_row=start_row + 1,
|
|
461
549
|
min_col=start_col + 2,
|
|
462
|
-
max_col=
|
|
550
|
+
max_col=max_col,
|
|
463
551
|
values_only=False,
|
|
464
552
|
),
|
|
465
553
|
start_col + 1,
|
|
@@ -473,13 +561,13 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
|
473
561
|
if cell.value is None and not merged_range:
|
|
474
562
|
break # Stop if the cell is empty and not merged
|
|
475
563
|
|
|
476
|
-
# Expand
|
|
564
|
+
# Expand table_max_col to include the merged range if applicable
|
|
477
565
|
if merged_range:
|
|
478
|
-
|
|
566
|
+
table_max_col = max(table_max_col, merged_range.max_col - 1)
|
|
479
567
|
else:
|
|
480
|
-
|
|
568
|
+
table_max_col = rj
|
|
481
569
|
|
|
482
|
-
return
|
|
570
|
+
return table_max_col
|
|
483
571
|
|
|
484
572
|
def _find_images_in_sheet(
|
|
485
573
|
self, doc: DoclingDocument, sheet: Worksheet
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from io import BytesIO
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import
|
|
4
|
+
from typing import Union
|
|
5
5
|
|
|
6
6
|
from docling_core.types.doc import (
|
|
7
7
|
BoundingBox,
|
|
@@ -80,7 +80,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
|
80
80
|
self.path_or_stream = None
|
|
81
81
|
|
|
82
82
|
@classmethod
|
|
83
|
-
def supported_formats(cls) ->
|
|
83
|
+
def supported_formats(cls) -> set[InputFormat]:
|
|
84
84
|
return {InputFormat.PPTX}
|
|
85
85
|
|
|
86
86
|
def convert(self) -> DoclingDocument:
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import re
|
|
3
|
+
from copy import deepcopy
|
|
3
4
|
from io import BytesIO
|
|
4
5
|
from pathlib import Path
|
|
5
|
-
from typing import Any,
|
|
6
|
+
from typing import Any, Callable, Optional, Union
|
|
6
7
|
|
|
7
8
|
from docling_core.types.doc import (
|
|
8
9
|
DocItemLabel,
|
|
@@ -33,6 +34,11 @@ from pydantic import AnyUrl
|
|
|
33
34
|
from typing_extensions import override
|
|
34
35
|
|
|
35
36
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
|
37
|
+
from docling.backend.docx.drawingml.utils import (
|
|
38
|
+
get_docx_to_pdf_converter,
|
|
39
|
+
get_libreoffice_cmd,
|
|
40
|
+
get_pil_from_dml_docx,
|
|
41
|
+
)
|
|
36
42
|
from docling.backend.docx.latex.omml import oMath2Latex
|
|
37
43
|
from docling.datamodel.base_models import InputFormat
|
|
38
44
|
from docling.datamodel.document import InputDocument
|
|
@@ -63,7 +69,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
63
69
|
self.numbered_headers: dict[int, int] = {}
|
|
64
70
|
self.equation_bookends: str = "<eq>{EQ}</eq>"
|
|
65
71
|
# Track processed textbox elements to avoid duplication
|
|
66
|
-
self.processed_textbox_elements:
|
|
72
|
+
self.processed_textbox_elements: list[int] = []
|
|
73
|
+
self.docx_to_pdf_converter: Optional[Callable] = None
|
|
74
|
+
self.docx_to_pdf_converter_init = False
|
|
75
|
+
self.display_drawingml_warning = True
|
|
67
76
|
|
|
68
77
|
for i in range(-1, self.max_levels):
|
|
69
78
|
self.parents[i] = None
|
|
@@ -80,18 +89,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
80
89
|
"indents": [None],
|
|
81
90
|
}
|
|
82
91
|
|
|
83
|
-
self.docx_obj =
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
elif isinstance(self.path_or_stream, Path):
|
|
88
|
-
self.docx_obj = Document(str(self.path_or_stream))
|
|
89
|
-
|
|
92
|
+
self.docx_obj = self.load_msword_file(
|
|
93
|
+
path_or_stream=self.path_or_stream, document_hash=self.document_hash
|
|
94
|
+
)
|
|
95
|
+
if self.docx_obj:
|
|
90
96
|
self.valid = True
|
|
91
|
-
except Exception as e:
|
|
92
|
-
raise RuntimeError(
|
|
93
|
-
f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
|
|
94
|
-
) from e
|
|
95
97
|
|
|
96
98
|
@override
|
|
97
99
|
def is_valid(self) -> bool:
|
|
@@ -139,6 +141,22 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
139
141
|
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
|
140
142
|
)
|
|
141
143
|
|
|
144
|
+
@staticmethod
|
|
145
|
+
def load_msword_file(
|
|
146
|
+
path_or_stream: Union[BytesIO, Path], document_hash: str
|
|
147
|
+
) -> DocxDocument:
|
|
148
|
+
try:
|
|
149
|
+
if isinstance(path_or_stream, BytesIO):
|
|
150
|
+
return Document(path_or_stream)
|
|
151
|
+
elif isinstance(path_or_stream, Path):
|
|
152
|
+
return Document(str(path_or_stream))
|
|
153
|
+
else:
|
|
154
|
+
return None
|
|
155
|
+
except Exception as e:
|
|
156
|
+
raise RuntimeError(
|
|
157
|
+
f"MsWordDocumentBackend could not load document with hash {document_hash}"
|
|
158
|
+
) from e
|
|
159
|
+
|
|
142
160
|
def _update_history(
|
|
143
161
|
self,
|
|
144
162
|
name: str,
|
|
@@ -195,6 +213,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
195
213
|
}
|
|
196
214
|
xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces)
|
|
197
215
|
drawing_blip = xpath_expr(element)
|
|
216
|
+
drawingml_els = element.findall(".//w:drawing", namespaces=namespaces)
|
|
198
217
|
|
|
199
218
|
# Check for textbox content - check multiple textbox formats
|
|
200
219
|
# Only process if the element hasn't been processed before
|
|
@@ -274,6 +293,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
274
293
|
):
|
|
275
294
|
te1 = self._handle_text_elements(element, docx_obj, doc)
|
|
276
295
|
added_elements.extend(te1)
|
|
296
|
+
# Check for DrawingML elements
|
|
297
|
+
elif drawingml_els:
|
|
298
|
+
if (
|
|
299
|
+
self.docx_to_pdf_converter is None
|
|
300
|
+
and self.docx_to_pdf_converter_init is False
|
|
301
|
+
):
|
|
302
|
+
self.docx_to_pdf_converter = get_docx_to_pdf_converter()
|
|
303
|
+
self.docx_to_pdf_converter_init = True
|
|
304
|
+
|
|
305
|
+
if self.docx_to_pdf_converter is None:
|
|
306
|
+
if self.display_drawingml_warning:
|
|
307
|
+
if self.docx_to_pdf_converter is None:
|
|
308
|
+
_log.warning(
|
|
309
|
+
"Found DrawingML elements in document, but no DOCX to PDF converters. "
|
|
310
|
+
"If you want these exported, make sure you have "
|
|
311
|
+
"LibreOffice binary in PATH or specify its path with DOCLING_LIBREOFFICE_CMD."
|
|
312
|
+
)
|
|
313
|
+
self.display_drawingml_warning = False
|
|
314
|
+
else:
|
|
315
|
+
self._handle_drawingml(doc=doc, drawingml_els=drawingml_els)
|
|
277
316
|
# Check for the sdt containers, like table of contents
|
|
278
317
|
elif tag_name in ["sdt"]:
|
|
279
318
|
sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
|
|
@@ -687,8 +726,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
687
726
|
textbox_elements: list,
|
|
688
727
|
docx_obj: DocxDocument,
|
|
689
728
|
doc: DoclingDocument,
|
|
690
|
-
) ->
|
|
691
|
-
elem_ref:
|
|
729
|
+
) -> list[RefItem]:
|
|
730
|
+
elem_ref: list[RefItem] = []
|
|
692
731
|
"""Process textbox content and add it to the document structure."""
|
|
693
732
|
level = self._get_level()
|
|
694
733
|
# Create a textbox group to contain all text from the textbox
|
|
@@ -817,8 +856,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
817
856
|
element: BaseOxmlElement,
|
|
818
857
|
docx_obj: DocxDocument,
|
|
819
858
|
doc: DoclingDocument,
|
|
820
|
-
) ->
|
|
821
|
-
elem_ref:
|
|
859
|
+
) -> list[RefItem]:
|
|
860
|
+
elem_ref: list[RefItem] = []
|
|
822
861
|
paragraph = Paragraph(element, docx_obj)
|
|
823
862
|
paragraph_elements = self._get_paragraph_elements(paragraph)
|
|
824
863
|
text, equations = self._handle_equations_in_text(
|
|
@@ -993,8 +1032,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
993
1032
|
curr_level: Optional[int],
|
|
994
1033
|
text: str,
|
|
995
1034
|
is_numbered_style: bool = False,
|
|
996
|
-
) ->
|
|
997
|
-
elem_ref:
|
|
1035
|
+
) -> list[RefItem]:
|
|
1036
|
+
elem_ref: list[RefItem] = []
|
|
998
1037
|
level = self._get_level()
|
|
999
1038
|
if isinstance(curr_level, int):
|
|
1000
1039
|
if curr_level > level:
|
|
@@ -1063,8 +1102,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1063
1102
|
marker: str,
|
|
1064
1103
|
enumerated: bool,
|
|
1065
1104
|
level: int,
|
|
1066
|
-
) ->
|
|
1067
|
-
elem_ref:
|
|
1105
|
+
) -> list[RefItem]:
|
|
1106
|
+
elem_ref: list[RefItem] = []
|
|
1068
1107
|
# This should not happen by construction
|
|
1069
1108
|
if not isinstance(self.parents[level], ListGroup):
|
|
1070
1109
|
return elem_ref
|
|
@@ -1109,8 +1148,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1109
1148
|
ilevel: int,
|
|
1110
1149
|
elements: list,
|
|
1111
1150
|
is_numbered: bool = False,
|
|
1112
|
-
) ->
|
|
1113
|
-
elem_ref:
|
|
1151
|
+
) -> list[RefItem]:
|
|
1152
|
+
elem_ref: list[RefItem] = []
|
|
1114
1153
|
# this method is always called with is_numbered. Numbered lists should be properly addressed.
|
|
1115
1154
|
if not elements:
|
|
1116
1155
|
return elem_ref
|
|
@@ -1205,8 +1244,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1205
1244
|
element: BaseOxmlElement,
|
|
1206
1245
|
docx_obj: DocxDocument,
|
|
1207
1246
|
doc: DoclingDocument,
|
|
1208
|
-
) ->
|
|
1209
|
-
elem_ref:
|
|
1247
|
+
) -> list[RefItem]:
|
|
1248
|
+
elem_ref: list[RefItem] = []
|
|
1210
1249
|
table: Table = Table(element, docx_obj)
|
|
1211
1250
|
num_rows = len(table.rows)
|
|
1212
1251
|
num_cols = len(table.columns)
|
|
@@ -1260,13 +1299,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1260
1299
|
else:
|
|
1261
1300
|
text = text.replace("<eq>", "$").replace("</eq>", "$")
|
|
1262
1301
|
|
|
1263
|
-
provs_in_cell:
|
|
1302
|
+
provs_in_cell: list[RefItem] = []
|
|
1264
1303
|
_, provs_in_cell = self._walk_linear(cell._element, docx_obj, doc)
|
|
1265
1304
|
ref_for_rich_cell = provs_in_cell[0]
|
|
1266
1305
|
rich_table_cell = False
|
|
1267
1306
|
|
|
1268
1307
|
def group_cell_elements(
|
|
1269
|
-
group_name: str, doc: DoclingDocument, provs_in_cell:
|
|
1308
|
+
group_name: str, doc: DoclingDocument, provs_in_cell: list[RefItem]
|
|
1270
1309
|
) -> RefItem:
|
|
1271
1310
|
group_element = doc.add_group(
|
|
1272
1311
|
label=GroupLabel.UNSPECIFIED,
|
|
@@ -1340,7 +1379,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1340
1379
|
|
|
1341
1380
|
def _handle_pictures(
|
|
1342
1381
|
self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
|
|
1343
|
-
) ->
|
|
1382
|
+
) -> list[RefItem]:
|
|
1344
1383
|
def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
|
|
1345
1384
|
image_data: Optional[bytes] = None
|
|
1346
1385
|
rId = drawing_blip[0].get(
|
|
@@ -1352,7 +1391,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1352
1391
|
image_data = image_part.blob # Get the binary image data
|
|
1353
1392
|
return image_data
|
|
1354
1393
|
|
|
1355
|
-
elem_ref:
|
|
1394
|
+
elem_ref: list[RefItem] = []
|
|
1356
1395
|
level = self._get_level()
|
|
1357
1396
|
# Open the BytesIO object with PIL to create an Image
|
|
1358
1397
|
image_data: Optional[bytes] = get_docx_image(drawing_blip)
|
|
@@ -1381,3 +1420,39 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
|
1381
1420
|
)
|
|
1382
1421
|
elem_ref.append(p3.get_ref())
|
|
1383
1422
|
return elem_ref
|
|
1423
|
+
|
|
1424
|
+
def _handle_drawingml(self, doc: DoclingDocument, drawingml_els: Any):
|
|
1425
|
+
# 1) Make an empty copy of the original document
|
|
1426
|
+
dml_doc = self.load_msword_file(self.path_or_stream, self.document_hash)
|
|
1427
|
+
body = dml_doc._element.body
|
|
1428
|
+
for child in list(body):
|
|
1429
|
+
body.remove(child)
|
|
1430
|
+
|
|
1431
|
+
# 2) Add DrawingML to empty document
|
|
1432
|
+
new_para = dml_doc.add_paragraph()
|
|
1433
|
+
new_r = new_para.add_run()
|
|
1434
|
+
for dml in drawingml_els:
|
|
1435
|
+
new_r._r.append(deepcopy(dml))
|
|
1436
|
+
|
|
1437
|
+
# 3) Export DOCX->PDF->PNG and save it in DoclingDocument
|
|
1438
|
+
level = self._get_level()
|
|
1439
|
+
try:
|
|
1440
|
+
pil_image = get_pil_from_dml_docx(
|
|
1441
|
+
dml_doc, converter=self.docx_to_pdf_converter
|
|
1442
|
+
)
|
|
1443
|
+
if pil_image is None:
|
|
1444
|
+
raise UnidentifiedImageError
|
|
1445
|
+
|
|
1446
|
+
doc.add_picture(
|
|
1447
|
+
parent=self.parents[level - 1],
|
|
1448
|
+
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
|
1449
|
+
caption=None,
|
|
1450
|
+
)
|
|
1451
|
+
except (UnidentifiedImageError, OSError):
|
|
1452
|
+
_log.warning("Warning: DrawingML image cannot be loaded by Pillow")
|
|
1453
|
+
doc.add_picture(
|
|
1454
|
+
parent=self.parents[level - 1],
|
|
1455
|
+
caption=None,
|
|
1456
|
+
)
|
|
1457
|
+
|
|
1458
|
+
return
|
docling/backend/pdf_backend.py
CHANGED
|
@@ -9,6 +9,7 @@ from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
|
|
9
9
|
from PIL import Image
|
|
10
10
|
|
|
11
11
|
from docling.backend.abstract_backend import PaginatedDocumentBackend
|
|
12
|
+
from docling.datamodel.backend_options import PdfBackendOptions
|
|
12
13
|
from docling.datamodel.base_models import InputFormat
|
|
13
14
|
from docling.datamodel.document import InputDocument
|
|
14
15
|
|
|
@@ -50,8 +51,14 @@ class PdfPageBackend(ABC):
|
|
|
50
51
|
|
|
51
52
|
|
|
52
53
|
class PdfDocumentBackend(PaginatedDocumentBackend):
|
|
53
|
-
def __init__(
|
|
54
|
-
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
in_doc: InputDocument,
|
|
57
|
+
path_or_stream: Union[BytesIO, Path],
|
|
58
|
+
options: PdfBackendOptions = PdfBackendOptions(),
|
|
59
|
+
):
|
|
60
|
+
super().__init__(in_doc, path_or_stream, options)
|
|
61
|
+
self.options: PdfBackendOptions
|
|
55
62
|
|
|
56
63
|
if self.input_format is not InputFormat.PDF:
|
|
57
64
|
if self.input_format is InputFormat.IMAGE:
|
|
@@ -20,6 +20,7 @@ from pypdfium2 import PdfTextPage
|
|
|
20
20
|
from pypdfium2._helpers.misc import PdfiumError
|
|
21
21
|
|
|
22
22
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
|
23
|
+
from docling.datamodel.backend_options import PdfBackendOptions
|
|
23
24
|
from docling.utils.locks import pypdfium2_lock
|
|
24
25
|
|
|
25
26
|
|
|
@@ -370,12 +371,20 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
|
370
371
|
|
|
371
372
|
|
|
372
373
|
class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
|
373
|
-
def __init__(
|
|
374
|
-
|
|
374
|
+
def __init__(
|
|
375
|
+
self,
|
|
376
|
+
in_doc: "InputDocument",
|
|
377
|
+
path_or_stream: Union[BytesIO, Path],
|
|
378
|
+
options: PdfBackendOptions = PdfBackendOptions(),
|
|
379
|
+
):
|
|
380
|
+
super().__init__(in_doc, path_or_stream, options)
|
|
375
381
|
|
|
382
|
+
password = (
|
|
383
|
+
self.options.password.get_secret_value() if self.options.password else None
|
|
384
|
+
)
|
|
376
385
|
try:
|
|
377
386
|
with pypdfium2_lock:
|
|
378
|
-
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
|
387
|
+
self._pdoc = pdfium.PdfDocument(self.path_or_stream, password=password)
|
|
379
388
|
except PdfiumError as e:
|
|
380
389
|
raise RuntimeError(
|
|
381
390
|
f"pypdfium could not load document with hash {self.document_hash}"
|