docling 2.29.0__py3-none-any.whl → 2.31.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/asciidoc_backend.py +7 -15
- docling/backend/csv_backend.py +1 -1
- docling/backend/docling_parse_backend.py +2 -2
- docling/backend/docling_parse_v2_backend.py +2 -2
- docling/backend/docling_parse_v4_backend.py +3 -4
- docling/backend/docx/latex/latex_dict.py +0 -5
- docling/backend/docx/latex/omml.py +4 -7
- docling/backend/html_backend.py +26 -9
- docling/backend/md_backend.py +5 -7
- docling/backend/msexcel_backend.py +271 -95
- docling/backend/mspowerpoint_backend.py +4 -7
- docling/backend/msword_backend.py +23 -15
- docling/backend/pdf_backend.py +2 -1
- docling/backend/pypdfium2_backend.py +3 -3
- docling/backend/xml/jats_backend.py +10 -13
- docling/backend/xml/uspto_backend.py +15 -19
- docling/cli/main.py +27 -9
- docling/cli/models.py +2 -3
- docling/datamodel/base_models.py +40 -5
- docling/datamodel/document.py +18 -10
- docling/datamodel/pipeline_options.py +29 -4
- docling/document_converter.py +5 -5
- docling/models/api_vlm_model.py +66 -0
- docling/models/base_model.py +2 -4
- docling/models/base_ocr_model.py +2 -2
- docling/models/code_formula_model.py +2 -1
- docling/models/document_picture_classifier.py +2 -1
- docling/models/easyocr_model.py +10 -11
- docling/models/factories/__init__.py +2 -2
- docling/models/factories/base_factory.py +1 -1
- docling/models/hf_mlx_model.py +4 -6
- docling/models/hf_vlm_model.py +7 -5
- docling/models/layout_model.py +2 -2
- docling/models/ocr_mac_model.py +3 -4
- docling/models/page_assemble_model.py +7 -12
- docling/models/page_preprocessing_model.py +2 -1
- docling/models/picture_description_api_model.py +9 -75
- docling/models/picture_description_base_model.py +16 -5
- docling/models/picture_description_vlm_model.py +2 -3
- docling/models/rapid_ocr_model.py +2 -3
- docling/models/readingorder_model.py +8 -23
- docling/models/table_structure_model.py +2 -6
- docling/models/tesseract_ocr_cli_model.py +17 -16
- docling/models/tesseract_ocr_model.py +8 -6
- docling/pipeline/base_pipeline.py +4 -8
- docling/pipeline/simple_pipeline.py +0 -1
- docling/pipeline/standard_pdf_pipeline.py +6 -3
- docling/pipeline/vlm_pipeline.py +27 -20
- docling/utils/api_image_request.py +61 -0
- docling/utils/export.py +2 -4
- docling/utils/glm_utils.py +2 -2
- docling/utils/layout_postprocessor.py +4 -2
- docling/utils/model_downloader.py +7 -7
- docling/utils/utils.py +1 -1
- {docling-2.29.0.dist-info → docling-2.31.0.dist-info}/METADATA +4 -3
- docling-2.31.0.dist-info/RECORD +86 -0
- docling-2.29.0.dist-info/RECORD +0 -84
- {docling-2.29.0.dist-info → docling-2.31.0.dist-info}/LICENSE +0 -0
- {docling-2.29.0.dist-info → docling-2.31.0.dist-info}/WHEEL +0 -0
- {docling-2.29.0.dist-info → docling-2.31.0.dist-info}/entry_points.txt +0 -0
@@ -1,36 +1,50 @@
|
|
1
1
|
import logging
|
2
2
|
from io import BytesIO
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import
|
4
|
+
from typing import Any, Union, cast
|
5
5
|
|
6
6
|
from docling_core.types.doc import (
|
7
|
+
BoundingBox,
|
8
|
+
CoordOrigin,
|
9
|
+
DocItem,
|
7
10
|
DoclingDocument,
|
8
11
|
DocumentOrigin,
|
9
12
|
GroupLabel,
|
10
13
|
ImageRef,
|
14
|
+
ProvenanceItem,
|
15
|
+
Size,
|
11
16
|
TableCell,
|
12
17
|
TableData,
|
13
18
|
)
|
14
|
-
|
15
|
-
# from lxml import etree
|
16
|
-
from openpyxl import Workbook, load_workbook
|
17
|
-
from openpyxl.cell.cell import Cell
|
19
|
+
from openpyxl import load_workbook
|
18
20
|
from openpyxl.drawing.image import Image
|
21
|
+
from openpyxl.drawing.spreadsheet_drawing import TwoCellAnchor
|
19
22
|
from openpyxl.worksheet.worksheet import Worksheet
|
23
|
+
from PIL import Image as PILImage
|
24
|
+
from pydantic import BaseModel, NonNegativeInt, PositiveInt
|
25
|
+
from typing_extensions import override
|
20
26
|
|
21
|
-
from docling.backend.abstract_backend import
|
27
|
+
from docling.backend.abstract_backend import (
|
28
|
+
DeclarativeDocumentBackend,
|
29
|
+
PaginatedDocumentBackend,
|
30
|
+
)
|
22
31
|
from docling.datamodel.base_models import InputFormat
|
23
32
|
from docling.datamodel.document import InputDocument
|
24
33
|
|
25
34
|
_log = logging.getLogger(__name__)
|
26
35
|
|
27
|
-
from typing import Any, List
|
28
36
|
|
29
|
-
|
30
|
-
|
37
|
+
class ExcelCell(BaseModel):
|
38
|
+
"""Represents an Excel cell.
|
31
39
|
|
40
|
+
Attributes:
|
41
|
+
row: The row number of the cell.
|
42
|
+
col: The column number of the cell.
|
43
|
+
text: The text content of the cell.
|
44
|
+
row_span: The number of rows the cell spans.
|
45
|
+
col_span: The number of columns the cell spans.
|
46
|
+
"""
|
32
47
|
|
33
|
-
class ExcelCell(BaseModel):
|
34
48
|
row: int
|
35
49
|
col: int
|
36
50
|
text: str
|
@@ -39,19 +53,57 @@ class ExcelCell(BaseModel):
|
|
39
53
|
|
40
54
|
|
41
55
|
class ExcelTable(BaseModel):
|
56
|
+
"""Represents an Excel table on a worksheet.
|
57
|
+
|
58
|
+
Attributes:
|
59
|
+
anchor: The column and row indices of the upper-left cell of the table
|
60
|
+
(0-based index).
|
61
|
+
num_rows: The number of rows in the table.
|
62
|
+
num_cols: The number of columns in the table.
|
63
|
+
data: The data in the table, represented as a list of ExcelCell objects.
|
64
|
+
"""
|
65
|
+
|
66
|
+
anchor: tuple[NonNegativeInt, NonNegativeInt]
|
42
67
|
num_rows: int
|
43
68
|
num_cols: int
|
44
|
-
data:
|
69
|
+
data: list[ExcelCell]
|
70
|
+
|
45
71
|
|
72
|
+
class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
|
73
|
+
"""Backend for parsing Excel workbooks.
|
46
74
|
|
47
|
-
|
48
|
-
|
75
|
+
The backend converts an Excel workbook into a DoclingDocument object.
|
76
|
+
Each worksheet is converted into a separate page.
|
77
|
+
The following elements are parsed:
|
78
|
+
- Cell contents, parsed as tables. If two groups of cells are disconnected
|
79
|
+
between each other, they will be parsed as two different tables.
|
80
|
+
- Images, parsed as PictureItem objects.
|
81
|
+
|
82
|
+
The DoclingDocument tables and pictures have their provenance information, including
|
83
|
+
the position in their original Excel worksheet. The position is represented by a
|
84
|
+
bounding box object with the cell indices as units (0-based index). The size of this
|
85
|
+
bounding box is the number of columns and rows that the table or picture spans.
|
86
|
+
"""
|
87
|
+
|
88
|
+
@override
|
89
|
+
def __init__(
|
90
|
+
self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
|
91
|
+
) -> None:
|
92
|
+
"""Initialize the MsExcelDocumentBackend object.
|
93
|
+
|
94
|
+
Parameters:
|
95
|
+
in_doc: The input document object.
|
96
|
+
path_or_stream: The path or stream to the Excel file.
|
97
|
+
|
98
|
+
Raises:
|
99
|
+
RuntimeError: An error occurred parsing the file.
|
100
|
+
"""
|
49
101
|
super().__init__(in_doc, path_or_stream)
|
50
102
|
|
51
103
|
# Initialise the parents for the hierarchy
|
52
104
|
self.max_levels = 10
|
53
105
|
|
54
|
-
self.parents:
|
106
|
+
self.parents: dict[int, Any] = {}
|
55
107
|
for i in range(-1, self.max_levels):
|
56
108
|
self.parents[i] = None
|
57
109
|
|
@@ -63,35 +115,47 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
63
115
|
elif isinstance(self.path_or_stream, Path):
|
64
116
|
self.workbook = load_workbook(filename=str(self.path_or_stream))
|
65
117
|
|
66
|
-
self.valid =
|
118
|
+
self.valid = self.workbook is not None
|
67
119
|
except Exception as e:
|
68
120
|
self.valid = False
|
69
121
|
|
70
122
|
raise RuntimeError(
|
71
|
-
f"
|
123
|
+
f"MsExcelDocumentBackend could not load document with hash {self.document_hash}"
|
72
124
|
) from e
|
73
125
|
|
126
|
+
@override
|
74
127
|
def is_valid(self) -> bool:
|
75
|
-
_log.
|
128
|
+
_log.debug(f"valid: {self.valid}")
|
76
129
|
return self.valid
|
77
130
|
|
78
131
|
@classmethod
|
132
|
+
@override
|
79
133
|
def supports_pagination(cls) -> bool:
|
80
134
|
return True
|
81
135
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
136
|
+
@override
|
137
|
+
def page_count(self) -> int:
|
138
|
+
if self.is_valid() and self.workbook:
|
139
|
+
return len(self.workbook.sheetnames)
|
140
|
+
else:
|
141
|
+
return 0
|
87
142
|
|
88
143
|
@classmethod
|
89
|
-
|
144
|
+
@override
|
145
|
+
def supported_formats(cls) -> set[InputFormat]:
|
90
146
|
return {InputFormat.XLSX}
|
91
147
|
|
148
|
+
@override
|
92
149
|
def convert(self) -> DoclingDocument:
|
93
|
-
|
150
|
+
"""Parse the Excel workbook into a DoclingDocument object.
|
151
|
+
|
152
|
+
Raises:
|
153
|
+
RuntimeError: Unable to run the conversion since the backend object failed to
|
154
|
+
initialize.
|
94
155
|
|
156
|
+
Returns:
|
157
|
+
The DoclingDocument object representing the Excel workbook.
|
158
|
+
"""
|
95
159
|
origin = DocumentOrigin(
|
96
160
|
filename=self.file.name or "file.xlsx",
|
97
161
|
mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
@@ -110,29 +174,48 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
110
174
|
return doc
|
111
175
|
|
112
176
|
def _convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
|
177
|
+
"""Parse the Excel workbook and attach its structure to a DoclingDocument.
|
113
178
|
|
114
|
-
|
179
|
+
Args:
|
180
|
+
doc: A DoclingDocument object.
|
115
181
|
|
182
|
+
Returns:
|
183
|
+
A DoclingDocument object with the parsed items.
|
184
|
+
"""
|
185
|
+
|
186
|
+
if self.workbook is not None:
|
116
187
|
# Iterate over all sheets
|
117
188
|
for sheet_name in self.workbook.sheetnames:
|
118
189
|
_log.info(f"Processing sheet: {sheet_name}")
|
119
190
|
|
120
|
-
# Access the sheet by name
|
121
191
|
sheet = self.workbook[sheet_name]
|
192
|
+
page_no = self.workbook.index(sheet) + 1
|
193
|
+
# do not rely on sheet.max_column, sheet.max_row if there are images
|
194
|
+
page = doc.add_page(page_no=page_no, size=Size(width=0, height=0))
|
122
195
|
|
123
196
|
self.parents[0] = doc.add_group(
|
124
197
|
parent=None,
|
125
198
|
label=GroupLabel.SECTION,
|
126
199
|
name=f"sheet: {sheet_name}",
|
127
200
|
)
|
128
|
-
|
129
201
|
doc = self._convert_sheet(doc, sheet)
|
202
|
+
width, height = self._find_page_size(doc, page_no)
|
203
|
+
page.size = Size(width=width, height=height)
|
130
204
|
else:
|
131
205
|
_log.error("Workbook is not initialized.")
|
132
206
|
|
133
207
|
return doc
|
134
208
|
|
135
|
-
def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet):
|
209
|
+
def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet) -> DoclingDocument:
|
210
|
+
"""Parse an Excel worksheet and attach its structure to a DoclingDocument
|
211
|
+
|
212
|
+
Args:
|
213
|
+
doc: The DoclingDocument to be updated.
|
214
|
+
sheet: The Excel worksheet to be parsed.
|
215
|
+
|
216
|
+
Returns:
|
217
|
+
The updated DoclingDocument.
|
218
|
+
"""
|
136
219
|
|
137
220
|
doc = self._find_tables_in_sheet(doc, sheet)
|
138
221
|
|
@@ -140,60 +223,90 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
140
223
|
|
141
224
|
return doc
|
142
225
|
|
143
|
-
def _find_tables_in_sheet(
|
144
|
-
|
145
|
-
|
226
|
+
def _find_tables_in_sheet(
|
227
|
+
self, doc: DoclingDocument, sheet: Worksheet
|
228
|
+
) -> DoclingDocument:
|
229
|
+
"""Find all tables in an Excel sheet and attach them to a DoclingDocument.
|
146
230
|
|
147
|
-
|
148
|
-
|
149
|
-
|
231
|
+
Args:
|
232
|
+
doc: The DoclingDocument to be updated.
|
233
|
+
sheet: The Excel worksheet to be parsed.
|
150
234
|
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
table_cells=[],
|
155
|
-
)
|
235
|
+
Returns:
|
236
|
+
The updated DoclingDocument.
|
237
|
+
"""
|
156
238
|
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
239
|
+
if self.workbook is not None:
|
240
|
+
tables = self._find_data_tables(sheet)
|
241
|
+
|
242
|
+
for excel_table in tables:
|
243
|
+
origin_col = excel_table.anchor[0]
|
244
|
+
origin_row = excel_table.anchor[1]
|
245
|
+
num_rows = excel_table.num_rows
|
246
|
+
num_cols = excel_table.num_cols
|
247
|
+
|
248
|
+
table_data = TableData(
|
249
|
+
num_rows=num_rows,
|
250
|
+
num_cols=num_cols,
|
251
|
+
table_cells=[],
|
169
252
|
)
|
170
|
-
table_data.table_cells.append(cell)
|
171
253
|
|
172
|
-
|
254
|
+
for excel_cell in excel_table.data:
|
255
|
+
cell = TableCell(
|
256
|
+
text=excel_cell.text,
|
257
|
+
row_span=excel_cell.row_span,
|
258
|
+
col_span=excel_cell.col_span,
|
259
|
+
start_row_offset_idx=excel_cell.row,
|
260
|
+
end_row_offset_idx=excel_cell.row + excel_cell.row_span,
|
261
|
+
start_col_offset_idx=excel_cell.col,
|
262
|
+
end_col_offset_idx=excel_cell.col + excel_cell.col_span,
|
263
|
+
column_header=excel_cell.row == 0,
|
264
|
+
row_header=False,
|
265
|
+
)
|
266
|
+
table_data.table_cells.append(cell)
|
267
|
+
|
268
|
+
page_no = self.workbook.index(sheet) + 1
|
269
|
+
doc.add_table(
|
270
|
+
data=table_data,
|
271
|
+
parent=self.parents[0],
|
272
|
+
prov=ProvenanceItem(
|
273
|
+
page_no=page_no,
|
274
|
+
charspan=(0, 0),
|
275
|
+
bbox=BoundingBox.from_tuple(
|
276
|
+
(
|
277
|
+
origin_col,
|
278
|
+
origin_row,
|
279
|
+
origin_col + num_cols,
|
280
|
+
origin_row + num_rows,
|
281
|
+
),
|
282
|
+
origin=CoordOrigin.TOPLEFT,
|
283
|
+
),
|
284
|
+
),
|
285
|
+
)
|
173
286
|
|
174
287
|
return doc
|
175
288
|
|
176
|
-
def _find_data_tables(self, sheet: Worksheet) ->
|
177
|
-
"""
|
178
|
-
Find all compact rectangular data tables in a sheet.
|
179
|
-
"""
|
180
|
-
# _log.info("find_data_tables")
|
289
|
+
def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
|
290
|
+
"""Find all compact rectangular data tables in an Excel worksheet.
|
181
291
|
|
182
|
-
|
183
|
-
|
292
|
+
Args:
|
293
|
+
sheet: The Excel worksheet to be parsed.
|
294
|
+
|
295
|
+
Returns:
|
296
|
+
A list of ExcelTable objects representing the data tables.
|
297
|
+
"""
|
298
|
+
tables: list[ExcelTable] = [] # List to store found tables
|
299
|
+
visited: set[tuple[int, int]] = set() # Track already visited cells
|
184
300
|
|
185
301
|
# Iterate over all cells in the sheet
|
186
302
|
for ri, row in enumerate(sheet.iter_rows(values_only=False)):
|
187
303
|
for rj, cell in enumerate(row):
|
188
|
-
|
189
304
|
# Skip empty or already visited cells
|
190
305
|
if cell.value is None or (ri, rj) in visited:
|
191
306
|
continue
|
192
307
|
|
193
308
|
# If the cell starts a new table, find its bounds
|
194
|
-
table_bounds, visited_cells = self._find_table_bounds(
|
195
|
-
sheet, ri, rj, visited
|
196
|
-
)
|
309
|
+
table_bounds, visited_cells = self._find_table_bounds(sheet, ri, rj)
|
197
310
|
|
198
311
|
visited.update(visited_cells) # Mark these cells as visited
|
199
312
|
tables.append(table_bounds)
|
@@ -205,41 +318,40 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
205
318
|
sheet: Worksheet,
|
206
319
|
start_row: int,
|
207
320
|
start_col: int,
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
321
|
+
) -> tuple[ExcelTable, set[tuple[int, int]]]:
|
322
|
+
"""Determine the bounds of a compact rectangular table.
|
323
|
+
|
324
|
+
Args:
|
325
|
+
sheet: The Excel worksheet to be parsed.
|
326
|
+
start_row: The row number of the starting cell.
|
327
|
+
start_col: The column number of the starting cell.
|
328
|
+
|
212
329
|
Returns:
|
213
|
-
|
214
|
-
- A set of visited cell coordinates.
|
330
|
+
A tuple with an Excel table and a set of cell coordinates.
|
215
331
|
"""
|
216
|
-
_log.
|
332
|
+
_log.debug("find_table_bounds")
|
217
333
|
|
218
334
|
max_row = self._find_table_bottom(sheet, start_row, start_col)
|
219
335
|
max_col = self._find_table_right(sheet, start_row, start_col)
|
220
336
|
|
221
337
|
# Collect the data within the bounds
|
222
338
|
data = []
|
223
|
-
visited_cells = set()
|
339
|
+
visited_cells: set[tuple[int, int]] = set()
|
224
340
|
for ri in range(start_row, max_row + 1):
|
225
341
|
for rj in range(start_col, max_col + 1):
|
226
|
-
|
227
342
|
cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing
|
228
343
|
|
229
344
|
# Check if the cell belongs to a merged range
|
230
345
|
row_span = 1
|
231
346
|
col_span = 1
|
232
347
|
|
233
|
-
# _log.info(sheet.merged_cells.ranges)
|
234
348
|
for merged_range in sheet.merged_cells.ranges:
|
235
|
-
|
236
349
|
if (
|
237
350
|
merged_range.min_row <= ri + 1
|
238
351
|
and ri + 1 <= merged_range.max_row
|
239
352
|
and merged_range.min_col <= rj + 1
|
240
353
|
and rj + 1 <= merged_range.max_col
|
241
354
|
):
|
242
|
-
|
243
355
|
row_span = merged_range.max_row - merged_range.min_row + 1
|
244
356
|
col_span = merged_range.max_col - merged_range.min_col + 1
|
245
357
|
break
|
@@ -254,7 +366,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
254
366
|
col_span=col_span,
|
255
367
|
)
|
256
368
|
)
|
257
|
-
# _log.info(f"cell: {ri}, {rj} -> {ri - start_row}, {rj - start_col}, {row_span}, {col_span}: {str(cell.value)}")
|
258
369
|
|
259
370
|
# Mark all cells in the span as visited
|
260
371
|
for span_row in range(ri, ri + row_span):
|
@@ -263,6 +374,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
263
374
|
|
264
375
|
return (
|
265
376
|
ExcelTable(
|
377
|
+
anchor=(start_col, start_row),
|
266
378
|
num_rows=max_row + 1 - start_row,
|
267
379
|
num_cols=max_col + 1 - start_col,
|
268
380
|
data=data,
|
@@ -270,10 +382,20 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
270
382
|
visited_cells,
|
271
383
|
)
|
272
384
|
|
273
|
-
def _find_table_bottom(
|
274
|
-
|
385
|
+
def _find_table_bottom(
|
386
|
+
self, sheet: Worksheet, start_row: int, start_col: int
|
387
|
+
) -> int:
|
388
|
+
"""Find the bottom boundary of a table.
|
389
|
+
|
390
|
+
Args:
|
391
|
+
sheet: The Excel worksheet to be parsed.
|
392
|
+
start_row: The starting row of the table.
|
393
|
+
start_col: The starting column of the table.
|
275
394
|
|
276
|
-
|
395
|
+
Returns:
|
396
|
+
The row index representing the bottom boundary of the table.
|
397
|
+
"""
|
398
|
+
max_row: int = start_row
|
277
399
|
|
278
400
|
while max_row < sheet.max_row - 1:
|
279
401
|
# Get the cell value or check if it is part of a merged cell
|
@@ -296,10 +418,20 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
296
418
|
|
297
419
|
return max_row
|
298
420
|
|
299
|
-
def _find_table_right(
|
300
|
-
|
421
|
+
def _find_table_right(
|
422
|
+
self, sheet: Worksheet, start_row: int, start_col: int
|
423
|
+
) -> int:
|
424
|
+
"""Find the right boundary of a table.
|
301
425
|
|
302
|
-
|
426
|
+
Args:
|
427
|
+
sheet: The Excel worksheet to be parsed.
|
428
|
+
start_row: The starting row of the table.
|
429
|
+
start_col: The starting column of the table.
|
430
|
+
|
431
|
+
Returns:
|
432
|
+
The column index representing the right boundary of the table."
|
433
|
+
"""
|
434
|
+
max_col: int = start_col
|
303
435
|
|
304
436
|
while max_col < sheet.max_column - 1:
|
305
437
|
# Get the cell value or check if it is part of a merged cell
|
@@ -325,19 +457,63 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
325
457
|
def _find_images_in_sheet(
|
326
458
|
self, doc: DoclingDocument, sheet: Worksheet
|
327
459
|
) -> DoclingDocument:
|
460
|
+
"""Find images in the Excel sheet and attach them to the DoclingDocument.
|
328
461
|
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
try:
|
333
|
-
pil_image = PILImage.open(image.ref)
|
462
|
+
Args:
|
463
|
+
doc: The DoclingDocument to be updated.
|
464
|
+
sheet: The Excel worksheet to be parsed.
|
334
465
|
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
466
|
+
Returns:
|
467
|
+
The updated DoclingDocument.
|
468
|
+
"""
|
469
|
+
if self.workbook is not None:
|
470
|
+
# Iterate over byte images in the sheet
|
471
|
+
for item in sheet._images: # type: ignore[attr-defined]
|
472
|
+
try:
|
473
|
+
image: Image = cast(Image, item)
|
474
|
+
pil_image = PILImage.open(image.ref) # type: ignore[arg-type]
|
475
|
+
page_no = self.workbook.index(sheet) + 1
|
476
|
+
anchor = (0, 0, 0, 0)
|
477
|
+
if isinstance(image.anchor, TwoCellAnchor):
|
478
|
+
anchor = (
|
479
|
+
image.anchor._from.col,
|
480
|
+
image.anchor._from.row,
|
481
|
+
image.anchor.to.col + 1,
|
482
|
+
image.anchor.to.row + 1,
|
483
|
+
)
|
484
|
+
doc.add_picture(
|
485
|
+
parent=self.parents[0],
|
486
|
+
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
487
|
+
caption=None,
|
488
|
+
prov=ProvenanceItem(
|
489
|
+
page_no=page_no,
|
490
|
+
charspan=(0, 0),
|
491
|
+
bbox=BoundingBox.from_tuple(
|
492
|
+
anchor, origin=CoordOrigin.TOPLEFT
|
493
|
+
),
|
494
|
+
),
|
495
|
+
)
|
496
|
+
except Exception:
|
497
|
+
_log.error("could not extract the image from excel sheets")
|
342
498
|
|
343
499
|
return doc
|
500
|
+
|
501
|
+
@staticmethod
|
502
|
+
def _find_page_size(
|
503
|
+
doc: DoclingDocument, page_no: PositiveInt
|
504
|
+
) -> tuple[float, float]:
|
505
|
+
left: float = -1.0
|
506
|
+
top: float = -1.0
|
507
|
+
right: float = -1.0
|
508
|
+
bottom: float = -1.0
|
509
|
+
for item, _ in doc.iterate_items(traverse_pictures=True, page_no=page_no):
|
510
|
+
if not isinstance(item, DocItem):
|
511
|
+
continue
|
512
|
+
for provenance in item.prov:
|
513
|
+
bbox = provenance.bbox
|
514
|
+
left = min(left, bbox.l) if left != -1 else bbox.l
|
515
|
+
right = max(right, bbox.r) if right != -1 else bbox.r
|
516
|
+
top = min(top, bbox.t) if top != -1 else bbox.t
|
517
|
+
bottom = max(bottom, bbox.b) if bottom != -1 else bbox.b
|
518
|
+
|
519
|
+
return (right - left, bottom - top)
|
@@ -120,13 +120,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
120
120
|
|
121
121
|
return prov
|
122
122
|
|
123
|
-
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
|
123
|
+
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size): # noqa: C901
|
124
124
|
is_a_list = False
|
125
125
|
is_list_group_created = False
|
126
126
|
enum_list_item_value = 0
|
127
127
|
new_list = None
|
128
128
|
bullet_type = "None"
|
129
|
-
list_text = ""
|
130
129
|
list_label = GroupLabel.LIST
|
131
130
|
doc_label = DocItemLabel.LIST_ITEM
|
132
131
|
prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
|
@@ -243,7 +242,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
243
242
|
enum_marker = str(enum_list_item_value) + "."
|
244
243
|
if not is_list_group_created:
|
245
244
|
new_list = doc.add_group(
|
246
|
-
label=list_label, name=
|
245
|
+
label=list_label, name="list", parent=parent_slide
|
247
246
|
)
|
248
247
|
is_list_group_created = True
|
249
248
|
doc.add_list_item(
|
@@ -368,11 +367,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
368
367
|
slide_width = pptx_obj.slide_width
|
369
368
|
slide_height = pptx_obj.slide_height
|
370
369
|
|
371
|
-
text_content = [] # type: ignore
|
372
|
-
|
373
370
|
max_levels = 10
|
374
371
|
parents = {} # type: ignore
|
375
|
-
for i in range(
|
372
|
+
for i in range(max_levels):
|
376
373
|
parents[i] = None
|
377
374
|
|
378
375
|
# Loop through each slide
|
@@ -383,7 +380,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
383
380
|
)
|
384
381
|
|
385
382
|
slide_size = Size(width=slide_width, height=slide_height)
|
386
|
-
|
383
|
+
doc.add_page(page_no=slide_ind + 1, size=slide_size)
|
387
384
|
|
388
385
|
def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
|
389
386
|
handle_groups(shape, parent_slide, slide_ind, doc, slide_size)
|
@@ -158,7 +158,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
158
158
|
def _get_level(self) -> int:
|
159
159
|
"""Return the first None index."""
|
160
160
|
for k, v in self.parents.items():
|
161
|
-
if k >= 0 and v
|
161
|
+
if k >= 0 and v is None:
|
162
162
|
return k
|
163
163
|
return 0
|
164
164
|
|
@@ -418,7 +418,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
418
418
|
else prev_parent
|
419
419
|
)
|
420
420
|
|
421
|
-
def _handle_text_elements(
|
421
|
+
def _handle_text_elements( # noqa: C901
|
422
422
|
self,
|
423
423
|
element: BaseOxmlElement,
|
424
424
|
docx_obj: DocxDocument,
|
@@ -812,7 +812,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
812
812
|
f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
|
813
813
|
)
|
814
814
|
if cell is None or cell._tc in cell_set:
|
815
|
-
_log.debug(
|
815
|
+
_log.debug(" skipped since repeated content")
|
816
816
|
col_idx += cell.grid_span
|
817
817
|
continue
|
818
818
|
else:
|
@@ -850,7 +850,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
850
850
|
def _handle_pictures(
|
851
851
|
self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
|
852
852
|
) -> None:
|
853
|
-
def get_docx_image(drawing_blip):
|
853
|
+
def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
|
854
|
+
image_data: Optional[bytes] = None
|
854
855
|
rId = drawing_blip[0].get(
|
855
856
|
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
|
856
857
|
)
|
@@ -862,19 +863,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
862
863
|
|
863
864
|
level = self._get_level()
|
864
865
|
# Open the BytesIO object with PIL to create an Image
|
865
|
-
|
866
|
-
|
867
|
-
|
868
|
-
pil_image = Image.open(image_bytes)
|
869
|
-
doc.add_picture(
|
870
|
-
parent=self.parents[level - 1],
|
871
|
-
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
872
|
-
caption=None,
|
873
|
-
)
|
874
|
-
except (UnidentifiedImageError, OSError) as e:
|
875
|
-
_log.warning("Warning: image cannot be loaded by Pillow")
|
866
|
+
image_data: Optional[bytes] = get_docx_image(drawing_blip)
|
867
|
+
if image_data is None:
|
868
|
+
_log.warning("Warning: image cannot be found")
|
876
869
|
doc.add_picture(
|
877
870
|
parent=self.parents[level - 1],
|
878
871
|
caption=None,
|
879
872
|
)
|
873
|
+
else:
|
874
|
+
try:
|
875
|
+
image_bytes = BytesIO(image_data)
|
876
|
+
pil_image = Image.open(image_bytes)
|
877
|
+
doc.add_picture(
|
878
|
+
parent=self.parents[level - 1],
|
879
|
+
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
880
|
+
caption=None,
|
881
|
+
)
|
882
|
+
except (UnidentifiedImageError, OSError):
|
883
|
+
_log.warning("Warning: image cannot be loaded by Pillow")
|
884
|
+
doc.add_picture(
|
885
|
+
parent=self.parents[level - 1],
|
886
|
+
caption=None,
|
887
|
+
)
|
880
888
|
return
|
docling/backend/pdf_backend.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
from abc import ABC, abstractmethod
|
2
|
+
from collections.abc import Iterable
|
2
3
|
from io import BytesIO
|
3
4
|
from pathlib import Path
|
4
|
-
from typing import
|
5
|
+
from typing import Optional, Set, Union
|
5
6
|
|
6
7
|
from docling_core.types.doc import BoundingBox, Size
|
7
8
|
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|