docling 2.28.4__py3-none-any.whl → 2.30.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/docx/latex/latex_dict.py +3 -0
- docling/backend/docx/latex/omml.py +14 -14
- docling/backend/html_backend.py +2 -1
- docling/backend/msexcel_backend.py +272 -90
- docling/backend/mspowerpoint_backend.py +4 -3
- docling/backend/msword_backend.py +320 -118
- docling/cli/main.py +70 -2
- docling/datamodel/base_models.py +33 -0
- docling/datamodel/document.py +7 -0
- docling/datamodel/pipeline_options.py +29 -3
- docling/models/api_vlm_model.py +67 -0
- docling/models/picture_description_api_model.py +8 -75
- docling/models/picture_description_base_model.py +14 -2
- docling/models/tesseract_ocr_cli_model.py +1 -1
- docling/pipeline/standard_pdf_pipeline.py +6 -2
- docling/pipeline/vlm_pipeline.py +27 -17
- docling/utils/api_image_request.py +61 -0
- {docling-2.28.4.dist-info → docling-2.30.0.dist-info}/METADATA +3 -3
- {docling-2.28.4.dist-info → docling-2.30.0.dist-info}/RECORD +22 -20
- {docling-2.28.4.dist-info → docling-2.30.0.dist-info}/LICENSE +0 -0
- {docling-2.28.4.dist-info → docling-2.30.0.dist-info}/WHEEL +0 -0
- {docling-2.28.4.dist-info → docling-2.30.0.dist-info}/entry_points.txt +0 -0
@@ -5,6 +5,8 @@ Adapted from https://github.com/xiilei/dwml/blob/master/dwml/omml.py
|
|
5
5
|
On 23/01/2025
|
6
6
|
"""
|
7
7
|
|
8
|
+
import logging
|
9
|
+
|
8
10
|
import lxml.etree as ET
|
9
11
|
from pylatexenc.latexencode import UnicodeToLatexEncoder
|
10
12
|
|
@@ -39,6 +41,8 @@ from docling.backend.docx.latex.latex_dict import (
|
|
39
41
|
|
40
42
|
OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}"
|
41
43
|
|
44
|
+
_log = logging.getLogger(__name__)
|
45
|
+
|
42
46
|
|
43
47
|
def load(stream):
|
44
48
|
tree = ET.parse(stream)
|
@@ -281,8 +285,10 @@ class oMath2Latex(Tag2Method):
|
|
281
285
|
if FUNC.get(t):
|
282
286
|
latex_chars.append(FUNC[t])
|
283
287
|
else:
|
284
|
-
|
285
|
-
|
288
|
+
_log.warning("Function not supported, will default to text: %s", t)
|
289
|
+
if isinstance(t, str):
|
290
|
+
latex_chars.append(t)
|
291
|
+
elif isinstance(t, str):
|
286
292
|
latex_chars.append(t)
|
287
293
|
t = BLANK.join(latex_chars)
|
288
294
|
return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this
|
@@ -382,8 +388,6 @@ class oMath2Latex(Tag2Method):
|
|
382
388
|
|
383
389
|
out_latex_str = self.u.unicode_to_latex(s)
|
384
390
|
|
385
|
-
# print(s, out_latex_str)
|
386
|
-
|
387
391
|
if (
|
388
392
|
s.startswith("{") is False
|
389
393
|
and out_latex_str.startswith("{")
|
@@ -392,19 +396,13 @@ class oMath2Latex(Tag2Method):
|
|
392
396
|
):
|
393
397
|
out_latex_str = f" {out_latex_str[1:-1]} "
|
394
398
|
|
395
|
-
# print(s, out_latex_str)
|
396
|
-
|
397
399
|
if "ensuremath" in out_latex_str:
|
398
400
|
out_latex_str = out_latex_str.replace("\\ensuremath{", " ")
|
399
401
|
out_latex_str = out_latex_str.replace("}", " ")
|
400
402
|
|
401
|
-
# print(s, out_latex_str)
|
402
|
-
|
403
403
|
if out_latex_str.strip().startswith("\\text"):
|
404
404
|
out_latex_str = f" \\text{{{out_latex_str}}} "
|
405
405
|
|
406
|
-
# print(s, out_latex_str)
|
407
|
-
|
408
406
|
return out_latex_str
|
409
407
|
|
410
408
|
def do_r(self, elm):
|
@@ -415,10 +413,12 @@ class oMath2Latex(Tag2Method):
|
|
415
413
|
"""
|
416
414
|
_str = []
|
417
415
|
_base_str = []
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
416
|
+
found_text = elm.findtext("./{0}t".format(OMML_NS))
|
417
|
+
if found_text:
|
418
|
+
for s in found_text:
|
419
|
+
out_latex_str = self.process_unicode(s)
|
420
|
+
_str.append(out_latex_str)
|
421
|
+
_base_str.append(s)
|
422
422
|
|
423
423
|
proc_str = escape_latex(BLANK.join(_str))
|
424
424
|
base_proc_str = BLANK.join(_base_str)
|
docling/backend/html_backend.py
CHANGED
@@ -34,6 +34,7 @@ TAGS_FOR_NODE_ITEMS: Final = [
|
|
34
34
|
"h6",
|
35
35
|
"p",
|
36
36
|
"pre",
|
37
|
+
"code",
|
37
38
|
"ul",
|
38
39
|
"ol",
|
39
40
|
"li",
|
@@ -165,7 +166,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
165
166
|
self.handle_header(tag, doc)
|
166
167
|
elif tag.name in ["p"]:
|
167
168
|
self.handle_paragraph(tag, doc)
|
168
|
-
elif tag.name in ["pre"]:
|
169
|
+
elif tag.name in ["pre", "code"]:
|
169
170
|
self.handle_code(tag, doc)
|
170
171
|
elif tag.name in ["ul", "ol"]:
|
171
172
|
self.handle_list(tag, doc)
|
@@ -1,36 +1,50 @@
|
|
1
1
|
import logging
|
2
2
|
from io import BytesIO
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import
|
4
|
+
from typing import Any, Union, cast
|
5
5
|
|
6
6
|
from docling_core.types.doc import (
|
7
|
+
BoundingBox,
|
8
|
+
CoordOrigin,
|
9
|
+
DocItem,
|
7
10
|
DoclingDocument,
|
8
11
|
DocumentOrigin,
|
9
12
|
GroupLabel,
|
10
13
|
ImageRef,
|
14
|
+
ProvenanceItem,
|
15
|
+
Size,
|
11
16
|
TableCell,
|
12
17
|
TableData,
|
13
18
|
)
|
14
|
-
|
15
|
-
# from lxml import etree
|
16
|
-
from openpyxl import Workbook, load_workbook
|
17
|
-
from openpyxl.cell.cell import Cell
|
19
|
+
from openpyxl import load_workbook
|
18
20
|
from openpyxl.drawing.image import Image
|
21
|
+
from openpyxl.drawing.spreadsheet_drawing import TwoCellAnchor
|
19
22
|
from openpyxl.worksheet.worksheet import Worksheet
|
23
|
+
from PIL import Image as PILImage
|
24
|
+
from pydantic import BaseModel, NonNegativeInt, PositiveInt
|
25
|
+
from typing_extensions import override
|
20
26
|
|
21
|
-
from docling.backend.abstract_backend import
|
27
|
+
from docling.backend.abstract_backend import (
|
28
|
+
DeclarativeDocumentBackend,
|
29
|
+
PaginatedDocumentBackend,
|
30
|
+
)
|
22
31
|
from docling.datamodel.base_models import InputFormat
|
23
32
|
from docling.datamodel.document import InputDocument
|
24
33
|
|
25
34
|
_log = logging.getLogger(__name__)
|
26
35
|
|
27
|
-
from typing import Any, List
|
28
36
|
|
29
|
-
|
30
|
-
|
37
|
+
class ExcelCell(BaseModel):
|
38
|
+
"""Represents an Excel cell.
|
31
39
|
|
40
|
+
Attributes:
|
41
|
+
row: The row number of the cell.
|
42
|
+
col: The column number of the cell.
|
43
|
+
text: The text content of the cell.
|
44
|
+
row_span: The number of rows the cell spans.
|
45
|
+
col_span: The number of columns the cell spans.
|
46
|
+
"""
|
32
47
|
|
33
|
-
class ExcelCell(BaseModel):
|
34
48
|
row: int
|
35
49
|
col: int
|
36
50
|
text: str
|
@@ -39,19 +53,57 @@ class ExcelCell(BaseModel):
|
|
39
53
|
|
40
54
|
|
41
55
|
class ExcelTable(BaseModel):
|
56
|
+
"""Represents an Excel table on a worksheet.
|
57
|
+
|
58
|
+
Attributes:
|
59
|
+
anchor: The column and row indices of the upper-left cell of the table
|
60
|
+
(0-based index).
|
61
|
+
num_rows: The number of rows in the table.
|
62
|
+
num_cols: The number of columns in the table.
|
63
|
+
data: The data in the table, represented as a list of ExcelCell objects.
|
64
|
+
"""
|
65
|
+
|
66
|
+
anchor: tuple[NonNegativeInt, NonNegativeInt]
|
42
67
|
num_rows: int
|
43
68
|
num_cols: int
|
44
|
-
data:
|
69
|
+
data: list[ExcelCell]
|
45
70
|
|
46
71
|
|
47
|
-
class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
48
|
-
|
72
|
+
class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
|
73
|
+
"""Backend for parsing Excel workbooks.
|
74
|
+
|
75
|
+
The backend converts an Excel workbook into a DoclingDocument object.
|
76
|
+
Each worksheet is converted into a separate page.
|
77
|
+
The following elements are parsed:
|
78
|
+
- Cell contents, parsed as tables. If two groups of cells are disconnected
|
79
|
+
between each other, they will be parsed as two different tables.
|
80
|
+
- Images, parsed as PictureItem objects.
|
81
|
+
|
82
|
+
The DoclingDocument tables and pictures have their provenance information, including
|
83
|
+
the position in their original Excel worksheet. The position is represented by a
|
84
|
+
bounding box object with the cell indices as units (0-based index). The size of this
|
85
|
+
bounding box is the number of columns and rows that the table or picture spans.
|
86
|
+
"""
|
87
|
+
|
88
|
+
@override
|
89
|
+
def __init__(
|
90
|
+
self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
|
91
|
+
) -> None:
|
92
|
+
"""Initialize the MsExcelDocumentBackend object.
|
93
|
+
|
94
|
+
Parameters:
|
95
|
+
in_doc: The input document object.
|
96
|
+
path_or_stream: The path or stream to the Excel file.
|
97
|
+
|
98
|
+
Raises:
|
99
|
+
RuntimeError: An error occurred parsing the file.
|
100
|
+
"""
|
49
101
|
super().__init__(in_doc, path_or_stream)
|
50
102
|
|
51
103
|
# Initialise the parents for the hierarchy
|
52
104
|
self.max_levels = 10
|
53
105
|
|
54
|
-
self.parents:
|
106
|
+
self.parents: dict[int, Any] = {}
|
55
107
|
for i in range(-1, self.max_levels):
|
56
108
|
self.parents[i] = None
|
57
109
|
|
@@ -63,35 +115,47 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
63
115
|
elif isinstance(self.path_or_stream, Path):
|
64
116
|
self.workbook = load_workbook(filename=str(self.path_or_stream))
|
65
117
|
|
66
|
-
self.valid =
|
118
|
+
self.valid = self.workbook is not None
|
67
119
|
except Exception as e:
|
68
120
|
self.valid = False
|
69
121
|
|
70
122
|
raise RuntimeError(
|
71
|
-
f"
|
123
|
+
f"MsExcelDocumentBackend could not load document with hash {self.document_hash}"
|
72
124
|
) from e
|
73
125
|
|
126
|
+
@override
|
74
127
|
def is_valid(self) -> bool:
|
75
|
-
_log.
|
128
|
+
_log.debug(f"valid: {self.valid}")
|
76
129
|
return self.valid
|
77
130
|
|
78
131
|
@classmethod
|
132
|
+
@override
|
79
133
|
def supports_pagination(cls) -> bool:
|
80
134
|
return True
|
81
135
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
136
|
+
@override
|
137
|
+
def page_count(self) -> int:
|
138
|
+
if self.is_valid() and self.workbook:
|
139
|
+
return len(self.workbook.sheetnames)
|
140
|
+
else:
|
141
|
+
return 0
|
87
142
|
|
88
143
|
@classmethod
|
89
|
-
|
144
|
+
@override
|
145
|
+
def supported_formats(cls) -> set[InputFormat]:
|
90
146
|
return {InputFormat.XLSX}
|
91
147
|
|
148
|
+
@override
|
92
149
|
def convert(self) -> DoclingDocument:
|
93
|
-
|
150
|
+
"""Parse the Excel workbook into a DoclingDocument object.
|
94
151
|
|
152
|
+
Raises:
|
153
|
+
RuntimeError: Unable to run the conversion since the backend object failed to
|
154
|
+
initialize.
|
155
|
+
|
156
|
+
Returns:
|
157
|
+
The DoclingDocument object representing the Excel workbook.
|
158
|
+
"""
|
95
159
|
origin = DocumentOrigin(
|
96
160
|
filename=self.file.name or "file.xlsx",
|
97
161
|
mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
@@ -110,6 +174,14 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
110
174
|
return doc
|
111
175
|
|
112
176
|
def _convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
|
177
|
+
"""Parse the Excel workbook and attach its structure to a DoclingDocument.
|
178
|
+
|
179
|
+
Args:
|
180
|
+
doc: A DoclingDocument object.
|
181
|
+
|
182
|
+
Returns:
|
183
|
+
A DoclingDocument object with the parsed items.
|
184
|
+
"""
|
113
185
|
|
114
186
|
if self.workbook is not None:
|
115
187
|
|
@@ -117,22 +189,34 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
117
189
|
for sheet_name in self.workbook.sheetnames:
|
118
190
|
_log.info(f"Processing sheet: {sheet_name}")
|
119
191
|
|
120
|
-
# Access the sheet by name
|
121
192
|
sheet = self.workbook[sheet_name]
|
193
|
+
page_no = self.workbook.index(sheet) + 1
|
194
|
+
# do not rely on sheet.max_column, sheet.max_row if there are images
|
195
|
+
page = doc.add_page(page_no=page_no, size=Size(width=0, height=0))
|
122
196
|
|
123
197
|
self.parents[0] = doc.add_group(
|
124
198
|
parent=None,
|
125
199
|
label=GroupLabel.SECTION,
|
126
200
|
name=f"sheet: {sheet_name}",
|
127
201
|
)
|
128
|
-
|
129
202
|
doc = self._convert_sheet(doc, sheet)
|
203
|
+
width, height = self._find_page_size(doc, page_no)
|
204
|
+
page.size = Size(width=width, height=height)
|
130
205
|
else:
|
131
206
|
_log.error("Workbook is not initialized.")
|
132
207
|
|
133
208
|
return doc
|
134
209
|
|
135
|
-
def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet):
|
210
|
+
def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet) -> DoclingDocument:
|
211
|
+
"""Parse an Excel worksheet and attach its structure to a DoclingDocument
|
212
|
+
|
213
|
+
Args:
|
214
|
+
doc: The DoclingDocument to be updated.
|
215
|
+
sheet: The Excel worksheet to be parsed.
|
216
|
+
|
217
|
+
Returns:
|
218
|
+
The updated DoclingDocument.
|
219
|
+
"""
|
136
220
|
|
137
221
|
doc = self._find_tables_in_sheet(doc, sheet)
|
138
222
|
|
@@ -140,47 +224,81 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
140
224
|
|
141
225
|
return doc
|
142
226
|
|
143
|
-
def _find_tables_in_sheet(
|
144
|
-
|
145
|
-
|
227
|
+
def _find_tables_in_sheet(
|
228
|
+
self, doc: DoclingDocument, sheet: Worksheet
|
229
|
+
) -> DoclingDocument:
|
230
|
+
"""Find all tables in an Excel sheet and attach them to a DoclingDocument.
|
146
231
|
|
147
|
-
|
148
|
-
|
149
|
-
|
232
|
+
Args:
|
233
|
+
doc: The DoclingDocument to be updated.
|
234
|
+
sheet: The Excel worksheet to be parsed.
|
150
235
|
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
table_cells=[],
|
155
|
-
)
|
236
|
+
Returns:
|
237
|
+
The updated DoclingDocument.
|
238
|
+
"""
|
156
239
|
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
240
|
+
if self.workbook is not None:
|
241
|
+
tables = self._find_data_tables(sheet)
|
242
|
+
|
243
|
+
for excel_table in tables:
|
244
|
+
origin_col = excel_table.anchor[0]
|
245
|
+
origin_row = excel_table.anchor[1]
|
246
|
+
num_rows = excel_table.num_rows
|
247
|
+
num_cols = excel_table.num_cols
|
248
|
+
|
249
|
+
table_data = TableData(
|
250
|
+
num_rows=num_rows,
|
251
|
+
num_cols=num_cols,
|
252
|
+
table_cells=[],
|
169
253
|
)
|
170
|
-
table_data.table_cells.append(cell)
|
171
254
|
|
172
|
-
|
255
|
+
for excel_cell in excel_table.data:
|
256
|
+
|
257
|
+
cell = TableCell(
|
258
|
+
text=excel_cell.text,
|
259
|
+
row_span=excel_cell.row_span,
|
260
|
+
col_span=excel_cell.col_span,
|
261
|
+
start_row_offset_idx=excel_cell.row,
|
262
|
+
end_row_offset_idx=excel_cell.row + excel_cell.row_span,
|
263
|
+
start_col_offset_idx=excel_cell.col,
|
264
|
+
end_col_offset_idx=excel_cell.col + excel_cell.col_span,
|
265
|
+
column_header=excel_cell.row == 0,
|
266
|
+
row_header=False,
|
267
|
+
)
|
268
|
+
table_data.table_cells.append(cell)
|
269
|
+
|
270
|
+
page_no = self.workbook.index(sheet) + 1
|
271
|
+
doc.add_table(
|
272
|
+
data=table_data,
|
273
|
+
parent=self.parents[0],
|
274
|
+
prov=ProvenanceItem(
|
275
|
+
page_no=page_no,
|
276
|
+
charspan=(0, 0),
|
277
|
+
bbox=BoundingBox.from_tuple(
|
278
|
+
(
|
279
|
+
origin_col,
|
280
|
+
origin_row,
|
281
|
+
origin_col + num_cols,
|
282
|
+
origin_row + num_rows,
|
283
|
+
),
|
284
|
+
origin=CoordOrigin.TOPLEFT,
|
285
|
+
),
|
286
|
+
),
|
287
|
+
)
|
173
288
|
|
174
289
|
return doc
|
175
290
|
|
176
|
-
def _find_data_tables(self, sheet: Worksheet) ->
|
177
|
-
"""
|
178
|
-
|
179
|
-
|
180
|
-
|
291
|
+
def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
|
292
|
+
"""Find all compact rectangular data tables in an Excel worksheet.
|
293
|
+
|
294
|
+
Args:
|
295
|
+
sheet: The Excel worksheet to be parsed.
|
181
296
|
|
182
|
-
|
183
|
-
|
297
|
+
Returns:
|
298
|
+
A list of ExcelTable objects representing the data tables.
|
299
|
+
"""
|
300
|
+
tables: list[ExcelTable] = [] # List to store found tables
|
301
|
+
visited: set[tuple[int, int]] = set() # Track already visited cells
|
184
302
|
|
185
303
|
# Iterate over all cells in the sheet
|
186
304
|
for ri, row in enumerate(sheet.iter_rows(values_only=False)):
|
@@ -191,9 +309,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
191
309
|
continue
|
192
310
|
|
193
311
|
# If the cell starts a new table, find its bounds
|
194
|
-
table_bounds, visited_cells = self._find_table_bounds(
|
195
|
-
sheet, ri, rj, visited
|
196
|
-
)
|
312
|
+
table_bounds, visited_cells = self._find_table_bounds(sheet, ri, rj)
|
197
313
|
|
198
314
|
visited.update(visited_cells) # Mark these cells as visited
|
199
315
|
tables.append(table_bounds)
|
@@ -205,22 +321,25 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
205
321
|
sheet: Worksheet,
|
206
322
|
start_row: int,
|
207
323
|
start_col: int,
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
324
|
+
) -> tuple[ExcelTable, set[tuple[int, int]]]:
|
325
|
+
"""Determine the bounds of a compact rectangular table.
|
326
|
+
|
327
|
+
Args:
|
328
|
+
sheet: The Excel worksheet to be parsed.
|
329
|
+
start_row: The row number of the starting cell.
|
330
|
+
start_col: The column number of the starting cell.
|
331
|
+
|
212
332
|
Returns:
|
213
|
-
|
214
|
-
- A set of visited cell coordinates.
|
333
|
+
A tuple with an Excel table and a set of cell coordinates.
|
215
334
|
"""
|
216
|
-
_log.
|
335
|
+
_log.debug("find_table_bounds")
|
217
336
|
|
218
337
|
max_row = self._find_table_bottom(sheet, start_row, start_col)
|
219
338
|
max_col = self._find_table_right(sheet, start_row, start_col)
|
220
339
|
|
221
340
|
# Collect the data within the bounds
|
222
341
|
data = []
|
223
|
-
visited_cells = set()
|
342
|
+
visited_cells: set[tuple[int, int]] = set()
|
224
343
|
for ri in range(start_row, max_row + 1):
|
225
344
|
for rj in range(start_col, max_col + 1):
|
226
345
|
|
@@ -230,7 +349,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
230
349
|
row_span = 1
|
231
350
|
col_span = 1
|
232
351
|
|
233
|
-
# _log.info(sheet.merged_cells.ranges)
|
234
352
|
for merged_range in sheet.merged_cells.ranges:
|
235
353
|
|
236
354
|
if (
|
@@ -254,7 +372,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
254
372
|
col_span=col_span,
|
255
373
|
)
|
256
374
|
)
|
257
|
-
# _log.info(f"cell: {ri}, {rj} -> {ri - start_row}, {rj - start_col}, {row_span}, {col_span}: {str(cell.value)}")
|
258
375
|
|
259
376
|
# Mark all cells in the span as visited
|
260
377
|
for span_row in range(ri, ri + row_span):
|
@@ -263,6 +380,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
263
380
|
|
264
381
|
return (
|
265
382
|
ExcelTable(
|
383
|
+
anchor=(start_col, start_row),
|
266
384
|
num_rows=max_row + 1 - start_row,
|
267
385
|
num_cols=max_col + 1 - start_col,
|
268
386
|
data=data,
|
@@ -270,10 +388,20 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
270
388
|
visited_cells,
|
271
389
|
)
|
272
390
|
|
273
|
-
def _find_table_bottom(
|
274
|
-
|
391
|
+
def _find_table_bottom(
|
392
|
+
self, sheet: Worksheet, start_row: int, start_col: int
|
393
|
+
) -> int:
|
394
|
+
"""Find the bottom boundary of a table.
|
275
395
|
|
276
|
-
|
396
|
+
Args:
|
397
|
+
sheet: The Excel worksheet to be parsed.
|
398
|
+
start_row: The starting row of the table.
|
399
|
+
start_col: The starting column of the table.
|
400
|
+
|
401
|
+
Returns:
|
402
|
+
The row index representing the bottom boundary of the table.
|
403
|
+
"""
|
404
|
+
max_row: int = start_row
|
277
405
|
|
278
406
|
while max_row < sheet.max_row - 1:
|
279
407
|
# Get the cell value or check if it is part of a merged cell
|
@@ -296,10 +424,20 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
296
424
|
|
297
425
|
return max_row
|
298
426
|
|
299
|
-
def _find_table_right(
|
300
|
-
|
427
|
+
def _find_table_right(
|
428
|
+
self, sheet: Worksheet, start_row: int, start_col: int
|
429
|
+
) -> int:
|
430
|
+
"""Find the right boundary of a table.
|
431
|
+
|
432
|
+
Args:
|
433
|
+
sheet: The Excel worksheet to be parsed.
|
434
|
+
start_row: The starting row of the table.
|
435
|
+
start_col: The starting column of the table.
|
301
436
|
|
302
|
-
|
437
|
+
Returns:
|
438
|
+
The column index representing the right boundary of the table."
|
439
|
+
"""
|
440
|
+
max_col: int = start_col
|
303
441
|
|
304
442
|
while max_col < sheet.max_column - 1:
|
305
443
|
# Get the cell value or check if it is part of a merged cell
|
@@ -325,19 +463,63 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
325
463
|
def _find_images_in_sheet(
|
326
464
|
self, doc: DoclingDocument, sheet: Worksheet
|
327
465
|
) -> DoclingDocument:
|
466
|
+
"""Find images in the Excel sheet and attach them to the DoclingDocument.
|
328
467
|
|
329
|
-
|
330
|
-
|
468
|
+
Args:
|
469
|
+
doc: The DoclingDocument to be updated.
|
470
|
+
sheet: The Excel worksheet to be parsed.
|
331
471
|
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
472
|
+
Returns:
|
473
|
+
The updated DoclingDocument.
|
474
|
+
"""
|
475
|
+
if self.workbook is not None:
|
476
|
+
# Iterate over byte images in the sheet
|
477
|
+
for item in sheet._images: # type: ignore[attr-defined]
|
478
|
+
try:
|
479
|
+
image: Image = cast(Image, item)
|
480
|
+
pil_image = PILImage.open(image.ref) # type: ignore[arg-type]
|
481
|
+
page_no = self.workbook.index(sheet) + 1
|
482
|
+
anchor = (0, 0, 0, 0)
|
483
|
+
if isinstance(image.anchor, TwoCellAnchor):
|
484
|
+
anchor = (
|
485
|
+
image.anchor._from.col,
|
486
|
+
image.anchor._from.row,
|
487
|
+
image.anchor.to.col + 1,
|
488
|
+
image.anchor.to.row + 1,
|
489
|
+
)
|
490
|
+
doc.add_picture(
|
491
|
+
parent=self.parents[0],
|
492
|
+
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
493
|
+
caption=None,
|
494
|
+
prov=ProvenanceItem(
|
495
|
+
page_no=page_no,
|
496
|
+
charspan=(0, 0),
|
497
|
+
bbox=BoundingBox.from_tuple(
|
498
|
+
anchor, origin=CoordOrigin.TOPLEFT
|
499
|
+
),
|
500
|
+
),
|
501
|
+
)
|
502
|
+
except:
|
503
|
+
_log.error("could not extract the image from excel sheets")
|
342
504
|
|
343
505
|
return doc
|
506
|
+
|
507
|
+
@staticmethod
|
508
|
+
def _find_page_size(
|
509
|
+
doc: DoclingDocument, page_no: PositiveInt
|
510
|
+
) -> tuple[float, float]:
|
511
|
+
left: float = -1.0
|
512
|
+
top: float = -1.0
|
513
|
+
right: float = -1.0
|
514
|
+
bottom: float = -1.0
|
515
|
+
for item, _ in doc.iterate_items(traverse_pictures=True, page_no=page_no):
|
516
|
+
if not isinstance(item, DocItem):
|
517
|
+
continue
|
518
|
+
for provenance in item.prov:
|
519
|
+
bbox = provenance.bbox
|
520
|
+
left = min(left, bbox.l) if left != -1 else bbox.l
|
521
|
+
right = max(right, bbox.r) if right != -1 else bbox.r
|
522
|
+
top = min(top, bbox.t) if top != -1 else bbox.t
|
523
|
+
bottom = max(bottom, bbox.b) if bottom != -1 else bbox.b
|
524
|
+
|
525
|
+
return (right - left, bottom - top)
|
@@ -392,9 +392,10 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
392
392
|
self.handle_tables(shape, parent_slide, slide_ind, doc, slide_size)
|
393
393
|
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
394
394
|
# Handle Pictures
|
395
|
-
|
396
|
-
|
397
|
-
|
395
|
+
if hasattr(shape, "image"):
|
396
|
+
self.handle_pictures(
|
397
|
+
shape, parent_slide, slide_ind, doc, slide_size
|
398
|
+
)
|
398
399
|
# If shape doesn't have any text, move on to the next shape
|
399
400
|
if not hasattr(shape, "text"):
|
400
401
|
return
|