aspose-cells-foss 25.12.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aspose/__init__.py +14 -0
- aspose/cells/__init__.py +31 -0
- aspose/cells/cell.py +350 -0
- aspose/cells/constants.py +44 -0
- aspose/cells/converters/__init__.py +13 -0
- aspose/cells/converters/csv_converter.py +55 -0
- aspose/cells/converters/json_converter.py +46 -0
- aspose/cells/converters/markdown_converter.py +453 -0
- aspose/cells/drawing/__init__.py +17 -0
- aspose/cells/drawing/anchor.py +172 -0
- aspose/cells/drawing/collection.py +233 -0
- aspose/cells/drawing/image.py +338 -0
- aspose/cells/formats.py +80 -0
- aspose/cells/formula/__init__.py +10 -0
- aspose/cells/formula/evaluator.py +360 -0
- aspose/cells/formula/functions.py +433 -0
- aspose/cells/formula/tokenizer.py +340 -0
- aspose/cells/io/__init__.py +27 -0
- aspose/cells/io/csv/__init__.py +8 -0
- aspose/cells/io/csv/reader.py +88 -0
- aspose/cells/io/csv/writer.py +98 -0
- aspose/cells/io/factory.py +138 -0
- aspose/cells/io/interfaces.py +48 -0
- aspose/cells/io/json/__init__.py +8 -0
- aspose/cells/io/json/reader.py +126 -0
- aspose/cells/io/json/writer.py +119 -0
- aspose/cells/io/md/__init__.py +8 -0
- aspose/cells/io/md/reader.py +161 -0
- aspose/cells/io/md/writer.py +334 -0
- aspose/cells/io/models.py +64 -0
- aspose/cells/io/xlsx/__init__.py +9 -0
- aspose/cells/io/xlsx/constants.py +312 -0
- aspose/cells/io/xlsx/image_writer.py +311 -0
- aspose/cells/io/xlsx/reader.py +284 -0
- aspose/cells/io/xlsx/writer.py +931 -0
- aspose/cells/plugins/__init__.py +6 -0
- aspose/cells/plugins/docling_backend/__init__.py +7 -0
- aspose/cells/plugins/docling_backend/backend.py +535 -0
- aspose/cells/plugins/markitdown_plugin/__init__.py +15 -0
- aspose/cells/plugins/markitdown_plugin/plugin.py +128 -0
- aspose/cells/range.py +210 -0
- aspose/cells/style.py +287 -0
- aspose/cells/utils/__init__.py +54 -0
- aspose/cells/utils/coordinates.py +68 -0
- aspose/cells/utils/exceptions.py +43 -0
- aspose/cells/utils/validation.py +102 -0
- aspose/cells/workbook.py +352 -0
- aspose/cells/worksheet.py +670 -0
- aspose_cells_foss-25.12.1.dist-info/METADATA +189 -0
- aspose_cells_foss-25.12.1.dist-info/RECORD +53 -0
- aspose_cells_foss-25.12.1.dist-info/WHEEL +5 -0
- aspose_cells_foss-25.12.1.dist-info/entry_points.txt +2 -0
- aspose_cells_foss-25.12.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,535 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Docling backend using Aspose.Cells for Excel processing.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
from io import BytesIO
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Union, cast
|
|
10
|
+
|
|
11
|
+
from docling_core.types.doc import (
|
|
12
|
+
BoundingBox,
|
|
13
|
+
CoordOrigin,
|
|
14
|
+
DocItem,
|
|
15
|
+
DoclingDocument,
|
|
16
|
+
DocumentOrigin,
|
|
17
|
+
GroupLabel,
|
|
18
|
+
ImageRef,
|
|
19
|
+
ProvenanceItem,
|
|
20
|
+
Size,
|
|
21
|
+
TableCell,
|
|
22
|
+
TableData,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class AsposeCellsDoclingDocument(DoclingDocument):
|
|
27
|
+
"""Extended DoclingDocument that uses Aspose.Cells MarkdownConverter for export."""
|
|
28
|
+
|
|
29
|
+
def __init__(self, *args, **kwargs):
|
|
30
|
+
super().__init__(*args, **kwargs)
|
|
31
|
+
self._aspose_markdown_content = None
|
|
32
|
+
|
|
33
|
+
def export_to_markdown(self, **kwargs) -> str:
|
|
34
|
+
"""Export using Aspose MarkdownConverter if available, fallback to docling default."""
|
|
35
|
+
if hasattr(self, '_aspose_markdown_content') and self._aspose_markdown_content:
|
|
36
|
+
return self._aspose_markdown_content
|
|
37
|
+
else:
|
|
38
|
+
# Fallback to original docling export
|
|
39
|
+
return super().export_to_markdown(**kwargs)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
from PIL import Image as PILImage
|
|
43
|
+
from pydantic import BaseModel, NonNegativeInt, PositiveInt
|
|
44
|
+
from typing_extensions import override
|
|
45
|
+
|
|
46
|
+
from docling.backend.abstract_backend import (
|
|
47
|
+
DeclarativeDocumentBackend,
|
|
48
|
+
PaginatedDocumentBackend,
|
|
49
|
+
)
|
|
50
|
+
from docling.datamodel.base_models import InputFormat
|
|
51
|
+
from docling.datamodel.document import InputDocument
|
|
52
|
+
|
|
53
|
+
# Import our Aspose.Cells modules
|
|
54
|
+
from aspose.cells import Workbook, Worksheet
|
|
55
|
+
|
|
56
|
+
_log = logging.getLogger(__name__)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class ExcelCell(BaseModel):
|
|
60
|
+
"""Represents an Excel cell.
|
|
61
|
+
|
|
62
|
+
Attributes:
|
|
63
|
+
row: The row number of the cell.
|
|
64
|
+
col: The column number of the cell.
|
|
65
|
+
text: The text content of the cell.
|
|
66
|
+
row_span: The number of rows the cell spans.
|
|
67
|
+
col_span: The number of columns the cell spans.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
row: int
|
|
71
|
+
col: int
|
|
72
|
+
text: str
|
|
73
|
+
row_span: int
|
|
74
|
+
col_span: int
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class ExcelTable(BaseModel):
|
|
78
|
+
"""Represents an Excel table on a worksheet.
|
|
79
|
+
|
|
80
|
+
Attributes:
|
|
81
|
+
anchor: The column and row indices of the upper-left cell of the table
|
|
82
|
+
(0-based index).
|
|
83
|
+
num_rows: The number of rows in the table.
|
|
84
|
+
num_cols: The number of columns in the table.
|
|
85
|
+
data: The data in the table, represented as a list of ExcelCell objects.
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
anchor: tuple[NonNegativeInt, NonNegativeInt]
|
|
89
|
+
num_rows: int
|
|
90
|
+
num_cols: int
|
|
91
|
+
data: list[ExcelCell]
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class CellsDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
|
|
95
|
+
"""Backend for parsing Excel workbooks using Aspose.Cells.
|
|
96
|
+
|
|
97
|
+
The backend converts an Excel workbook into a DoclingDocument object.
|
|
98
|
+
Each worksheet is converted into a separate page.
|
|
99
|
+
The following elements are parsed:
|
|
100
|
+
- Cell contents, parsed as tables. If two groups of cells are disconnected
|
|
101
|
+
between each other, they will be parsed as two different tables.
|
|
102
|
+
- Images, parsed as PictureItem objects.
|
|
103
|
+
|
|
104
|
+
The DoclingDocument tables and pictures have their provenance information, including
|
|
105
|
+
the position in their original Excel worksheet. The position is represented by a
|
|
106
|
+
bounding box object with the cell indices as units (0-based index). The size of this
|
|
107
|
+
bounding box is the number of columns and rows that the table or picture spans.
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
@override
|
|
111
|
+
def __init__(
|
|
112
|
+
self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path], **kwargs
|
|
113
|
+
) -> None:
|
|
114
|
+
"""Initialize the CellsDocumentBackend object.
|
|
115
|
+
|
|
116
|
+
Parameters:
|
|
117
|
+
in_doc: The input document object.
|
|
118
|
+
path_or_stream: The path or stream to the Excel file.
|
|
119
|
+
|
|
120
|
+
Raises:
|
|
121
|
+
RuntimeError: An error occurred parsing the file.
|
|
122
|
+
"""
|
|
123
|
+
super().__init__(in_doc, path_or_stream)
|
|
124
|
+
|
|
125
|
+
# Store conversion parameters
|
|
126
|
+
self.conversion_kwargs = kwargs
|
|
127
|
+
|
|
128
|
+
# Initialise the parents for the hierarchy
|
|
129
|
+
self.max_levels = 10
|
|
130
|
+
|
|
131
|
+
self.parents: dict[int, Any] = {}
|
|
132
|
+
for i in range(-1, self.max_levels):
|
|
133
|
+
self.parents[i] = None
|
|
134
|
+
|
|
135
|
+
self.workbook = None
|
|
136
|
+
try:
|
|
137
|
+
if isinstance(self.path_or_stream, BytesIO):
|
|
138
|
+
# For BytesIO, we need to write to a temporary file
|
|
139
|
+
import tempfile
|
|
140
|
+
with tempfile.NamedTemporaryFile(suffix='.xlsx', delete=False) as tmp:
|
|
141
|
+
tmp.write(self.path_or_stream.getvalue())
|
|
142
|
+
tmp_path = tmp.name
|
|
143
|
+
self.workbook = Workbook.load(tmp_path)
|
|
144
|
+
import os
|
|
145
|
+
os.unlink(tmp_path) # Clean up temp file
|
|
146
|
+
|
|
147
|
+
elif isinstance(self.path_or_stream, Path):
|
|
148
|
+
self.workbook = Workbook.load(str(self.path_or_stream))
|
|
149
|
+
|
|
150
|
+
self.valid = self.workbook is not None
|
|
151
|
+
except Exception as e:
|
|
152
|
+
self.valid = False
|
|
153
|
+
raise RuntimeError(
|
|
154
|
+
f"CellsDocumentBackend could not load document with hash {self.document_hash}"
|
|
155
|
+
) from e
|
|
156
|
+
|
|
157
|
+
@override
|
|
158
|
+
def is_valid(self) -> bool:
|
|
159
|
+
_log.debug(f"valid: {self.valid}")
|
|
160
|
+
return self.valid
|
|
161
|
+
|
|
162
|
+
@classmethod
|
|
163
|
+
@override
|
|
164
|
+
def supports_pagination(cls) -> bool:
|
|
165
|
+
return True
|
|
166
|
+
|
|
167
|
+
@override
|
|
168
|
+
def page_count(self) -> int:
|
|
169
|
+
if self.is_valid() and self.workbook:
|
|
170
|
+
return len(self.workbook.sheetnames)
|
|
171
|
+
else:
|
|
172
|
+
return 0
|
|
173
|
+
|
|
174
|
+
@classmethod
|
|
175
|
+
@override
|
|
176
|
+
def supported_formats(cls) -> set[InputFormat]:
|
|
177
|
+
return {InputFormat.XLSX}
|
|
178
|
+
|
|
179
|
+
@override
|
|
180
|
+
def convert(self, **kwargs) -> DoclingDocument:
|
|
181
|
+
"""Parse the Excel workbook into a DoclingDocument object.
|
|
182
|
+
|
|
183
|
+
Raises:
|
|
184
|
+
RuntimeError: Unable to run the conversion since the backend object failed to
|
|
185
|
+
initialize.
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
The DoclingDocument object representing the Excel workbook.
|
|
189
|
+
"""
|
|
190
|
+
origin = DocumentOrigin(
|
|
191
|
+
filename=self.file.name or "file.xlsx",
|
|
192
|
+
mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
193
|
+
binary_hash=self.document_hash,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
doc = AsposeCellsDoclingDocument(name=self.file.stem or "file.xlsx", origin=origin)
|
|
197
|
+
|
|
198
|
+
if self.is_valid():
|
|
199
|
+
doc = self._convert_workbook_with_markdown(doc)
|
|
200
|
+
else:
|
|
201
|
+
raise RuntimeError(
|
|
202
|
+
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
return doc
|
|
206
|
+
|
|
207
|
+
def _convert_workbook_with_markdown(self, doc: AsposeCellsDoclingDocument) -> AsposeCellsDoclingDocument:
|
|
208
|
+
"""Convert workbook using our MarkdownConverter and embed result in DoclingDocument."""
|
|
209
|
+
|
|
210
|
+
# Use our MarkdownConverter instead of custom docling logic
|
|
211
|
+
from ...converters.markdown_converter import MarkdownConverter
|
|
212
|
+
|
|
213
|
+
converter = MarkdownConverter()
|
|
214
|
+
|
|
215
|
+
# Use same parameters as markitdown plugin
|
|
216
|
+
convert_kwargs = {
|
|
217
|
+
"sheet_name": self.conversion_kwargs.get("sheet_name", None),
|
|
218
|
+
"include_metadata": self.conversion_kwargs.get("include_metadata", True),
|
|
219
|
+
"value_mode": self.conversion_kwargs.get("value_mode", "value"),
|
|
220
|
+
"include_hyperlinks": self.conversion_kwargs.get("include_hyperlinks", True),
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
# Convert workbook to markdown using our converter
|
|
224
|
+
markdown_content = converter.convert_workbook(self.workbook, **convert_kwargs)
|
|
225
|
+
|
|
226
|
+
# Store the markdown content in the document for export
|
|
227
|
+
doc._aspose_markdown_content = markdown_content
|
|
228
|
+
|
|
229
|
+
# Still add basic docling structure for compatibility
|
|
230
|
+
doc = self._convert_workbook(doc)
|
|
231
|
+
|
|
232
|
+
return doc
|
|
233
|
+
|
|
234
|
+
def _convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
|
|
235
|
+
"""Parse the Excel workbook and attach its structure to a DoclingDocument.
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
doc: A DoclingDocument object.
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
A DoclingDocument object with the parsed items.
|
|
242
|
+
"""
|
|
243
|
+
|
|
244
|
+
if self.workbook is not None:
|
|
245
|
+
# Iterate over all sheets
|
|
246
|
+
for i, sheet_name in enumerate(self.workbook.sheetnames):
|
|
247
|
+
_log.info(f"Processing sheet: {sheet_name}")
|
|
248
|
+
|
|
249
|
+
sheet = self.workbook.worksheets[sheet_name]
|
|
250
|
+
page_no = i + 1
|
|
251
|
+
# Add page with initial size
|
|
252
|
+
page = doc.add_page(page_no=page_no, size=Size(width=0, height=0))
|
|
253
|
+
|
|
254
|
+
self.parents[0] = doc.add_group(
|
|
255
|
+
parent=None,
|
|
256
|
+
label=GroupLabel.SECTION,
|
|
257
|
+
name=f"sheet: {sheet_name}",
|
|
258
|
+
)
|
|
259
|
+
doc = self._convert_sheet(doc, sheet, page_no)
|
|
260
|
+
width, height = self._find_page_size(doc, page_no)
|
|
261
|
+
page.size = Size(width=width, height=height)
|
|
262
|
+
else:
|
|
263
|
+
_log.error("Workbook is not initialized.")
|
|
264
|
+
|
|
265
|
+
return doc
|
|
266
|
+
|
|
267
|
+
def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet, page_no: int) -> DoclingDocument:
|
|
268
|
+
"""Parse an Excel worksheet and attach its structure to a DoclingDocument
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
doc: The DoclingDocument to be updated.
|
|
272
|
+
sheet: The Excel worksheet to be parsed.
|
|
273
|
+
page_no: The page number for this sheet.
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
The updated DoclingDocument.
|
|
277
|
+
"""
|
|
278
|
+
|
|
279
|
+
doc = self._find_tables_in_sheet(doc, sheet, page_no)
|
|
280
|
+
doc = self._find_images_in_sheet(doc, sheet, page_no)
|
|
281
|
+
|
|
282
|
+
return doc
|
|
283
|
+
|
|
284
|
+
def _find_tables_in_sheet(
|
|
285
|
+
self, doc: DoclingDocument, sheet: Worksheet, page_no: int
|
|
286
|
+
) -> DoclingDocument:
|
|
287
|
+
"""Find all tables in an Excel sheet and attach them to a DoclingDocument.
|
|
288
|
+
|
|
289
|
+
Args:
|
|
290
|
+
doc: The DoclingDocument to be updated.
|
|
291
|
+
sheet: The Excel worksheet to be parsed.
|
|
292
|
+
page_no: The page number for this sheet.
|
|
293
|
+
|
|
294
|
+
Returns:
|
|
295
|
+
The updated DoclingDocument.
|
|
296
|
+
"""
|
|
297
|
+
|
|
298
|
+
if self.workbook is not None:
|
|
299
|
+
tables = self._find_data_tables(sheet)
|
|
300
|
+
|
|
301
|
+
for excel_table in tables:
|
|
302
|
+
origin_col = excel_table.anchor[0]
|
|
303
|
+
origin_row = excel_table.anchor[1]
|
|
304
|
+
num_rows = excel_table.num_rows
|
|
305
|
+
num_cols = excel_table.num_cols
|
|
306
|
+
|
|
307
|
+
table_data = TableData(
|
|
308
|
+
num_rows=num_rows,
|
|
309
|
+
num_cols=num_cols,
|
|
310
|
+
table_cells=[],
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
for excel_cell in excel_table.data:
|
|
314
|
+
cell = TableCell(
|
|
315
|
+
text=excel_cell.text,
|
|
316
|
+
row_span=excel_cell.row_span,
|
|
317
|
+
col_span=excel_cell.col_span,
|
|
318
|
+
start_row_offset_idx=excel_cell.row,
|
|
319
|
+
end_row_offset_idx=excel_cell.row + excel_cell.row_span,
|
|
320
|
+
start_col_offset_idx=excel_cell.col,
|
|
321
|
+
end_col_offset_idx=excel_cell.col + excel_cell.col_span,
|
|
322
|
+
column_header=excel_cell.row == 0,
|
|
323
|
+
row_header=False,
|
|
324
|
+
)
|
|
325
|
+
table_data.table_cells.append(cell)
|
|
326
|
+
|
|
327
|
+
doc.add_table(
|
|
328
|
+
data=table_data,
|
|
329
|
+
parent=self.parents[0],
|
|
330
|
+
prov=ProvenanceItem(
|
|
331
|
+
page_no=page_no,
|
|
332
|
+
charspan=(0, 0),
|
|
333
|
+
bbox=BoundingBox.from_tuple(
|
|
334
|
+
(
|
|
335
|
+
origin_col,
|
|
336
|
+
origin_row,
|
|
337
|
+
origin_col + num_cols,
|
|
338
|
+
origin_row + num_rows,
|
|
339
|
+
),
|
|
340
|
+
origin=CoordOrigin.TOPLEFT,
|
|
341
|
+
),
|
|
342
|
+
),
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
return doc
|
|
346
|
+
|
|
347
|
+
def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
|
|
348
|
+
"""Find all compact rectangular data tables in an Excel worksheet.
|
|
349
|
+
|
|
350
|
+
Args:
|
|
351
|
+
sheet: The Excel worksheet to be parsed.
|
|
352
|
+
|
|
353
|
+
Returns:
|
|
354
|
+
A list of ExcelTable objects representing the data tables.
|
|
355
|
+
"""
|
|
356
|
+
tables: list[ExcelTable] = []
|
|
357
|
+
visited: set[tuple[int, int]] = set()
|
|
358
|
+
|
|
359
|
+
# Get all non-empty cells
|
|
360
|
+
non_empty_cells = []
|
|
361
|
+
for row in range(1, 1000): # Reasonable limit
|
|
362
|
+
for col in range(1, 100): # Reasonable limit
|
|
363
|
+
cell = sheet.cell(row, col)
|
|
364
|
+
if cell.value is not None and str(cell.value).strip():
|
|
365
|
+
non_empty_cells.append((row-1, col-1)) # Convert to 0-based
|
|
366
|
+
|
|
367
|
+
# Group adjacent cells into tables
|
|
368
|
+
for row, col in non_empty_cells:
|
|
369
|
+
if (row, col) in visited:
|
|
370
|
+
continue
|
|
371
|
+
|
|
372
|
+
# Find table bounds starting from this cell
|
|
373
|
+
table_bounds, visited_cells = self._find_table_bounds(sheet, row, col)
|
|
374
|
+
visited.update(visited_cells)
|
|
375
|
+
tables.append(table_bounds)
|
|
376
|
+
|
|
377
|
+
return tables
|
|
378
|
+
|
|
379
|
+
def _find_table_bounds(
|
|
380
|
+
self,
|
|
381
|
+
sheet: Worksheet,
|
|
382
|
+
start_row: int,
|
|
383
|
+
start_col: int,
|
|
384
|
+
) -> tuple[ExcelTable, set[tuple[int, int]]]:
|
|
385
|
+
"""Determine the bounds of a compact rectangular table.
|
|
386
|
+
|
|
387
|
+
Args:
|
|
388
|
+
sheet: The Excel worksheet to be parsed.
|
|
389
|
+
start_row: The row number of the starting cell (0-based).
|
|
390
|
+
start_col: The column number of the starting cell (0-based).
|
|
391
|
+
|
|
392
|
+
Returns:
|
|
393
|
+
A tuple with an Excel table and a set of cell coordinates.
|
|
394
|
+
"""
|
|
395
|
+
_log.debug("find_table_bounds")
|
|
396
|
+
|
|
397
|
+
max_row = self._find_table_bottom(sheet, start_row, start_col)
|
|
398
|
+
max_col = self._find_table_right(sheet, start_row, start_col)
|
|
399
|
+
|
|
400
|
+
# Collect the data within the bounds
|
|
401
|
+
data = []
|
|
402
|
+
visited_cells: set[tuple[int, int]] = set()
|
|
403
|
+
|
|
404
|
+
for row in range(start_row, max_row + 1):
|
|
405
|
+
for col in range(start_col, max_col + 1):
|
|
406
|
+
# Convert to 1-based for our cell access
|
|
407
|
+
cell = sheet.cell(row + 1, col + 1)
|
|
408
|
+
|
|
409
|
+
# Check for merged cells (simplified - assume no merging for now)
|
|
410
|
+
row_span = 1
|
|
411
|
+
col_span = 1
|
|
412
|
+
|
|
413
|
+
if (row, col) not in visited_cells:
|
|
414
|
+
cell_value = cell.value if cell.value is not None else ""
|
|
415
|
+
data.append(
|
|
416
|
+
ExcelCell(
|
|
417
|
+
row=row - start_row,
|
|
418
|
+
col=col - start_col,
|
|
419
|
+
text=str(cell_value),
|
|
420
|
+
row_span=row_span,
|
|
421
|
+
col_span=col_span,
|
|
422
|
+
)
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
# Mark cells in span as visited
|
|
426
|
+
for span_row in range(row, row + row_span):
|
|
427
|
+
for span_col in range(col, col + col_span):
|
|
428
|
+
visited_cells.add((span_row, span_col))
|
|
429
|
+
|
|
430
|
+
return (
|
|
431
|
+
ExcelTable(
|
|
432
|
+
anchor=(start_col, start_row),
|
|
433
|
+
num_rows=max_row + 1 - start_row,
|
|
434
|
+
num_cols=max_col + 1 - start_col,
|
|
435
|
+
data=data,
|
|
436
|
+
),
|
|
437
|
+
visited_cells,
|
|
438
|
+
)
|
|
439
|
+
|
|
440
|
+
def _find_table_bottom(
|
|
441
|
+
self, sheet: Worksheet, start_row: int, start_col: int
|
|
442
|
+
) -> int:
|
|
443
|
+
"""Find the bottom boundary of a table."""
|
|
444
|
+
max_row = start_row
|
|
445
|
+
|
|
446
|
+
for row in range(start_row + 1, 1000): # Reasonable limit
|
|
447
|
+
cell = sheet.cell(row + 1, start_col + 1) # Convert to 1-based
|
|
448
|
+
if cell.value is None or not str(cell.value).strip():
|
|
449
|
+
break
|
|
450
|
+
max_row = row
|
|
451
|
+
|
|
452
|
+
return max_row
|
|
453
|
+
|
|
454
|
+
def _find_table_right(
|
|
455
|
+
self, sheet: Worksheet, start_row: int, start_col: int
|
|
456
|
+
) -> int:
|
|
457
|
+
"""Find the right boundary of a table."""
|
|
458
|
+
max_col = start_col
|
|
459
|
+
|
|
460
|
+
for col in range(start_col + 1, 100): # Reasonable limit
|
|
461
|
+
cell = sheet.cell(start_row + 1, col + 1) # Convert to 1-based
|
|
462
|
+
if cell.value is None or not str(cell.value).strip():
|
|
463
|
+
break
|
|
464
|
+
max_col = col
|
|
465
|
+
|
|
466
|
+
return max_col
|
|
467
|
+
|
|
468
|
+
def _find_images_in_sheet(
|
|
469
|
+
self, doc: DoclingDocument, sheet: Worksheet, page_no: int
|
|
470
|
+
) -> DoclingDocument:
|
|
471
|
+
"""Find images in the Excel sheet and attach them to the DoclingDocument.
|
|
472
|
+
|
|
473
|
+
Args:
|
|
474
|
+
doc: The DoclingDocument to be updated.
|
|
475
|
+
sheet: The Excel worksheet to be parsed.
|
|
476
|
+
page_no: The page number for this sheet.
|
|
477
|
+
|
|
478
|
+
Returns:
|
|
479
|
+
The updated DoclingDocument.
|
|
480
|
+
"""
|
|
481
|
+
if self.workbook is not None:
|
|
482
|
+
# Check if the sheet has images (simplified implementation)
|
|
483
|
+
if hasattr(sheet, 'images') and sheet.images:
|
|
484
|
+
for image in sheet.images:
|
|
485
|
+
try:
|
|
486
|
+
# Convert our Image to PIL Image for compatibility
|
|
487
|
+
if hasattr(image, 'data') and image.data:
|
|
488
|
+
pil_image = PILImage.open(BytesIO(image.data))
|
|
489
|
+
|
|
490
|
+
# Get anchor information (simplified)
|
|
491
|
+
anchor = (0, 0, 5, 5) # Default anchor
|
|
492
|
+
if hasattr(image, 'anchor') and image.anchor:
|
|
493
|
+
anchor = (
|
|
494
|
+
getattr(image.anchor, 'col', 0),
|
|
495
|
+
getattr(image.anchor, 'row', 0),
|
|
496
|
+
getattr(image.anchor, 'col', 0) + 5,
|
|
497
|
+
getattr(image.anchor, 'row', 0) + 5,
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
doc.add_picture(
|
|
501
|
+
parent=self.parents[0],
|
|
502
|
+
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
|
503
|
+
caption=None,
|
|
504
|
+
prov=ProvenanceItem(
|
|
505
|
+
page_no=page_no,
|
|
506
|
+
charspan=(0, 0),
|
|
507
|
+
bbox=BoundingBox.from_tuple(
|
|
508
|
+
anchor, origin=CoordOrigin.TOPLEFT
|
|
509
|
+
),
|
|
510
|
+
),
|
|
511
|
+
)
|
|
512
|
+
except Exception as e:
|
|
513
|
+
_log.warning(f"Could not extract image from sheet: {e}")
|
|
514
|
+
|
|
515
|
+
return doc
|
|
516
|
+
|
|
517
|
+
@staticmethod
|
|
518
|
+
def _find_page_size(
|
|
519
|
+
doc: DoclingDocument, page_no: PositiveInt
|
|
520
|
+
) -> tuple[float, float]:
|
|
521
|
+
left: float = -1.0
|
|
522
|
+
top: float = -1.0
|
|
523
|
+
right: float = -1.0
|
|
524
|
+
bottom: float = -1.0
|
|
525
|
+
for item, _ in doc.iterate_items(traverse_pictures=True, page_no=page_no):
|
|
526
|
+
if not isinstance(item, DocItem):
|
|
527
|
+
continue
|
|
528
|
+
for provenance in item.prov:
|
|
529
|
+
bbox = provenance.bbox
|
|
530
|
+
left = min(left, bbox.l) if left != -1 else bbox.l
|
|
531
|
+
right = max(right, bbox.r) if right != -1 else bbox.r
|
|
532
|
+
top = min(top, bbox.t) if top != -1 else bbox.t
|
|
533
|
+
bottom = max(bottom, bbox.b) if bottom != -1 else bbox.b
|
|
534
|
+
|
|
535
|
+
return (max(right - left, 10.0), max(bottom - top, 10.0))
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""
|
|
2
|
+
MarkItDown Excel Enhancement Plugin
|
|
3
|
+
|
|
4
|
+
Integrates Aspose.Cells.Python Excel-to-Markdown conversion with Microsoft MarkItDown.
|
|
5
|
+
Part of the Aspose.org open source ecosystem.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .plugin import register_converters # re-export for convenience
|
|
9
|
+
|
|
10
|
+
__version__ = "1.1.0"
|
|
11
|
+
__plugin_name__ = "Excel Enhancer"
|
|
12
|
+
__plugin_description__ = "Enhanced Excel processing for MarkItDown (.xlsx only)"
|
|
13
|
+
|
|
14
|
+
# MarkItDown plugin interface
|
|
15
|
+
__plugin_interface_version__ = 1
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""
|
|
2
|
+
MarkItDown Excel plugin that leverages Aspose.Cells.Python's Markdown converter.
|
|
3
|
+
|
|
4
|
+
This plugin integrates Aspose.Cells.Python with Microsoft MarkItDown to provide
|
|
5
|
+
enhanced Excel-to-Markdown conversion with metadata, multi-sheet support,
|
|
6
|
+
and professional formatting.
|
|
7
|
+
|
|
8
|
+
Part of the Aspose.org open source ecosystem.
|
|
9
|
+
"""
|
|
10
|
+
from typing import BinaryIO, Any
|
|
11
|
+
import tempfile
|
|
12
|
+
import os
|
|
13
|
+
import logging
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
__plugin_interface_version__ = 1
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def register_converters(markitdown, **kwargs):
|
|
21
|
+
"""Register Aspose.Cells.Python's enhanced Excel converter with MarkItDown."""
|
|
22
|
+
markitdown.register_converter(ExcelEnhancerConverter())
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ExcelEnhancerConverter:
|
|
26
|
+
"""Enhanced Excel converter using Aspose.Cells.Python's MarkItDownConverter."""
|
|
27
|
+
|
|
28
|
+
# Hints for MarkItDown converter discovery systems
|
|
29
|
+
file_extensions = [".xlsx"]
|
|
30
|
+
mimetypes = ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]
|
|
31
|
+
name = "Excel Enhancer"
|
|
32
|
+
priority = 50 # Prefer this converter over generic ones for .xlsx
|
|
33
|
+
|
|
34
|
+
def accepts(self, file_stream: BinaryIO, stream_info, **kwargs: Any) -> bool:
|
|
35
|
+
"""Return True if the stream describes an .xlsx file."""
|
|
36
|
+
# MarkItDown's StreamInfo may expose different fields depending on source
|
|
37
|
+
extension = (
|
|
38
|
+
(getattr(stream_info, "extension", None) or
|
|
39
|
+
getattr(stream_info, "suffix", None) or
|
|
40
|
+
"").lower()
|
|
41
|
+
)
|
|
42
|
+
filename = (getattr(stream_info, "filename", None) or "").lower()
|
|
43
|
+
mimetype = (getattr(stream_info, "mimetype", None) or "").lower()
|
|
44
|
+
|
|
45
|
+
# Only support modern .xlsx
|
|
46
|
+
return (
|
|
47
|
+
extension == ".xlsx"
|
|
48
|
+
or (filename and filename.endswith(".xlsx") and not filename.endswith(".xls"))
|
|
49
|
+
or "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" in mimetype
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
def convert(self, file_stream: BinaryIO, stream_info, **kwargs: Any):
|
|
53
|
+
"""Convert given Excel content to Markdown using our converter."""
|
|
54
|
+
try:
|
|
55
|
+
from markitdown import DocumentConverterResult
|
|
56
|
+
except ImportError:
|
|
57
|
+
# Fallback lightweight result object if markitdown is not installed
|
|
58
|
+
class DocumentConverterResult: # type: ignore
|
|
59
|
+
def __init__(self, text_content):
|
|
60
|
+
self.text_content = text_content
|
|
61
|
+
|
|
62
|
+
try:
|
|
63
|
+
# Persist incoming stream to a temporary .xlsx file
|
|
64
|
+
with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
|
|
65
|
+
if hasattr(file_stream, "read"):
|
|
66
|
+
content = file_stream.read()
|
|
67
|
+
if hasattr(file_stream, "seek"):
|
|
68
|
+
file_stream.seek(0) # Reset for potential re-use elsewhere
|
|
69
|
+
tmp.write(content)
|
|
70
|
+
else:
|
|
71
|
+
# file_stream may be a file path
|
|
72
|
+
with open(file_stream, "rb") as f: # type: ignore[arg-type]
|
|
73
|
+
tmp.write(f.read())
|
|
74
|
+
tmp_path = tmp.name
|
|
75
|
+
|
|
76
|
+
# Load workbook using our implementation
|
|
77
|
+
from ...workbook import Workbook
|
|
78
|
+
|
|
79
|
+
workbook = Workbook.load(tmp_path)
|
|
80
|
+
|
|
81
|
+
# Convert to MarkItDown format optimized for LLMs using enhanced MarkdownConverter
|
|
82
|
+
from ...converters.markdown_converter import MarkdownConverter
|
|
83
|
+
|
|
84
|
+
converter = MarkdownConverter()
|
|
85
|
+
|
|
86
|
+
# Simplified and optimized parameters for better user experience
|
|
87
|
+
convert_kwargs = {
|
|
88
|
+
"sheet_name": kwargs.get("sheet_name", None), # Convert specific sheet by name, None means all sheets
|
|
89
|
+
"include_metadata": kwargs.get("include_metadata", True),
|
|
90
|
+
"value_mode": kwargs.get("value_mode", "value"), # "value" shows calculated results, "formula" shows formulas
|
|
91
|
+
"include_hyperlinks": kwargs.get("include_hyperlinks", True), # Convert hyperlinks to markdown
|
|
92
|
+
}
|
|
93
|
+
markdown_content = converter.convert_workbook(workbook, **convert_kwargs)
|
|
94
|
+
|
|
95
|
+
# Optional generator banner for disambiguation in outputs
|
|
96
|
+
if kwargs.get("include_generator_info", False):
|
|
97
|
+
banner = "<!-- Generator: Aspose.Cells.Python MarkItDown Plugin -->\n\n"
|
|
98
|
+
markdown_content = banner + markdown_content
|
|
99
|
+
|
|
100
|
+
# Cleanup temp file
|
|
101
|
+
try:
|
|
102
|
+
os.unlink(tmp_path)
|
|
103
|
+
except OSError:
|
|
104
|
+
logger.debug("Temp file already removed or locked: %s", tmp_path)
|
|
105
|
+
|
|
106
|
+
logger.info("Converted .xlsx using enhanced Excel converter")
|
|
107
|
+
return DocumentConverterResult(markdown_content)
|
|
108
|
+
|
|
109
|
+
except Exception as e: # pragma: no cover - defensive path
|
|
110
|
+
logger.error("Excel conversion failed: %s", e)
|
|
111
|
+
error_msg = (
|
|
112
|
+
"# Excel conversion error\n\n"
|
|
113
|
+
f"Conversion failed: {str(e)}\n\n"
|
|
114
|
+
"Please verify the Excel file is a valid .xlsx workbook."
|
|
115
|
+
)
|
|
116
|
+
return DocumentConverterResult(error_msg)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# MarkItDown plugin interface
|
|
120
|
+
__plugin_interface_version__ = 1
|
|
121
|
+
|
|
122
|
+
def register_converters(markitdown, **kwargs):
|
|
123
|
+
"""
|
|
124
|
+
Register Aspose.Cells.Python's enhanced Excel converter with MarkItDown.
|
|
125
|
+
|
|
126
|
+
This function is called by MarkItDown when enable_plugins=True.
|
|
127
|
+
"""
|
|
128
|
+
markitdown.register_converter(ExcelEnhancerConverter())
|