aspose-cells-foss 25.12.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. aspose/__init__.py +14 -0
  2. aspose/cells/__init__.py +31 -0
  3. aspose/cells/cell.py +350 -0
  4. aspose/cells/constants.py +44 -0
  5. aspose/cells/converters/__init__.py +13 -0
  6. aspose/cells/converters/csv_converter.py +55 -0
  7. aspose/cells/converters/json_converter.py +46 -0
  8. aspose/cells/converters/markdown_converter.py +453 -0
  9. aspose/cells/drawing/__init__.py +17 -0
  10. aspose/cells/drawing/anchor.py +172 -0
  11. aspose/cells/drawing/collection.py +233 -0
  12. aspose/cells/drawing/image.py +338 -0
  13. aspose/cells/formats.py +80 -0
  14. aspose/cells/formula/__init__.py +10 -0
  15. aspose/cells/formula/evaluator.py +360 -0
  16. aspose/cells/formula/functions.py +433 -0
  17. aspose/cells/formula/tokenizer.py +340 -0
  18. aspose/cells/io/__init__.py +27 -0
  19. aspose/cells/io/csv/__init__.py +8 -0
  20. aspose/cells/io/csv/reader.py +88 -0
  21. aspose/cells/io/csv/writer.py +98 -0
  22. aspose/cells/io/factory.py +138 -0
  23. aspose/cells/io/interfaces.py +48 -0
  24. aspose/cells/io/json/__init__.py +8 -0
  25. aspose/cells/io/json/reader.py +126 -0
  26. aspose/cells/io/json/writer.py +119 -0
  27. aspose/cells/io/md/__init__.py +8 -0
  28. aspose/cells/io/md/reader.py +161 -0
  29. aspose/cells/io/md/writer.py +334 -0
  30. aspose/cells/io/models.py +64 -0
  31. aspose/cells/io/xlsx/__init__.py +9 -0
  32. aspose/cells/io/xlsx/constants.py +312 -0
  33. aspose/cells/io/xlsx/image_writer.py +311 -0
  34. aspose/cells/io/xlsx/reader.py +284 -0
  35. aspose/cells/io/xlsx/writer.py +931 -0
  36. aspose/cells/plugins/__init__.py +6 -0
  37. aspose/cells/plugins/docling_backend/__init__.py +7 -0
  38. aspose/cells/plugins/docling_backend/backend.py +535 -0
  39. aspose/cells/plugins/markitdown_plugin/__init__.py +15 -0
  40. aspose/cells/plugins/markitdown_plugin/plugin.py +128 -0
  41. aspose/cells/range.py +210 -0
  42. aspose/cells/style.py +287 -0
  43. aspose/cells/utils/__init__.py +54 -0
  44. aspose/cells/utils/coordinates.py +68 -0
  45. aspose/cells/utils/exceptions.py +43 -0
  46. aspose/cells/utils/validation.py +102 -0
  47. aspose/cells/workbook.py +352 -0
  48. aspose/cells/worksheet.py +670 -0
  49. aspose_cells_foss-25.12.1.dist-info/METADATA +189 -0
  50. aspose_cells_foss-25.12.1.dist-info/RECORD +53 -0
  51. aspose_cells_foss-25.12.1.dist-info/WHEEL +5 -0
  52. aspose_cells_foss-25.12.1.dist-info/entry_points.txt +2 -0
  53. aspose_cells_foss-25.12.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,6 @@
1
+ """
2
+ Plugins module for external integrations.
3
+ """
4
+
5
+ # Import plugins for easier access
6
+ from .docling_backend import CellsDocumentBackend
@@ -0,0 +1,7 @@
1
+ """
2
+ Docling backend plugin for Aspose.Cells Excel processing.
3
+ """
4
+
5
+ from .backend import CellsDocumentBackend
6
+
7
+ __all__ = ["CellsDocumentBackend"]
@@ -0,0 +1,535 @@
1
+ """
2
+ Docling backend using Aspose.Cells for Excel processing.
3
+ """
4
+
5
+ import logging
6
+
7
+ from io import BytesIO
8
+ from pathlib import Path
9
+ from typing import Any, Union, cast
10
+
11
+ from docling_core.types.doc import (
12
+ BoundingBox,
13
+ CoordOrigin,
14
+ DocItem,
15
+ DoclingDocument,
16
+ DocumentOrigin,
17
+ GroupLabel,
18
+ ImageRef,
19
+ ProvenanceItem,
20
+ Size,
21
+ TableCell,
22
+ TableData,
23
+ )
24
+
25
+
26
+ class AsposeCellsDoclingDocument(DoclingDocument):
27
+ """Extended DoclingDocument that uses Aspose.Cells MarkdownConverter for export."""
28
+
29
+ def __init__(self, *args, **kwargs):
30
+ super().__init__(*args, **kwargs)
31
+ self._aspose_markdown_content = None
32
+
33
+ def export_to_markdown(self, **kwargs) -> str:
34
+ """Export using Aspose MarkdownConverter if available, fallback to docling default."""
35
+ if hasattr(self, '_aspose_markdown_content') and self._aspose_markdown_content:
36
+ return self._aspose_markdown_content
37
+ else:
38
+ # Fallback to original docling export
39
+ return super().export_to_markdown(**kwargs)
40
+
41
+
42
+ from PIL import Image as PILImage
43
+ from pydantic import BaseModel, NonNegativeInt, PositiveInt
44
+ from typing_extensions import override
45
+
46
+ from docling.backend.abstract_backend import (
47
+ DeclarativeDocumentBackend,
48
+ PaginatedDocumentBackend,
49
+ )
50
+ from docling.datamodel.base_models import InputFormat
51
+ from docling.datamodel.document import InputDocument
52
+
53
+ # Import our Aspose.Cells modules
54
+ from aspose.cells import Workbook, Worksheet
55
+
56
+ _log = logging.getLogger(__name__)
57
+
58
+
59
+ class ExcelCell(BaseModel):
60
+ """Represents an Excel cell.
61
+
62
+ Attributes:
63
+ row: The row number of the cell.
64
+ col: The column number of the cell.
65
+ text: The text content of the cell.
66
+ row_span: The number of rows the cell spans.
67
+ col_span: The number of columns the cell spans.
68
+ """
69
+
70
+ row: int
71
+ col: int
72
+ text: str
73
+ row_span: int
74
+ col_span: int
75
+
76
+
77
+ class ExcelTable(BaseModel):
78
+ """Represents an Excel table on a worksheet.
79
+
80
+ Attributes:
81
+ anchor: The column and row indices of the upper-left cell of the table
82
+ (0-based index).
83
+ num_rows: The number of rows in the table.
84
+ num_cols: The number of columns in the table.
85
+ data: The data in the table, represented as a list of ExcelCell objects.
86
+ """
87
+
88
+ anchor: tuple[NonNegativeInt, NonNegativeInt]
89
+ num_rows: int
90
+ num_cols: int
91
+ data: list[ExcelCell]
92
+
93
+
94
+ class CellsDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
95
+ """Backend for parsing Excel workbooks using Aspose.Cells.
96
+
97
+ The backend converts an Excel workbook into a DoclingDocument object.
98
+ Each worksheet is converted into a separate page.
99
+ The following elements are parsed:
100
+ - Cell contents, parsed as tables. If two groups of cells are disconnected
101
+ between each other, they will be parsed as two different tables.
102
+ - Images, parsed as PictureItem objects.
103
+
104
+ The DoclingDocument tables and pictures have their provenance information, including
105
+ the position in their original Excel worksheet. The position is represented by a
106
+ bounding box object with the cell indices as units (0-based index). The size of this
107
+ bounding box is the number of columns and rows that the table or picture spans.
108
+ """
109
+
110
+ @override
111
+ def __init__(
112
+ self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path], **kwargs
113
+ ) -> None:
114
+ """Initialize the CellsDocumentBackend object.
115
+
116
+ Parameters:
117
+ in_doc: The input document object.
118
+ path_or_stream: The path or stream to the Excel file.
119
+
120
+ Raises:
121
+ RuntimeError: An error occurred parsing the file.
122
+ """
123
+ super().__init__(in_doc, path_or_stream)
124
+
125
+ # Store conversion parameters
126
+ self.conversion_kwargs = kwargs
127
+
128
+ # Initialise the parents for the hierarchy
129
+ self.max_levels = 10
130
+
131
+ self.parents: dict[int, Any] = {}
132
+ for i in range(-1, self.max_levels):
133
+ self.parents[i] = None
134
+
135
+ self.workbook = None
136
+ try:
137
+ if isinstance(self.path_or_stream, BytesIO):
138
+ # For BytesIO, we need to write to a temporary file
139
+ import tempfile
140
+ with tempfile.NamedTemporaryFile(suffix='.xlsx', delete=False) as tmp:
141
+ tmp.write(self.path_or_stream.getvalue())
142
+ tmp_path = tmp.name
143
+ self.workbook = Workbook.load(tmp_path)
144
+ import os
145
+ os.unlink(tmp_path) # Clean up temp file
146
+
147
+ elif isinstance(self.path_or_stream, Path):
148
+ self.workbook = Workbook.load(str(self.path_or_stream))
149
+
150
+ self.valid = self.workbook is not None
151
+ except Exception as e:
152
+ self.valid = False
153
+ raise RuntimeError(
154
+ f"CellsDocumentBackend could not load document with hash {self.document_hash}"
155
+ ) from e
156
+
157
+ @override
158
+ def is_valid(self) -> bool:
159
+ _log.debug(f"valid: {self.valid}")
160
+ return self.valid
161
+
162
+ @classmethod
163
+ @override
164
+ def supports_pagination(cls) -> bool:
165
+ return True
166
+
167
+ @override
168
+ def page_count(self) -> int:
169
+ if self.is_valid() and self.workbook:
170
+ return len(self.workbook.sheetnames)
171
+ else:
172
+ return 0
173
+
174
+ @classmethod
175
+ @override
176
+ def supported_formats(cls) -> set[InputFormat]:
177
+ return {InputFormat.XLSX}
178
+
179
+ @override
180
+ def convert(self, **kwargs) -> DoclingDocument:
181
+ """Parse the Excel workbook into a DoclingDocument object.
182
+
183
+ Raises:
184
+ RuntimeError: Unable to run the conversion since the backend object failed to
185
+ initialize.
186
+
187
+ Returns:
188
+ The DoclingDocument object representing the Excel workbook.
189
+ """
190
+ origin = DocumentOrigin(
191
+ filename=self.file.name or "file.xlsx",
192
+ mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
193
+ binary_hash=self.document_hash,
194
+ )
195
+
196
+ doc = AsposeCellsDoclingDocument(name=self.file.stem or "file.xlsx", origin=origin)
197
+
198
+ if self.is_valid():
199
+ doc = self._convert_workbook_with_markdown(doc)
200
+ else:
201
+ raise RuntimeError(
202
+ f"Cannot convert doc with {self.document_hash} because the backend failed to init."
203
+ )
204
+
205
+ return doc
206
+
207
+ def _convert_workbook_with_markdown(self, doc: AsposeCellsDoclingDocument) -> AsposeCellsDoclingDocument:
208
+ """Convert workbook using our MarkdownConverter and embed result in DoclingDocument."""
209
+
210
+ # Use our MarkdownConverter instead of custom docling logic
211
+ from ...converters.markdown_converter import MarkdownConverter
212
+
213
+ converter = MarkdownConverter()
214
+
215
+ # Use same parameters as markitdown plugin
216
+ convert_kwargs = {
217
+ "sheet_name": self.conversion_kwargs.get("sheet_name", None),
218
+ "include_metadata": self.conversion_kwargs.get("include_metadata", True),
219
+ "value_mode": self.conversion_kwargs.get("value_mode", "value"),
220
+ "include_hyperlinks": self.conversion_kwargs.get("include_hyperlinks", True),
221
+ }
222
+
223
+ # Convert workbook to markdown using our converter
224
+ markdown_content = converter.convert_workbook(self.workbook, **convert_kwargs)
225
+
226
+ # Store the markdown content in the document for export
227
+ doc._aspose_markdown_content = markdown_content
228
+
229
+ # Still add basic docling structure for compatibility
230
+ doc = self._convert_workbook(doc)
231
+
232
+ return doc
233
+
234
+ def _convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
235
+ """Parse the Excel workbook and attach its structure to a DoclingDocument.
236
+
237
+ Args:
238
+ doc: A DoclingDocument object.
239
+
240
+ Returns:
241
+ A DoclingDocument object with the parsed items.
242
+ """
243
+
244
+ if self.workbook is not None:
245
+ # Iterate over all sheets
246
+ for i, sheet_name in enumerate(self.workbook.sheetnames):
247
+ _log.info(f"Processing sheet: {sheet_name}")
248
+
249
+ sheet = self.workbook.worksheets[sheet_name]
250
+ page_no = i + 1
251
+ # Add page with initial size
252
+ page = doc.add_page(page_no=page_no, size=Size(width=0, height=0))
253
+
254
+ self.parents[0] = doc.add_group(
255
+ parent=None,
256
+ label=GroupLabel.SECTION,
257
+ name=f"sheet: {sheet_name}",
258
+ )
259
+ doc = self._convert_sheet(doc, sheet, page_no)
260
+ width, height = self._find_page_size(doc, page_no)
261
+ page.size = Size(width=width, height=height)
262
+ else:
263
+ _log.error("Workbook is not initialized.")
264
+
265
+ return doc
266
+
267
+ def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet, page_no: int) -> DoclingDocument:
268
+ """Parse an Excel worksheet and attach its structure to a DoclingDocument
269
+
270
+ Args:
271
+ doc: The DoclingDocument to be updated.
272
+ sheet: The Excel worksheet to be parsed.
273
+ page_no: The page number for this sheet.
274
+
275
+ Returns:
276
+ The updated DoclingDocument.
277
+ """
278
+
279
+ doc = self._find_tables_in_sheet(doc, sheet, page_no)
280
+ doc = self._find_images_in_sheet(doc, sheet, page_no)
281
+
282
+ return doc
283
+
284
+ def _find_tables_in_sheet(
285
+ self, doc: DoclingDocument, sheet: Worksheet, page_no: int
286
+ ) -> DoclingDocument:
287
+ """Find all tables in an Excel sheet and attach them to a DoclingDocument.
288
+
289
+ Args:
290
+ doc: The DoclingDocument to be updated.
291
+ sheet: The Excel worksheet to be parsed.
292
+ page_no: The page number for this sheet.
293
+
294
+ Returns:
295
+ The updated DoclingDocument.
296
+ """
297
+
298
+ if self.workbook is not None:
299
+ tables = self._find_data_tables(sheet)
300
+
301
+ for excel_table in tables:
302
+ origin_col = excel_table.anchor[0]
303
+ origin_row = excel_table.anchor[1]
304
+ num_rows = excel_table.num_rows
305
+ num_cols = excel_table.num_cols
306
+
307
+ table_data = TableData(
308
+ num_rows=num_rows,
309
+ num_cols=num_cols,
310
+ table_cells=[],
311
+ )
312
+
313
+ for excel_cell in excel_table.data:
314
+ cell = TableCell(
315
+ text=excel_cell.text,
316
+ row_span=excel_cell.row_span,
317
+ col_span=excel_cell.col_span,
318
+ start_row_offset_idx=excel_cell.row,
319
+ end_row_offset_idx=excel_cell.row + excel_cell.row_span,
320
+ start_col_offset_idx=excel_cell.col,
321
+ end_col_offset_idx=excel_cell.col + excel_cell.col_span,
322
+ column_header=excel_cell.row == 0,
323
+ row_header=False,
324
+ )
325
+ table_data.table_cells.append(cell)
326
+
327
+ doc.add_table(
328
+ data=table_data,
329
+ parent=self.parents[0],
330
+ prov=ProvenanceItem(
331
+ page_no=page_no,
332
+ charspan=(0, 0),
333
+ bbox=BoundingBox.from_tuple(
334
+ (
335
+ origin_col,
336
+ origin_row,
337
+ origin_col + num_cols,
338
+ origin_row + num_rows,
339
+ ),
340
+ origin=CoordOrigin.TOPLEFT,
341
+ ),
342
+ ),
343
+ )
344
+
345
+ return doc
346
+
347
+ def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
348
+ """Find all compact rectangular data tables in an Excel worksheet.
349
+
350
+ Args:
351
+ sheet: The Excel worksheet to be parsed.
352
+
353
+ Returns:
354
+ A list of ExcelTable objects representing the data tables.
355
+ """
356
+ tables: list[ExcelTable] = []
357
+ visited: set[tuple[int, int]] = set()
358
+
359
+ # Get all non-empty cells
360
+ non_empty_cells = []
361
+ for row in range(1, 1000): # Reasonable limit
362
+ for col in range(1, 100): # Reasonable limit
363
+ cell = sheet.cell(row, col)
364
+ if cell.value is not None and str(cell.value).strip():
365
+ non_empty_cells.append((row-1, col-1)) # Convert to 0-based
366
+
367
+ # Group adjacent cells into tables
368
+ for row, col in non_empty_cells:
369
+ if (row, col) in visited:
370
+ continue
371
+
372
+ # Find table bounds starting from this cell
373
+ table_bounds, visited_cells = self._find_table_bounds(sheet, row, col)
374
+ visited.update(visited_cells)
375
+ tables.append(table_bounds)
376
+
377
+ return tables
378
+
379
+ def _find_table_bounds(
380
+ self,
381
+ sheet: Worksheet,
382
+ start_row: int,
383
+ start_col: int,
384
+ ) -> tuple[ExcelTable, set[tuple[int, int]]]:
385
+ """Determine the bounds of a compact rectangular table.
386
+
387
+ Args:
388
+ sheet: The Excel worksheet to be parsed.
389
+ start_row: The row number of the starting cell (0-based).
390
+ start_col: The column number of the starting cell (0-based).
391
+
392
+ Returns:
393
+ A tuple with an Excel table and a set of cell coordinates.
394
+ """
395
+ _log.debug("find_table_bounds")
396
+
397
+ max_row = self._find_table_bottom(sheet, start_row, start_col)
398
+ max_col = self._find_table_right(sheet, start_row, start_col)
399
+
400
+ # Collect the data within the bounds
401
+ data = []
402
+ visited_cells: set[tuple[int, int]] = set()
403
+
404
+ for row in range(start_row, max_row + 1):
405
+ for col in range(start_col, max_col + 1):
406
+ # Convert to 1-based for our cell access
407
+ cell = sheet.cell(row + 1, col + 1)
408
+
409
+ # Check for merged cells (simplified - assume no merging for now)
410
+ row_span = 1
411
+ col_span = 1
412
+
413
+ if (row, col) not in visited_cells:
414
+ cell_value = cell.value if cell.value is not None else ""
415
+ data.append(
416
+ ExcelCell(
417
+ row=row - start_row,
418
+ col=col - start_col,
419
+ text=str(cell_value),
420
+ row_span=row_span,
421
+ col_span=col_span,
422
+ )
423
+ )
424
+
425
+ # Mark cells in span as visited
426
+ for span_row in range(row, row + row_span):
427
+ for span_col in range(col, col + col_span):
428
+ visited_cells.add((span_row, span_col))
429
+
430
+ return (
431
+ ExcelTable(
432
+ anchor=(start_col, start_row),
433
+ num_rows=max_row + 1 - start_row,
434
+ num_cols=max_col + 1 - start_col,
435
+ data=data,
436
+ ),
437
+ visited_cells,
438
+ )
439
+
440
+ def _find_table_bottom(
441
+ self, sheet: Worksheet, start_row: int, start_col: int
442
+ ) -> int:
443
+ """Find the bottom boundary of a table."""
444
+ max_row = start_row
445
+
446
+ for row in range(start_row + 1, 1000): # Reasonable limit
447
+ cell = sheet.cell(row + 1, start_col + 1) # Convert to 1-based
448
+ if cell.value is None or not str(cell.value).strip():
449
+ break
450
+ max_row = row
451
+
452
+ return max_row
453
+
454
+ def _find_table_right(
455
+ self, sheet: Worksheet, start_row: int, start_col: int
456
+ ) -> int:
457
+ """Find the right boundary of a table."""
458
+ max_col = start_col
459
+
460
+ for col in range(start_col + 1, 100): # Reasonable limit
461
+ cell = sheet.cell(start_row + 1, col + 1) # Convert to 1-based
462
+ if cell.value is None or not str(cell.value).strip():
463
+ break
464
+ max_col = col
465
+
466
+ return max_col
467
+
468
+ def _find_images_in_sheet(
469
+ self, doc: DoclingDocument, sheet: Worksheet, page_no: int
470
+ ) -> DoclingDocument:
471
+ """Find images in the Excel sheet and attach them to the DoclingDocument.
472
+
473
+ Args:
474
+ doc: The DoclingDocument to be updated.
475
+ sheet: The Excel worksheet to be parsed.
476
+ page_no: The page number for this sheet.
477
+
478
+ Returns:
479
+ The updated DoclingDocument.
480
+ """
481
+ if self.workbook is not None:
482
+ # Check if the sheet has images (simplified implementation)
483
+ if hasattr(sheet, 'images') and sheet.images:
484
+ for image in sheet.images:
485
+ try:
486
+ # Convert our Image to PIL Image for compatibility
487
+ if hasattr(image, 'data') and image.data:
488
+ pil_image = PILImage.open(BytesIO(image.data))
489
+
490
+ # Get anchor information (simplified)
491
+ anchor = (0, 0, 5, 5) # Default anchor
492
+ if hasattr(image, 'anchor') and image.anchor:
493
+ anchor = (
494
+ getattr(image.anchor, 'col', 0),
495
+ getattr(image.anchor, 'row', 0),
496
+ getattr(image.anchor, 'col', 0) + 5,
497
+ getattr(image.anchor, 'row', 0) + 5,
498
+ )
499
+
500
+ doc.add_picture(
501
+ parent=self.parents[0],
502
+ image=ImageRef.from_pil(image=pil_image, dpi=72),
503
+ caption=None,
504
+ prov=ProvenanceItem(
505
+ page_no=page_no,
506
+ charspan=(0, 0),
507
+ bbox=BoundingBox.from_tuple(
508
+ anchor, origin=CoordOrigin.TOPLEFT
509
+ ),
510
+ ),
511
+ )
512
+ except Exception as e:
513
+ _log.warning(f"Could not extract image from sheet: {e}")
514
+
515
+ return doc
516
+
517
+ @staticmethod
518
+ def _find_page_size(
519
+ doc: DoclingDocument, page_no: PositiveInt
520
+ ) -> tuple[float, float]:
521
+ left: float = -1.0
522
+ top: float = -1.0
523
+ right: float = -1.0
524
+ bottom: float = -1.0
525
+ for item, _ in doc.iterate_items(traverse_pictures=True, page_no=page_no):
526
+ if not isinstance(item, DocItem):
527
+ continue
528
+ for provenance in item.prov:
529
+ bbox = provenance.bbox
530
+ left = min(left, bbox.l) if left != -1 else bbox.l
531
+ right = max(right, bbox.r) if right != -1 else bbox.r
532
+ top = min(top, bbox.t) if top != -1 else bbox.t
533
+ bottom = max(bottom, bbox.b) if bottom != -1 else bbox.b
534
+
535
+ return (max(right - left, 10.0), max(bottom - top, 10.0))
@@ -0,0 +1,15 @@
1
+ """
2
+ MarkItDown Excel Enhancement Plugin
3
+
4
+ Integrates Aspose.Cells.Python Excel-to-Markdown conversion with Microsoft MarkItDown.
5
+ Part of the Aspose.org open source ecosystem.
6
+ """
7
+
8
+ from .plugin import register_converters # re-export for convenience
9
+
10
+ __version__ = "1.1.0"
11
+ __plugin_name__ = "Excel Enhancer"
12
+ __plugin_description__ = "Enhanced Excel processing for MarkItDown (.xlsx only)"
13
+
14
+ # MarkItDown plugin interface
15
+ __plugin_interface_version__ = 1
@@ -0,0 +1,128 @@
1
+ """
2
+ MarkItDown Excel plugin that leverages Aspose.Cells.Python's Markdown converter.
3
+
4
+ This plugin integrates Aspose.Cells.Python with Microsoft MarkItDown to provide
5
+ enhanced Excel-to-Markdown conversion with metadata, multi-sheet support,
6
+ and professional formatting.
7
+
8
+ Part of the Aspose.org open source ecosystem.
9
+ """
10
+ from typing import BinaryIO, Any
11
+ import tempfile
12
+ import os
13
+ import logging
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ __plugin_interface_version__ = 1
18
+
19
+
20
+ def register_converters(markitdown, **kwargs):
21
+ """Register Aspose.Cells.Python's enhanced Excel converter with MarkItDown."""
22
+ markitdown.register_converter(ExcelEnhancerConverter())
23
+
24
+
25
+ class ExcelEnhancerConverter:
26
+ """Enhanced Excel converter using Aspose.Cells.Python's MarkItDownConverter."""
27
+
28
+ # Hints for MarkItDown converter discovery systems
29
+ file_extensions = [".xlsx"]
30
+ mimetypes = ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]
31
+ name = "Excel Enhancer"
32
+ priority = 50 # Prefer this converter over generic ones for .xlsx
33
+
34
+ def accepts(self, file_stream: BinaryIO, stream_info, **kwargs: Any) -> bool:
35
+ """Return True if the stream describes an .xlsx file."""
36
+ # MarkItDown's StreamInfo may expose different fields depending on source
37
+ extension = (
38
+ (getattr(stream_info, "extension", None) or
39
+ getattr(stream_info, "suffix", None) or
40
+ "").lower()
41
+ )
42
+ filename = (getattr(stream_info, "filename", None) or "").lower()
43
+ mimetype = (getattr(stream_info, "mimetype", None) or "").lower()
44
+
45
+ # Only support modern .xlsx
46
+ return (
47
+ extension == ".xlsx"
48
+ or (filename and filename.endswith(".xlsx") and not filename.endswith(".xls"))
49
+ or "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" in mimetype
50
+ )
51
+
52
+ def convert(self, file_stream: BinaryIO, stream_info, **kwargs: Any):
53
+ """Convert given Excel content to Markdown using our converter."""
54
+ try:
55
+ from markitdown import DocumentConverterResult
56
+ except ImportError:
57
+ # Fallback lightweight result object if markitdown is not installed
58
+ class DocumentConverterResult: # type: ignore
59
+ def __init__(self, text_content):
60
+ self.text_content = text_content
61
+
62
+ try:
63
+ # Persist incoming stream to a temporary .xlsx file
64
+ with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
65
+ if hasattr(file_stream, "read"):
66
+ content = file_stream.read()
67
+ if hasattr(file_stream, "seek"):
68
+ file_stream.seek(0) # Reset for potential re-use elsewhere
69
+ tmp.write(content)
70
+ else:
71
+ # file_stream may be a file path
72
+ with open(file_stream, "rb") as f: # type: ignore[arg-type]
73
+ tmp.write(f.read())
74
+ tmp_path = tmp.name
75
+
76
+ # Load workbook using our implementation
77
+ from ...workbook import Workbook
78
+
79
+ workbook = Workbook.load(tmp_path)
80
+
81
+ # Convert to MarkItDown format optimized for LLMs using enhanced MarkdownConverter
82
+ from ...converters.markdown_converter import MarkdownConverter
83
+
84
+ converter = MarkdownConverter()
85
+
86
+ # Simplified and optimized parameters for better user experience
87
+ convert_kwargs = {
88
+ "sheet_name": kwargs.get("sheet_name", None), # Convert specific sheet by name, None means all sheets
89
+ "include_metadata": kwargs.get("include_metadata", True),
90
+ "value_mode": kwargs.get("value_mode", "value"), # "value" shows calculated results, "formula" shows formulas
91
+ "include_hyperlinks": kwargs.get("include_hyperlinks", True), # Convert hyperlinks to markdown
92
+ }
93
+ markdown_content = converter.convert_workbook(workbook, **convert_kwargs)
94
+
95
+ # Optional generator banner for disambiguation in outputs
96
+ if kwargs.get("include_generator_info", False):
97
+ banner = "<!-- Generator: Aspose.Cells.Python MarkItDown Plugin -->\n\n"
98
+ markdown_content = banner + markdown_content
99
+
100
+ # Cleanup temp file
101
+ try:
102
+ os.unlink(tmp_path)
103
+ except OSError:
104
+ logger.debug("Temp file already removed or locked: %s", tmp_path)
105
+
106
+ logger.info("Converted .xlsx using enhanced Excel converter")
107
+ return DocumentConverterResult(markdown_content)
108
+
109
+ except Exception as e: # pragma: no cover - defensive path
110
+ logger.error("Excel conversion failed: %s", e)
111
+ error_msg = (
112
+ "# Excel conversion error\n\n"
113
+ f"Conversion failed: {str(e)}\n\n"
114
+ "Please verify the Excel file is a valid .xlsx workbook."
115
+ )
116
+ return DocumentConverterResult(error_msg)
117
+
118
+
119
+ # MarkItDown plugin interface
120
+ __plugin_interface_version__ = 1
121
+
122
+ def register_converters(markitdown, **kwargs):
123
+ """
124
+ Register Aspose.Cells.Python's enhanced Excel converter with MarkItDown.
125
+
126
+ This function is called by MarkItDown when enable_plugins=True.
127
+ """
128
+ markitdown.register_converter(ExcelEnhancerConverter())