docling 2.28.4__tar.gz → 2.30.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. {docling-2.28.4 → docling-2.30.0}/PKG-INFO +3 -3
  2. {docling-2.28.4 → docling-2.30.0}/docling/backend/docx/latex/latex_dict.py +3 -0
  3. {docling-2.28.4 → docling-2.30.0}/docling/backend/docx/latex/omml.py +14 -14
  4. {docling-2.28.4 → docling-2.30.0}/docling/backend/html_backend.py +2 -1
  5. docling-2.30.0/docling/backend/msexcel_backend.py +525 -0
  6. {docling-2.28.4 → docling-2.30.0}/docling/backend/mspowerpoint_backend.py +4 -3
  7. {docling-2.28.4 → docling-2.30.0}/docling/backend/msword_backend.py +320 -118
  8. {docling-2.28.4 → docling-2.30.0}/docling/cli/main.py +70 -2
  9. {docling-2.28.4 → docling-2.30.0}/docling/datamodel/base_models.py +33 -0
  10. {docling-2.28.4 → docling-2.30.0}/docling/datamodel/document.py +7 -0
  11. {docling-2.28.4 → docling-2.30.0}/docling/datamodel/pipeline_options.py +29 -3
  12. docling-2.30.0/docling/models/api_vlm_model.py +67 -0
  13. docling-2.30.0/docling/models/picture_description_api_model.py +58 -0
  14. {docling-2.28.4 → docling-2.30.0}/docling/models/picture_description_base_model.py +14 -2
  15. {docling-2.28.4 → docling-2.30.0}/docling/models/tesseract_ocr_cli_model.py +1 -1
  16. {docling-2.28.4 → docling-2.30.0}/docling/pipeline/standard_pdf_pipeline.py +6 -2
  17. {docling-2.28.4 → docling-2.30.0}/docling/pipeline/vlm_pipeline.py +27 -17
  18. docling-2.30.0/docling/utils/api_image_request.py +61 -0
  19. {docling-2.28.4 → docling-2.30.0}/pyproject.toml +3 -3
  20. docling-2.28.4/docling/backend/msexcel_backend.py +0 -343
  21. docling-2.28.4/docling/models/picture_description_api_model.py +0 -125
  22. {docling-2.28.4 → docling-2.30.0}/LICENSE +0 -0
  23. {docling-2.28.4 → docling-2.30.0}/README.md +0 -0
  24. {docling-2.28.4 → docling-2.30.0}/docling/__init__.py +0 -0
  25. {docling-2.28.4 → docling-2.30.0}/docling/backend/__init__.py +0 -0
  26. {docling-2.28.4 → docling-2.30.0}/docling/backend/abstract_backend.py +0 -0
  27. {docling-2.28.4 → docling-2.30.0}/docling/backend/asciidoc_backend.py +0 -0
  28. {docling-2.28.4 → docling-2.30.0}/docling/backend/csv_backend.py +0 -0
  29. {docling-2.28.4 → docling-2.30.0}/docling/backend/docling_parse_backend.py +0 -0
  30. {docling-2.28.4 → docling-2.30.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  31. {docling-2.28.4 → docling-2.30.0}/docling/backend/docling_parse_v4_backend.py +0 -0
  32. {docling-2.28.4 → docling-2.30.0}/docling/backend/docx/__init__.py +0 -0
  33. {docling-2.28.4 → docling-2.30.0}/docling/backend/docx/latex/__init__.py +0 -0
  34. {docling-2.28.4 → docling-2.30.0}/docling/backend/json/__init__.py +0 -0
  35. {docling-2.28.4 → docling-2.30.0}/docling/backend/json/docling_json_backend.py +0 -0
  36. {docling-2.28.4 → docling-2.30.0}/docling/backend/md_backend.py +0 -0
  37. {docling-2.28.4 → docling-2.30.0}/docling/backend/pdf_backend.py +0 -0
  38. {docling-2.28.4 → docling-2.30.0}/docling/backend/pypdfium2_backend.py +0 -0
  39. {docling-2.28.4 → docling-2.30.0}/docling/backend/xml/__init__.py +0 -0
  40. {docling-2.28.4 → docling-2.30.0}/docling/backend/xml/jats_backend.py +0 -0
  41. {docling-2.28.4 → docling-2.30.0}/docling/backend/xml/uspto_backend.py +0 -0
  42. {docling-2.28.4 → docling-2.30.0}/docling/chunking/__init__.py +0 -0
  43. {docling-2.28.4 → docling-2.30.0}/docling/cli/__init__.py +0 -0
  44. {docling-2.28.4 → docling-2.30.0}/docling/cli/models.py +0 -0
  45. {docling-2.28.4 → docling-2.30.0}/docling/cli/tools.py +0 -0
  46. {docling-2.28.4 → docling-2.30.0}/docling/datamodel/__init__.py +0 -0
  47. {docling-2.28.4 → docling-2.30.0}/docling/datamodel/settings.py +0 -0
  48. {docling-2.28.4 → docling-2.30.0}/docling/document_converter.py +0 -0
  49. {docling-2.28.4 → docling-2.30.0}/docling/exceptions.py +0 -0
  50. {docling-2.28.4 → docling-2.30.0}/docling/models/__init__.py +0 -0
  51. {docling-2.28.4 → docling-2.30.0}/docling/models/base_model.py +0 -0
  52. {docling-2.28.4 → docling-2.30.0}/docling/models/base_ocr_model.py +0 -0
  53. {docling-2.28.4 → docling-2.30.0}/docling/models/code_formula_model.py +0 -0
  54. {docling-2.28.4 → docling-2.30.0}/docling/models/document_picture_classifier.py +0 -0
  55. {docling-2.28.4 → docling-2.30.0}/docling/models/easyocr_model.py +0 -0
  56. {docling-2.28.4 → docling-2.30.0}/docling/models/factories/__init__.py +0 -0
  57. {docling-2.28.4 → docling-2.30.0}/docling/models/factories/base_factory.py +0 -0
  58. {docling-2.28.4 → docling-2.30.0}/docling/models/factories/ocr_factory.py +0 -0
  59. {docling-2.28.4 → docling-2.30.0}/docling/models/factories/picture_description_factory.py +0 -0
  60. {docling-2.28.4 → docling-2.30.0}/docling/models/hf_mlx_model.py +0 -0
  61. {docling-2.28.4 → docling-2.30.0}/docling/models/hf_vlm_model.py +0 -0
  62. {docling-2.28.4 → docling-2.30.0}/docling/models/layout_model.py +0 -0
  63. {docling-2.28.4 → docling-2.30.0}/docling/models/ocr_mac_model.py +0 -0
  64. {docling-2.28.4 → docling-2.30.0}/docling/models/page_assemble_model.py +0 -0
  65. {docling-2.28.4 → docling-2.30.0}/docling/models/page_preprocessing_model.py +0 -0
  66. {docling-2.28.4 → docling-2.30.0}/docling/models/picture_description_vlm_model.py +0 -0
  67. {docling-2.28.4 → docling-2.30.0}/docling/models/plugins/__init__.py +0 -0
  68. {docling-2.28.4 → docling-2.30.0}/docling/models/plugins/defaults.py +0 -0
  69. {docling-2.28.4 → docling-2.30.0}/docling/models/rapid_ocr_model.py +0 -0
  70. {docling-2.28.4 → docling-2.30.0}/docling/models/readingorder_model.py +0 -0
  71. {docling-2.28.4 → docling-2.30.0}/docling/models/table_structure_model.py +0 -0
  72. {docling-2.28.4 → docling-2.30.0}/docling/models/tesseract_ocr_model.py +0 -0
  73. {docling-2.28.4 → docling-2.30.0}/docling/pipeline/__init__.py +0 -0
  74. {docling-2.28.4 → docling-2.30.0}/docling/pipeline/base_pipeline.py +0 -0
  75. {docling-2.28.4 → docling-2.30.0}/docling/pipeline/simple_pipeline.py +0 -0
  76. {docling-2.28.4 → docling-2.30.0}/docling/py.typed +0 -0
  77. {docling-2.28.4 → docling-2.30.0}/docling/utils/__init__.py +0 -0
  78. {docling-2.28.4 → docling-2.30.0}/docling/utils/accelerator_utils.py +0 -0
  79. {docling-2.28.4 → docling-2.30.0}/docling/utils/export.py +0 -0
  80. {docling-2.28.4 → docling-2.30.0}/docling/utils/glm_utils.py +0 -0
  81. {docling-2.28.4 → docling-2.30.0}/docling/utils/layout_postprocessor.py +0 -0
  82. {docling-2.28.4 → docling-2.30.0}/docling/utils/locks.py +0 -0
  83. {docling-2.28.4 → docling-2.30.0}/docling/utils/model_downloader.py +0 -0
  84. {docling-2.28.4 → docling-2.30.0}/docling/utils/ocr_utils.py +0 -0
  85. {docling-2.28.4 → docling-2.30.0}/docling/utils/profiling.py +0 -0
  86. {docling-2.28.4 → docling-2.30.0}/docling/utils/utils.py +0 -0
  87. {docling-2.28.4 → docling-2.30.0}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.28.4
3
+ Version: 2.30.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/docling-project/docling
6
6
  License: MIT
@@ -28,7 +28,7 @@ Provides-Extra: vlm
28
28
  Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
29
29
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
30
30
  Requires-Dist: certifi (>=2024.7.4)
31
- Requires-Dist: docling-core[chunking] (>=2.24.1,<3.0.0)
31
+ Requires-Dist: docling-core[chunking] (>=2.26.0,<3.0.0)
32
32
  Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
33
33
  Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
34
34
  Requires-Dist: easyocr (>=1.7,<2.0)
@@ -58,7 +58,7 @@ Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
58
58
  Requires-Dist: tqdm (>=4.65.0,<5.0.0)
59
59
  Requires-Dist: transformers (>=4.42.0,<4.43.0) ; (sys_platform == "darwin" and platform_machine == "x86_64") and (extra == "vlm")
60
60
  Requires-Dist: transformers (>=4.46.0,<5.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
61
- Requires-Dist: typer (>=0.12.5,<0.13.0)
61
+ Requires-Dist: typer (>=0.12.5,<0.16.0)
62
62
  Project-URL: Repository, https://github.com/docling-project/docling
63
63
  Description-Content-Type: text/markdown
64
64
 
@@ -215,6 +215,9 @@ FUNC = {
215
215
  "coth": "\\coth({fe})",
216
216
  "sec": "\\sec({fe})",
217
217
  "csc": "\\csc({fe})",
218
+ "mod": "\\mod {fe}",
219
+ "max": "\\max({fe})",
220
+ "min": "\\min({fe})",
218
221
  }
219
222
 
220
223
  FUNC_PLACE = "{fe}"
@@ -5,6 +5,8 @@ Adapted from https://github.com/xiilei/dwml/blob/master/dwml/omml.py
5
5
  On 23/01/2025
6
6
  """
7
7
 
8
+ import logging
9
+
8
10
  import lxml.etree as ET
9
11
  from pylatexenc.latexencode import UnicodeToLatexEncoder
10
12
 
@@ -39,6 +41,8 @@ from docling.backend.docx.latex.latex_dict import (
39
41
 
40
42
  OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}"
41
43
 
44
+ _log = logging.getLogger(__name__)
45
+
42
46
 
43
47
  def load(stream):
44
48
  tree = ET.parse(stream)
@@ -281,8 +285,10 @@ class oMath2Latex(Tag2Method):
281
285
  if FUNC.get(t):
282
286
  latex_chars.append(FUNC[t])
283
287
  else:
284
- raise NotSupport("Not support func %s" % t)
285
- else:
288
+ _log.warning("Function not supported, will default to text: %s", t)
289
+ if isinstance(t, str):
290
+ latex_chars.append(t)
291
+ elif isinstance(t, str):
286
292
  latex_chars.append(t)
287
293
  t = BLANK.join(latex_chars)
288
294
  return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this
@@ -382,8 +388,6 @@ class oMath2Latex(Tag2Method):
382
388
 
383
389
  out_latex_str = self.u.unicode_to_latex(s)
384
390
 
385
- # print(s, out_latex_str)
386
-
387
391
  if (
388
392
  s.startswith("{") is False
389
393
  and out_latex_str.startswith("{")
@@ -392,19 +396,13 @@ class oMath2Latex(Tag2Method):
392
396
  ):
393
397
  out_latex_str = f" {out_latex_str[1:-1]} "
394
398
 
395
- # print(s, out_latex_str)
396
-
397
399
  if "ensuremath" in out_latex_str:
398
400
  out_latex_str = out_latex_str.replace("\\ensuremath{", " ")
399
401
  out_latex_str = out_latex_str.replace("}", " ")
400
402
 
401
- # print(s, out_latex_str)
402
-
403
403
  if out_latex_str.strip().startswith("\\text"):
404
404
  out_latex_str = f" \\text{{{out_latex_str}}} "
405
405
 
406
- # print(s, out_latex_str)
407
-
408
406
  return out_latex_str
409
407
 
410
408
  def do_r(self, elm):
@@ -415,10 +413,12 @@ class oMath2Latex(Tag2Method):
415
413
  """
416
414
  _str = []
417
415
  _base_str = []
418
- for s in elm.findtext("./{0}t".format(OMML_NS)):
419
- out_latex_str = self.process_unicode(s)
420
- _str.append(out_latex_str)
421
- _base_str.append(s)
416
+ found_text = elm.findtext("./{0}t".format(OMML_NS))
417
+ if found_text:
418
+ for s in found_text:
419
+ out_latex_str = self.process_unicode(s)
420
+ _str.append(out_latex_str)
421
+ _base_str.append(s)
422
422
 
423
423
  proc_str = escape_latex(BLANK.join(_str))
424
424
  base_proc_str = BLANK.join(_base_str)
@@ -34,6 +34,7 @@ TAGS_FOR_NODE_ITEMS: Final = [
34
34
  "h6",
35
35
  "p",
36
36
  "pre",
37
+ "code",
37
38
  "ul",
38
39
  "ol",
39
40
  "li",
@@ -165,7 +166,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
165
166
  self.handle_header(tag, doc)
166
167
  elif tag.name in ["p"]:
167
168
  self.handle_paragraph(tag, doc)
168
- elif tag.name in ["pre"]:
169
+ elif tag.name in ["pre", "code"]:
169
170
  self.handle_code(tag, doc)
170
171
  elif tag.name in ["ul", "ol"]:
171
172
  self.handle_list(tag, doc)
@@ -0,0 +1,525 @@
1
+ import logging
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from typing import Any, Union, cast
5
+
6
+ from docling_core.types.doc import (
7
+ BoundingBox,
8
+ CoordOrigin,
9
+ DocItem,
10
+ DoclingDocument,
11
+ DocumentOrigin,
12
+ GroupLabel,
13
+ ImageRef,
14
+ ProvenanceItem,
15
+ Size,
16
+ TableCell,
17
+ TableData,
18
+ )
19
+ from openpyxl import load_workbook
20
+ from openpyxl.drawing.image import Image
21
+ from openpyxl.drawing.spreadsheet_drawing import TwoCellAnchor
22
+ from openpyxl.worksheet.worksheet import Worksheet
23
+ from PIL import Image as PILImage
24
+ from pydantic import BaseModel, NonNegativeInt, PositiveInt
25
+ from typing_extensions import override
26
+
27
+ from docling.backend.abstract_backend import (
28
+ DeclarativeDocumentBackend,
29
+ PaginatedDocumentBackend,
30
+ )
31
+ from docling.datamodel.base_models import InputFormat
32
+ from docling.datamodel.document import InputDocument
33
+
34
+ _log = logging.getLogger(__name__)
35
+
36
+
37
+ class ExcelCell(BaseModel):
38
+ """Represents an Excel cell.
39
+
40
+ Attributes:
41
+ row: The row number of the cell.
42
+ col: The column number of the cell.
43
+ text: The text content of the cell.
44
+ row_span: The number of rows the cell spans.
45
+ col_span: The number of columns the cell spans.
46
+ """
47
+
48
+ row: int
49
+ col: int
50
+ text: str
51
+ row_span: int
52
+ col_span: int
53
+
54
+
55
+ class ExcelTable(BaseModel):
56
+ """Represents an Excel table on a worksheet.
57
+
58
+ Attributes:
59
+ anchor: The column and row indices of the upper-left cell of the table
60
+ (0-based index).
61
+ num_rows: The number of rows in the table.
62
+ num_cols: The number of columns in the table.
63
+ data: The data in the table, represented as a list of ExcelCell objects.
64
+ """
65
+
66
+ anchor: tuple[NonNegativeInt, NonNegativeInt]
67
+ num_rows: int
68
+ num_cols: int
69
+ data: list[ExcelCell]
70
+
71
+
72
+ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
73
+ """Backend for parsing Excel workbooks.
74
+
75
+ The backend converts an Excel workbook into a DoclingDocument object.
76
+ Each worksheet is converted into a separate page.
77
+ The following elements are parsed:
78
+ - Cell contents, parsed as tables. If two groups of cells are disconnected
79
+ between each other, they will be parsed as two different tables.
80
+ - Images, parsed as PictureItem objects.
81
+
82
+ The DoclingDocument tables and pictures have their provenance information, including
83
+ the position in their original Excel worksheet. The position is represented by a
84
+ bounding box object with the cell indices as units (0-based index). The size of this
85
+ bounding box is the number of columns and rows that the table or picture spans.
86
+ """
87
+
88
+ @override
89
+ def __init__(
90
+ self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
91
+ ) -> None:
92
+ """Initialize the MsExcelDocumentBackend object.
93
+
94
+ Parameters:
95
+ in_doc: The input document object.
96
+ path_or_stream: The path or stream to the Excel file.
97
+
98
+ Raises:
99
+ RuntimeError: An error occurred parsing the file.
100
+ """
101
+ super().__init__(in_doc, path_or_stream)
102
+
103
+ # Initialise the parents for the hierarchy
104
+ self.max_levels = 10
105
+
106
+ self.parents: dict[int, Any] = {}
107
+ for i in range(-1, self.max_levels):
108
+ self.parents[i] = None
109
+
110
+ self.workbook = None
111
+ try:
112
+ if isinstance(self.path_or_stream, BytesIO):
113
+ self.workbook = load_workbook(filename=self.path_or_stream)
114
+
115
+ elif isinstance(self.path_or_stream, Path):
116
+ self.workbook = load_workbook(filename=str(self.path_or_stream))
117
+
118
+ self.valid = self.workbook is not None
119
+ except Exception as e:
120
+ self.valid = False
121
+
122
+ raise RuntimeError(
123
+ f"MsExcelDocumentBackend could not load document with hash {self.document_hash}"
124
+ ) from e
125
+
126
+ @override
127
+ def is_valid(self) -> bool:
128
+ _log.debug(f"valid: {self.valid}")
129
+ return self.valid
130
+
131
+ @classmethod
132
+ @override
133
+ def supports_pagination(cls) -> bool:
134
+ return True
135
+
136
+ @override
137
+ def page_count(self) -> int:
138
+ if self.is_valid() and self.workbook:
139
+ return len(self.workbook.sheetnames)
140
+ else:
141
+ return 0
142
+
143
+ @classmethod
144
+ @override
145
+ def supported_formats(cls) -> set[InputFormat]:
146
+ return {InputFormat.XLSX}
147
+
148
+ @override
149
+ def convert(self) -> DoclingDocument:
150
+ """Parse the Excel workbook into a DoclingDocument object.
151
+
152
+ Raises:
153
+ RuntimeError: Unable to run the conversion since the backend object failed to
154
+ initialize.
155
+
156
+ Returns:
157
+ The DoclingDocument object representing the Excel workbook.
158
+ """
159
+ origin = DocumentOrigin(
160
+ filename=self.file.name or "file.xlsx",
161
+ mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
162
+ binary_hash=self.document_hash,
163
+ )
164
+
165
+ doc = DoclingDocument(name=self.file.stem or "file.xlsx", origin=origin)
166
+
167
+ if self.is_valid():
168
+ doc = self._convert_workbook(doc)
169
+ else:
170
+ raise RuntimeError(
171
+ f"Cannot convert doc with {self.document_hash} because the backend failed to init."
172
+ )
173
+
174
+ return doc
175
+
176
+ def _convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
177
+ """Parse the Excel workbook and attach its structure to a DoclingDocument.
178
+
179
+ Args:
180
+ doc: A DoclingDocument object.
181
+
182
+ Returns:
183
+ A DoclingDocument object with the parsed items.
184
+ """
185
+
186
+ if self.workbook is not None:
187
+
188
+ # Iterate over all sheets
189
+ for sheet_name in self.workbook.sheetnames:
190
+ _log.info(f"Processing sheet: {sheet_name}")
191
+
192
+ sheet = self.workbook[sheet_name]
193
+ page_no = self.workbook.index(sheet) + 1
194
+ # do not rely on sheet.max_column, sheet.max_row if there are images
195
+ page = doc.add_page(page_no=page_no, size=Size(width=0, height=0))
196
+
197
+ self.parents[0] = doc.add_group(
198
+ parent=None,
199
+ label=GroupLabel.SECTION,
200
+ name=f"sheet: {sheet_name}",
201
+ )
202
+ doc = self._convert_sheet(doc, sheet)
203
+ width, height = self._find_page_size(doc, page_no)
204
+ page.size = Size(width=width, height=height)
205
+ else:
206
+ _log.error("Workbook is not initialized.")
207
+
208
+ return doc
209
+
210
+ def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet) -> DoclingDocument:
211
+ """Parse an Excel worksheet and attach its structure to a DoclingDocument
212
+
213
+ Args:
214
+ doc: The DoclingDocument to be updated.
215
+ sheet: The Excel worksheet to be parsed.
216
+
217
+ Returns:
218
+ The updated DoclingDocument.
219
+ """
220
+
221
+ doc = self._find_tables_in_sheet(doc, sheet)
222
+
223
+ doc = self._find_images_in_sheet(doc, sheet)
224
+
225
+ return doc
226
+
227
+ def _find_tables_in_sheet(
228
+ self, doc: DoclingDocument, sheet: Worksheet
229
+ ) -> DoclingDocument:
230
+ """Find all tables in an Excel sheet and attach them to a DoclingDocument.
231
+
232
+ Args:
233
+ doc: The DoclingDocument to be updated.
234
+ sheet: The Excel worksheet to be parsed.
235
+
236
+ Returns:
237
+ The updated DoclingDocument.
238
+ """
239
+
240
+ if self.workbook is not None:
241
+ tables = self._find_data_tables(sheet)
242
+
243
+ for excel_table in tables:
244
+ origin_col = excel_table.anchor[0]
245
+ origin_row = excel_table.anchor[1]
246
+ num_rows = excel_table.num_rows
247
+ num_cols = excel_table.num_cols
248
+
249
+ table_data = TableData(
250
+ num_rows=num_rows,
251
+ num_cols=num_cols,
252
+ table_cells=[],
253
+ )
254
+
255
+ for excel_cell in excel_table.data:
256
+
257
+ cell = TableCell(
258
+ text=excel_cell.text,
259
+ row_span=excel_cell.row_span,
260
+ col_span=excel_cell.col_span,
261
+ start_row_offset_idx=excel_cell.row,
262
+ end_row_offset_idx=excel_cell.row + excel_cell.row_span,
263
+ start_col_offset_idx=excel_cell.col,
264
+ end_col_offset_idx=excel_cell.col + excel_cell.col_span,
265
+ column_header=excel_cell.row == 0,
266
+ row_header=False,
267
+ )
268
+ table_data.table_cells.append(cell)
269
+
270
+ page_no = self.workbook.index(sheet) + 1
271
+ doc.add_table(
272
+ data=table_data,
273
+ parent=self.parents[0],
274
+ prov=ProvenanceItem(
275
+ page_no=page_no,
276
+ charspan=(0, 0),
277
+ bbox=BoundingBox.from_tuple(
278
+ (
279
+ origin_col,
280
+ origin_row,
281
+ origin_col + num_cols,
282
+ origin_row + num_rows,
283
+ ),
284
+ origin=CoordOrigin.TOPLEFT,
285
+ ),
286
+ ),
287
+ )
288
+
289
+ return doc
290
+
291
+ def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
292
+ """Find all compact rectangular data tables in an Excel worksheet.
293
+
294
+ Args:
295
+ sheet: The Excel worksheet to be parsed.
296
+
297
+ Returns:
298
+ A list of ExcelTable objects representing the data tables.
299
+ """
300
+ tables: list[ExcelTable] = [] # List to store found tables
301
+ visited: set[tuple[int, int]] = set() # Track already visited cells
302
+
303
+ # Iterate over all cells in the sheet
304
+ for ri, row in enumerate(sheet.iter_rows(values_only=False)):
305
+ for rj, cell in enumerate(row):
306
+
307
+ # Skip empty or already visited cells
308
+ if cell.value is None or (ri, rj) in visited:
309
+ continue
310
+
311
+ # If the cell starts a new table, find its bounds
312
+ table_bounds, visited_cells = self._find_table_bounds(sheet, ri, rj)
313
+
314
+ visited.update(visited_cells) # Mark these cells as visited
315
+ tables.append(table_bounds)
316
+
317
+ return tables
318
+
319
+ def _find_table_bounds(
320
+ self,
321
+ sheet: Worksheet,
322
+ start_row: int,
323
+ start_col: int,
324
+ ) -> tuple[ExcelTable, set[tuple[int, int]]]:
325
+ """Determine the bounds of a compact rectangular table.
326
+
327
+ Args:
328
+ sheet: The Excel worksheet to be parsed.
329
+ start_row: The row number of the starting cell.
330
+ start_col: The column number of the starting cell.
331
+
332
+ Returns:
333
+ A tuple with an Excel table and a set of cell coordinates.
334
+ """
335
+ _log.debug("find_table_bounds")
336
+
337
+ max_row = self._find_table_bottom(sheet, start_row, start_col)
338
+ max_col = self._find_table_right(sheet, start_row, start_col)
339
+
340
+ # Collect the data within the bounds
341
+ data = []
342
+ visited_cells: set[tuple[int, int]] = set()
343
+ for ri in range(start_row, max_row + 1):
344
+ for rj in range(start_col, max_col + 1):
345
+
346
+ cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing
347
+
348
+ # Check if the cell belongs to a merged range
349
+ row_span = 1
350
+ col_span = 1
351
+
352
+ for merged_range in sheet.merged_cells.ranges:
353
+
354
+ if (
355
+ merged_range.min_row <= ri + 1
356
+ and ri + 1 <= merged_range.max_row
357
+ and merged_range.min_col <= rj + 1
358
+ and rj + 1 <= merged_range.max_col
359
+ ):
360
+
361
+ row_span = merged_range.max_row - merged_range.min_row + 1
362
+ col_span = merged_range.max_col - merged_range.min_col + 1
363
+ break
364
+
365
+ if (ri, rj) not in visited_cells:
366
+ data.append(
367
+ ExcelCell(
368
+ row=ri - start_row,
369
+ col=rj - start_col,
370
+ text=str(cell.value),
371
+ row_span=row_span,
372
+ col_span=col_span,
373
+ )
374
+ )
375
+
376
+ # Mark all cells in the span as visited
377
+ for span_row in range(ri, ri + row_span):
378
+ for span_col in range(rj, rj + col_span):
379
+ visited_cells.add((span_row, span_col))
380
+
381
+ return (
382
+ ExcelTable(
383
+ anchor=(start_col, start_row),
384
+ num_rows=max_row + 1 - start_row,
385
+ num_cols=max_col + 1 - start_col,
386
+ data=data,
387
+ ),
388
+ visited_cells,
389
+ )
390
+
391
+ def _find_table_bottom(
392
+ self, sheet: Worksheet, start_row: int, start_col: int
393
+ ) -> int:
394
+ """Find the bottom boundary of a table.
395
+
396
+ Args:
397
+ sheet: The Excel worksheet to be parsed.
398
+ start_row: The starting row of the table.
399
+ start_col: The starting column of the table.
400
+
401
+ Returns:
402
+ The row index representing the bottom boundary of the table.
403
+ """
404
+ max_row: int = start_row
405
+
406
+ while max_row < sheet.max_row - 1:
407
+ # Get the cell value or check if it is part of a merged cell
408
+ cell = sheet.cell(row=max_row + 2, column=start_col + 1)
409
+
410
+ # Check if the cell is part of a merged range
411
+ merged_range = next(
412
+ (mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
413
+ None,
414
+ )
415
+
416
+ if cell.value is None and not merged_range:
417
+ break # Stop if the cell is empty and not merged
418
+
419
+ # Expand max_row to include the merged range if applicable
420
+ if merged_range:
421
+ max_row = max(max_row, merged_range.max_row - 1)
422
+ else:
423
+ max_row += 1
424
+
425
+ return max_row
426
+
427
+ def _find_table_right(
428
+ self, sheet: Worksheet, start_row: int, start_col: int
429
+ ) -> int:
430
+ """Find the right boundary of a table.
431
+
432
+ Args:
433
+ sheet: The Excel worksheet to be parsed.
434
+ start_row: The starting row of the table.
435
+ start_col: The starting column of the table.
436
+
437
+ Returns:
438
+ The column index representing the right boundary of the table."
439
+ """
440
+ max_col: int = start_col
441
+
442
+ while max_col < sheet.max_column - 1:
443
+ # Get the cell value or check if it is part of a merged cell
444
+ cell = sheet.cell(row=start_row + 1, column=max_col + 2)
445
+
446
+ # Check if the cell is part of a merged range
447
+ merged_range = next(
448
+ (mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
449
+ None,
450
+ )
451
+
452
+ if cell.value is None and not merged_range:
453
+ break # Stop if the cell is empty and not merged
454
+
455
+ # Expand max_col to include the merged range if applicable
456
+ if merged_range:
457
+ max_col = max(max_col, merged_range.max_col - 1)
458
+ else:
459
+ max_col += 1
460
+
461
+ return max_col
462
+
463
+ def _find_images_in_sheet(
464
+ self, doc: DoclingDocument, sheet: Worksheet
465
+ ) -> DoclingDocument:
466
+ """Find images in the Excel sheet and attach them to the DoclingDocument.
467
+
468
+ Args:
469
+ doc: The DoclingDocument to be updated.
470
+ sheet: The Excel worksheet to be parsed.
471
+
472
+ Returns:
473
+ The updated DoclingDocument.
474
+ """
475
+ if self.workbook is not None:
476
+ # Iterate over byte images in the sheet
477
+ for item in sheet._images: # type: ignore[attr-defined]
478
+ try:
479
+ image: Image = cast(Image, item)
480
+ pil_image = PILImage.open(image.ref) # type: ignore[arg-type]
481
+ page_no = self.workbook.index(sheet) + 1
482
+ anchor = (0, 0, 0, 0)
483
+ if isinstance(image.anchor, TwoCellAnchor):
484
+ anchor = (
485
+ image.anchor._from.col,
486
+ image.anchor._from.row,
487
+ image.anchor.to.col + 1,
488
+ image.anchor.to.row + 1,
489
+ )
490
+ doc.add_picture(
491
+ parent=self.parents[0],
492
+ image=ImageRef.from_pil(image=pil_image, dpi=72),
493
+ caption=None,
494
+ prov=ProvenanceItem(
495
+ page_no=page_no,
496
+ charspan=(0, 0),
497
+ bbox=BoundingBox.from_tuple(
498
+ anchor, origin=CoordOrigin.TOPLEFT
499
+ ),
500
+ ),
501
+ )
502
+ except:
503
+ _log.error("could not extract the image from excel sheets")
504
+
505
+ return doc
506
+
507
+ @staticmethod
508
+ def _find_page_size(
509
+ doc: DoclingDocument, page_no: PositiveInt
510
+ ) -> tuple[float, float]:
511
+ left: float = -1.0
512
+ top: float = -1.0
513
+ right: float = -1.0
514
+ bottom: float = -1.0
515
+ for item, _ in doc.iterate_items(traverse_pictures=True, page_no=page_no):
516
+ if not isinstance(item, DocItem):
517
+ continue
518
+ for provenance in item.prov:
519
+ bbox = provenance.bbox
520
+ left = min(left, bbox.l) if left != -1 else bbox.l
521
+ right = max(right, bbox.r) if right != -1 else bbox.r
522
+ top = min(top, bbox.t) if top != -1 else bbox.t
523
+ bottom = max(bottom, bbox.b) if bottom != -1 else bbox.b
524
+
525
+ return (right - left, bottom - top)
@@ -392,9 +392,10 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
392
392
  self.handle_tables(shape, parent_slide, slide_ind, doc, slide_size)
393
393
  if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
394
394
  # Handle Pictures
395
- self.handle_pictures(
396
- shape, parent_slide, slide_ind, doc, slide_size
397
- )
395
+ if hasattr(shape, "image"):
396
+ self.handle_pictures(
397
+ shape, parent_slide, slide_ind, doc, slide_size
398
+ )
398
399
  # If shape doesn't have any text, move on to the next shape
399
400
  if not hasattr(shape, "text"):
400
401
  return