docling 2.28.4__py3-none-any.whl → 2.30.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -215,6 +215,9 @@ FUNC = {
215
215
  "coth": "\\coth({fe})",
216
216
  "sec": "\\sec({fe})",
217
217
  "csc": "\\csc({fe})",
218
+ "mod": "\\mod {fe}",
219
+ "max": "\\max({fe})",
220
+ "min": "\\min({fe})",
218
221
  }
219
222
 
220
223
  FUNC_PLACE = "{fe}"
@@ -5,6 +5,8 @@ Adapted from https://github.com/xiilei/dwml/blob/master/dwml/omml.py
5
5
  On 23/01/2025
6
6
  """
7
7
 
8
+ import logging
9
+
8
10
  import lxml.etree as ET
9
11
  from pylatexenc.latexencode import UnicodeToLatexEncoder
10
12
 
@@ -39,6 +41,8 @@ from docling.backend.docx.latex.latex_dict import (
39
41
 
40
42
  OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}"
41
43
 
44
+ _log = logging.getLogger(__name__)
45
+
42
46
 
43
47
  def load(stream):
44
48
  tree = ET.parse(stream)
@@ -281,8 +285,10 @@ class oMath2Latex(Tag2Method):
281
285
  if FUNC.get(t):
282
286
  latex_chars.append(FUNC[t])
283
287
  else:
284
- raise NotSupport("Not support func %s" % t)
285
- else:
288
+ _log.warning("Function not supported, will default to text: %s", t)
289
+ if isinstance(t, str):
290
+ latex_chars.append(t)
291
+ elif isinstance(t, str):
286
292
  latex_chars.append(t)
287
293
  t = BLANK.join(latex_chars)
288
294
  return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this
@@ -382,8 +388,6 @@ class oMath2Latex(Tag2Method):
382
388
 
383
389
  out_latex_str = self.u.unicode_to_latex(s)
384
390
 
385
- # print(s, out_latex_str)
386
-
387
391
  if (
388
392
  s.startswith("{") is False
389
393
  and out_latex_str.startswith("{")
@@ -392,19 +396,13 @@ class oMath2Latex(Tag2Method):
392
396
  ):
393
397
  out_latex_str = f" {out_latex_str[1:-1]} "
394
398
 
395
- # print(s, out_latex_str)
396
-
397
399
  if "ensuremath" in out_latex_str:
398
400
  out_latex_str = out_latex_str.replace("\\ensuremath{", " ")
399
401
  out_latex_str = out_latex_str.replace("}", " ")
400
402
 
401
- # print(s, out_latex_str)
402
-
403
403
  if out_latex_str.strip().startswith("\\text"):
404
404
  out_latex_str = f" \\text{{{out_latex_str}}} "
405
405
 
406
- # print(s, out_latex_str)
407
-
408
406
  return out_latex_str
409
407
 
410
408
  def do_r(self, elm):
@@ -415,10 +413,12 @@ class oMath2Latex(Tag2Method):
415
413
  """
416
414
  _str = []
417
415
  _base_str = []
418
- for s in elm.findtext("./{0}t".format(OMML_NS)):
419
- out_latex_str = self.process_unicode(s)
420
- _str.append(out_latex_str)
421
- _base_str.append(s)
416
+ found_text = elm.findtext("./{0}t".format(OMML_NS))
417
+ if found_text:
418
+ for s in found_text:
419
+ out_latex_str = self.process_unicode(s)
420
+ _str.append(out_latex_str)
421
+ _base_str.append(s)
422
422
 
423
423
  proc_str = escape_latex(BLANK.join(_str))
424
424
  base_proc_str = BLANK.join(_base_str)
@@ -34,6 +34,7 @@ TAGS_FOR_NODE_ITEMS: Final = [
34
34
  "h6",
35
35
  "p",
36
36
  "pre",
37
+ "code",
37
38
  "ul",
38
39
  "ol",
39
40
  "li",
@@ -165,7 +166,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
165
166
  self.handle_header(tag, doc)
166
167
  elif tag.name in ["p"]:
167
168
  self.handle_paragraph(tag, doc)
168
- elif tag.name in ["pre"]:
169
+ elif tag.name in ["pre", "code"]:
169
170
  self.handle_code(tag, doc)
170
171
  elif tag.name in ["ul", "ol"]:
171
172
  self.handle_list(tag, doc)
@@ -1,36 +1,50 @@
1
1
  import logging
2
2
  from io import BytesIO
3
3
  from pathlib import Path
4
- from typing import Dict, Set, Tuple, Union
4
+ from typing import Any, Union, cast
5
5
 
6
6
  from docling_core.types.doc import (
7
+ BoundingBox,
8
+ CoordOrigin,
9
+ DocItem,
7
10
  DoclingDocument,
8
11
  DocumentOrigin,
9
12
  GroupLabel,
10
13
  ImageRef,
14
+ ProvenanceItem,
15
+ Size,
11
16
  TableCell,
12
17
  TableData,
13
18
  )
14
-
15
- # from lxml import etree
16
- from openpyxl import Workbook, load_workbook
17
- from openpyxl.cell.cell import Cell
19
+ from openpyxl import load_workbook
18
20
  from openpyxl.drawing.image import Image
21
+ from openpyxl.drawing.spreadsheet_drawing import TwoCellAnchor
19
22
  from openpyxl.worksheet.worksheet import Worksheet
23
+ from PIL import Image as PILImage
24
+ from pydantic import BaseModel, NonNegativeInt, PositiveInt
25
+ from typing_extensions import override
20
26
 
21
- from docling.backend.abstract_backend import DeclarativeDocumentBackend
27
+ from docling.backend.abstract_backend import (
28
+ DeclarativeDocumentBackend,
29
+ PaginatedDocumentBackend,
30
+ )
22
31
  from docling.datamodel.base_models import InputFormat
23
32
  from docling.datamodel.document import InputDocument
24
33
 
25
34
  _log = logging.getLogger(__name__)
26
35
 
27
- from typing import Any, List
28
36
 
29
- from PIL import Image as PILImage
30
- from pydantic import BaseModel
37
+ class ExcelCell(BaseModel):
38
+ """Represents an Excel cell.
31
39
 
40
+ Attributes:
41
+ row: The row number of the cell.
42
+ col: The column number of the cell.
43
+ text: The text content of the cell.
44
+ row_span: The number of rows the cell spans.
45
+ col_span: The number of columns the cell spans.
46
+ """
32
47
 
33
- class ExcelCell(BaseModel):
34
48
  row: int
35
49
  col: int
36
50
  text: str
@@ -39,19 +53,57 @@ class ExcelCell(BaseModel):
39
53
 
40
54
 
41
55
  class ExcelTable(BaseModel):
56
+ """Represents an Excel table on a worksheet.
57
+
58
+ Attributes:
59
+ anchor: The column and row indices of the upper-left cell of the table
60
+ (0-based index).
61
+ num_rows: The number of rows in the table.
62
+ num_cols: The number of columns in the table.
63
+ data: The data in the table, represented as a list of ExcelCell objects.
64
+ """
65
+
66
+ anchor: tuple[NonNegativeInt, NonNegativeInt]
42
67
  num_rows: int
43
68
  num_cols: int
44
- data: List[ExcelCell]
69
+ data: list[ExcelCell]
45
70
 
46
71
 
47
- class MsExcelDocumentBackend(DeclarativeDocumentBackend):
48
- def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
72
+ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
73
+ """Backend for parsing Excel workbooks.
74
+
75
+ The backend converts an Excel workbook into a DoclingDocument object.
76
+ Each worksheet is converted into a separate page.
77
+ The following elements are parsed:
78
+ - Cell contents, parsed as tables. If two groups of cells are disconnected
79
+ between each other, they will be parsed as two different tables.
80
+ - Images, parsed as PictureItem objects.
81
+
82
+ The DoclingDocument tables and pictures have their provenance information, including
83
+ the position in their original Excel worksheet. The position is represented by a
84
+ bounding box object with the cell indices as units (0-based index). The size of this
85
+ bounding box is the number of columns and rows that the table or picture spans.
86
+ """
87
+
88
+ @override
89
+ def __init__(
90
+ self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
91
+ ) -> None:
92
+ """Initialize the MsExcelDocumentBackend object.
93
+
94
+ Parameters:
95
+ in_doc: The input document object.
96
+ path_or_stream: The path or stream to the Excel file.
97
+
98
+ Raises:
99
+ RuntimeError: An error occurred parsing the file.
100
+ """
49
101
  super().__init__(in_doc, path_or_stream)
50
102
 
51
103
  # Initialise the parents for the hierarchy
52
104
  self.max_levels = 10
53
105
 
54
- self.parents: Dict[int, Any] = {}
106
+ self.parents: dict[int, Any] = {}
55
107
  for i in range(-1, self.max_levels):
56
108
  self.parents[i] = None
57
109
 
@@ -63,35 +115,47 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
63
115
  elif isinstance(self.path_or_stream, Path):
64
116
  self.workbook = load_workbook(filename=str(self.path_or_stream))
65
117
 
66
- self.valid = True
118
+ self.valid = self.workbook is not None
67
119
  except Exception as e:
68
120
  self.valid = False
69
121
 
70
122
  raise RuntimeError(
71
- f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
123
+ f"MsExcelDocumentBackend could not load document with hash {self.document_hash}"
72
124
  ) from e
73
125
 
126
+ @override
74
127
  def is_valid(self) -> bool:
75
- _log.info(f"valid: {self.valid}")
128
+ _log.debug(f"valid: {self.valid}")
76
129
  return self.valid
77
130
 
78
131
  @classmethod
132
+ @override
79
133
  def supports_pagination(cls) -> bool:
80
134
  return True
81
135
 
82
- def unload(self):
83
- if isinstance(self.path_or_stream, BytesIO):
84
- self.path_or_stream.close()
85
-
86
- self.path_or_stream = None
136
+ @override
137
+ def page_count(self) -> int:
138
+ if self.is_valid() and self.workbook:
139
+ return len(self.workbook.sheetnames)
140
+ else:
141
+ return 0
87
142
 
88
143
  @classmethod
89
- def supported_formats(cls) -> Set[InputFormat]:
144
+ @override
145
+ def supported_formats(cls) -> set[InputFormat]:
90
146
  return {InputFormat.XLSX}
91
147
 
148
+ @override
92
149
  def convert(self) -> DoclingDocument:
93
- # Parses the XLSX into a structured document model.
150
+ """Parse the Excel workbook into a DoclingDocument object.
94
151
 
152
+ Raises:
153
+ RuntimeError: Unable to run the conversion since the backend object failed to
154
+ initialize.
155
+
156
+ Returns:
157
+ The DoclingDocument object representing the Excel workbook.
158
+ """
95
159
  origin = DocumentOrigin(
96
160
  filename=self.file.name or "file.xlsx",
97
161
  mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
@@ -110,6 +174,14 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
110
174
  return doc
111
175
 
112
176
  def _convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
177
+ """Parse the Excel workbook and attach its structure to a DoclingDocument.
178
+
179
+ Args:
180
+ doc: A DoclingDocument object.
181
+
182
+ Returns:
183
+ A DoclingDocument object with the parsed items.
184
+ """
113
185
 
114
186
  if self.workbook is not None:
115
187
 
@@ -117,22 +189,34 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
117
189
  for sheet_name in self.workbook.sheetnames:
118
190
  _log.info(f"Processing sheet: {sheet_name}")
119
191
 
120
- # Access the sheet by name
121
192
  sheet = self.workbook[sheet_name]
193
+ page_no = self.workbook.index(sheet) + 1
194
+ # do not rely on sheet.max_column, sheet.max_row if there are images
195
+ page = doc.add_page(page_no=page_no, size=Size(width=0, height=0))
122
196
 
123
197
  self.parents[0] = doc.add_group(
124
198
  parent=None,
125
199
  label=GroupLabel.SECTION,
126
200
  name=f"sheet: {sheet_name}",
127
201
  )
128
-
129
202
  doc = self._convert_sheet(doc, sheet)
203
+ width, height = self._find_page_size(doc, page_no)
204
+ page.size = Size(width=width, height=height)
130
205
  else:
131
206
  _log.error("Workbook is not initialized.")
132
207
 
133
208
  return doc
134
209
 
135
- def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet):
210
+ def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet) -> DoclingDocument:
211
+ """Parse an Excel worksheet and attach its structure to a DoclingDocument
212
+
213
+ Args:
214
+ doc: The DoclingDocument to be updated.
215
+ sheet: The Excel worksheet to be parsed.
216
+
217
+ Returns:
218
+ The updated DoclingDocument.
219
+ """
136
220
 
137
221
  doc = self._find_tables_in_sheet(doc, sheet)
138
222
 
@@ -140,47 +224,81 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
140
224
 
141
225
  return doc
142
226
 
143
- def _find_tables_in_sheet(self, doc: DoclingDocument, sheet: Worksheet):
144
-
145
- tables = self._find_data_tables(sheet)
227
+ def _find_tables_in_sheet(
228
+ self, doc: DoclingDocument, sheet: Worksheet
229
+ ) -> DoclingDocument:
230
+ """Find all tables in an Excel sheet and attach them to a DoclingDocument.
146
231
 
147
- for excel_table in tables:
148
- num_rows = excel_table.num_rows
149
- num_cols = excel_table.num_cols
232
+ Args:
233
+ doc: The DoclingDocument to be updated.
234
+ sheet: The Excel worksheet to be parsed.
150
235
 
151
- table_data = TableData(
152
- num_rows=num_rows,
153
- num_cols=num_cols,
154
- table_cells=[],
155
- )
236
+ Returns:
237
+ The updated DoclingDocument.
238
+ """
156
239
 
157
- for excel_cell in excel_table.data:
158
-
159
- cell = TableCell(
160
- text=excel_cell.text,
161
- row_span=excel_cell.row_span,
162
- col_span=excel_cell.col_span,
163
- start_row_offset_idx=excel_cell.row,
164
- end_row_offset_idx=excel_cell.row + excel_cell.row_span,
165
- start_col_offset_idx=excel_cell.col,
166
- end_col_offset_idx=excel_cell.col + excel_cell.col_span,
167
- column_header=excel_cell.row == 0,
168
- row_header=False,
240
+ if self.workbook is not None:
241
+ tables = self._find_data_tables(sheet)
242
+
243
+ for excel_table in tables:
244
+ origin_col = excel_table.anchor[0]
245
+ origin_row = excel_table.anchor[1]
246
+ num_rows = excel_table.num_rows
247
+ num_cols = excel_table.num_cols
248
+
249
+ table_data = TableData(
250
+ num_rows=num_rows,
251
+ num_cols=num_cols,
252
+ table_cells=[],
169
253
  )
170
- table_data.table_cells.append(cell)
171
254
 
172
- doc.add_table(data=table_data, parent=self.parents[0])
255
+ for excel_cell in excel_table.data:
256
+
257
+ cell = TableCell(
258
+ text=excel_cell.text,
259
+ row_span=excel_cell.row_span,
260
+ col_span=excel_cell.col_span,
261
+ start_row_offset_idx=excel_cell.row,
262
+ end_row_offset_idx=excel_cell.row + excel_cell.row_span,
263
+ start_col_offset_idx=excel_cell.col,
264
+ end_col_offset_idx=excel_cell.col + excel_cell.col_span,
265
+ column_header=excel_cell.row == 0,
266
+ row_header=False,
267
+ )
268
+ table_data.table_cells.append(cell)
269
+
270
+ page_no = self.workbook.index(sheet) + 1
271
+ doc.add_table(
272
+ data=table_data,
273
+ parent=self.parents[0],
274
+ prov=ProvenanceItem(
275
+ page_no=page_no,
276
+ charspan=(0, 0),
277
+ bbox=BoundingBox.from_tuple(
278
+ (
279
+ origin_col,
280
+ origin_row,
281
+ origin_col + num_cols,
282
+ origin_row + num_rows,
283
+ ),
284
+ origin=CoordOrigin.TOPLEFT,
285
+ ),
286
+ ),
287
+ )
173
288
 
174
289
  return doc
175
290
 
176
- def _find_data_tables(self, sheet: Worksheet) -> List[ExcelTable]:
177
- """
178
- Find all compact rectangular data tables in a sheet.
179
- """
180
- # _log.info("find_data_tables")
291
+ def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
292
+ """Find all compact rectangular data tables in an Excel worksheet.
293
+
294
+ Args:
295
+ sheet: The Excel worksheet to be parsed.
181
296
 
182
- tables = [] # List to store found tables
183
- visited: set[Tuple[int, int]] = set() # Track already visited cells
297
+ Returns:
298
+ A list of ExcelTable objects representing the data tables.
299
+ """
300
+ tables: list[ExcelTable] = [] # List to store found tables
301
+ visited: set[tuple[int, int]] = set() # Track already visited cells
184
302
 
185
303
  # Iterate over all cells in the sheet
186
304
  for ri, row in enumerate(sheet.iter_rows(values_only=False)):
@@ -191,9 +309,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
191
309
  continue
192
310
 
193
311
  # If the cell starts a new table, find its bounds
194
- table_bounds, visited_cells = self._find_table_bounds(
195
- sheet, ri, rj, visited
196
- )
312
+ table_bounds, visited_cells = self._find_table_bounds(sheet, ri, rj)
197
313
 
198
314
  visited.update(visited_cells) # Mark these cells as visited
199
315
  tables.append(table_bounds)
@@ -205,22 +321,25 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
205
321
  sheet: Worksheet,
206
322
  start_row: int,
207
323
  start_col: int,
208
- visited: set[Tuple[int, int]],
209
- ):
210
- """
211
- Determine the bounds of a compact rectangular table.
324
+ ) -> tuple[ExcelTable, set[tuple[int, int]]]:
325
+ """Determine the bounds of a compact rectangular table.
326
+
327
+ Args:
328
+ sheet: The Excel worksheet to be parsed.
329
+ start_row: The row number of the starting cell.
330
+ start_col: The column number of the starting cell.
331
+
212
332
  Returns:
213
- - A dictionary with the bounds and data.
214
- - A set of visited cell coordinates.
333
+ A tuple with an Excel table and a set of cell coordinates.
215
334
  """
216
- _log.info("find_table_bounds")
335
+ _log.debug("find_table_bounds")
217
336
 
218
337
  max_row = self._find_table_bottom(sheet, start_row, start_col)
219
338
  max_col = self._find_table_right(sheet, start_row, start_col)
220
339
 
221
340
  # Collect the data within the bounds
222
341
  data = []
223
- visited_cells = set()
342
+ visited_cells: set[tuple[int, int]] = set()
224
343
  for ri in range(start_row, max_row + 1):
225
344
  for rj in range(start_col, max_col + 1):
226
345
 
@@ -230,7 +349,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
230
349
  row_span = 1
231
350
  col_span = 1
232
351
 
233
- # _log.info(sheet.merged_cells.ranges)
234
352
  for merged_range in sheet.merged_cells.ranges:
235
353
 
236
354
  if (
@@ -254,7 +372,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
254
372
  col_span=col_span,
255
373
  )
256
374
  )
257
- # _log.info(f"cell: {ri}, {rj} -> {ri - start_row}, {rj - start_col}, {row_span}, {col_span}: {str(cell.value)}")
258
375
 
259
376
  # Mark all cells in the span as visited
260
377
  for span_row in range(ri, ri + row_span):
@@ -263,6 +380,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
263
380
 
264
381
  return (
265
382
  ExcelTable(
383
+ anchor=(start_col, start_row),
266
384
  num_rows=max_row + 1 - start_row,
267
385
  num_cols=max_col + 1 - start_col,
268
386
  data=data,
@@ -270,10 +388,20 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
270
388
  visited_cells,
271
389
  )
272
390
 
273
- def _find_table_bottom(self, sheet: Worksheet, start_row: int, start_col: int):
274
- """Function to find the bottom boundary of the table"""
391
+ def _find_table_bottom(
392
+ self, sheet: Worksheet, start_row: int, start_col: int
393
+ ) -> int:
394
+ """Find the bottom boundary of a table.
275
395
 
276
- max_row = start_row
396
+ Args:
397
+ sheet: The Excel worksheet to be parsed.
398
+ start_row: The starting row of the table.
399
+ start_col: The starting column of the table.
400
+
401
+ Returns:
402
+ The row index representing the bottom boundary of the table.
403
+ """
404
+ max_row: int = start_row
277
405
 
278
406
  while max_row < sheet.max_row - 1:
279
407
  # Get the cell value or check if it is part of a merged cell
@@ -296,10 +424,20 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
296
424
 
297
425
  return max_row
298
426
 
299
- def _find_table_right(self, sheet: Worksheet, start_row: int, start_col: int):
300
- """Function to find the right boundary of the table"""
427
+ def _find_table_right(
428
+ self, sheet: Worksheet, start_row: int, start_col: int
429
+ ) -> int:
430
+ """Find the right boundary of a table.
431
+
432
+ Args:
433
+ sheet: The Excel worksheet to be parsed.
434
+ start_row: The starting row of the table.
435
+ start_col: The starting column of the table.
301
436
 
302
- max_col = start_col
437
+ Returns:
438
+ The column index representing the right boundary of the table."
439
+ """
440
+ max_col: int = start_col
303
441
 
304
442
  while max_col < sheet.max_column - 1:
305
443
  # Get the cell value or check if it is part of a merged cell
@@ -325,19 +463,63 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
325
463
  def _find_images_in_sheet(
326
464
  self, doc: DoclingDocument, sheet: Worksheet
327
465
  ) -> DoclingDocument:
466
+ """Find images in the Excel sheet and attach them to the DoclingDocument.
328
467
 
329
- # Iterate over byte images in the sheet
330
- for idx, image in enumerate(sheet._images): # type: ignore
468
+ Args:
469
+ doc: The DoclingDocument to be updated.
470
+ sheet: The Excel worksheet to be parsed.
331
471
 
332
- try:
333
- pil_image = PILImage.open(image.ref)
334
-
335
- doc.add_picture(
336
- parent=self.parents[0],
337
- image=ImageRef.from_pil(image=pil_image, dpi=72),
338
- caption=None,
339
- )
340
- except:
341
- _log.error("could not extract the image from excel sheets")
472
+ Returns:
473
+ The updated DoclingDocument.
474
+ """
475
+ if self.workbook is not None:
476
+ # Iterate over byte images in the sheet
477
+ for item in sheet._images: # type: ignore[attr-defined]
478
+ try:
479
+ image: Image = cast(Image, item)
480
+ pil_image = PILImage.open(image.ref) # type: ignore[arg-type]
481
+ page_no = self.workbook.index(sheet) + 1
482
+ anchor = (0, 0, 0, 0)
483
+ if isinstance(image.anchor, TwoCellAnchor):
484
+ anchor = (
485
+ image.anchor._from.col,
486
+ image.anchor._from.row,
487
+ image.anchor.to.col + 1,
488
+ image.anchor.to.row + 1,
489
+ )
490
+ doc.add_picture(
491
+ parent=self.parents[0],
492
+ image=ImageRef.from_pil(image=pil_image, dpi=72),
493
+ caption=None,
494
+ prov=ProvenanceItem(
495
+ page_no=page_no,
496
+ charspan=(0, 0),
497
+ bbox=BoundingBox.from_tuple(
498
+ anchor, origin=CoordOrigin.TOPLEFT
499
+ ),
500
+ ),
501
+ )
502
+ except:
503
+ _log.error("could not extract the image from excel sheets")
342
504
 
343
505
  return doc
506
+
507
+ @staticmethod
508
+ def _find_page_size(
509
+ doc: DoclingDocument, page_no: PositiveInt
510
+ ) -> tuple[float, float]:
511
+ left: float = -1.0
512
+ top: float = -1.0
513
+ right: float = -1.0
514
+ bottom: float = -1.0
515
+ for item, _ in doc.iterate_items(traverse_pictures=True, page_no=page_no):
516
+ if not isinstance(item, DocItem):
517
+ continue
518
+ for provenance in item.prov:
519
+ bbox = provenance.bbox
520
+ left = min(left, bbox.l) if left != -1 else bbox.l
521
+ right = max(right, bbox.r) if right != -1 else bbox.r
522
+ top = min(top, bbox.t) if top != -1 else bbox.t
523
+ bottom = max(bottom, bbox.b) if bottom != -1 else bbox.b
524
+
525
+ return (right - left, bottom - top)
@@ -392,9 +392,10 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
392
392
  self.handle_tables(shape, parent_slide, slide_ind, doc, slide_size)
393
393
  if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
394
394
  # Handle Pictures
395
- self.handle_pictures(
396
- shape, parent_slide, slide_ind, doc, slide_size
397
- )
395
+ if hasattr(shape, "image"):
396
+ self.handle_pictures(
397
+ shape, parent_slide, slide_ind, doc, slide_size
398
+ )
398
399
  # If shape doesn't have any text, move on to the next shape
399
400
  if not hasattr(shape, "text"):
400
401
  return