docling 2.29.0__py3-none-any.whl → 2.31.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. docling/backend/asciidoc_backend.py +7 -15
  2. docling/backend/csv_backend.py +1 -1
  3. docling/backend/docling_parse_backend.py +2 -2
  4. docling/backend/docling_parse_v2_backend.py +2 -2
  5. docling/backend/docling_parse_v4_backend.py +3 -4
  6. docling/backend/docx/latex/latex_dict.py +0 -5
  7. docling/backend/docx/latex/omml.py +4 -7
  8. docling/backend/html_backend.py +26 -9
  9. docling/backend/md_backend.py +5 -7
  10. docling/backend/msexcel_backend.py +271 -95
  11. docling/backend/mspowerpoint_backend.py +4 -7
  12. docling/backend/msword_backend.py +23 -15
  13. docling/backend/pdf_backend.py +2 -1
  14. docling/backend/pypdfium2_backend.py +3 -3
  15. docling/backend/xml/jats_backend.py +10 -13
  16. docling/backend/xml/uspto_backend.py +15 -19
  17. docling/cli/main.py +27 -9
  18. docling/cli/models.py +2 -3
  19. docling/datamodel/base_models.py +40 -5
  20. docling/datamodel/document.py +18 -10
  21. docling/datamodel/pipeline_options.py +29 -4
  22. docling/document_converter.py +5 -5
  23. docling/models/api_vlm_model.py +66 -0
  24. docling/models/base_model.py +2 -4
  25. docling/models/base_ocr_model.py +2 -2
  26. docling/models/code_formula_model.py +2 -1
  27. docling/models/document_picture_classifier.py +2 -1
  28. docling/models/easyocr_model.py +10 -11
  29. docling/models/factories/__init__.py +2 -2
  30. docling/models/factories/base_factory.py +1 -1
  31. docling/models/hf_mlx_model.py +4 -6
  32. docling/models/hf_vlm_model.py +7 -5
  33. docling/models/layout_model.py +2 -2
  34. docling/models/ocr_mac_model.py +3 -4
  35. docling/models/page_assemble_model.py +7 -12
  36. docling/models/page_preprocessing_model.py +2 -1
  37. docling/models/picture_description_api_model.py +9 -75
  38. docling/models/picture_description_base_model.py +16 -5
  39. docling/models/picture_description_vlm_model.py +2 -3
  40. docling/models/rapid_ocr_model.py +2 -3
  41. docling/models/readingorder_model.py +8 -23
  42. docling/models/table_structure_model.py +2 -6
  43. docling/models/tesseract_ocr_cli_model.py +17 -16
  44. docling/models/tesseract_ocr_model.py +8 -6
  45. docling/pipeline/base_pipeline.py +4 -8
  46. docling/pipeline/simple_pipeline.py +0 -1
  47. docling/pipeline/standard_pdf_pipeline.py +6 -3
  48. docling/pipeline/vlm_pipeline.py +27 -20
  49. docling/utils/api_image_request.py +61 -0
  50. docling/utils/export.py +2 -4
  51. docling/utils/glm_utils.py +2 -2
  52. docling/utils/layout_postprocessor.py +4 -2
  53. docling/utils/model_downloader.py +7 -7
  54. docling/utils/utils.py +1 -1
  55. {docling-2.29.0.dist-info → docling-2.31.0.dist-info}/METADATA +4 -3
  56. docling-2.31.0.dist-info/RECORD +86 -0
  57. docling-2.29.0.dist-info/RECORD +0 -84
  58. {docling-2.29.0.dist-info → docling-2.31.0.dist-info}/LICENSE +0 -0
  59. {docling-2.29.0.dist-info → docling-2.31.0.dist-info}/WHEEL +0 -0
  60. {docling-2.29.0.dist-info → docling-2.31.0.dist-info}/entry_points.txt +0 -0
@@ -1,36 +1,50 @@
1
1
  import logging
2
2
  from io import BytesIO
3
3
  from pathlib import Path
4
- from typing import Dict, Set, Tuple, Union
4
+ from typing import Any, Union, cast
5
5
 
6
6
  from docling_core.types.doc import (
7
+ BoundingBox,
8
+ CoordOrigin,
9
+ DocItem,
7
10
  DoclingDocument,
8
11
  DocumentOrigin,
9
12
  GroupLabel,
10
13
  ImageRef,
14
+ ProvenanceItem,
15
+ Size,
11
16
  TableCell,
12
17
  TableData,
13
18
  )
14
-
15
- # from lxml import etree
16
- from openpyxl import Workbook, load_workbook
17
- from openpyxl.cell.cell import Cell
19
+ from openpyxl import load_workbook
18
20
  from openpyxl.drawing.image import Image
21
+ from openpyxl.drawing.spreadsheet_drawing import TwoCellAnchor
19
22
  from openpyxl.worksheet.worksheet import Worksheet
23
+ from PIL import Image as PILImage
24
+ from pydantic import BaseModel, NonNegativeInt, PositiveInt
25
+ from typing_extensions import override
20
26
 
21
- from docling.backend.abstract_backend import DeclarativeDocumentBackend
27
+ from docling.backend.abstract_backend import (
28
+ DeclarativeDocumentBackend,
29
+ PaginatedDocumentBackend,
30
+ )
22
31
  from docling.datamodel.base_models import InputFormat
23
32
  from docling.datamodel.document import InputDocument
24
33
 
25
34
  _log = logging.getLogger(__name__)
26
35
 
27
- from typing import Any, List
28
36
 
29
- from PIL import Image as PILImage
30
- from pydantic import BaseModel
37
+ class ExcelCell(BaseModel):
38
+ """Represents an Excel cell.
31
39
 
40
+ Attributes:
41
+ row: The row number of the cell.
42
+ col: The column number of the cell.
43
+ text: The text content of the cell.
44
+ row_span: The number of rows the cell spans.
45
+ col_span: The number of columns the cell spans.
46
+ """
32
47
 
33
- class ExcelCell(BaseModel):
34
48
  row: int
35
49
  col: int
36
50
  text: str
@@ -39,19 +53,57 @@ class ExcelCell(BaseModel):
39
53
 
40
54
 
41
55
  class ExcelTable(BaseModel):
56
+ """Represents an Excel table on a worksheet.
57
+
58
+ Attributes:
59
+ anchor: The column and row indices of the upper-left cell of the table
60
+ (0-based index).
61
+ num_rows: The number of rows in the table.
62
+ num_cols: The number of columns in the table.
63
+ data: The data in the table, represented as a list of ExcelCell objects.
64
+ """
65
+
66
+ anchor: tuple[NonNegativeInt, NonNegativeInt]
42
67
  num_rows: int
43
68
  num_cols: int
44
- data: List[ExcelCell]
69
+ data: list[ExcelCell]
70
+
45
71
 
72
+ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
73
+ """Backend for parsing Excel workbooks.
46
74
 
47
- class MsExcelDocumentBackend(DeclarativeDocumentBackend):
48
- def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
75
+ The backend converts an Excel workbook into a DoclingDocument object.
76
+ Each worksheet is converted into a separate page.
77
+ The following elements are parsed:
78
+ - Cell contents, parsed as tables. If two groups of cells are disconnected
79
+ between each other, they will be parsed as two different tables.
80
+ - Images, parsed as PictureItem objects.
81
+
82
+ The DoclingDocument tables and pictures have their provenance information, including
83
+ the position in their original Excel worksheet. The position is represented by a
84
+ bounding box object with the cell indices as units (0-based index). The size of this
85
+ bounding box is the number of columns and rows that the table or picture spans.
86
+ """
87
+
88
+ @override
89
+ def __init__(
90
+ self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
91
+ ) -> None:
92
+ """Initialize the MsExcelDocumentBackend object.
93
+
94
+ Parameters:
95
+ in_doc: The input document object.
96
+ path_or_stream: The path or stream to the Excel file.
97
+
98
+ Raises:
99
+ RuntimeError: An error occurred parsing the file.
100
+ """
49
101
  super().__init__(in_doc, path_or_stream)
50
102
 
51
103
  # Initialise the parents for the hierarchy
52
104
  self.max_levels = 10
53
105
 
54
- self.parents: Dict[int, Any] = {}
106
+ self.parents: dict[int, Any] = {}
55
107
  for i in range(-1, self.max_levels):
56
108
  self.parents[i] = None
57
109
 
@@ -63,35 +115,47 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
63
115
  elif isinstance(self.path_or_stream, Path):
64
116
  self.workbook = load_workbook(filename=str(self.path_or_stream))
65
117
 
66
- self.valid = True
118
+ self.valid = self.workbook is not None
67
119
  except Exception as e:
68
120
  self.valid = False
69
121
 
70
122
  raise RuntimeError(
71
- f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
123
+ f"MsExcelDocumentBackend could not load document with hash {self.document_hash}"
72
124
  ) from e
73
125
 
126
+ @override
74
127
  def is_valid(self) -> bool:
75
- _log.info(f"valid: {self.valid}")
128
+ _log.debug(f"valid: {self.valid}")
76
129
  return self.valid
77
130
 
78
131
  @classmethod
132
+ @override
79
133
  def supports_pagination(cls) -> bool:
80
134
  return True
81
135
 
82
- def unload(self):
83
- if isinstance(self.path_or_stream, BytesIO):
84
- self.path_or_stream.close()
85
-
86
- self.path_or_stream = None
136
+ @override
137
+ def page_count(self) -> int:
138
+ if self.is_valid() and self.workbook:
139
+ return len(self.workbook.sheetnames)
140
+ else:
141
+ return 0
87
142
 
88
143
  @classmethod
89
- def supported_formats(cls) -> Set[InputFormat]:
144
+ @override
145
+ def supported_formats(cls) -> set[InputFormat]:
90
146
  return {InputFormat.XLSX}
91
147
 
148
+ @override
92
149
  def convert(self) -> DoclingDocument:
93
- # Parses the XLSX into a structured document model.
150
+ """Parse the Excel workbook into a DoclingDocument object.
151
+
152
+ Raises:
153
+ RuntimeError: Unable to run the conversion since the backend object failed to
154
+ initialize.
94
155
 
156
+ Returns:
157
+ The DoclingDocument object representing the Excel workbook.
158
+ """
95
159
  origin = DocumentOrigin(
96
160
  filename=self.file.name or "file.xlsx",
97
161
  mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
@@ -110,29 +174,48 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
110
174
  return doc
111
175
 
112
176
  def _convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
177
+ """Parse the Excel workbook and attach its structure to a DoclingDocument.
113
178
 
114
- if self.workbook is not None:
179
+ Args:
180
+ doc: A DoclingDocument object.
115
181
 
182
+ Returns:
183
+ A DoclingDocument object with the parsed items.
184
+ """
185
+
186
+ if self.workbook is not None:
116
187
  # Iterate over all sheets
117
188
  for sheet_name in self.workbook.sheetnames:
118
189
  _log.info(f"Processing sheet: {sheet_name}")
119
190
 
120
- # Access the sheet by name
121
191
  sheet = self.workbook[sheet_name]
192
+ page_no = self.workbook.index(sheet) + 1
193
+ # do not rely on sheet.max_column, sheet.max_row if there are images
194
+ page = doc.add_page(page_no=page_no, size=Size(width=0, height=0))
122
195
 
123
196
  self.parents[0] = doc.add_group(
124
197
  parent=None,
125
198
  label=GroupLabel.SECTION,
126
199
  name=f"sheet: {sheet_name}",
127
200
  )
128
-
129
201
  doc = self._convert_sheet(doc, sheet)
202
+ width, height = self._find_page_size(doc, page_no)
203
+ page.size = Size(width=width, height=height)
130
204
  else:
131
205
  _log.error("Workbook is not initialized.")
132
206
 
133
207
  return doc
134
208
 
135
- def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet):
209
+ def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet) -> DoclingDocument:
210
+ """Parse an Excel worksheet and attach its structure to a DoclingDocument
211
+
212
+ Args:
213
+ doc: The DoclingDocument to be updated.
214
+ sheet: The Excel worksheet to be parsed.
215
+
216
+ Returns:
217
+ The updated DoclingDocument.
218
+ """
136
219
 
137
220
  doc = self._find_tables_in_sheet(doc, sheet)
138
221
 
@@ -140,60 +223,90 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
140
223
 
141
224
  return doc
142
225
 
143
- def _find_tables_in_sheet(self, doc: DoclingDocument, sheet: Worksheet):
144
-
145
- tables = self._find_data_tables(sheet)
226
+ def _find_tables_in_sheet(
227
+ self, doc: DoclingDocument, sheet: Worksheet
228
+ ) -> DoclingDocument:
229
+ """Find all tables in an Excel sheet and attach them to a DoclingDocument.
146
230
 
147
- for excel_table in tables:
148
- num_rows = excel_table.num_rows
149
- num_cols = excel_table.num_cols
231
+ Args:
232
+ doc: The DoclingDocument to be updated.
233
+ sheet: The Excel worksheet to be parsed.
150
234
 
151
- table_data = TableData(
152
- num_rows=num_rows,
153
- num_cols=num_cols,
154
- table_cells=[],
155
- )
235
+ Returns:
236
+ The updated DoclingDocument.
237
+ """
156
238
 
157
- for excel_cell in excel_table.data:
158
-
159
- cell = TableCell(
160
- text=excel_cell.text,
161
- row_span=excel_cell.row_span,
162
- col_span=excel_cell.col_span,
163
- start_row_offset_idx=excel_cell.row,
164
- end_row_offset_idx=excel_cell.row + excel_cell.row_span,
165
- start_col_offset_idx=excel_cell.col,
166
- end_col_offset_idx=excel_cell.col + excel_cell.col_span,
167
- column_header=excel_cell.row == 0,
168
- row_header=False,
239
+ if self.workbook is not None:
240
+ tables = self._find_data_tables(sheet)
241
+
242
+ for excel_table in tables:
243
+ origin_col = excel_table.anchor[0]
244
+ origin_row = excel_table.anchor[1]
245
+ num_rows = excel_table.num_rows
246
+ num_cols = excel_table.num_cols
247
+
248
+ table_data = TableData(
249
+ num_rows=num_rows,
250
+ num_cols=num_cols,
251
+ table_cells=[],
169
252
  )
170
- table_data.table_cells.append(cell)
171
253
 
172
- doc.add_table(data=table_data, parent=self.parents[0])
254
+ for excel_cell in excel_table.data:
255
+ cell = TableCell(
256
+ text=excel_cell.text,
257
+ row_span=excel_cell.row_span,
258
+ col_span=excel_cell.col_span,
259
+ start_row_offset_idx=excel_cell.row,
260
+ end_row_offset_idx=excel_cell.row + excel_cell.row_span,
261
+ start_col_offset_idx=excel_cell.col,
262
+ end_col_offset_idx=excel_cell.col + excel_cell.col_span,
263
+ column_header=excel_cell.row == 0,
264
+ row_header=False,
265
+ )
266
+ table_data.table_cells.append(cell)
267
+
268
+ page_no = self.workbook.index(sheet) + 1
269
+ doc.add_table(
270
+ data=table_data,
271
+ parent=self.parents[0],
272
+ prov=ProvenanceItem(
273
+ page_no=page_no,
274
+ charspan=(0, 0),
275
+ bbox=BoundingBox.from_tuple(
276
+ (
277
+ origin_col,
278
+ origin_row,
279
+ origin_col + num_cols,
280
+ origin_row + num_rows,
281
+ ),
282
+ origin=CoordOrigin.TOPLEFT,
283
+ ),
284
+ ),
285
+ )
173
286
 
174
287
  return doc
175
288
 
176
- def _find_data_tables(self, sheet: Worksheet) -> List[ExcelTable]:
177
- """
178
- Find all compact rectangular data tables in a sheet.
179
- """
180
- # _log.info("find_data_tables")
289
+ def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
290
+ """Find all compact rectangular data tables in an Excel worksheet.
181
291
 
182
- tables = [] # List to store found tables
183
- visited: set[Tuple[int, int]] = set() # Track already visited cells
292
+ Args:
293
+ sheet: The Excel worksheet to be parsed.
294
+
295
+ Returns:
296
+ A list of ExcelTable objects representing the data tables.
297
+ """
298
+ tables: list[ExcelTable] = [] # List to store found tables
299
+ visited: set[tuple[int, int]] = set() # Track already visited cells
184
300
 
185
301
  # Iterate over all cells in the sheet
186
302
  for ri, row in enumerate(sheet.iter_rows(values_only=False)):
187
303
  for rj, cell in enumerate(row):
188
-
189
304
  # Skip empty or already visited cells
190
305
  if cell.value is None or (ri, rj) in visited:
191
306
  continue
192
307
 
193
308
  # If the cell starts a new table, find its bounds
194
- table_bounds, visited_cells = self._find_table_bounds(
195
- sheet, ri, rj, visited
196
- )
309
+ table_bounds, visited_cells = self._find_table_bounds(sheet, ri, rj)
197
310
 
198
311
  visited.update(visited_cells) # Mark these cells as visited
199
312
  tables.append(table_bounds)
@@ -205,41 +318,40 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
205
318
  sheet: Worksheet,
206
319
  start_row: int,
207
320
  start_col: int,
208
- visited: set[Tuple[int, int]],
209
- ):
210
- """
211
- Determine the bounds of a compact rectangular table.
321
+ ) -> tuple[ExcelTable, set[tuple[int, int]]]:
322
+ """Determine the bounds of a compact rectangular table.
323
+
324
+ Args:
325
+ sheet: The Excel worksheet to be parsed.
326
+ start_row: The row number of the starting cell.
327
+ start_col: The column number of the starting cell.
328
+
212
329
  Returns:
213
- - A dictionary with the bounds and data.
214
- - A set of visited cell coordinates.
330
+ A tuple with an Excel table and a set of cell coordinates.
215
331
  """
216
- _log.info("find_table_bounds")
332
+ _log.debug("find_table_bounds")
217
333
 
218
334
  max_row = self._find_table_bottom(sheet, start_row, start_col)
219
335
  max_col = self._find_table_right(sheet, start_row, start_col)
220
336
 
221
337
  # Collect the data within the bounds
222
338
  data = []
223
- visited_cells = set()
339
+ visited_cells: set[tuple[int, int]] = set()
224
340
  for ri in range(start_row, max_row + 1):
225
341
  for rj in range(start_col, max_col + 1):
226
-
227
342
  cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing
228
343
 
229
344
  # Check if the cell belongs to a merged range
230
345
  row_span = 1
231
346
  col_span = 1
232
347
 
233
- # _log.info(sheet.merged_cells.ranges)
234
348
  for merged_range in sheet.merged_cells.ranges:
235
-
236
349
  if (
237
350
  merged_range.min_row <= ri + 1
238
351
  and ri + 1 <= merged_range.max_row
239
352
  and merged_range.min_col <= rj + 1
240
353
  and rj + 1 <= merged_range.max_col
241
354
  ):
242
-
243
355
  row_span = merged_range.max_row - merged_range.min_row + 1
244
356
  col_span = merged_range.max_col - merged_range.min_col + 1
245
357
  break
@@ -254,7 +366,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
254
366
  col_span=col_span,
255
367
  )
256
368
  )
257
- # _log.info(f"cell: {ri}, {rj} -> {ri - start_row}, {rj - start_col}, {row_span}, {col_span}: {str(cell.value)}")
258
369
 
259
370
  # Mark all cells in the span as visited
260
371
  for span_row in range(ri, ri + row_span):
@@ -263,6 +374,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
263
374
 
264
375
  return (
265
376
  ExcelTable(
377
+ anchor=(start_col, start_row),
266
378
  num_rows=max_row + 1 - start_row,
267
379
  num_cols=max_col + 1 - start_col,
268
380
  data=data,
@@ -270,10 +382,20 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
270
382
  visited_cells,
271
383
  )
272
384
 
273
- def _find_table_bottom(self, sheet: Worksheet, start_row: int, start_col: int):
274
- """Function to find the bottom boundary of the table"""
385
+ def _find_table_bottom(
386
+ self, sheet: Worksheet, start_row: int, start_col: int
387
+ ) -> int:
388
+ """Find the bottom boundary of a table.
389
+
390
+ Args:
391
+ sheet: The Excel worksheet to be parsed.
392
+ start_row: The starting row of the table.
393
+ start_col: The starting column of the table.
275
394
 
276
- max_row = start_row
395
+ Returns:
396
+ The row index representing the bottom boundary of the table.
397
+ """
398
+ max_row: int = start_row
277
399
 
278
400
  while max_row < sheet.max_row - 1:
279
401
  # Get the cell value or check if it is part of a merged cell
@@ -296,10 +418,20 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
296
418
 
297
419
  return max_row
298
420
 
299
- def _find_table_right(self, sheet: Worksheet, start_row: int, start_col: int):
300
- """Function to find the right boundary of the table"""
421
+ def _find_table_right(
422
+ self, sheet: Worksheet, start_row: int, start_col: int
423
+ ) -> int:
424
+ """Find the right boundary of a table.
301
425
 
302
- max_col = start_col
426
+ Args:
427
+ sheet: The Excel worksheet to be parsed.
428
+ start_row: The starting row of the table.
429
+ start_col: The starting column of the table.
430
+
431
+ Returns:
432
+ The column index representing the right boundary of the table."
433
+ """
434
+ max_col: int = start_col
303
435
 
304
436
  while max_col < sheet.max_column - 1:
305
437
  # Get the cell value or check if it is part of a merged cell
@@ -325,19 +457,63 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
325
457
  def _find_images_in_sheet(
326
458
  self, doc: DoclingDocument, sheet: Worksheet
327
459
  ) -> DoclingDocument:
460
+ """Find images in the Excel sheet and attach them to the DoclingDocument.
328
461
 
329
- # Iterate over byte images in the sheet
330
- for idx, image in enumerate(sheet._images): # type: ignore
331
-
332
- try:
333
- pil_image = PILImage.open(image.ref)
462
+ Args:
463
+ doc: The DoclingDocument to be updated.
464
+ sheet: The Excel worksheet to be parsed.
334
465
 
335
- doc.add_picture(
336
- parent=self.parents[0],
337
- image=ImageRef.from_pil(image=pil_image, dpi=72),
338
- caption=None,
339
- )
340
- except:
341
- _log.error("could not extract the image from excel sheets")
466
+ Returns:
467
+ The updated DoclingDocument.
468
+ """
469
+ if self.workbook is not None:
470
+ # Iterate over byte images in the sheet
471
+ for item in sheet._images: # type: ignore[attr-defined]
472
+ try:
473
+ image: Image = cast(Image, item)
474
+ pil_image = PILImage.open(image.ref) # type: ignore[arg-type]
475
+ page_no = self.workbook.index(sheet) + 1
476
+ anchor = (0, 0, 0, 0)
477
+ if isinstance(image.anchor, TwoCellAnchor):
478
+ anchor = (
479
+ image.anchor._from.col,
480
+ image.anchor._from.row,
481
+ image.anchor.to.col + 1,
482
+ image.anchor.to.row + 1,
483
+ )
484
+ doc.add_picture(
485
+ parent=self.parents[0],
486
+ image=ImageRef.from_pil(image=pil_image, dpi=72),
487
+ caption=None,
488
+ prov=ProvenanceItem(
489
+ page_no=page_no,
490
+ charspan=(0, 0),
491
+ bbox=BoundingBox.from_tuple(
492
+ anchor, origin=CoordOrigin.TOPLEFT
493
+ ),
494
+ ),
495
+ )
496
+ except Exception:
497
+ _log.error("could not extract the image from excel sheets")
342
498
 
343
499
  return doc
500
+
501
+ @staticmethod
502
+ def _find_page_size(
503
+ doc: DoclingDocument, page_no: PositiveInt
504
+ ) -> tuple[float, float]:
505
+ left: float = -1.0
506
+ top: float = -1.0
507
+ right: float = -1.0
508
+ bottom: float = -1.0
509
+ for item, _ in doc.iterate_items(traverse_pictures=True, page_no=page_no):
510
+ if not isinstance(item, DocItem):
511
+ continue
512
+ for provenance in item.prov:
513
+ bbox = provenance.bbox
514
+ left = min(left, bbox.l) if left != -1 else bbox.l
515
+ right = max(right, bbox.r) if right != -1 else bbox.r
516
+ top = min(top, bbox.t) if top != -1 else bbox.t
517
+ bottom = max(bottom, bbox.b) if bottom != -1 else bbox.b
518
+
519
+ return (right - left, bottom - top)
@@ -120,13 +120,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
120
120
 
121
121
  return prov
122
122
 
123
- def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
123
+ def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size): # noqa: C901
124
124
  is_a_list = False
125
125
  is_list_group_created = False
126
126
  enum_list_item_value = 0
127
127
  new_list = None
128
128
  bullet_type = "None"
129
- list_text = ""
130
129
  list_label = GroupLabel.LIST
131
130
  doc_label = DocItemLabel.LIST_ITEM
132
131
  prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
@@ -243,7 +242,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
243
242
  enum_marker = str(enum_list_item_value) + "."
244
243
  if not is_list_group_created:
245
244
  new_list = doc.add_group(
246
- label=list_label, name=f"list", parent=parent_slide
245
+ label=list_label, name="list", parent=parent_slide
247
246
  )
248
247
  is_list_group_created = True
249
248
  doc.add_list_item(
@@ -368,11 +367,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
368
367
  slide_width = pptx_obj.slide_width
369
368
  slide_height = pptx_obj.slide_height
370
369
 
371
- text_content = [] # type: ignore
372
-
373
370
  max_levels = 10
374
371
  parents = {} # type: ignore
375
- for i in range(0, max_levels):
372
+ for i in range(max_levels):
376
373
  parents[i] = None
377
374
 
378
375
  # Loop through each slide
@@ -383,7 +380,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
383
380
  )
384
381
 
385
382
  slide_size = Size(width=slide_width, height=slide_height)
386
- parent_page = doc.add_page(page_no=slide_ind + 1, size=slide_size)
383
+ doc.add_page(page_no=slide_ind + 1, size=slide_size)
387
384
 
388
385
  def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
389
386
  handle_groups(shape, parent_slide, slide_ind, doc, slide_size)
@@ -158,7 +158,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
158
158
  def _get_level(self) -> int:
159
159
  """Return the first None index."""
160
160
  for k, v in self.parents.items():
161
- if k >= 0 and v == None:
161
+ if k >= 0 and v is None:
162
162
  return k
163
163
  return 0
164
164
 
@@ -418,7 +418,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
418
418
  else prev_parent
419
419
  )
420
420
 
421
- def _handle_text_elements(
421
+ def _handle_text_elements( # noqa: C901
422
422
  self,
423
423
  element: BaseOxmlElement,
424
424
  docx_obj: DocxDocument,
@@ -812,7 +812,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
812
812
  f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
813
813
  )
814
814
  if cell is None or cell._tc in cell_set:
815
- _log.debug(f" skipped since repeated content")
815
+ _log.debug(" skipped since repeated content")
816
816
  col_idx += cell.grid_span
817
817
  continue
818
818
  else:
@@ -850,7 +850,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
850
850
  def _handle_pictures(
851
851
  self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
852
852
  ) -> None:
853
- def get_docx_image(drawing_blip):
853
+ def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
854
+ image_data: Optional[bytes] = None
854
855
  rId = drawing_blip[0].get(
855
856
  "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
856
857
  )
@@ -862,19 +863,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
862
863
 
863
864
  level = self._get_level()
864
865
  # Open the BytesIO object with PIL to create an Image
865
- try:
866
- image_data = get_docx_image(drawing_blip)
867
- image_bytes = BytesIO(image_data)
868
- pil_image = Image.open(image_bytes)
869
- doc.add_picture(
870
- parent=self.parents[level - 1],
871
- image=ImageRef.from_pil(image=pil_image, dpi=72),
872
- caption=None,
873
- )
874
- except (UnidentifiedImageError, OSError) as e:
875
- _log.warning("Warning: image cannot be loaded by Pillow")
866
+ image_data: Optional[bytes] = get_docx_image(drawing_blip)
867
+ if image_data is None:
868
+ _log.warning("Warning: image cannot be found")
876
869
  doc.add_picture(
877
870
  parent=self.parents[level - 1],
878
871
  caption=None,
879
872
  )
873
+ else:
874
+ try:
875
+ image_bytes = BytesIO(image_data)
876
+ pil_image = Image.open(image_bytes)
877
+ doc.add_picture(
878
+ parent=self.parents[level - 1],
879
+ image=ImageRef.from_pil(image=pil_image, dpi=72),
880
+ caption=None,
881
+ )
882
+ except (UnidentifiedImageError, OSError):
883
+ _log.warning("Warning: image cannot be loaded by Pillow")
884
+ doc.add_picture(
885
+ parent=self.parents[level - 1],
886
+ caption=None,
887
+ )
880
888
  return
@@ -1,7 +1,8 @@
1
1
  from abc import ABC, abstractmethod
2
+ from collections.abc import Iterable
2
3
  from io import BytesIO
3
4
  from pathlib import Path
4
- from typing import Iterable, Optional, Set, Union
5
+ from typing import Optional, Set, Union
5
6
 
6
7
  from docling_core.types.doc import BoundingBox, Size
7
8
  from docling_core.types.doc.page import SegmentedPdfPage, TextCell