docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,686 @@
1
+ import logging
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from typing import Annotated, Any, Optional, Union, cast
5
+
6
+ from docling_core.types.doc import (
7
+ BoundingBox,
8
+ ContentLayer,
9
+ CoordOrigin,
10
+ DocItem,
11
+ DocItemLabel,
12
+ DoclingDocument,
13
+ DocumentOrigin,
14
+ GroupLabel,
15
+ ImageRef,
16
+ ProvenanceItem,
17
+ Size,
18
+ TableCell,
19
+ TableData,
20
+ )
21
+ from openpyxl import load_workbook
22
+ from openpyxl.chartsheet.chartsheet import Chartsheet
23
+ from openpyxl.drawing.image import Image
24
+ from openpyxl.drawing.spreadsheet_drawing import TwoCellAnchor
25
+ from openpyxl.worksheet.worksheet import Worksheet
26
+ from PIL import Image as PILImage
27
+ from pydantic import BaseModel, Field, NonNegativeInt, PositiveInt
28
+ from pydantic.dataclasses import dataclass
29
+ from typing_extensions import override
30
+
31
+ from docling.backend.abstract_backend import (
32
+ DeclarativeDocumentBackend,
33
+ PaginatedDocumentBackend,
34
+ )
35
+ from docling.datamodel.backend_options import MsExcelBackendOptions
36
+ from docling.datamodel.base_models import InputFormat
37
+ from docling.datamodel.document import InputDocument
38
+
39
+ _log = logging.getLogger(__name__)
40
+
41
+
42
+ @dataclass
43
+ class DataRegion:
44
+ """Represents the bounding rectangle of non-empty cells in a worksheet."""
45
+
46
+ min_row: Annotated[
47
+ PositiveInt, Field(description="Smallest row index (1-based index).")
48
+ ]
49
+ max_row: Annotated[
50
+ PositiveInt, Field(description="Largest row index (1-based index).")
51
+ ]
52
+ min_col: Annotated[
53
+ PositiveInt, Field(description="Smallest column index (1-based index).")
54
+ ]
55
+ max_col: Annotated[
56
+ PositiveInt, Field(description="Largest column index (1-based index).")
57
+ ]
58
+
59
+ def width(self) -> PositiveInt:
60
+ """Number of columns in the data region."""
61
+ return self.max_col - self.min_col + 1
62
+
63
+ def height(self) -> PositiveInt:
64
+ """Number of rows in the data region."""
65
+ return self.max_row - self.min_row + 1
66
+
67
+
68
+ class ExcelCell(BaseModel):
69
+ """Represents an Excel cell.
70
+
71
+ Attributes:
72
+ row: The row number of the cell.
73
+ col: The column number of the cell.
74
+ text: The text content of the cell.
75
+ row_span: The number of rows the cell spans.
76
+ col_span: The number of columns the cell spans.
77
+ """
78
+
79
+ row: int
80
+ col: int
81
+ text: str
82
+ row_span: int
83
+ col_span: int
84
+
85
+
86
+ class ExcelTable(BaseModel):
87
+ """Represents an Excel table on a worksheet.
88
+
89
+ Attributes:
90
+ anchor: The column and row indices of the upper-left cell of the table
91
+ (0-based index).
92
+ num_rows: The number of rows in the table.
93
+ num_cols: The number of columns in the table.
94
+ data: The data in the table, represented as a list of ExcelCell objects.
95
+ """
96
+
97
+ anchor: tuple[NonNegativeInt, NonNegativeInt]
98
+ num_rows: int
99
+ num_cols: int
100
+ data: list[ExcelCell]
101
+
102
+
103
+ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
104
+ """Backend for parsing Excel workbooks.
105
+
106
+ The backend converts an Excel workbook into a DoclingDocument object.
107
+ Each worksheet is converted into a separate page.
108
+ The following elements are parsed:
109
+ - Cell contents, parsed as tables. If two groups of cells are disconnected
110
+ between each other, they will be parsed as two different tables.
111
+ - Images, parsed as PictureItem objects.
112
+
113
+ The DoclingDocument tables and pictures have their provenance information, including
114
+ the position in their original Excel worksheet. The position is represented by a
115
+ bounding box object with the cell indices as units (0-based index). The size of this
116
+ bounding box is the number of columns and rows that the table or picture spans.
117
+ """
118
+
119
+ @override
120
+ def __init__(
121
+ self,
122
+ in_doc: "InputDocument",
123
+ path_or_stream: Union[BytesIO, Path],
124
+ options: MsExcelBackendOptions = MsExcelBackendOptions(),
125
+ ) -> None:
126
+ """Initialize the MsExcelDocumentBackend object.
127
+
128
+ Parameters:
129
+ in_doc: The input document object.
130
+ path_or_stream: The path or stream to the Excel file.
131
+ options: Backend options for Excel parsing.
132
+
133
+ Raises:
134
+ RuntimeError: An error occurred parsing the file.
135
+ """
136
+ super().__init__(in_doc, path_or_stream, options)
137
+
138
+ # Initialise the parents for the hierarchy
139
+ self.max_levels = 10
140
+
141
+ self.parents: dict[int, Any] = {}
142
+ for i in range(-1, self.max_levels):
143
+ self.parents[i] = None
144
+
145
+ self.workbook = None
146
+ try:
147
+ if isinstance(self.path_or_stream, BytesIO):
148
+ self.workbook = load_workbook(
149
+ filename=self.path_or_stream, data_only=True
150
+ )
151
+
152
+ elif isinstance(self.path_or_stream, Path):
153
+ self.workbook = load_workbook(
154
+ filename=str(self.path_or_stream), data_only=True
155
+ )
156
+
157
+ self.valid = self.workbook is not None
158
+ except Exception as e:
159
+ self.valid = False
160
+
161
+ raise RuntimeError(
162
+ f"MsExcelDocumentBackend could not load document with hash {self.document_hash}"
163
+ ) from e
164
+
165
+ @override
166
+ def is_valid(self) -> bool:
167
+ _log.debug(f"valid: {self.valid}")
168
+ return self.valid
169
+
170
+ @classmethod
171
+ @override
172
+ def supports_pagination(cls) -> bool:
173
+ return True
174
+
175
+ @override
176
+ def page_count(self) -> int:
177
+ if self.is_valid() and self.workbook:
178
+ return len(self.workbook.sheetnames)
179
+ else:
180
+ return 0
181
+
182
+ @classmethod
183
+ @override
184
+ def supported_formats(cls) -> set[InputFormat]:
185
+ return {InputFormat.XLSX}
186
+
187
+ @override
188
+ def convert(self) -> DoclingDocument:
189
+ """Parse the Excel workbook into a DoclingDocument object.
190
+
191
+ Raises:
192
+ RuntimeError: Unable to run the conversion since the backend object failed to
193
+ initialize.
194
+
195
+ Returns:
196
+ The DoclingDocument object representing the Excel workbook.
197
+ """
198
+ origin = DocumentOrigin(
199
+ filename=self.file.name or "file.xlsx",
200
+ mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
201
+ binary_hash=self.document_hash,
202
+ )
203
+
204
+ doc = DoclingDocument(name=self.file.stem or "file.xlsx", origin=origin)
205
+
206
+ if self.is_valid():
207
+ doc = self._convert_workbook(doc)
208
+ else:
209
+ raise RuntimeError(
210
+ f"Cannot convert doc with {self.document_hash} because the backend failed to init."
211
+ )
212
+
213
+ return doc
214
+
215
+ def _convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
216
+ """Parse the Excel workbook and attach its structure to a DoclingDocument.
217
+
218
+ Args:
219
+ doc: A DoclingDocument object.
220
+
221
+ Returns:
222
+ A DoclingDocument object with the parsed items.
223
+ """
224
+
225
+ if self.workbook is not None:
226
+ # Iterate over all sheets
227
+ for idx, name in enumerate(self.workbook.sheetnames):
228
+ _log.info(f"Processing sheet {idx}: {name}")
229
+
230
+ sheet = self.workbook[name]
231
+ page_no = idx + 1
232
+ # do not rely on sheet.max_column, sheet.max_row if there are images
233
+ page = doc.add_page(page_no=page_no, size=Size(width=0, height=0))
234
+
235
+ self.parents[0] = doc.add_group(
236
+ parent=None,
237
+ label=GroupLabel.SECTION,
238
+ name=f"sheet: {name}",
239
+ content_layer=self._get_sheet_content_layer(sheet),
240
+ )
241
+ doc = self._convert_sheet(doc, sheet)
242
+ width, height = self._find_page_size(doc, page_no)
243
+ page.size = Size(width=width, height=height)
244
+ else:
245
+ _log.error("Workbook is not initialized.")
246
+
247
+ return doc
248
+
249
+ def _convert_sheet(
250
+ self, doc: DoclingDocument, sheet: Union[Worksheet, Chartsheet]
251
+ ) -> DoclingDocument:
252
+ """Parse an Excel worksheet and attach its structure to a DoclingDocument
253
+
254
+ Args:
255
+ doc: The DoclingDocument to be updated.
256
+ sheet: The Excel worksheet to be parsed.
257
+
258
+ Returns:
259
+ The updated DoclingDocument.
260
+ """
261
+ if isinstance(sheet, Worksheet):
262
+ doc = self._find_tables_in_sheet(doc, sheet)
263
+ doc = self._find_images_in_sheet(doc, sheet)
264
+
265
+ # TODO: parse charts in sheet
266
+
267
+ return doc
268
+
269
+ def _find_tables_in_sheet(
270
+ self, doc: DoclingDocument, sheet: Worksheet
271
+ ) -> DoclingDocument:
272
+ """Find all tables in an Excel sheet and attach them to a DoclingDocument.
273
+
274
+ Args:
275
+ doc: The DoclingDocument to be updated.
276
+ sheet: The Excel worksheet to be parsed.
277
+
278
+ Returns:
279
+ The updated DoclingDocument.
280
+ """
281
+
282
+ if self.workbook is not None:
283
+ content_layer = self._get_sheet_content_layer(sheet)
284
+ tables = self._find_data_tables(sheet)
285
+
286
+ treat_singleton_as_text = (
287
+ isinstance(self.options, MsExcelBackendOptions)
288
+ and self.options.treat_singleton_as_text
289
+ )
290
+
291
+ for excel_table in tables:
292
+ origin_col = excel_table.anchor[0]
293
+ origin_row = excel_table.anchor[1]
294
+ num_rows = excel_table.num_rows
295
+ num_cols = excel_table.num_cols
296
+
297
+ if (
298
+ treat_singleton_as_text
299
+ and num_rows == 1
300
+ and num_cols == 1
301
+ and excel_table.data
302
+ ):
303
+ page_no = self.workbook.index(sheet) + 1
304
+ doc.add_text(
305
+ text=excel_table.data[0].text,
306
+ label=DocItemLabel.TEXT,
307
+ parent=self.parents[0],
308
+ prov=ProvenanceItem(
309
+ page_no=page_no,
310
+ charspan=(0, 0),
311
+ bbox=BoundingBox.from_tuple(
312
+ (
313
+ origin_col,
314
+ origin_row,
315
+ origin_col + num_cols,
316
+ origin_row + num_rows,
317
+ ),
318
+ origin=CoordOrigin.TOPLEFT,
319
+ ),
320
+ ),
321
+ content_layer=content_layer,
322
+ )
323
+ else:
324
+ table_data = TableData(
325
+ num_rows=num_rows,
326
+ num_cols=num_cols,
327
+ table_cells=[],
328
+ )
329
+
330
+ for excel_cell in excel_table.data:
331
+ cell = TableCell(
332
+ text=excel_cell.text,
333
+ row_span=excel_cell.row_span,
334
+ col_span=excel_cell.col_span,
335
+ start_row_offset_idx=excel_cell.row,
336
+ end_row_offset_idx=excel_cell.row + excel_cell.row_span,
337
+ start_col_offset_idx=excel_cell.col,
338
+ end_col_offset_idx=excel_cell.col + excel_cell.col_span,
339
+ column_header=excel_cell.row == 0,
340
+ row_header=False,
341
+ )
342
+ table_data.table_cells.append(cell)
343
+
344
+ page_no = self.workbook.index(sheet) + 1
345
+ doc.add_table(
346
+ data=table_data,
347
+ parent=self.parents[0],
348
+ prov=ProvenanceItem(
349
+ page_no=page_no,
350
+ charspan=(0, 0),
351
+ bbox=BoundingBox.from_tuple(
352
+ (
353
+ origin_col,
354
+ origin_row,
355
+ origin_col + num_cols,
356
+ origin_row + num_rows,
357
+ ),
358
+ origin=CoordOrigin.TOPLEFT,
359
+ ),
360
+ ),
361
+ content_layer=content_layer,
362
+ )
363
+
364
+ return doc
365
+
366
+ def _find_true_data_bounds(self, sheet: Worksheet) -> DataRegion:
367
+ """Find the true data boundaries (min/max rows and columns) in a worksheet.
368
+
369
+ This function scans all cells to find the smallest rectangular region that contains
370
+ all non-empty cells or merged cell ranges. It returns the minimal and maximal
371
+ row/column indices that bound the actual data region.
372
+
373
+ Args:
374
+ sheet: The worksheet to analyze.
375
+
376
+ Returns:
377
+ A data region representing the smallest rectangle that covers all data and merged cells.
378
+ If the sheet is empty, returns (1, 1, 1, 1) by default.
379
+ """
380
+ min_row, min_col = None, None
381
+ max_row, max_col = 0, 0
382
+
383
+ for cell in sheet._cells.values():
384
+ if cell.value is not None:
385
+ r, c = cell.row, cell.column
386
+ min_row = r if min_row is None else min(min_row, r)
387
+ min_col = c if min_col is None else min(min_col, c)
388
+ max_row = max(max_row, r)
389
+ max_col = max(max_col, c)
390
+
391
+ # Expand bounds to include merged cells
392
+ for merged in sheet.merged_cells.ranges:
393
+ min_row = (
394
+ merged.min_row if min_row is None else min(min_row, merged.min_row)
395
+ )
396
+ min_col = (
397
+ merged.min_col if min_col is None else min(min_col, merged.min_col)
398
+ )
399
+ max_row = max(max_row, merged.max_row)
400
+ max_col = max(max_col, merged.max_col)
401
+
402
+ # If no data found, default to (1, 1, 1, 1)
403
+ if min_row is None or min_col is None:
404
+ min_row = min_col = max_row = max_col = 1
405
+
406
+ return DataRegion(min_row, max_row, min_col, max_col)
407
+
408
+ def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
409
+ """Find all compact rectangular data tables in an Excel worksheet.
410
+
411
+ Args:
412
+ sheet: The Excel worksheet to be parsed.
413
+
414
+ Returns:
415
+ A list of ExcelTable objects representing the data tables.
416
+ """
417
+ bounds: DataRegion = self._find_true_data_bounds(
418
+ sheet
419
+ ) # The true data boundaries
420
+ tables: list[ExcelTable] = [] # List to store found tables
421
+ visited: set[tuple[int, int]] = set() # Track already visited cells
422
+
423
+ # Limit scan to actual data bounds
424
+ for ri, row in enumerate(
425
+ sheet.iter_rows(
426
+ min_row=bounds.min_row,
427
+ max_row=bounds.max_row,
428
+ min_col=bounds.min_col,
429
+ max_col=bounds.max_col,
430
+ values_only=False,
431
+ ),
432
+ start=bounds.min_row - 1,
433
+ ):
434
+ for rj, cell in enumerate(row, start=bounds.min_col - 1):
435
+ if cell.value is None or (ri, rj) in visited:
436
+ continue
437
+
438
+ # If the cell starts a new table, find its bounds
439
+ table_bounds, visited_cells = self._find_table_bounds(
440
+ sheet, ri, rj, bounds.max_row, bounds.max_col
441
+ )
442
+
443
+ visited.update(visited_cells) # Mark these cells as visited
444
+ tables.append(table_bounds)
445
+
446
+ return tables
447
+
448
+ def _find_table_bounds(
449
+ self,
450
+ sheet: Worksheet,
451
+ start_row: int,
452
+ start_col: int,
453
+ max_row: int,
454
+ max_col: int,
455
+ ) -> tuple[ExcelTable, set[tuple[int, int]]]:
456
+ """Determine the bounds of a compact rectangular table.
457
+
458
+ Args:
459
+ sheet: The Excel worksheet to be parsed.
460
+ start_row: The row number of the starting cell.
461
+ start_col: The column number of the starting cell.
462
+ max_row: Maximum row boundary from true data bounds.
463
+ max_col: Maximum column boundary from true data bounds.
464
+
465
+ Returns:
466
+ A tuple with an Excel table and a set of cell coordinates.
467
+ """
468
+ _log.debug("find_table_bounds")
469
+
470
+ table_max_row = self._find_table_bottom(sheet, start_row, start_col, max_row)
471
+ table_max_col = self._find_table_right(sheet, start_row, start_col, max_col)
472
+
473
+ # Collect the data within the bounds
474
+ data = []
475
+ visited_cells: set[tuple[int, int]] = set()
476
+ for ri, row in enumerate(
477
+ sheet.iter_rows(
478
+ min_row=start_row + 1, # start_row is 0-based but iter_rows is 1-based
479
+ max_row=table_max_row + 1,
480
+ min_col=start_col + 1,
481
+ max_col=table_max_col + 1,
482
+ values_only=False,
483
+ ),
484
+ start_row,
485
+ ):
486
+ for rj, cell in enumerate(row, start_col):
487
+ # Check if the cell belongs to a merged range
488
+ row_span = 1
489
+ col_span = 1
490
+
491
+ for merged_range in sheet.merged_cells.ranges:
492
+ if (
493
+ merged_range.min_row <= ri + 1
494
+ and ri + 1 <= merged_range.max_row
495
+ and merged_range.min_col <= rj + 1
496
+ and rj + 1 <= merged_range.max_col
497
+ ):
498
+ row_span = merged_range.max_row - merged_range.min_row + 1
499
+ col_span = merged_range.max_col - merged_range.min_col + 1
500
+ break
501
+
502
+ if (ri, rj) not in visited_cells:
503
+ data.append(
504
+ ExcelCell(
505
+ row=ri - start_row,
506
+ col=rj - start_col,
507
+ text=str(cell.value),
508
+ row_span=row_span,
509
+ col_span=col_span,
510
+ )
511
+ )
512
+
513
+ # Mark all cells in the span as visited
514
+ for span_row in range(ri, ri + row_span):
515
+ for span_col in range(rj, rj + col_span):
516
+ visited_cells.add((span_row, span_col))
517
+
518
+ return (
519
+ ExcelTable(
520
+ anchor=(start_col, start_row),
521
+ num_rows=table_max_row + 1 - start_row,
522
+ num_cols=table_max_col + 1 - start_col,
523
+ data=data,
524
+ ),
525
+ visited_cells,
526
+ )
527
+
528
+ def _find_table_bottom(
529
+ self, sheet: Worksheet, start_row: int, start_col: int, max_row: int
530
+ ) -> int:
531
+ """Find the bottom boundary of a table.
532
+
533
+ Args:
534
+ sheet: The Excel worksheet to be parsed.
535
+ start_row: The starting row of the table.
536
+ start_col: The starting column of the table.
537
+ max_row: Maximum row boundary from true data bounds.
538
+
539
+ Returns:
540
+ The row index representing the bottom boundary of the table.
541
+ """
542
+ table_max_row: int = start_row
543
+
544
+ for ri, (cell,) in enumerate(
545
+ sheet.iter_rows(
546
+ min_row=start_row + 2,
547
+ max_row=max_row,
548
+ min_col=start_col + 1,
549
+ max_col=start_col + 1,
550
+ values_only=False,
551
+ ),
552
+ start_row + 1,
553
+ ):
554
+ # Check if the cell is part of a merged range
555
+ merged_range = next(
556
+ (mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
557
+ None,
558
+ )
559
+
560
+ if cell.value is None and not merged_range:
561
+ break # Stop if the cell is empty and not merged
562
+
563
+ # Expand table_max_row to include the merged range if applicable
564
+ if merged_range:
565
+ table_max_row = max(table_max_row, merged_range.max_row - 1)
566
+ else:
567
+ table_max_row = ri
568
+
569
+ return table_max_row
570
+
571
+ def _find_table_right(
572
+ self, sheet: Worksheet, start_row: int, start_col: int, max_col: int
573
+ ) -> int:
574
+ """Find the right boundary of a table.
575
+
576
+ Args:
577
+ sheet: The Excel worksheet to be parsed.
578
+ start_row: The starting row of the table.
579
+ start_col: The starting column of the table.
580
+ max_col: The actual max column of the table.
581
+
582
+ Returns:
583
+ The column index representing the right boundary of the table."
584
+ """
585
+ table_max_col: int = start_col
586
+
587
+ for rj, (cell,) in enumerate(
588
+ sheet.iter_cols(
589
+ min_row=start_row + 1,
590
+ max_row=start_row + 1,
591
+ min_col=start_col + 2,
592
+ max_col=max_col,
593
+ values_only=False,
594
+ ),
595
+ start_col + 1,
596
+ ):
597
+ # Check if the cell is part of a merged range
598
+ merged_range = next(
599
+ (mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
600
+ None,
601
+ )
602
+
603
+ if cell.value is None and not merged_range:
604
+ break # Stop if the cell is empty and not merged
605
+
606
+ # Expand table_max_col to include the merged range if applicable
607
+ if merged_range:
608
+ table_max_col = max(table_max_col, merged_range.max_col - 1)
609
+ else:
610
+ table_max_col = rj
611
+
612
+ return table_max_col
613
+
614
+ def _find_images_in_sheet(
615
+ self, doc: DoclingDocument, sheet: Worksheet
616
+ ) -> DoclingDocument:
617
+ """Find images in the Excel sheet and attach them to the DoclingDocument.
618
+
619
+ Args:
620
+ doc: The DoclingDocument to be updated.
621
+ sheet: The Excel worksheet to be parsed.
622
+
623
+ Returns:
624
+ The updated DoclingDocument.
625
+ """
626
+ if self.workbook is not None:
627
+ content_layer = self._get_sheet_content_layer(sheet)
628
+ # Iterate over byte images in the sheet
629
+ for item in sheet._images: # type: ignore[attr-defined]
630
+ try:
631
+ image: Image = cast(Image, item)
632
+ pil_image = PILImage.open(image.ref) # type: ignore[arg-type]
633
+ page_no = self.workbook.index(sheet) + 1
634
+ anchor = (0, 0, 0, 0)
635
+ if isinstance(image.anchor, TwoCellAnchor):
636
+ anchor = (
637
+ image.anchor._from.col,
638
+ image.anchor._from.row,
639
+ image.anchor.to.col + 1,
640
+ image.anchor.to.row + 1,
641
+ )
642
+ doc.add_picture(
643
+ parent=self.parents[0],
644
+ image=ImageRef.from_pil(image=pil_image, dpi=72),
645
+ caption=None,
646
+ prov=ProvenanceItem(
647
+ page_no=page_no,
648
+ charspan=(0, 0),
649
+ bbox=BoundingBox.from_tuple(
650
+ anchor, origin=CoordOrigin.TOPLEFT
651
+ ),
652
+ ),
653
+ content_layer=content_layer,
654
+ )
655
+ except Exception:
656
+ _log.error("could not extract the image from excel sheets")
657
+
658
+ return doc
659
+
660
+ @staticmethod
661
+ def _find_page_size(
662
+ doc: DoclingDocument, page_no: PositiveInt
663
+ ) -> tuple[float, float]:
664
+ left: float = -1.0
665
+ top: float = -1.0
666
+ right: float = -1.0
667
+ bottom: float = -1.0
668
+ for item, _ in doc.iterate_items(traverse_pictures=True, page_no=page_no):
669
+ if not isinstance(item, DocItem):
670
+ continue
671
+ for provenance in item.prov:
672
+ bbox = provenance.bbox
673
+ left = min(left, bbox.l) if left != -1 else bbox.l
674
+ right = max(right, bbox.r) if right != -1 else bbox.r
675
+ top = min(top, bbox.t) if top != -1 else bbox.t
676
+ bottom = max(bottom, bbox.b) if bottom != -1 else bbox.b
677
+
678
+ return (right - left, bottom - top)
679
+
680
+ @staticmethod
681
+ def _get_sheet_content_layer(sheet: Worksheet) -> Optional[ContentLayer]:
682
+ return (
683
+ None
684
+ if sheet.sheet_state == Worksheet.SHEETSTATE_VISIBLE
685
+ else ContentLayer.INVISIBLE
686
+ )