docling 2.29.0__tar.gz → 2.30.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. {docling-2.29.0 → docling-2.30.0}/PKG-INFO +3 -3
  2. docling-2.30.0/docling/backend/msexcel_backend.py +525 -0
  3. {docling-2.29.0 → docling-2.30.0}/docling/backend/msword_backend.py +20 -12
  4. {docling-2.29.0 → docling-2.30.0}/docling/cli/main.py +20 -2
  5. {docling-2.29.0 → docling-2.30.0}/docling/datamodel/base_models.py +33 -0
  6. {docling-2.29.0 → docling-2.30.0}/docling/datamodel/document.py +7 -0
  7. {docling-2.29.0 → docling-2.30.0}/docling/datamodel/pipeline_options.py +29 -3
  8. docling-2.30.0/docling/models/api_vlm_model.py +67 -0
  9. docling-2.30.0/docling/models/picture_description_api_model.py +58 -0
  10. {docling-2.29.0 → docling-2.30.0}/docling/models/picture_description_base_model.py +14 -2
  11. {docling-2.29.0 → docling-2.30.0}/docling/pipeline/standard_pdf_pipeline.py +6 -2
  12. {docling-2.29.0 → docling-2.30.0}/docling/pipeline/vlm_pipeline.py +27 -17
  13. docling-2.30.0/docling/utils/api_image_request.py +61 -0
  14. {docling-2.29.0 → docling-2.30.0}/pyproject.toml +3 -3
  15. docling-2.29.0/docling/backend/msexcel_backend.py +0 -343
  16. docling-2.29.0/docling/models/picture_description_api_model.py +0 -125
  17. {docling-2.29.0 → docling-2.30.0}/LICENSE +0 -0
  18. {docling-2.29.0 → docling-2.30.0}/README.md +0 -0
  19. {docling-2.29.0 → docling-2.30.0}/docling/__init__.py +0 -0
  20. {docling-2.29.0 → docling-2.30.0}/docling/backend/__init__.py +0 -0
  21. {docling-2.29.0 → docling-2.30.0}/docling/backend/abstract_backend.py +0 -0
  22. {docling-2.29.0 → docling-2.30.0}/docling/backend/asciidoc_backend.py +0 -0
  23. {docling-2.29.0 → docling-2.30.0}/docling/backend/csv_backend.py +0 -0
  24. {docling-2.29.0 → docling-2.30.0}/docling/backend/docling_parse_backend.py +0 -0
  25. {docling-2.29.0 → docling-2.30.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  26. {docling-2.29.0 → docling-2.30.0}/docling/backend/docling_parse_v4_backend.py +0 -0
  27. {docling-2.29.0 → docling-2.30.0}/docling/backend/docx/__init__.py +0 -0
  28. {docling-2.29.0 → docling-2.30.0}/docling/backend/docx/latex/__init__.py +0 -0
  29. {docling-2.29.0 → docling-2.30.0}/docling/backend/docx/latex/latex_dict.py +0 -0
  30. {docling-2.29.0 → docling-2.30.0}/docling/backend/docx/latex/omml.py +0 -0
  31. {docling-2.29.0 → docling-2.30.0}/docling/backend/html_backend.py +0 -0
  32. {docling-2.29.0 → docling-2.30.0}/docling/backend/json/__init__.py +0 -0
  33. {docling-2.29.0 → docling-2.30.0}/docling/backend/json/docling_json_backend.py +0 -0
  34. {docling-2.29.0 → docling-2.30.0}/docling/backend/md_backend.py +0 -0
  35. {docling-2.29.0 → docling-2.30.0}/docling/backend/mspowerpoint_backend.py +0 -0
  36. {docling-2.29.0 → docling-2.30.0}/docling/backend/pdf_backend.py +0 -0
  37. {docling-2.29.0 → docling-2.30.0}/docling/backend/pypdfium2_backend.py +0 -0
  38. {docling-2.29.0 → docling-2.30.0}/docling/backend/xml/__init__.py +0 -0
  39. {docling-2.29.0 → docling-2.30.0}/docling/backend/xml/jats_backend.py +0 -0
  40. {docling-2.29.0 → docling-2.30.0}/docling/backend/xml/uspto_backend.py +0 -0
  41. {docling-2.29.0 → docling-2.30.0}/docling/chunking/__init__.py +0 -0
  42. {docling-2.29.0 → docling-2.30.0}/docling/cli/__init__.py +0 -0
  43. {docling-2.29.0 → docling-2.30.0}/docling/cli/models.py +0 -0
  44. {docling-2.29.0 → docling-2.30.0}/docling/cli/tools.py +0 -0
  45. {docling-2.29.0 → docling-2.30.0}/docling/datamodel/__init__.py +0 -0
  46. {docling-2.29.0 → docling-2.30.0}/docling/datamodel/settings.py +0 -0
  47. {docling-2.29.0 → docling-2.30.0}/docling/document_converter.py +0 -0
  48. {docling-2.29.0 → docling-2.30.0}/docling/exceptions.py +0 -0
  49. {docling-2.29.0 → docling-2.30.0}/docling/models/__init__.py +0 -0
  50. {docling-2.29.0 → docling-2.30.0}/docling/models/base_model.py +0 -0
  51. {docling-2.29.0 → docling-2.30.0}/docling/models/base_ocr_model.py +0 -0
  52. {docling-2.29.0 → docling-2.30.0}/docling/models/code_formula_model.py +0 -0
  53. {docling-2.29.0 → docling-2.30.0}/docling/models/document_picture_classifier.py +0 -0
  54. {docling-2.29.0 → docling-2.30.0}/docling/models/easyocr_model.py +0 -0
  55. {docling-2.29.0 → docling-2.30.0}/docling/models/factories/__init__.py +0 -0
  56. {docling-2.29.0 → docling-2.30.0}/docling/models/factories/base_factory.py +0 -0
  57. {docling-2.29.0 → docling-2.30.0}/docling/models/factories/ocr_factory.py +0 -0
  58. {docling-2.29.0 → docling-2.30.0}/docling/models/factories/picture_description_factory.py +0 -0
  59. {docling-2.29.0 → docling-2.30.0}/docling/models/hf_mlx_model.py +0 -0
  60. {docling-2.29.0 → docling-2.30.0}/docling/models/hf_vlm_model.py +0 -0
  61. {docling-2.29.0 → docling-2.30.0}/docling/models/layout_model.py +0 -0
  62. {docling-2.29.0 → docling-2.30.0}/docling/models/ocr_mac_model.py +0 -0
  63. {docling-2.29.0 → docling-2.30.0}/docling/models/page_assemble_model.py +0 -0
  64. {docling-2.29.0 → docling-2.30.0}/docling/models/page_preprocessing_model.py +0 -0
  65. {docling-2.29.0 → docling-2.30.0}/docling/models/picture_description_vlm_model.py +0 -0
  66. {docling-2.29.0 → docling-2.30.0}/docling/models/plugins/__init__.py +0 -0
  67. {docling-2.29.0 → docling-2.30.0}/docling/models/plugins/defaults.py +0 -0
  68. {docling-2.29.0 → docling-2.30.0}/docling/models/rapid_ocr_model.py +0 -0
  69. {docling-2.29.0 → docling-2.30.0}/docling/models/readingorder_model.py +0 -0
  70. {docling-2.29.0 → docling-2.30.0}/docling/models/table_structure_model.py +0 -0
  71. {docling-2.29.0 → docling-2.30.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
  72. {docling-2.29.0 → docling-2.30.0}/docling/models/tesseract_ocr_model.py +0 -0
  73. {docling-2.29.0 → docling-2.30.0}/docling/pipeline/__init__.py +0 -0
  74. {docling-2.29.0 → docling-2.30.0}/docling/pipeline/base_pipeline.py +0 -0
  75. {docling-2.29.0 → docling-2.30.0}/docling/pipeline/simple_pipeline.py +0 -0
  76. {docling-2.29.0 → docling-2.30.0}/docling/py.typed +0 -0
  77. {docling-2.29.0 → docling-2.30.0}/docling/utils/__init__.py +0 -0
  78. {docling-2.29.0 → docling-2.30.0}/docling/utils/accelerator_utils.py +0 -0
  79. {docling-2.29.0 → docling-2.30.0}/docling/utils/export.py +0 -0
  80. {docling-2.29.0 → docling-2.30.0}/docling/utils/glm_utils.py +0 -0
  81. {docling-2.29.0 → docling-2.30.0}/docling/utils/layout_postprocessor.py +0 -0
  82. {docling-2.29.0 → docling-2.30.0}/docling/utils/locks.py +0 -0
  83. {docling-2.29.0 → docling-2.30.0}/docling/utils/model_downloader.py +0 -0
  84. {docling-2.29.0 → docling-2.30.0}/docling/utils/ocr_utils.py +0 -0
  85. {docling-2.29.0 → docling-2.30.0}/docling/utils/profiling.py +0 -0
  86. {docling-2.29.0 → docling-2.30.0}/docling/utils/utils.py +0 -0
  87. {docling-2.29.0 → docling-2.30.0}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.29.0
3
+ Version: 2.30.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/docling-project/docling
6
6
  License: MIT
@@ -28,7 +28,7 @@ Provides-Extra: vlm
28
28
  Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
29
29
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
30
30
  Requires-Dist: certifi (>=2024.7.4)
31
- Requires-Dist: docling-core[chunking] (>=2.24.1,<3.0.0)
31
+ Requires-Dist: docling-core[chunking] (>=2.26.0,<3.0.0)
32
32
  Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
33
33
  Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
34
34
  Requires-Dist: easyocr (>=1.7,<2.0)
@@ -58,7 +58,7 @@ Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
58
58
  Requires-Dist: tqdm (>=4.65.0,<5.0.0)
59
59
  Requires-Dist: transformers (>=4.42.0,<4.43.0) ; (sys_platform == "darwin" and platform_machine == "x86_64") and (extra == "vlm")
60
60
  Requires-Dist: transformers (>=4.46.0,<5.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
61
- Requires-Dist: typer (>=0.12.5,<0.13.0)
61
+ Requires-Dist: typer (>=0.12.5,<0.16.0)
62
62
  Project-URL: Repository, https://github.com/docling-project/docling
63
63
  Description-Content-Type: text/markdown
64
64
 
@@ -0,0 +1,525 @@
1
+ import logging
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from typing import Any, Union, cast
5
+
6
+ from docling_core.types.doc import (
7
+ BoundingBox,
8
+ CoordOrigin,
9
+ DocItem,
10
+ DoclingDocument,
11
+ DocumentOrigin,
12
+ GroupLabel,
13
+ ImageRef,
14
+ ProvenanceItem,
15
+ Size,
16
+ TableCell,
17
+ TableData,
18
+ )
19
+ from openpyxl import load_workbook
20
+ from openpyxl.drawing.image import Image
21
+ from openpyxl.drawing.spreadsheet_drawing import TwoCellAnchor
22
+ from openpyxl.worksheet.worksheet import Worksheet
23
+ from PIL import Image as PILImage
24
+ from pydantic import BaseModel, NonNegativeInt, PositiveInt
25
+ from typing_extensions import override
26
+
27
+ from docling.backend.abstract_backend import (
28
+ DeclarativeDocumentBackend,
29
+ PaginatedDocumentBackend,
30
+ )
31
+ from docling.datamodel.base_models import InputFormat
32
+ from docling.datamodel.document import InputDocument
33
+
34
+ _log = logging.getLogger(__name__)
35
+
36
+
37
+ class ExcelCell(BaseModel):
38
+ """Represents an Excel cell.
39
+
40
+ Attributes:
41
+ row: The row number of the cell.
42
+ col: The column number of the cell.
43
+ text: The text content of the cell.
44
+ row_span: The number of rows the cell spans.
45
+ col_span: The number of columns the cell spans.
46
+ """
47
+
48
+ row: int
49
+ col: int
50
+ text: str
51
+ row_span: int
52
+ col_span: int
53
+
54
+
55
+ class ExcelTable(BaseModel):
56
+ """Represents an Excel table on a worksheet.
57
+
58
+ Attributes:
59
+ anchor: The column and row indices of the upper-left cell of the table
60
+ (0-based index).
61
+ num_rows: The number of rows in the table.
62
+ num_cols: The number of columns in the table.
63
+ data: The data in the table, represented as a list of ExcelCell objects.
64
+ """
65
+
66
+ anchor: tuple[NonNegativeInt, NonNegativeInt]
67
+ num_rows: int
68
+ num_cols: int
69
+ data: list[ExcelCell]
70
+
71
+
72
+ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
73
+ """Backend for parsing Excel workbooks.
74
+
75
+ The backend converts an Excel workbook into a DoclingDocument object.
76
+ Each worksheet is converted into a separate page.
77
+ The following elements are parsed:
78
+ - Cell contents, parsed as tables. If two groups of cells are disconnected
79
+ between each other, they will be parsed as two different tables.
80
+ - Images, parsed as PictureItem objects.
81
+
82
+ The DoclingDocument tables and pictures have their provenance information, including
83
+ the position in their original Excel worksheet. The position is represented by a
84
+ bounding box object with the cell indices as units (0-based index). The size of this
85
+ bounding box is the number of columns and rows that the table or picture spans.
86
+ """
87
+
88
+ @override
89
+ def __init__(
90
+ self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
91
+ ) -> None:
92
+ """Initialize the MsExcelDocumentBackend object.
93
+
94
+ Parameters:
95
+ in_doc: The input document object.
96
+ path_or_stream: The path or stream to the Excel file.
97
+
98
+ Raises:
99
+ RuntimeError: An error occurred parsing the file.
100
+ """
101
+ super().__init__(in_doc, path_or_stream)
102
+
103
+ # Initialise the parents for the hierarchy
104
+ self.max_levels = 10
105
+
106
+ self.parents: dict[int, Any] = {}
107
+ for i in range(-1, self.max_levels):
108
+ self.parents[i] = None
109
+
110
+ self.workbook = None
111
+ try:
112
+ if isinstance(self.path_or_stream, BytesIO):
113
+ self.workbook = load_workbook(filename=self.path_or_stream)
114
+
115
+ elif isinstance(self.path_or_stream, Path):
116
+ self.workbook = load_workbook(filename=str(self.path_or_stream))
117
+
118
+ self.valid = self.workbook is not None
119
+ except Exception as e:
120
+ self.valid = False
121
+
122
+ raise RuntimeError(
123
+ f"MsExcelDocumentBackend could not load document with hash {self.document_hash}"
124
+ ) from e
125
+
126
+ @override
127
+ def is_valid(self) -> bool:
128
+ _log.debug(f"valid: {self.valid}")
129
+ return self.valid
130
+
131
+ @classmethod
132
+ @override
133
+ def supports_pagination(cls) -> bool:
134
+ return True
135
+
136
+ @override
137
+ def page_count(self) -> int:
138
+ if self.is_valid() and self.workbook:
139
+ return len(self.workbook.sheetnames)
140
+ else:
141
+ return 0
142
+
143
+ @classmethod
144
+ @override
145
+ def supported_formats(cls) -> set[InputFormat]:
146
+ return {InputFormat.XLSX}
147
+
148
+ @override
149
+ def convert(self) -> DoclingDocument:
150
+ """Parse the Excel workbook into a DoclingDocument object.
151
+
152
+ Raises:
153
+ RuntimeError: Unable to run the conversion since the backend object failed to
154
+ initialize.
155
+
156
+ Returns:
157
+ The DoclingDocument object representing the Excel workbook.
158
+ """
159
+ origin = DocumentOrigin(
160
+ filename=self.file.name or "file.xlsx",
161
+ mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
162
+ binary_hash=self.document_hash,
163
+ )
164
+
165
+ doc = DoclingDocument(name=self.file.stem or "file.xlsx", origin=origin)
166
+
167
+ if self.is_valid():
168
+ doc = self._convert_workbook(doc)
169
+ else:
170
+ raise RuntimeError(
171
+ f"Cannot convert doc with {self.document_hash} because the backend failed to init."
172
+ )
173
+
174
+ return doc
175
+
176
+ def _convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
177
+ """Parse the Excel workbook and attach its structure to a DoclingDocument.
178
+
179
+ Args:
180
+ doc: A DoclingDocument object.
181
+
182
+ Returns:
183
+ A DoclingDocument object with the parsed items.
184
+ """
185
+
186
+ if self.workbook is not None:
187
+
188
+ # Iterate over all sheets
189
+ for sheet_name in self.workbook.sheetnames:
190
+ _log.info(f"Processing sheet: {sheet_name}")
191
+
192
+ sheet = self.workbook[sheet_name]
193
+ page_no = self.workbook.index(sheet) + 1
194
+ # do not rely on sheet.max_column, sheet.max_row if there are images
195
+ page = doc.add_page(page_no=page_no, size=Size(width=0, height=0))
196
+
197
+ self.parents[0] = doc.add_group(
198
+ parent=None,
199
+ label=GroupLabel.SECTION,
200
+ name=f"sheet: {sheet_name}",
201
+ )
202
+ doc = self._convert_sheet(doc, sheet)
203
+ width, height = self._find_page_size(doc, page_no)
204
+ page.size = Size(width=width, height=height)
205
+ else:
206
+ _log.error("Workbook is not initialized.")
207
+
208
+ return doc
209
+
210
+ def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet) -> DoclingDocument:
211
+ """Parse an Excel worksheet and attach its structure to a DoclingDocument
212
+
213
+ Args:
214
+ doc: The DoclingDocument to be updated.
215
+ sheet: The Excel worksheet to be parsed.
216
+
217
+ Returns:
218
+ The updated DoclingDocument.
219
+ """
220
+
221
+ doc = self._find_tables_in_sheet(doc, sheet)
222
+
223
+ doc = self._find_images_in_sheet(doc, sheet)
224
+
225
+ return doc
226
+
227
+ def _find_tables_in_sheet(
228
+ self, doc: DoclingDocument, sheet: Worksheet
229
+ ) -> DoclingDocument:
230
+ """Find all tables in an Excel sheet and attach them to a DoclingDocument.
231
+
232
+ Args:
233
+ doc: The DoclingDocument to be updated.
234
+ sheet: The Excel worksheet to be parsed.
235
+
236
+ Returns:
237
+ The updated DoclingDocument.
238
+ """
239
+
240
+ if self.workbook is not None:
241
+ tables = self._find_data_tables(sheet)
242
+
243
+ for excel_table in tables:
244
+ origin_col = excel_table.anchor[0]
245
+ origin_row = excel_table.anchor[1]
246
+ num_rows = excel_table.num_rows
247
+ num_cols = excel_table.num_cols
248
+
249
+ table_data = TableData(
250
+ num_rows=num_rows,
251
+ num_cols=num_cols,
252
+ table_cells=[],
253
+ )
254
+
255
+ for excel_cell in excel_table.data:
256
+
257
+ cell = TableCell(
258
+ text=excel_cell.text,
259
+ row_span=excel_cell.row_span,
260
+ col_span=excel_cell.col_span,
261
+ start_row_offset_idx=excel_cell.row,
262
+ end_row_offset_idx=excel_cell.row + excel_cell.row_span,
263
+ start_col_offset_idx=excel_cell.col,
264
+ end_col_offset_idx=excel_cell.col + excel_cell.col_span,
265
+ column_header=excel_cell.row == 0,
266
+ row_header=False,
267
+ )
268
+ table_data.table_cells.append(cell)
269
+
270
+ page_no = self.workbook.index(sheet) + 1
271
+ doc.add_table(
272
+ data=table_data,
273
+ parent=self.parents[0],
274
+ prov=ProvenanceItem(
275
+ page_no=page_no,
276
+ charspan=(0, 0),
277
+ bbox=BoundingBox.from_tuple(
278
+ (
279
+ origin_col,
280
+ origin_row,
281
+ origin_col + num_cols,
282
+ origin_row + num_rows,
283
+ ),
284
+ origin=CoordOrigin.TOPLEFT,
285
+ ),
286
+ ),
287
+ )
288
+
289
+ return doc
290
+
291
+ def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
292
+ """Find all compact rectangular data tables in an Excel worksheet.
293
+
294
+ Args:
295
+ sheet: The Excel worksheet to be parsed.
296
+
297
+ Returns:
298
+ A list of ExcelTable objects representing the data tables.
299
+ """
300
+ tables: list[ExcelTable] = [] # List to store found tables
301
+ visited: set[tuple[int, int]] = set() # Track already visited cells
302
+
303
+ # Iterate over all cells in the sheet
304
+ for ri, row in enumerate(sheet.iter_rows(values_only=False)):
305
+ for rj, cell in enumerate(row):
306
+
307
+ # Skip empty or already visited cells
308
+ if cell.value is None or (ri, rj) in visited:
309
+ continue
310
+
311
+ # If the cell starts a new table, find its bounds
312
+ table_bounds, visited_cells = self._find_table_bounds(sheet, ri, rj)
313
+
314
+ visited.update(visited_cells) # Mark these cells as visited
315
+ tables.append(table_bounds)
316
+
317
+ return tables
318
+
319
+ def _find_table_bounds(
320
+ self,
321
+ sheet: Worksheet,
322
+ start_row: int,
323
+ start_col: int,
324
+ ) -> tuple[ExcelTable, set[tuple[int, int]]]:
325
+ """Determine the bounds of a compact rectangular table.
326
+
327
+ Args:
328
+ sheet: The Excel worksheet to be parsed.
329
+ start_row: The row number of the starting cell.
330
+ start_col: The column number of the starting cell.
331
+
332
+ Returns:
333
+ A tuple with an Excel table and a set of cell coordinates.
334
+ """
335
+ _log.debug("find_table_bounds")
336
+
337
+ max_row = self._find_table_bottom(sheet, start_row, start_col)
338
+ max_col = self._find_table_right(sheet, start_row, start_col)
339
+
340
+ # Collect the data within the bounds
341
+ data = []
342
+ visited_cells: set[tuple[int, int]] = set()
343
+ for ri in range(start_row, max_row + 1):
344
+ for rj in range(start_col, max_col + 1):
345
+
346
+ cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing
347
+
348
+ # Check if the cell belongs to a merged range
349
+ row_span = 1
350
+ col_span = 1
351
+
352
+ for merged_range in sheet.merged_cells.ranges:
353
+
354
+ if (
355
+ merged_range.min_row <= ri + 1
356
+ and ri + 1 <= merged_range.max_row
357
+ and merged_range.min_col <= rj + 1
358
+ and rj + 1 <= merged_range.max_col
359
+ ):
360
+
361
+ row_span = merged_range.max_row - merged_range.min_row + 1
362
+ col_span = merged_range.max_col - merged_range.min_col + 1
363
+ break
364
+
365
+ if (ri, rj) not in visited_cells:
366
+ data.append(
367
+ ExcelCell(
368
+ row=ri - start_row,
369
+ col=rj - start_col,
370
+ text=str(cell.value),
371
+ row_span=row_span,
372
+ col_span=col_span,
373
+ )
374
+ )
375
+
376
+ # Mark all cells in the span as visited
377
+ for span_row in range(ri, ri + row_span):
378
+ for span_col in range(rj, rj + col_span):
379
+ visited_cells.add((span_row, span_col))
380
+
381
+ return (
382
+ ExcelTable(
383
+ anchor=(start_col, start_row),
384
+ num_rows=max_row + 1 - start_row,
385
+ num_cols=max_col + 1 - start_col,
386
+ data=data,
387
+ ),
388
+ visited_cells,
389
+ )
390
+
391
+ def _find_table_bottom(
392
+ self, sheet: Worksheet, start_row: int, start_col: int
393
+ ) -> int:
394
+ """Find the bottom boundary of a table.
395
+
396
+ Args:
397
+ sheet: The Excel worksheet to be parsed.
398
+ start_row: The starting row of the table.
399
+ start_col: The starting column of the table.
400
+
401
+ Returns:
402
+ The row index representing the bottom boundary of the table.
403
+ """
404
+ max_row: int = start_row
405
+
406
+ while max_row < sheet.max_row - 1:
407
+ # Get the cell value or check if it is part of a merged cell
408
+ cell = sheet.cell(row=max_row + 2, column=start_col + 1)
409
+
410
+ # Check if the cell is part of a merged range
411
+ merged_range = next(
412
+ (mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
413
+ None,
414
+ )
415
+
416
+ if cell.value is None and not merged_range:
417
+ break # Stop if the cell is empty and not merged
418
+
419
+ # Expand max_row to include the merged range if applicable
420
+ if merged_range:
421
+ max_row = max(max_row, merged_range.max_row - 1)
422
+ else:
423
+ max_row += 1
424
+
425
+ return max_row
426
+
427
+ def _find_table_right(
428
+ self, sheet: Worksheet, start_row: int, start_col: int
429
+ ) -> int:
430
+ """Find the right boundary of a table.
431
+
432
+ Args:
433
+ sheet: The Excel worksheet to be parsed.
434
+ start_row: The starting row of the table.
435
+ start_col: The starting column of the table.
436
+
437
+ Returns:
438
+ The column index representing the right boundary of the table."
439
+ """
440
+ max_col: int = start_col
441
+
442
+ while max_col < sheet.max_column - 1:
443
+ # Get the cell value or check if it is part of a merged cell
444
+ cell = sheet.cell(row=start_row + 1, column=max_col + 2)
445
+
446
+ # Check if the cell is part of a merged range
447
+ merged_range = next(
448
+ (mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
449
+ None,
450
+ )
451
+
452
+ if cell.value is None and not merged_range:
453
+ break # Stop if the cell is empty and not merged
454
+
455
+ # Expand max_col to include the merged range if applicable
456
+ if merged_range:
457
+ max_col = max(max_col, merged_range.max_col - 1)
458
+ else:
459
+ max_col += 1
460
+
461
+ return max_col
462
+
463
+ def _find_images_in_sheet(
464
+ self, doc: DoclingDocument, sheet: Worksheet
465
+ ) -> DoclingDocument:
466
+ """Find images in the Excel sheet and attach them to the DoclingDocument.
467
+
468
+ Args:
469
+ doc: The DoclingDocument to be updated.
470
+ sheet: The Excel worksheet to be parsed.
471
+
472
+ Returns:
473
+ The updated DoclingDocument.
474
+ """
475
+ if self.workbook is not None:
476
+ # Iterate over byte images in the sheet
477
+ for item in sheet._images: # type: ignore[attr-defined]
478
+ try:
479
+ image: Image = cast(Image, item)
480
+ pil_image = PILImage.open(image.ref) # type: ignore[arg-type]
481
+ page_no = self.workbook.index(sheet) + 1
482
+ anchor = (0, 0, 0, 0)
483
+ if isinstance(image.anchor, TwoCellAnchor):
484
+ anchor = (
485
+ image.anchor._from.col,
486
+ image.anchor._from.row,
487
+ image.anchor.to.col + 1,
488
+ image.anchor.to.row + 1,
489
+ )
490
+ doc.add_picture(
491
+ parent=self.parents[0],
492
+ image=ImageRef.from_pil(image=pil_image, dpi=72),
493
+ caption=None,
494
+ prov=ProvenanceItem(
495
+ page_no=page_no,
496
+ charspan=(0, 0),
497
+ bbox=BoundingBox.from_tuple(
498
+ anchor, origin=CoordOrigin.TOPLEFT
499
+ ),
500
+ ),
501
+ )
502
+ except:
503
+ _log.error("could not extract the image from excel sheets")
504
+
505
+ return doc
506
+
507
+ @staticmethod
508
+ def _find_page_size(
509
+ doc: DoclingDocument, page_no: PositiveInt
510
+ ) -> tuple[float, float]:
511
+ left: float = -1.0
512
+ top: float = -1.0
513
+ right: float = -1.0
514
+ bottom: float = -1.0
515
+ for item, _ in doc.iterate_items(traverse_pictures=True, page_no=page_no):
516
+ if not isinstance(item, DocItem):
517
+ continue
518
+ for provenance in item.prov:
519
+ bbox = provenance.bbox
520
+ left = min(left, bbox.l) if left != -1 else bbox.l
521
+ right = max(right, bbox.r) if right != -1 else bbox.r
522
+ top = min(top, bbox.t) if top != -1 else bbox.t
523
+ bottom = max(bottom, bbox.b) if bottom != -1 else bbox.b
524
+
525
+ return (right - left, bottom - top)
@@ -850,7 +850,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
850
850
  def _handle_pictures(
851
851
  self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
852
852
  ) -> None:
853
- def get_docx_image(drawing_blip):
853
+ def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
854
+ image_data: Optional[bytes] = None
854
855
  rId = drawing_blip[0].get(
855
856
  "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
856
857
  )
@@ -862,19 +863,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
862
863
 
863
864
  level = self._get_level()
864
865
  # Open the BytesIO object with PIL to create an Image
865
- try:
866
- image_data = get_docx_image(drawing_blip)
867
- image_bytes = BytesIO(image_data)
868
- pil_image = Image.open(image_bytes)
869
- doc.add_picture(
870
- parent=self.parents[level - 1],
871
- image=ImageRef.from_pil(image=pil_image, dpi=72),
872
- caption=None,
873
- )
874
- except (UnidentifiedImageError, OSError) as e:
875
- _log.warning("Warning: image cannot be loaded by Pillow")
866
+ image_data: Optional[bytes] = get_docx_image(drawing_blip)
867
+ if image_data is None:
868
+ _log.warning("Warning: image cannot be found")
876
869
  doc.add_picture(
877
870
  parent=self.parents[level - 1],
878
871
  caption=None,
879
872
  )
873
+ else:
874
+ try:
875
+ image_bytes = BytesIO(image_data)
876
+ pil_image = Image.open(image_bytes)
877
+ doc.add_picture(
878
+ parent=self.parents[level - 1],
879
+ image=ImageRef.from_pil(image=pil_image, dpi=72),
880
+ caption=None,
881
+ )
882
+ except (UnidentifiedImageError, OSError) as e:
883
+ _log.warning("Warning: image cannot be loaded by Pillow")
884
+ doc.add_picture(
885
+ parent=self.parents[level - 1],
886
+ caption=None,
887
+ )
880
888
  return
@@ -40,6 +40,7 @@ from docling.datamodel.pipeline_options import (
40
40
  VlmModelType,
41
41
  VlmPipelineOptions,
42
42
  granite_vision_vlm_conversion_options,
43
+ granite_vision_vlm_ollama_conversion_options,
43
44
  smoldocling_vlm_conversion_options,
44
45
  smoldocling_vlm_mlx_conversion_options,
45
46
  )
@@ -153,6 +154,7 @@ def export_documents(
153
154
  output_dir: Path,
154
155
  export_json: bool,
155
156
  export_html: bool,
157
+ export_html_split_page: bool,
156
158
  export_md: bool,
157
159
  export_txt: bool,
158
160
  export_doctags: bool,
@@ -180,7 +182,15 @@ def export_documents(
180
182
  fname = output_dir / f"{doc_filename}.html"
181
183
  _log.info(f"writing HTML output to {fname}")
182
184
  conv_res.document.save_as_html(
183
- filename=fname, image_mode=image_export_mode
185
+ filename=fname, image_mode=image_export_mode, split_page_view=False
186
+ )
187
+
188
+ # Export HTML format:
189
+ if export_html_split_page:
190
+ fname = output_dir / f"{doc_filename}.html"
191
+ _log.info(f"writing HTML output to {fname}")
192
+ conv_res.document.save_as_html(
193
+ filename=fname, image_mode=image_export_mode, split_page_view=True
184
194
  )
185
195
 
186
196
  # Export Text format:
@@ -471,6 +481,7 @@ def convert(
471
481
 
472
482
  export_json = OutputFormat.JSON in to_formats
473
483
  export_html = OutputFormat.HTML in to_formats
484
+ export_html_split_page = OutputFormat.HTML_SPLIT_PAGE in to_formats
474
485
  export_md = OutputFormat.MARKDOWN in to_formats
475
486
  export_txt = OutputFormat.TEXT in to_formats
476
487
  export_doctags = OutputFormat.DOCTAGS in to_formats
@@ -531,10 +542,16 @@ def convert(
531
542
  backend=backend, # pdf_backend
532
543
  )
533
544
  elif pipeline == PdfPipeline.VLM:
534
- pipeline_options = VlmPipelineOptions()
545
+ pipeline_options = VlmPipelineOptions(
546
+ enable_remote_services=enable_remote_services,
547
+ )
535
548
 
536
549
  if vlm_model == VlmModelType.GRANITE_VISION:
537
550
  pipeline_options.vlm_options = granite_vision_vlm_conversion_options
551
+ elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
552
+ pipeline_options.vlm_options = (
553
+ granite_vision_vlm_ollama_conversion_options
554
+ )
538
555
  elif vlm_model == VlmModelType.SMOLDOCLING:
539
556
  pipeline_options.vlm_options = smoldocling_vlm_conversion_options
540
557
  if sys.platform == "darwin":
@@ -578,6 +595,7 @@ def convert(
578
595
  output_dir=output,
579
596
  export_json=export_json,
580
597
  export_html=export_html,
598
+ export_html_split_page=export_html_split_page,
581
599
  export_md=export_md,
582
600
  export_txt=export_txt,
583
601
  export_doctags=export_doctags,