docling 2.29.0__py3-none-any.whl → 2.30.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/msexcel_backend.py +272 -90
- docling/backend/msword_backend.py +20 -12
- docling/cli/main.py +20 -2
- docling/datamodel/base_models.py +33 -0
- docling/datamodel/document.py +7 -0
- docling/datamodel/pipeline_options.py +29 -3
- docling/models/api_vlm_model.py +67 -0
- docling/models/picture_description_api_model.py +8 -75
- docling/models/picture_description_base_model.py +14 -2
- docling/pipeline/standard_pdf_pipeline.py +6 -2
- docling/pipeline/vlm_pipeline.py +27 -17
- docling/utils/api_image_request.py +61 -0
- {docling-2.29.0.dist-info → docling-2.30.0.dist-info}/METADATA +3 -3
- {docling-2.29.0.dist-info → docling-2.30.0.dist-info}/RECORD +17 -15
- {docling-2.29.0.dist-info → docling-2.30.0.dist-info}/LICENSE +0 -0
- {docling-2.29.0.dist-info → docling-2.30.0.dist-info}/WHEEL +0 -0
- {docling-2.29.0.dist-info → docling-2.30.0.dist-info}/entry_points.txt +0 -0
@@ -1,36 +1,50 @@
|
|
1
1
|
import logging
|
2
2
|
from io import BytesIO
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import
|
4
|
+
from typing import Any, Union, cast
|
5
5
|
|
6
6
|
from docling_core.types.doc import (
|
7
|
+
BoundingBox,
|
8
|
+
CoordOrigin,
|
9
|
+
DocItem,
|
7
10
|
DoclingDocument,
|
8
11
|
DocumentOrigin,
|
9
12
|
GroupLabel,
|
10
13
|
ImageRef,
|
14
|
+
ProvenanceItem,
|
15
|
+
Size,
|
11
16
|
TableCell,
|
12
17
|
TableData,
|
13
18
|
)
|
14
|
-
|
15
|
-
# from lxml import etree
|
16
|
-
from openpyxl import Workbook, load_workbook
|
17
|
-
from openpyxl.cell.cell import Cell
|
19
|
+
from openpyxl import load_workbook
|
18
20
|
from openpyxl.drawing.image import Image
|
21
|
+
from openpyxl.drawing.spreadsheet_drawing import TwoCellAnchor
|
19
22
|
from openpyxl.worksheet.worksheet import Worksheet
|
23
|
+
from PIL import Image as PILImage
|
24
|
+
from pydantic import BaseModel, NonNegativeInt, PositiveInt
|
25
|
+
from typing_extensions import override
|
20
26
|
|
21
|
-
from docling.backend.abstract_backend import
|
27
|
+
from docling.backend.abstract_backend import (
|
28
|
+
DeclarativeDocumentBackend,
|
29
|
+
PaginatedDocumentBackend,
|
30
|
+
)
|
22
31
|
from docling.datamodel.base_models import InputFormat
|
23
32
|
from docling.datamodel.document import InputDocument
|
24
33
|
|
25
34
|
_log = logging.getLogger(__name__)
|
26
35
|
|
27
|
-
from typing import Any, List
|
28
36
|
|
29
|
-
|
30
|
-
|
37
|
+
class ExcelCell(BaseModel):
|
38
|
+
"""Represents an Excel cell.
|
31
39
|
|
40
|
+
Attributes:
|
41
|
+
row: The row number of the cell.
|
42
|
+
col: The column number of the cell.
|
43
|
+
text: The text content of the cell.
|
44
|
+
row_span: The number of rows the cell spans.
|
45
|
+
col_span: The number of columns the cell spans.
|
46
|
+
"""
|
32
47
|
|
33
|
-
class ExcelCell(BaseModel):
|
34
48
|
row: int
|
35
49
|
col: int
|
36
50
|
text: str
|
@@ -39,19 +53,57 @@ class ExcelCell(BaseModel):
|
|
39
53
|
|
40
54
|
|
41
55
|
class ExcelTable(BaseModel):
|
56
|
+
"""Represents an Excel table on a worksheet.
|
57
|
+
|
58
|
+
Attributes:
|
59
|
+
anchor: The column and row indices of the upper-left cell of the table
|
60
|
+
(0-based index).
|
61
|
+
num_rows: The number of rows in the table.
|
62
|
+
num_cols: The number of columns in the table.
|
63
|
+
data: The data in the table, represented as a list of ExcelCell objects.
|
64
|
+
"""
|
65
|
+
|
66
|
+
anchor: tuple[NonNegativeInt, NonNegativeInt]
|
42
67
|
num_rows: int
|
43
68
|
num_cols: int
|
44
|
-
data:
|
69
|
+
data: list[ExcelCell]
|
45
70
|
|
46
71
|
|
47
|
-
class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
48
|
-
|
72
|
+
class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
|
73
|
+
"""Backend for parsing Excel workbooks.
|
74
|
+
|
75
|
+
The backend converts an Excel workbook into a DoclingDocument object.
|
76
|
+
Each worksheet is converted into a separate page.
|
77
|
+
The following elements are parsed:
|
78
|
+
- Cell contents, parsed as tables. If two groups of cells are disconnected
|
79
|
+
between each other, they will be parsed as two different tables.
|
80
|
+
- Images, parsed as PictureItem objects.
|
81
|
+
|
82
|
+
The DoclingDocument tables and pictures have their provenance information, including
|
83
|
+
the position in their original Excel worksheet. The position is represented by a
|
84
|
+
bounding box object with the cell indices as units (0-based index). The size of this
|
85
|
+
bounding box is the number of columns and rows that the table or picture spans.
|
86
|
+
"""
|
87
|
+
|
88
|
+
@override
|
89
|
+
def __init__(
|
90
|
+
self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
|
91
|
+
) -> None:
|
92
|
+
"""Initialize the MsExcelDocumentBackend object.
|
93
|
+
|
94
|
+
Parameters:
|
95
|
+
in_doc: The input document object.
|
96
|
+
path_or_stream: The path or stream to the Excel file.
|
97
|
+
|
98
|
+
Raises:
|
99
|
+
RuntimeError: An error occurred parsing the file.
|
100
|
+
"""
|
49
101
|
super().__init__(in_doc, path_or_stream)
|
50
102
|
|
51
103
|
# Initialise the parents for the hierarchy
|
52
104
|
self.max_levels = 10
|
53
105
|
|
54
|
-
self.parents:
|
106
|
+
self.parents: dict[int, Any] = {}
|
55
107
|
for i in range(-1, self.max_levels):
|
56
108
|
self.parents[i] = None
|
57
109
|
|
@@ -63,35 +115,47 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
63
115
|
elif isinstance(self.path_or_stream, Path):
|
64
116
|
self.workbook = load_workbook(filename=str(self.path_or_stream))
|
65
117
|
|
66
|
-
self.valid =
|
118
|
+
self.valid = self.workbook is not None
|
67
119
|
except Exception as e:
|
68
120
|
self.valid = False
|
69
121
|
|
70
122
|
raise RuntimeError(
|
71
|
-
f"
|
123
|
+
f"MsExcelDocumentBackend could not load document with hash {self.document_hash}"
|
72
124
|
) from e
|
73
125
|
|
126
|
+
@override
|
74
127
|
def is_valid(self) -> bool:
|
75
|
-
_log.
|
128
|
+
_log.debug(f"valid: {self.valid}")
|
76
129
|
return self.valid
|
77
130
|
|
78
131
|
@classmethod
|
132
|
+
@override
|
79
133
|
def supports_pagination(cls) -> bool:
|
80
134
|
return True
|
81
135
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
136
|
+
@override
|
137
|
+
def page_count(self) -> int:
|
138
|
+
if self.is_valid() and self.workbook:
|
139
|
+
return len(self.workbook.sheetnames)
|
140
|
+
else:
|
141
|
+
return 0
|
87
142
|
|
88
143
|
@classmethod
|
89
|
-
|
144
|
+
@override
|
145
|
+
def supported_formats(cls) -> set[InputFormat]:
|
90
146
|
return {InputFormat.XLSX}
|
91
147
|
|
148
|
+
@override
|
92
149
|
def convert(self) -> DoclingDocument:
|
93
|
-
|
150
|
+
"""Parse the Excel workbook into a DoclingDocument object.
|
94
151
|
|
152
|
+
Raises:
|
153
|
+
RuntimeError: Unable to run the conversion since the backend object failed to
|
154
|
+
initialize.
|
155
|
+
|
156
|
+
Returns:
|
157
|
+
The DoclingDocument object representing the Excel workbook.
|
158
|
+
"""
|
95
159
|
origin = DocumentOrigin(
|
96
160
|
filename=self.file.name or "file.xlsx",
|
97
161
|
mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
@@ -110,6 +174,14 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
110
174
|
return doc
|
111
175
|
|
112
176
|
def _convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
|
177
|
+
"""Parse the Excel workbook and attach its structure to a DoclingDocument.
|
178
|
+
|
179
|
+
Args:
|
180
|
+
doc: A DoclingDocument object.
|
181
|
+
|
182
|
+
Returns:
|
183
|
+
A DoclingDocument object with the parsed items.
|
184
|
+
"""
|
113
185
|
|
114
186
|
if self.workbook is not None:
|
115
187
|
|
@@ -117,22 +189,34 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
117
189
|
for sheet_name in self.workbook.sheetnames:
|
118
190
|
_log.info(f"Processing sheet: {sheet_name}")
|
119
191
|
|
120
|
-
# Access the sheet by name
|
121
192
|
sheet = self.workbook[sheet_name]
|
193
|
+
page_no = self.workbook.index(sheet) + 1
|
194
|
+
# do not rely on sheet.max_column, sheet.max_row if there are images
|
195
|
+
page = doc.add_page(page_no=page_no, size=Size(width=0, height=0))
|
122
196
|
|
123
197
|
self.parents[0] = doc.add_group(
|
124
198
|
parent=None,
|
125
199
|
label=GroupLabel.SECTION,
|
126
200
|
name=f"sheet: {sheet_name}",
|
127
201
|
)
|
128
|
-
|
129
202
|
doc = self._convert_sheet(doc, sheet)
|
203
|
+
width, height = self._find_page_size(doc, page_no)
|
204
|
+
page.size = Size(width=width, height=height)
|
130
205
|
else:
|
131
206
|
_log.error("Workbook is not initialized.")
|
132
207
|
|
133
208
|
return doc
|
134
209
|
|
135
|
-
def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet):
|
210
|
+
def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet) -> DoclingDocument:
|
211
|
+
"""Parse an Excel worksheet and attach its structure to a DoclingDocument
|
212
|
+
|
213
|
+
Args:
|
214
|
+
doc: The DoclingDocument to be updated.
|
215
|
+
sheet: The Excel worksheet to be parsed.
|
216
|
+
|
217
|
+
Returns:
|
218
|
+
The updated DoclingDocument.
|
219
|
+
"""
|
136
220
|
|
137
221
|
doc = self._find_tables_in_sheet(doc, sheet)
|
138
222
|
|
@@ -140,47 +224,81 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
140
224
|
|
141
225
|
return doc
|
142
226
|
|
143
|
-
def _find_tables_in_sheet(
|
144
|
-
|
145
|
-
|
227
|
+
def _find_tables_in_sheet(
|
228
|
+
self, doc: DoclingDocument, sheet: Worksheet
|
229
|
+
) -> DoclingDocument:
|
230
|
+
"""Find all tables in an Excel sheet and attach them to a DoclingDocument.
|
146
231
|
|
147
|
-
|
148
|
-
|
149
|
-
|
232
|
+
Args:
|
233
|
+
doc: The DoclingDocument to be updated.
|
234
|
+
sheet: The Excel worksheet to be parsed.
|
150
235
|
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
table_cells=[],
|
155
|
-
)
|
236
|
+
Returns:
|
237
|
+
The updated DoclingDocument.
|
238
|
+
"""
|
156
239
|
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
240
|
+
if self.workbook is not None:
|
241
|
+
tables = self._find_data_tables(sheet)
|
242
|
+
|
243
|
+
for excel_table in tables:
|
244
|
+
origin_col = excel_table.anchor[0]
|
245
|
+
origin_row = excel_table.anchor[1]
|
246
|
+
num_rows = excel_table.num_rows
|
247
|
+
num_cols = excel_table.num_cols
|
248
|
+
|
249
|
+
table_data = TableData(
|
250
|
+
num_rows=num_rows,
|
251
|
+
num_cols=num_cols,
|
252
|
+
table_cells=[],
|
169
253
|
)
|
170
|
-
table_data.table_cells.append(cell)
|
171
254
|
|
172
|
-
|
255
|
+
for excel_cell in excel_table.data:
|
256
|
+
|
257
|
+
cell = TableCell(
|
258
|
+
text=excel_cell.text,
|
259
|
+
row_span=excel_cell.row_span,
|
260
|
+
col_span=excel_cell.col_span,
|
261
|
+
start_row_offset_idx=excel_cell.row,
|
262
|
+
end_row_offset_idx=excel_cell.row + excel_cell.row_span,
|
263
|
+
start_col_offset_idx=excel_cell.col,
|
264
|
+
end_col_offset_idx=excel_cell.col + excel_cell.col_span,
|
265
|
+
column_header=excel_cell.row == 0,
|
266
|
+
row_header=False,
|
267
|
+
)
|
268
|
+
table_data.table_cells.append(cell)
|
269
|
+
|
270
|
+
page_no = self.workbook.index(sheet) + 1
|
271
|
+
doc.add_table(
|
272
|
+
data=table_data,
|
273
|
+
parent=self.parents[0],
|
274
|
+
prov=ProvenanceItem(
|
275
|
+
page_no=page_no,
|
276
|
+
charspan=(0, 0),
|
277
|
+
bbox=BoundingBox.from_tuple(
|
278
|
+
(
|
279
|
+
origin_col,
|
280
|
+
origin_row,
|
281
|
+
origin_col + num_cols,
|
282
|
+
origin_row + num_rows,
|
283
|
+
),
|
284
|
+
origin=CoordOrigin.TOPLEFT,
|
285
|
+
),
|
286
|
+
),
|
287
|
+
)
|
173
288
|
|
174
289
|
return doc
|
175
290
|
|
176
|
-
def _find_data_tables(self, sheet: Worksheet) ->
|
177
|
-
"""
|
178
|
-
|
179
|
-
|
180
|
-
|
291
|
+
def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
|
292
|
+
"""Find all compact rectangular data tables in an Excel worksheet.
|
293
|
+
|
294
|
+
Args:
|
295
|
+
sheet: The Excel worksheet to be parsed.
|
181
296
|
|
182
|
-
|
183
|
-
|
297
|
+
Returns:
|
298
|
+
A list of ExcelTable objects representing the data tables.
|
299
|
+
"""
|
300
|
+
tables: list[ExcelTable] = [] # List to store found tables
|
301
|
+
visited: set[tuple[int, int]] = set() # Track already visited cells
|
184
302
|
|
185
303
|
# Iterate over all cells in the sheet
|
186
304
|
for ri, row in enumerate(sheet.iter_rows(values_only=False)):
|
@@ -191,9 +309,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
191
309
|
continue
|
192
310
|
|
193
311
|
# If the cell starts a new table, find its bounds
|
194
|
-
table_bounds, visited_cells = self._find_table_bounds(
|
195
|
-
sheet, ri, rj, visited
|
196
|
-
)
|
312
|
+
table_bounds, visited_cells = self._find_table_bounds(sheet, ri, rj)
|
197
313
|
|
198
314
|
visited.update(visited_cells) # Mark these cells as visited
|
199
315
|
tables.append(table_bounds)
|
@@ -205,22 +321,25 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
205
321
|
sheet: Worksheet,
|
206
322
|
start_row: int,
|
207
323
|
start_col: int,
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
324
|
+
) -> tuple[ExcelTable, set[tuple[int, int]]]:
|
325
|
+
"""Determine the bounds of a compact rectangular table.
|
326
|
+
|
327
|
+
Args:
|
328
|
+
sheet: The Excel worksheet to be parsed.
|
329
|
+
start_row: The row number of the starting cell.
|
330
|
+
start_col: The column number of the starting cell.
|
331
|
+
|
212
332
|
Returns:
|
213
|
-
|
214
|
-
- A set of visited cell coordinates.
|
333
|
+
A tuple with an Excel table and a set of cell coordinates.
|
215
334
|
"""
|
216
|
-
_log.
|
335
|
+
_log.debug("find_table_bounds")
|
217
336
|
|
218
337
|
max_row = self._find_table_bottom(sheet, start_row, start_col)
|
219
338
|
max_col = self._find_table_right(sheet, start_row, start_col)
|
220
339
|
|
221
340
|
# Collect the data within the bounds
|
222
341
|
data = []
|
223
|
-
visited_cells = set()
|
342
|
+
visited_cells: set[tuple[int, int]] = set()
|
224
343
|
for ri in range(start_row, max_row + 1):
|
225
344
|
for rj in range(start_col, max_col + 1):
|
226
345
|
|
@@ -230,7 +349,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
230
349
|
row_span = 1
|
231
350
|
col_span = 1
|
232
351
|
|
233
|
-
# _log.info(sheet.merged_cells.ranges)
|
234
352
|
for merged_range in sheet.merged_cells.ranges:
|
235
353
|
|
236
354
|
if (
|
@@ -254,7 +372,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
254
372
|
col_span=col_span,
|
255
373
|
)
|
256
374
|
)
|
257
|
-
# _log.info(f"cell: {ri}, {rj} -> {ri - start_row}, {rj - start_col}, {row_span}, {col_span}: {str(cell.value)}")
|
258
375
|
|
259
376
|
# Mark all cells in the span as visited
|
260
377
|
for span_row in range(ri, ri + row_span):
|
@@ -263,6 +380,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
263
380
|
|
264
381
|
return (
|
265
382
|
ExcelTable(
|
383
|
+
anchor=(start_col, start_row),
|
266
384
|
num_rows=max_row + 1 - start_row,
|
267
385
|
num_cols=max_col + 1 - start_col,
|
268
386
|
data=data,
|
@@ -270,10 +388,20 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
270
388
|
visited_cells,
|
271
389
|
)
|
272
390
|
|
273
|
-
def _find_table_bottom(
|
274
|
-
|
391
|
+
def _find_table_bottom(
|
392
|
+
self, sheet: Worksheet, start_row: int, start_col: int
|
393
|
+
) -> int:
|
394
|
+
"""Find the bottom boundary of a table.
|
275
395
|
|
276
|
-
|
396
|
+
Args:
|
397
|
+
sheet: The Excel worksheet to be parsed.
|
398
|
+
start_row: The starting row of the table.
|
399
|
+
start_col: The starting column of the table.
|
400
|
+
|
401
|
+
Returns:
|
402
|
+
The row index representing the bottom boundary of the table.
|
403
|
+
"""
|
404
|
+
max_row: int = start_row
|
277
405
|
|
278
406
|
while max_row < sheet.max_row - 1:
|
279
407
|
# Get the cell value or check if it is part of a merged cell
|
@@ -296,10 +424,20 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
296
424
|
|
297
425
|
return max_row
|
298
426
|
|
299
|
-
def _find_table_right(
|
300
|
-
|
427
|
+
def _find_table_right(
|
428
|
+
self, sheet: Worksheet, start_row: int, start_col: int
|
429
|
+
) -> int:
|
430
|
+
"""Find the right boundary of a table.
|
431
|
+
|
432
|
+
Args:
|
433
|
+
sheet: The Excel worksheet to be parsed.
|
434
|
+
start_row: The starting row of the table.
|
435
|
+
start_col: The starting column of the table.
|
301
436
|
|
302
|
-
|
437
|
+
Returns:
|
438
|
+
The column index representing the right boundary of the table."
|
439
|
+
"""
|
440
|
+
max_col: int = start_col
|
303
441
|
|
304
442
|
while max_col < sheet.max_column - 1:
|
305
443
|
# Get the cell value or check if it is part of a merged cell
|
@@ -325,19 +463,63 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
|
325
463
|
def _find_images_in_sheet(
|
326
464
|
self, doc: DoclingDocument, sheet: Worksheet
|
327
465
|
) -> DoclingDocument:
|
466
|
+
"""Find images in the Excel sheet and attach them to the DoclingDocument.
|
328
467
|
|
329
|
-
|
330
|
-
|
468
|
+
Args:
|
469
|
+
doc: The DoclingDocument to be updated.
|
470
|
+
sheet: The Excel worksheet to be parsed.
|
331
471
|
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
472
|
+
Returns:
|
473
|
+
The updated DoclingDocument.
|
474
|
+
"""
|
475
|
+
if self.workbook is not None:
|
476
|
+
# Iterate over byte images in the sheet
|
477
|
+
for item in sheet._images: # type: ignore[attr-defined]
|
478
|
+
try:
|
479
|
+
image: Image = cast(Image, item)
|
480
|
+
pil_image = PILImage.open(image.ref) # type: ignore[arg-type]
|
481
|
+
page_no = self.workbook.index(sheet) + 1
|
482
|
+
anchor = (0, 0, 0, 0)
|
483
|
+
if isinstance(image.anchor, TwoCellAnchor):
|
484
|
+
anchor = (
|
485
|
+
image.anchor._from.col,
|
486
|
+
image.anchor._from.row,
|
487
|
+
image.anchor.to.col + 1,
|
488
|
+
image.anchor.to.row + 1,
|
489
|
+
)
|
490
|
+
doc.add_picture(
|
491
|
+
parent=self.parents[0],
|
492
|
+
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
493
|
+
caption=None,
|
494
|
+
prov=ProvenanceItem(
|
495
|
+
page_no=page_no,
|
496
|
+
charspan=(0, 0),
|
497
|
+
bbox=BoundingBox.from_tuple(
|
498
|
+
anchor, origin=CoordOrigin.TOPLEFT
|
499
|
+
),
|
500
|
+
),
|
501
|
+
)
|
502
|
+
except:
|
503
|
+
_log.error("could not extract the image from excel sheets")
|
342
504
|
|
343
505
|
return doc
|
506
|
+
|
507
|
+
@staticmethod
|
508
|
+
def _find_page_size(
|
509
|
+
doc: DoclingDocument, page_no: PositiveInt
|
510
|
+
) -> tuple[float, float]:
|
511
|
+
left: float = -1.0
|
512
|
+
top: float = -1.0
|
513
|
+
right: float = -1.0
|
514
|
+
bottom: float = -1.0
|
515
|
+
for item, _ in doc.iterate_items(traverse_pictures=True, page_no=page_no):
|
516
|
+
if not isinstance(item, DocItem):
|
517
|
+
continue
|
518
|
+
for provenance in item.prov:
|
519
|
+
bbox = provenance.bbox
|
520
|
+
left = min(left, bbox.l) if left != -1 else bbox.l
|
521
|
+
right = max(right, bbox.r) if right != -1 else bbox.r
|
522
|
+
top = min(top, bbox.t) if top != -1 else bbox.t
|
523
|
+
bottom = max(bottom, bbox.b) if bottom != -1 else bbox.b
|
524
|
+
|
525
|
+
return (right - left, bottom - top)
|
@@ -850,7 +850,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
850
850
|
def _handle_pictures(
|
851
851
|
self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
|
852
852
|
) -> None:
|
853
|
-
def get_docx_image(drawing_blip):
|
853
|
+
def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
|
854
|
+
image_data: Optional[bytes] = None
|
854
855
|
rId = drawing_blip[0].get(
|
855
856
|
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
|
856
857
|
)
|
@@ -862,19 +863,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
862
863
|
|
863
864
|
level = self._get_level()
|
864
865
|
# Open the BytesIO object with PIL to create an Image
|
865
|
-
|
866
|
-
|
867
|
-
|
868
|
-
pil_image = Image.open(image_bytes)
|
869
|
-
doc.add_picture(
|
870
|
-
parent=self.parents[level - 1],
|
871
|
-
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
872
|
-
caption=None,
|
873
|
-
)
|
874
|
-
except (UnidentifiedImageError, OSError) as e:
|
875
|
-
_log.warning("Warning: image cannot be loaded by Pillow")
|
866
|
+
image_data: Optional[bytes] = get_docx_image(drawing_blip)
|
867
|
+
if image_data is None:
|
868
|
+
_log.warning("Warning: image cannot be found")
|
876
869
|
doc.add_picture(
|
877
870
|
parent=self.parents[level - 1],
|
878
871
|
caption=None,
|
879
872
|
)
|
873
|
+
else:
|
874
|
+
try:
|
875
|
+
image_bytes = BytesIO(image_data)
|
876
|
+
pil_image = Image.open(image_bytes)
|
877
|
+
doc.add_picture(
|
878
|
+
parent=self.parents[level - 1],
|
879
|
+
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
880
|
+
caption=None,
|
881
|
+
)
|
882
|
+
except (UnidentifiedImageError, OSError) as e:
|
883
|
+
_log.warning("Warning: image cannot be loaded by Pillow")
|
884
|
+
doc.add_picture(
|
885
|
+
parent=self.parents[level - 1],
|
886
|
+
caption=None,
|
887
|
+
)
|
880
888
|
return
|
docling/cli/main.py
CHANGED
@@ -40,6 +40,7 @@ from docling.datamodel.pipeline_options import (
|
|
40
40
|
VlmModelType,
|
41
41
|
VlmPipelineOptions,
|
42
42
|
granite_vision_vlm_conversion_options,
|
43
|
+
granite_vision_vlm_ollama_conversion_options,
|
43
44
|
smoldocling_vlm_conversion_options,
|
44
45
|
smoldocling_vlm_mlx_conversion_options,
|
45
46
|
)
|
@@ -153,6 +154,7 @@ def export_documents(
|
|
153
154
|
output_dir: Path,
|
154
155
|
export_json: bool,
|
155
156
|
export_html: bool,
|
157
|
+
export_html_split_page: bool,
|
156
158
|
export_md: bool,
|
157
159
|
export_txt: bool,
|
158
160
|
export_doctags: bool,
|
@@ -180,7 +182,15 @@ def export_documents(
|
|
180
182
|
fname = output_dir / f"{doc_filename}.html"
|
181
183
|
_log.info(f"writing HTML output to {fname}")
|
182
184
|
conv_res.document.save_as_html(
|
183
|
-
filename=fname, image_mode=image_export_mode
|
185
|
+
filename=fname, image_mode=image_export_mode, split_page_view=False
|
186
|
+
)
|
187
|
+
|
188
|
+
# Export HTML format:
|
189
|
+
if export_html_split_page:
|
190
|
+
fname = output_dir / f"{doc_filename}.html"
|
191
|
+
_log.info(f"writing HTML output to {fname}")
|
192
|
+
conv_res.document.save_as_html(
|
193
|
+
filename=fname, image_mode=image_export_mode, split_page_view=True
|
184
194
|
)
|
185
195
|
|
186
196
|
# Export Text format:
|
@@ -471,6 +481,7 @@ def convert(
|
|
471
481
|
|
472
482
|
export_json = OutputFormat.JSON in to_formats
|
473
483
|
export_html = OutputFormat.HTML in to_formats
|
484
|
+
export_html_split_page = OutputFormat.HTML_SPLIT_PAGE in to_formats
|
474
485
|
export_md = OutputFormat.MARKDOWN in to_formats
|
475
486
|
export_txt = OutputFormat.TEXT in to_formats
|
476
487
|
export_doctags = OutputFormat.DOCTAGS in to_formats
|
@@ -531,10 +542,16 @@ def convert(
|
|
531
542
|
backend=backend, # pdf_backend
|
532
543
|
)
|
533
544
|
elif pipeline == PdfPipeline.VLM:
|
534
|
-
pipeline_options = VlmPipelineOptions(
|
545
|
+
pipeline_options = VlmPipelineOptions(
|
546
|
+
enable_remote_services=enable_remote_services,
|
547
|
+
)
|
535
548
|
|
536
549
|
if vlm_model == VlmModelType.GRANITE_VISION:
|
537
550
|
pipeline_options.vlm_options = granite_vision_vlm_conversion_options
|
551
|
+
elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
|
552
|
+
pipeline_options.vlm_options = (
|
553
|
+
granite_vision_vlm_ollama_conversion_options
|
554
|
+
)
|
538
555
|
elif vlm_model == VlmModelType.SMOLDOCLING:
|
539
556
|
pipeline_options.vlm_options = smoldocling_vlm_conversion_options
|
540
557
|
if sys.platform == "darwin":
|
@@ -578,6 +595,7 @@ def convert(
|
|
578
595
|
output_dir=output,
|
579
596
|
export_json=export_json,
|
580
597
|
export_html=export_html,
|
598
|
+
export_html_split_page=export_html_split_page,
|
581
599
|
export_md=export_md,
|
582
600
|
export_txt=export_txt,
|
583
601
|
export_doctags=export_doctags,
|
docling/datamodel/base_models.py
CHANGED
@@ -50,6 +50,7 @@ class OutputFormat(str, Enum):
|
|
50
50
|
MARKDOWN = "md"
|
51
51
|
JSON = "json"
|
52
52
|
HTML = "html"
|
53
|
+
HTML_SPLIT_PAGE = "html_split_page"
|
53
54
|
TEXT = "text"
|
54
55
|
DOCTAGS = "doctags"
|
55
56
|
|
@@ -262,3 +263,35 @@ class Page(BaseModel):
|
|
262
263
|
@property
|
263
264
|
def image(self) -> Optional[Image]:
|
264
265
|
return self.get_image(scale=self._default_image_scale)
|
266
|
+
|
267
|
+
|
268
|
+
## OpenAI API Request / Response Models ##
|
269
|
+
|
270
|
+
|
271
|
+
class OpenAiChatMessage(BaseModel):
|
272
|
+
role: str
|
273
|
+
content: str
|
274
|
+
|
275
|
+
|
276
|
+
class OpenAiResponseChoice(BaseModel):
|
277
|
+
index: int
|
278
|
+
message: OpenAiChatMessage
|
279
|
+
finish_reason: str
|
280
|
+
|
281
|
+
|
282
|
+
class OpenAiResponseUsage(BaseModel):
|
283
|
+
prompt_tokens: int
|
284
|
+
completion_tokens: int
|
285
|
+
total_tokens: int
|
286
|
+
|
287
|
+
|
288
|
+
class OpenAiApiResponse(BaseModel):
|
289
|
+
model_config = ConfigDict(
|
290
|
+
protected_namespaces=(),
|
291
|
+
)
|
292
|
+
|
293
|
+
id: str
|
294
|
+
model: Optional[str] = None # returned by openai
|
295
|
+
choices: List[OpenAiResponseChoice]
|
296
|
+
created: int
|
297
|
+
usage: OpenAiResponseUsage
|
docling/datamodel/document.py
CHANGED
@@ -283,6 +283,13 @@ class _DocumentConversionInput(BaseModel):
|
|
283
283
|
if mime is None: # must guess from
|
284
284
|
with obj.open("rb") as f:
|
285
285
|
content = f.read(1024) # Read first 1KB
|
286
|
+
if mime is not None and mime.lower() == "application/zip":
|
287
|
+
if obj.suffixes[-1].lower() == ".xlsx":
|
288
|
+
mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
289
|
+
elif obj.suffixes[-1].lower() == ".docx":
|
290
|
+
mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
291
|
+
elif obj.suffixes[-1].lower() == ".pptx":
|
292
|
+
mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
286
293
|
|
287
294
|
elif isinstance(obj, DocumentStream):
|
288
295
|
content = obj.stream.read(8192)
|
@@ -213,8 +213,8 @@ class PictureDescriptionBaseOptions(BaseOptions):
|
|
213
213
|
batch_size: int = 8
|
214
214
|
scale: float = 2
|
215
215
|
|
216
|
-
|
217
|
-
0.
|
216
|
+
picture_area_threshold: float = (
|
217
|
+
0.05 # percentage of the area for a picture to processed with the models
|
218
218
|
)
|
219
219
|
|
220
220
|
|
@@ -266,6 +266,7 @@ class ResponseFormat(str, Enum):
|
|
266
266
|
class InferenceFramework(str, Enum):
|
267
267
|
MLX = "mlx"
|
268
268
|
TRANSFORMERS = "transformers"
|
269
|
+
OPENAI = "openai"
|
269
270
|
|
270
271
|
|
271
272
|
class HuggingFaceVlmOptions(BaseVlmOptions):
|
@@ -284,6 +285,19 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
|
|
284
285
|
return self.repo_id.replace("/", "--")
|
285
286
|
|
286
287
|
|
288
|
+
class ApiVlmOptions(BaseVlmOptions):
|
289
|
+
kind: Literal["api_model_options"] = "api_model_options"
|
290
|
+
|
291
|
+
url: AnyUrl = AnyUrl(
|
292
|
+
"http://localhost:11434/v1/chat/completions"
|
293
|
+
) # Default to ollama
|
294
|
+
headers: Dict[str, str] = {}
|
295
|
+
params: Dict[str, Any] = {}
|
296
|
+
scale: float = 2.0
|
297
|
+
timeout: float = 60
|
298
|
+
response_format: ResponseFormat
|
299
|
+
|
300
|
+
|
287
301
|
smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
|
288
302
|
repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
|
289
303
|
prompt="Convert this page to docling.",
|
@@ -307,10 +321,20 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
|
|
307
321
|
inference_framework=InferenceFramework.TRANSFORMERS,
|
308
322
|
)
|
309
323
|
|
324
|
+
granite_vision_vlm_ollama_conversion_options = ApiVlmOptions(
|
325
|
+
url=AnyUrl("http://localhost:11434/v1/chat/completions"),
|
326
|
+
params={"model": "granite3.2-vision:2b"},
|
327
|
+
prompt="OCR the full page to markdown.",
|
328
|
+
scale=1.0,
|
329
|
+
timeout=120,
|
330
|
+
response_format=ResponseFormat.MARKDOWN,
|
331
|
+
)
|
332
|
+
|
310
333
|
|
311
334
|
class VlmModelType(str, Enum):
|
312
335
|
SMOLDOCLING = "smoldocling"
|
313
336
|
GRANITE_VISION = "granite_vision"
|
337
|
+
GRANITE_VISION_OLLAMA = "granite_vision_ollama"
|
314
338
|
|
315
339
|
|
316
340
|
# Define an enum for the backend options
|
@@ -362,7 +386,9 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
|
|
362
386
|
False # (To be used with vlms, or other generative models)
|
363
387
|
)
|
364
388
|
# If True, text from backend will be used instead of generated text
|
365
|
-
vlm_options: Union[HuggingFaceVlmOptions] =
|
389
|
+
vlm_options: Union[HuggingFaceVlmOptions, ApiVlmOptions] = (
|
390
|
+
smoldocling_vlm_conversion_options
|
391
|
+
)
|
366
392
|
|
367
393
|
|
368
394
|
class PdfPipelineOptions(PaginatedPipelineOptions):
|
@@ -0,0 +1,67 @@
|
|
1
|
+
from typing import Iterable
|
2
|
+
|
3
|
+
from docling.datamodel.base_models import Page, VlmPrediction
|
4
|
+
from docling.datamodel.document import ConversionResult
|
5
|
+
from docling.datamodel.pipeline_options import ApiVlmOptions
|
6
|
+
from docling.exceptions import OperationNotAllowed
|
7
|
+
from docling.models.base_model import BasePageModel
|
8
|
+
from docling.utils.api_image_request import api_image_request
|
9
|
+
from docling.utils.profiling import TimeRecorder
|
10
|
+
|
11
|
+
|
12
|
+
class ApiVlmModel(BasePageModel):
|
13
|
+
|
14
|
+
def __init__(
|
15
|
+
self,
|
16
|
+
enabled: bool,
|
17
|
+
enable_remote_services: bool,
|
18
|
+
vlm_options: ApiVlmOptions,
|
19
|
+
):
|
20
|
+
self.enabled = enabled
|
21
|
+
self.vlm_options = vlm_options
|
22
|
+
if self.enabled:
|
23
|
+
if not enable_remote_services:
|
24
|
+
raise OperationNotAllowed(
|
25
|
+
"Connections to remote services is only allowed when set explicitly. "
|
26
|
+
"pipeline_options.enable_remote_services=True, or using the CLI "
|
27
|
+
"--enable-remote-services."
|
28
|
+
)
|
29
|
+
|
30
|
+
self.timeout = self.vlm_options.timeout
|
31
|
+
self.prompt_content = (
|
32
|
+
f"This is a page from a document.\n{self.vlm_options.prompt}"
|
33
|
+
)
|
34
|
+
self.params = {
|
35
|
+
**self.vlm_options.params,
|
36
|
+
"temperature": 0,
|
37
|
+
}
|
38
|
+
|
39
|
+
def __call__(
|
40
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
41
|
+
) -> Iterable[Page]:
|
42
|
+
for page in page_batch:
|
43
|
+
assert page._backend is not None
|
44
|
+
if not page._backend.is_valid():
|
45
|
+
yield page
|
46
|
+
else:
|
47
|
+
with TimeRecorder(conv_res, "vlm"):
|
48
|
+
assert page.size is not None
|
49
|
+
|
50
|
+
hi_res_image = page.get_image(scale=self.vlm_options.scale)
|
51
|
+
assert hi_res_image is not None
|
52
|
+
if hi_res_image:
|
53
|
+
if hi_res_image.mode != "RGB":
|
54
|
+
hi_res_image = hi_res_image.convert("RGB")
|
55
|
+
|
56
|
+
page_tags = api_image_request(
|
57
|
+
image=hi_res_image,
|
58
|
+
prompt=self.prompt_content,
|
59
|
+
url=self.vlm_options.url,
|
60
|
+
timeout=self.timeout,
|
61
|
+
headers=self.vlm_options.headers,
|
62
|
+
**self.params,
|
63
|
+
)
|
64
|
+
|
65
|
+
page.predictions.vlm_response = VlmPrediction(text=page_tags)
|
66
|
+
|
67
|
+
yield page
|
@@ -1,12 +1,7 @@
|
|
1
|
-
import base64
|
2
|
-
import io
|
3
|
-
import logging
|
4
1
|
from pathlib import Path
|
5
|
-
from typing import Iterable,
|
2
|
+
from typing import Iterable, Optional, Type, Union
|
6
3
|
|
7
|
-
import requests
|
8
4
|
from PIL import Image
|
9
|
-
from pydantic import BaseModel, ConfigDict
|
10
5
|
|
11
6
|
from docling.datamodel.pipeline_options import (
|
12
7
|
AcceleratorOptions,
|
@@ -15,37 +10,7 @@ from docling.datamodel.pipeline_options import (
|
|
15
10
|
)
|
16
11
|
from docling.exceptions import OperationNotAllowed
|
17
12
|
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
18
|
-
|
19
|
-
_log = logging.getLogger(__name__)
|
20
|
-
|
21
|
-
|
22
|
-
class ChatMessage(BaseModel):
|
23
|
-
role: str
|
24
|
-
content: str
|
25
|
-
|
26
|
-
|
27
|
-
class ResponseChoice(BaseModel):
|
28
|
-
index: int
|
29
|
-
message: ChatMessage
|
30
|
-
finish_reason: str
|
31
|
-
|
32
|
-
|
33
|
-
class ResponseUsage(BaseModel):
|
34
|
-
prompt_tokens: int
|
35
|
-
completion_tokens: int
|
36
|
-
total_tokens: int
|
37
|
-
|
38
|
-
|
39
|
-
class ApiResponse(BaseModel):
|
40
|
-
model_config = ConfigDict(
|
41
|
-
protected_namespaces=(),
|
42
|
-
)
|
43
|
-
|
44
|
-
id: str
|
45
|
-
model: Optional[str] = None # returned by openai
|
46
|
-
choices: List[ResponseChoice]
|
47
|
-
created: int
|
48
|
-
usage: ResponseUsage
|
13
|
+
from docling.utils.api_image_request import api_image_request
|
49
14
|
|
50
15
|
|
51
16
|
class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
@@ -83,43 +48,11 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
|
83
48
|
# Note: technically we could make a batch request here,
|
84
49
|
# but not all APIs will allow for it. For example, vllm won't allow more than 1.
|
85
50
|
for image in images:
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
messages = [
|
91
|
-
{
|
92
|
-
"role": "user",
|
93
|
-
"content": [
|
94
|
-
{
|
95
|
-
"type": "text",
|
96
|
-
"text": self.options.prompt,
|
97
|
-
},
|
98
|
-
{
|
99
|
-
"type": "image_url",
|
100
|
-
"image_url": {
|
101
|
-
"url": f"data:image/png;base64,{image_base64}"
|
102
|
-
},
|
103
|
-
},
|
104
|
-
],
|
105
|
-
}
|
106
|
-
]
|
107
|
-
|
108
|
-
payload = {
|
109
|
-
"messages": messages,
|
110
|
-
**self.options.params,
|
111
|
-
}
|
112
|
-
|
113
|
-
r = requests.post(
|
114
|
-
str(self.options.url),
|
115
|
-
headers=self.options.headers,
|
116
|
-
json=payload,
|
51
|
+
yield api_image_request(
|
52
|
+
image=image,
|
53
|
+
prompt=self.options.prompt,
|
54
|
+
url=self.options.url,
|
117
55
|
timeout=self.options.timeout,
|
56
|
+
headers=self.options.headers,
|
57
|
+
**self.options.params,
|
118
58
|
)
|
119
|
-
if not r.ok:
|
120
|
-
_log.error(f"Error calling the API. Reponse was {r.text}")
|
121
|
-
r.raise_for_status()
|
122
|
-
|
123
|
-
api_resp = ApiResponse.model_validate_json(r.text)
|
124
|
-
generated_text = api_resp.choices[0].message.content.strip()
|
125
|
-
yield generated_text
|
@@ -63,8 +63,20 @@ class PictureDescriptionBaseModel(
|
|
63
63
|
elements: List[PictureItem] = []
|
64
64
|
for el in element_batch:
|
65
65
|
assert isinstance(el.item, PictureItem)
|
66
|
-
|
67
|
-
|
66
|
+
describe_image = True
|
67
|
+
# Don't describe the image if it's smaller than the threshold
|
68
|
+
if len(el.item.prov) > 0:
|
69
|
+
prov = el.item.prov[0] # PictureItems have at most a single provenance
|
70
|
+
page = doc.pages.get(prov.page_no)
|
71
|
+
if page is not None:
|
72
|
+
page_area = page.size.width * page.size.height
|
73
|
+
if page_area > 0:
|
74
|
+
area_fraction = prov.bbox.area() / page_area
|
75
|
+
if area_fraction < self.options.picture_area_threshold:
|
76
|
+
describe_image = False
|
77
|
+
if describe_image:
|
78
|
+
elements.append(el.item)
|
79
|
+
images.append(el.image)
|
68
80
|
|
69
81
|
outputs = self._annotate_images(images)
|
70
82
|
|
@@ -2,7 +2,7 @@ import logging
|
|
2
2
|
import sys
|
3
3
|
import warnings
|
4
4
|
from pathlib import Path
|
5
|
-
from typing import Optional
|
5
|
+
from typing import Optional, cast
|
6
6
|
|
7
7
|
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
8
8
|
|
@@ -226,7 +226,11 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
226
226
|
and self.pipeline_options.generate_table_images
|
227
227
|
):
|
228
228
|
page_ix = element.prov[0].page_no - 1
|
229
|
-
page =
|
229
|
+
page = next(
|
230
|
+
(p for p in conv_res.pages if p.page_no == page_ix),
|
231
|
+
cast("Page", None),
|
232
|
+
)
|
233
|
+
assert page is not None
|
230
234
|
assert page.size is not None
|
231
235
|
assert page.image is not None
|
232
236
|
|
docling/pipeline/vlm_pipeline.py
CHANGED
@@ -15,11 +15,14 @@ from docling.backend.pdf_backend import PdfDocumentBackend
|
|
15
15
|
from docling.datamodel.base_models import InputFormat, Page
|
16
16
|
from docling.datamodel.document import ConversionResult, InputDocument
|
17
17
|
from docling.datamodel.pipeline_options import (
|
18
|
+
ApiVlmOptions,
|
19
|
+
HuggingFaceVlmOptions,
|
18
20
|
InferenceFramework,
|
19
21
|
ResponseFormat,
|
20
22
|
VlmPipelineOptions,
|
21
23
|
)
|
22
24
|
from docling.datamodel.settings import settings
|
25
|
+
from docling.models.api_vlm_model import ApiVlmModel
|
23
26
|
from docling.models.hf_mlx_model import HuggingFaceMlxModel
|
24
27
|
from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
25
28
|
from docling.pipeline.base_pipeline import PaginatedPipeline
|
@@ -57,27 +60,34 @@ class VlmPipeline(PaginatedPipeline):
|
|
57
60
|
|
58
61
|
self.keep_images = self.pipeline_options.generate_page_images
|
59
62
|
|
60
|
-
if (
|
61
|
-
self.pipeline_options.vlm_options.inference_framework
|
62
|
-
== InferenceFramework.MLX
|
63
|
-
):
|
63
|
+
if isinstance(pipeline_options.vlm_options, ApiVlmOptions):
|
64
64
|
self.build_pipe = [
|
65
|
-
|
65
|
+
ApiVlmModel(
|
66
66
|
enabled=True, # must be always enabled for this pipeline to make sense.
|
67
|
-
|
68
|
-
|
69
|
-
vlm_options=self.pipeline_options.vlm_options,
|
70
|
-
),
|
71
|
-
]
|
72
|
-
else:
|
73
|
-
self.build_pipe = [
|
74
|
-
HuggingFaceVlmModel(
|
75
|
-
enabled=True, # must be always enabled for this pipeline to make sense.
|
76
|
-
artifacts_path=artifacts_path,
|
77
|
-
accelerator_options=pipeline_options.accelerator_options,
|
78
|
-
vlm_options=self.pipeline_options.vlm_options,
|
67
|
+
enable_remote_services=self.pipeline_options.enable_remote_services,
|
68
|
+
vlm_options=cast(ApiVlmOptions, self.pipeline_options.vlm_options),
|
79
69
|
),
|
80
70
|
]
|
71
|
+
elif isinstance(self.pipeline_options.vlm_options, HuggingFaceVlmOptions):
|
72
|
+
vlm_options = cast(HuggingFaceVlmOptions, self.pipeline_options.vlm_options)
|
73
|
+
if vlm_options.inference_framework == InferenceFramework.MLX:
|
74
|
+
self.build_pipe = [
|
75
|
+
HuggingFaceMlxModel(
|
76
|
+
enabled=True, # must be always enabled for this pipeline to make sense.
|
77
|
+
artifacts_path=artifacts_path,
|
78
|
+
accelerator_options=pipeline_options.accelerator_options,
|
79
|
+
vlm_options=vlm_options,
|
80
|
+
),
|
81
|
+
]
|
82
|
+
else:
|
83
|
+
self.build_pipe = [
|
84
|
+
HuggingFaceVlmModel(
|
85
|
+
enabled=True, # must be always enabled for this pipeline to make sense.
|
86
|
+
artifacts_path=artifacts_path,
|
87
|
+
accelerator_options=pipeline_options.accelerator_options,
|
88
|
+
vlm_options=vlm_options,
|
89
|
+
),
|
90
|
+
]
|
81
91
|
|
82
92
|
self.enrichment_pipe = [
|
83
93
|
# Other models working on `NodeItem` elements in the DoclingDocument
|
@@ -0,0 +1,61 @@
|
|
1
|
+
import base64
|
2
|
+
import logging
|
3
|
+
from io import BytesIO
|
4
|
+
from typing import Dict, Optional
|
5
|
+
|
6
|
+
import requests
|
7
|
+
from PIL import Image
|
8
|
+
from pydantic import AnyUrl
|
9
|
+
|
10
|
+
from docling.datamodel.base_models import OpenAiApiResponse
|
11
|
+
|
12
|
+
_log = logging.getLogger(__name__)
|
13
|
+
|
14
|
+
|
15
|
+
def api_image_request(
|
16
|
+
image: Image.Image,
|
17
|
+
prompt: str,
|
18
|
+
url: AnyUrl,
|
19
|
+
timeout: float = 20,
|
20
|
+
headers: Optional[Dict[str, str]] = None,
|
21
|
+
**params,
|
22
|
+
) -> str:
|
23
|
+
img_io = BytesIO()
|
24
|
+
image.save(img_io, "PNG")
|
25
|
+
image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
|
26
|
+
messages = [
|
27
|
+
{
|
28
|
+
"role": "user",
|
29
|
+
"content": [
|
30
|
+
{
|
31
|
+
"type": "image_url",
|
32
|
+
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
33
|
+
},
|
34
|
+
{
|
35
|
+
"type": "text",
|
36
|
+
"text": prompt,
|
37
|
+
},
|
38
|
+
],
|
39
|
+
}
|
40
|
+
]
|
41
|
+
|
42
|
+
payload = {
|
43
|
+
"messages": messages,
|
44
|
+
**params,
|
45
|
+
}
|
46
|
+
|
47
|
+
headers = headers or {}
|
48
|
+
|
49
|
+
r = requests.post(
|
50
|
+
str(url),
|
51
|
+
headers=headers,
|
52
|
+
json=payload,
|
53
|
+
timeout=timeout,
|
54
|
+
)
|
55
|
+
if not r.ok:
|
56
|
+
_log.error(f"Error calling the API. Response was {r.text}")
|
57
|
+
r.raise_for_status()
|
58
|
+
|
59
|
+
api_resp = OpenAiApiResponse.model_validate_json(r.text)
|
60
|
+
generated_text = api_resp.choices[0].message.content.strip()
|
61
|
+
return generated_text
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.30.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/docling-project/docling
|
6
6
|
License: MIT
|
@@ -28,7 +28,7 @@ Provides-Extra: vlm
|
|
28
28
|
Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
|
29
29
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
30
30
|
Requires-Dist: certifi (>=2024.7.4)
|
31
|
-
Requires-Dist: docling-core[chunking] (>=2.
|
31
|
+
Requires-Dist: docling-core[chunking] (>=2.26.0,<3.0.0)
|
32
32
|
Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
|
33
33
|
Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
|
34
34
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
@@ -58,7 +58,7 @@ Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
|
|
58
58
|
Requires-Dist: tqdm (>=4.65.0,<5.0.0)
|
59
59
|
Requires-Dist: transformers (>=4.42.0,<4.43.0) ; (sys_platform == "darwin" and platform_machine == "x86_64") and (extra == "vlm")
|
60
60
|
Requires-Dist: transformers (>=4.46.0,<5.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
|
61
|
-
Requires-Dist: typer (>=0.12.5,<0.
|
61
|
+
Requires-Dist: typer (>=0.12.5,<0.16.0)
|
62
62
|
Project-URL: Repository, https://github.com/docling-project/docling
|
63
63
|
Description-Content-Type: text/markdown
|
64
64
|
|
@@ -14,9 +14,9 @@ docling/backend/html_backend.py,sha256=ghPLZfdBEPBzLIO9IWzzx0t1Os9B9r4VyGyEZtMsZ
|
|
14
14
|
docling/backend/json/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
15
|
docling/backend/json/docling_json_backend.py,sha256=LlFMVoZrrCfVwbDuRbNN4Xg96Lujh4xxrTBt9jGhY9I,1984
|
16
16
|
docling/backend/md_backend.py,sha256=lqDiKIBHGsA0u-H1n9oVpPlrcpVT4gYRuNXXcyGlftM,17219
|
17
|
-
docling/backend/msexcel_backend.py,sha256=
|
17
|
+
docling/backend/msexcel_backend.py,sha256=KRPoHRDv-mqko9RUHGQCzdRrvDo7g7zSU2Z5zoL_Hzo,18106
|
18
18
|
docling/backend/mspowerpoint_backend.py,sha256=X55-1anXm562wxAuYn5uwQkqKjirmgrn1KfbeaKUbXw,17273
|
19
|
-
docling/backend/msword_backend.py,sha256=
|
19
|
+
docling/backend/msword_backend.py,sha256=CgNPjU8SQ7rkAYH_BGiUyv568MGhoH3R0M39WBT8gkc,32468
|
20
20
|
docling/backend/pdf_backend.py,sha256=odWb1rxk3WCUIEJMhq-dYFNUQ1pSDuNHbU9wlTZIRAs,2211
|
21
21
|
docling/backend/pypdfium2_backend.py,sha256=wRwhA5XHRqL7vyNhCAHM6P-ONkwtyjKG9LgC4NJ-4i8,10784
|
22
22
|
docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -24,17 +24,18 @@ docling/backend/xml/jats_backend.py,sha256=HXailrDjiwu4swwFnXy3lNfRtLZmkBBp4yqaf
|
|
24
24
|
docling/backend/xml/uspto_backend.py,sha256=H0jwIt2skOke_yEUk0wfXCtodrB-hrj2ygLtB3jMWaI,71056
|
25
25
|
docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
|
26
26
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
27
|
-
docling/cli/main.py,sha256=
|
27
|
+
docling/cli/main.py,sha256=TD-cEf4giuk1O5NPoB-heXHHteUqKoLsj4Rg4xsBUrs,26119
|
28
28
|
docling/cli/models.py,sha256=tM_qbMM3YOPxFU7JlME96MLbtd1CX_bOAK7FS-NhJvY,3979
|
29
29
|
docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
|
30
30
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
31
|
-
docling/datamodel/base_models.py,sha256=
|
32
|
-
docling/datamodel/document.py,sha256=
|
33
|
-
docling/datamodel/pipeline_options.py,sha256=
|
31
|
+
docling/datamodel/base_models.py,sha256=fJfFMaHXc-CUrAVfhPF8lKrdb-gaXr2tohx6dHldvRU,7926
|
32
|
+
docling/datamodel/document.py,sha256=V0iK1MYOkPIzd4eQa-G8unp-t01fktlG9wwQ1IwE6Zg,15109
|
33
|
+
docling/datamodel/pipeline_options.py,sha256=iGLijZR-YOtmg0RQs59pqoG_1uGsDYbg5wMDD0FWYx4,13351
|
34
34
|
docling/datamodel/settings.py,sha256=bNMdowIKv7RUchabQTo4rFNEsxfB6pGg2LoZSY634zo,1869
|
35
35
|
docling/document_converter.py,sha256=LCX92FzgmXNJLFVSQfjqH9SGe3zA7FGwARedSigFIpY,13798
|
36
36
|
docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
|
37
37
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
38
|
+
docling/models/api_vlm_model.py,sha256=6SxMsFPf0SbT365P67KspdpF3TXZSeu5kmPE3lXAhW4,2470
|
38
39
|
docling/models/base_model.py,sha256=9xJ0VIlpR2BzqoEWMC8LYp5Y96QAEKip4b_HCwCDltY,2931
|
39
40
|
docling/models/base_ocr_model.py,sha256=xvKMhE4ZOGkL2GAhpDvrAHLLFps3ZUfxXZ5ctL1lXUw,7226
|
40
41
|
docling/models/code_formula_model.py,sha256=mOu5luYMzyrCCr8MRGOciNcSvULpQysDd_FXn96WPc8,11477
|
@@ -50,8 +51,8 @@ docling/models/layout_model.py,sha256=7fQWipGV1HDrvbP4uOKa9QAicQl89jp7lailQmbFL3
|
|
50
51
|
docling/models/ocr_mac_model.py,sha256=2pZaUWg19go_u88mKWr5y_52PAYEN__GsbyUYLdY4zo,5353
|
51
52
|
docling/models/page_assemble_model.py,sha256=ivkCdbZJpFcGl7CazLegcP1tLK8ZixDfVhQXqsdW_UA,6359
|
52
53
|
docling/models/page_preprocessing_model.py,sha256=Ja7RE1K-2fWxWrxOzNm6QDSGqFf-MY6_uY5OAZ7AQSo,3078
|
53
|
-
docling/models/picture_description_api_model.py,sha256=
|
54
|
-
docling/models/picture_description_base_model.py,sha256=
|
54
|
+
docling/models/picture_description_api_model.py,sha256=DowWOU93MXAjj3N1A9ex88Sa3Nic2c3dfoOYir5jZEA,2064
|
55
|
+
docling/models/picture_description_base_model.py,sha256=khuhQZDAZemZMe4BsrBMpjEwkY3nhMFXuczjQpSQrVY,2971
|
55
56
|
docling/models/picture_description_vlm_model.py,sha256=I2Un3vfhQVeWEyZ3Sd3Kygw9la2QSZCwDfl_7XVlMm4,4042
|
56
57
|
docling/models/plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
57
58
|
docling/models/plugins/defaults.py,sha256=qslXGnRX07Z3GGttNriqaox0v0vXp4zs4KLurHCZjp4,858
|
@@ -63,11 +64,12 @@ docling/models/tesseract_ocr_model.py,sha256=UpLAgKgJtBgbKtJELmKBNMcejJJKBCyFK0q
|
|
63
64
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
64
65
|
docling/pipeline/base_pipeline.py,sha256=9ABK-Cr235bxE5vweoIA5rgBZV_EF8qFxAqLI27H_Pg,8749
|
65
66
|
docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
|
66
|
-
docling/pipeline/standard_pdf_pipeline.py,sha256=
|
67
|
-
docling/pipeline/vlm_pipeline.py,sha256=
|
67
|
+
docling/pipeline/standard_pdf_pipeline.py,sha256=gPNqUparhIONG4AyMekW9OfZ7t8YMs0odhtbE6Z-Hxw,10784
|
68
|
+
docling/pipeline/vlm_pipeline.py,sha256=dqQYAd3viW577TVSZltnB4P-f-ZUWQh0J8SSFDuQN6Q,9738
|
68
69
|
docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
69
70
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
70
71
|
docling/utils/accelerator_utils.py,sha256=ONNRrC8fH-8E93WUCNhfOq1t7WrQ1T7-YsmExTOY5f0,2292
|
72
|
+
docling/utils/api_image_request.py,sha256=_CgdzmPqdsyXmyYUFGLZcXcoH586qC6A1p5vsNbj1Q0,1416
|
71
73
|
docling/utils/export.py,sha256=4W-ptI1fLdVrtoqHdHY1RF9Xn2Yescs-hunITqxJ7Is,4697
|
72
74
|
docling/utils/glm_utils.py,sha256=W4JRoP0xQ6SJmhhIoAfcKxm5dr1CFvLHp8pqI1kdhxs,12250
|
73
75
|
docling/utils/layout_postprocessor.py,sha256=Q36DfcIYMuMfC6LzCBIrYtHK7pBE-Xyvjepz660s9UM,24508
|
@@ -77,8 +79,8 @@ docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,26
|
|
77
79
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
78
80
|
docling/utils/utils.py,sha256=0ozCk7zUkYzxRVmYoIB2zA1lqjQOuaQzxfGuf1wmKW4,1866
|
79
81
|
docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
|
80
|
-
docling-2.
|
81
|
-
docling-2.
|
82
|
-
docling-2.
|
83
|
-
docling-2.
|
84
|
-
docling-2.
|
82
|
+
docling-2.30.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
83
|
+
docling-2.30.0.dist-info/METADATA,sha256=HSI154YUnSDJE8BMMjOuu-U3EXQg0ksFuyuyzv7-UdU,9982
|
84
|
+
docling-2.30.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
85
|
+
docling-2.30.0.dist-info/entry_points.txt,sha256=pIxel-UeVo1S7FhoNG5xgEfPjLZfBLi_N9TsGPtJSLo,144
|
86
|
+
docling-2.30.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|