docling 2.5.2__tar.gz → 2.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.5.2 → docling-2.6.0}/PKG-INFO +3 -2
- docling-2.6.0/docling/backend/msexcel_backend.py +374 -0
- {docling-2.5.2 → docling-2.6.0}/docling/backend/mspowerpoint_backend.py +16 -1
- {docling-2.5.2 → docling-2.6.0}/docling/backend/msword_backend.py +26 -11
- {docling-2.5.2 → docling-2.6.0}/docling/cli/main.py +35 -1
- {docling-2.5.2 → docling-2.6.0}/docling/datamodel/base_models.py +6 -0
- {docling-2.5.2 → docling-2.6.0}/docling/datamodel/pipeline_options.py +9 -1
- {docling-2.5.2 → docling-2.6.0}/docling/document_converter.py +9 -0
- {docling-2.5.2 → docling-2.6.0}/pyproject.toml +5 -2
- {docling-2.5.2 → docling-2.6.0}/LICENSE +0 -0
- {docling-2.5.2 → docling-2.6.0}/README.md +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/__init__.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/backend/__init__.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/backend/html_backend.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/backend/md_backend.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/cli/__init__.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/datamodel/document.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/datamodel/settings.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/models/__init__.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/models/base_model.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/models/base_ocr_model.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/models/ds_glm_model.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/models/easyocr_model.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/models/layout_model.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/models/table_structure_model.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/utils/__init__.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/utils/export.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/utils/layout_utils.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/utils/profiling.py +0 -0
- {docling-2.5.2 → docling-2.6.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.6.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -23,13 +23,14 @@ Provides-Extra: tesserocr
|
|
23
23
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
24
24
|
Requires-Dist: certifi (>=2024.7.4)
|
25
25
|
Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
|
26
|
-
Requires-Dist: docling-core (>=2.
|
26
|
+
Requires-Dist: docling-core (>=2.4.0,<3.0.0)
|
27
27
|
Requires-Dist: docling-ibm-models (>=2.0.3,<3.0.0)
|
28
28
|
Requires-Dist: docling-parse (>=2.0.2,<3.0.0)
|
29
29
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
30
30
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
31
31
|
Requires-Dist: huggingface_hub (>=0.23,<1)
|
32
32
|
Requires-Dist: marko (>=2.1.2,<3.0.0)
|
33
|
+
Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
|
33
34
|
Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
34
35
|
Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
|
35
36
|
Requires-Dist: pydantic (>=2.0.0,<3.0.0)
|
@@ -0,0 +1,374 @@
|
|
1
|
+
import logging
|
2
|
+
from io import BytesIO
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Dict, Set, Tuple, Union
|
5
|
+
|
6
|
+
from docling_core.types.doc import (
|
7
|
+
DoclingDocument,
|
8
|
+
DocumentOrigin,
|
9
|
+
GroupLabel,
|
10
|
+
ImageRef,
|
11
|
+
TableCell,
|
12
|
+
TableData,
|
13
|
+
)
|
14
|
+
|
15
|
+
# from lxml import etree
|
16
|
+
from openpyxl import Workbook, load_workbook
|
17
|
+
from openpyxl.cell.cell import Cell
|
18
|
+
from openpyxl.drawing.image import Image
|
19
|
+
from openpyxl.worksheet.worksheet import Worksheet
|
20
|
+
|
21
|
+
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
22
|
+
from docling.datamodel.base_models import InputFormat
|
23
|
+
from docling.datamodel.document import InputDocument
|
24
|
+
|
25
|
+
_log = logging.getLogger(__name__)
|
26
|
+
|
27
|
+
from typing import Any, List
|
28
|
+
|
29
|
+
from pydantic import BaseModel
|
30
|
+
|
31
|
+
|
32
|
+
class ExcelCell(BaseModel):
|
33
|
+
row: int
|
34
|
+
col: int
|
35
|
+
text: str
|
36
|
+
row_span: int
|
37
|
+
col_span: int
|
38
|
+
|
39
|
+
|
40
|
+
class ExcelTable(BaseModel):
|
41
|
+
num_rows: int
|
42
|
+
num_cols: int
|
43
|
+
data: List[ExcelCell]
|
44
|
+
|
45
|
+
|
46
|
+
class MsExcelDocumentBackend(DeclarativeDocumentBackend):
|
47
|
+
|
48
|
+
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
49
|
+
super().__init__(in_doc, path_or_stream)
|
50
|
+
|
51
|
+
# Initialise the parents for the hierarchy
|
52
|
+
self.max_levels = 10
|
53
|
+
|
54
|
+
self.parents: Dict[int, Any] = {}
|
55
|
+
for i in range(-1, self.max_levels):
|
56
|
+
self.parents[i] = None
|
57
|
+
|
58
|
+
self.workbook = None
|
59
|
+
try:
|
60
|
+
if isinstance(self.path_or_stream, BytesIO):
|
61
|
+
self.workbook = load_workbook(filename=self.path_or_stream)
|
62
|
+
|
63
|
+
elif isinstance(self.path_or_stream, Path):
|
64
|
+
self.workbook = load_workbook(filename=str(self.path_or_stream))
|
65
|
+
|
66
|
+
self.valid = True
|
67
|
+
except Exception as e:
|
68
|
+
self.valid = False
|
69
|
+
|
70
|
+
raise RuntimeError(
|
71
|
+
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
|
72
|
+
) from e
|
73
|
+
|
74
|
+
def is_valid(self) -> bool:
|
75
|
+
_log.info(f"valid: {self.valid}")
|
76
|
+
return self.valid
|
77
|
+
|
78
|
+
@classmethod
|
79
|
+
def supports_pagination(cls) -> bool:
|
80
|
+
return True
|
81
|
+
|
82
|
+
def unload(self):
|
83
|
+
if isinstance(self.path_or_stream, BytesIO):
|
84
|
+
self.path_or_stream.close()
|
85
|
+
|
86
|
+
self.path_or_stream = None
|
87
|
+
|
88
|
+
@classmethod
|
89
|
+
def supported_formats(cls) -> Set[InputFormat]:
|
90
|
+
return {InputFormat.XLSX}
|
91
|
+
|
92
|
+
def convert(self) -> DoclingDocument:
|
93
|
+
# Parses the XLSX into a structured document model.
|
94
|
+
|
95
|
+
origin = DocumentOrigin(
|
96
|
+
filename=self.file.name or "file.xlsx",
|
97
|
+
mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
98
|
+
binary_hash=self.document_hash,
|
99
|
+
)
|
100
|
+
|
101
|
+
doc = DoclingDocument(name=self.file.stem or "file.xlsx", origin=origin)
|
102
|
+
|
103
|
+
if self.is_valid():
|
104
|
+
doc = self._convert_workbook(doc)
|
105
|
+
else:
|
106
|
+
raise RuntimeError(
|
107
|
+
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
108
|
+
)
|
109
|
+
|
110
|
+
return doc
|
111
|
+
|
112
|
+
def _convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
|
113
|
+
|
114
|
+
if self.workbook is not None:
|
115
|
+
|
116
|
+
# Iterate over all sheets
|
117
|
+
for sheet_name in self.workbook.sheetnames:
|
118
|
+
_log.info(f"Processing sheet: {sheet_name}")
|
119
|
+
|
120
|
+
# Access the sheet by name
|
121
|
+
sheet = self.workbook[sheet_name]
|
122
|
+
|
123
|
+
self.parents[0] = doc.add_group(
|
124
|
+
parent=None,
|
125
|
+
label=GroupLabel.SECTION,
|
126
|
+
name=f"sheet: {sheet_name}",
|
127
|
+
)
|
128
|
+
|
129
|
+
doc = self._convert_sheet(doc, sheet)
|
130
|
+
else:
|
131
|
+
_log.error("Workbook is not initialized.")
|
132
|
+
|
133
|
+
return doc
|
134
|
+
|
135
|
+
def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet):
|
136
|
+
|
137
|
+
doc = self._find_tables_in_sheet(doc, sheet)
|
138
|
+
|
139
|
+
doc = self._find_images_in_sheet(doc, sheet)
|
140
|
+
|
141
|
+
return doc
|
142
|
+
|
143
|
+
def _find_tables_in_sheet(self, doc: DoclingDocument, sheet: Worksheet):
|
144
|
+
|
145
|
+
tables = self._find_data_tables(sheet)
|
146
|
+
|
147
|
+
for excel_table in tables:
|
148
|
+
num_rows = excel_table.num_rows
|
149
|
+
num_cols = excel_table.num_cols
|
150
|
+
|
151
|
+
table_data = TableData(
|
152
|
+
num_rows=num_rows,
|
153
|
+
num_cols=num_cols,
|
154
|
+
table_cells=[],
|
155
|
+
)
|
156
|
+
|
157
|
+
for excel_cell in excel_table.data:
|
158
|
+
|
159
|
+
cell = TableCell(
|
160
|
+
text=excel_cell.text,
|
161
|
+
row_span=excel_cell.row_span,
|
162
|
+
col_span=excel_cell.col_span,
|
163
|
+
start_row_offset_idx=excel_cell.row,
|
164
|
+
end_row_offset_idx=excel_cell.row + excel_cell.row_span,
|
165
|
+
start_col_offset_idx=excel_cell.col,
|
166
|
+
end_col_offset_idx=excel_cell.col + excel_cell.col_span,
|
167
|
+
col_header=False,
|
168
|
+
row_header=False,
|
169
|
+
)
|
170
|
+
table_data.table_cells.append(cell)
|
171
|
+
|
172
|
+
doc.add_table(data=table_data, parent=self.parents[0])
|
173
|
+
|
174
|
+
return doc
|
175
|
+
|
176
|
+
def _find_data_tables(self, sheet: Worksheet):
|
177
|
+
"""
|
178
|
+
Find all compact rectangular data tables in a sheet.
|
179
|
+
"""
|
180
|
+
# _log.info("find_data_tables")
|
181
|
+
|
182
|
+
tables = [] # List to store found tables
|
183
|
+
visited: set[Tuple[int, int]] = set() # Track already visited cells
|
184
|
+
|
185
|
+
# Iterate over all cells in the sheet
|
186
|
+
for ri, row in enumerate(sheet.iter_rows(values_only=False)):
|
187
|
+
for rj, cell in enumerate(row):
|
188
|
+
|
189
|
+
# Skip empty or already visited cells
|
190
|
+
if cell.value is None or (ri, rj) in visited:
|
191
|
+
continue
|
192
|
+
|
193
|
+
# If the cell starts a new table, find its bounds
|
194
|
+
table_bounds, visited_cells = self._find_table_bounds(
|
195
|
+
sheet, ri, rj, visited
|
196
|
+
)
|
197
|
+
|
198
|
+
visited.update(visited_cells) # Mark these cells as visited
|
199
|
+
tables.append(table_bounds)
|
200
|
+
|
201
|
+
return tables
|
202
|
+
|
203
|
+
def _find_table_bounds(
|
204
|
+
self,
|
205
|
+
sheet: Worksheet,
|
206
|
+
start_row: int,
|
207
|
+
start_col: int,
|
208
|
+
visited: set[Tuple[int, int]],
|
209
|
+
):
|
210
|
+
"""
|
211
|
+
Determine the bounds of a compact rectangular table.
|
212
|
+
Returns:
|
213
|
+
- A dictionary with the bounds and data.
|
214
|
+
- A set of visited cell coordinates.
|
215
|
+
"""
|
216
|
+
_log.info("find_table_bounds")
|
217
|
+
|
218
|
+
max_row = self._find_table_bottom(sheet, start_row, start_col)
|
219
|
+
max_col = self._find_table_right(sheet, start_row, start_col)
|
220
|
+
|
221
|
+
# Collect the data within the bounds
|
222
|
+
data = []
|
223
|
+
visited_cells = set()
|
224
|
+
for ri in range(start_row, max_row + 1):
|
225
|
+
for rj in range(start_col, max_col + 1):
|
226
|
+
|
227
|
+
cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing
|
228
|
+
|
229
|
+
# Check if the cell belongs to a merged range
|
230
|
+
row_span = 1
|
231
|
+
col_span = 1
|
232
|
+
|
233
|
+
# _log.info(sheet.merged_cells.ranges)
|
234
|
+
for merged_range in sheet.merged_cells.ranges:
|
235
|
+
|
236
|
+
if (
|
237
|
+
merged_range.min_row <= ri + 1
|
238
|
+
and ri + 1 <= merged_range.max_row
|
239
|
+
and merged_range.min_col <= rj + 1
|
240
|
+
and rj + 1 <= merged_range.max_col
|
241
|
+
):
|
242
|
+
|
243
|
+
row_span = merged_range.max_row - merged_range.min_row + 1
|
244
|
+
col_span = merged_range.max_col - merged_range.min_col + 1
|
245
|
+
break
|
246
|
+
|
247
|
+
if (ri, rj) not in visited_cells:
|
248
|
+
data.append(
|
249
|
+
ExcelCell(
|
250
|
+
row=ri - start_row,
|
251
|
+
col=rj - start_col,
|
252
|
+
text=str(cell.value),
|
253
|
+
row_span=row_span,
|
254
|
+
col_span=col_span,
|
255
|
+
)
|
256
|
+
)
|
257
|
+
# _log.info(f"cell: {ri}, {rj} -> {ri - start_row}, {rj - start_col}, {row_span}, {col_span}: {str(cell.value)}")
|
258
|
+
|
259
|
+
# Mark all cells in the span as visited
|
260
|
+
for span_row in range(ri, ri + row_span):
|
261
|
+
for span_col in range(rj, rj + col_span):
|
262
|
+
visited_cells.add((span_row, span_col))
|
263
|
+
|
264
|
+
return (
|
265
|
+
ExcelTable(
|
266
|
+
num_rows=max_row + 1 - start_row,
|
267
|
+
num_cols=max_col + 1 - start_col,
|
268
|
+
data=data,
|
269
|
+
),
|
270
|
+
visited_cells,
|
271
|
+
)
|
272
|
+
|
273
|
+
def _find_table_bottom(self, sheet: Worksheet, start_row: int, start_col: int):
|
274
|
+
"""Function to find the bottom boundary of the table"""
|
275
|
+
|
276
|
+
max_row = start_row
|
277
|
+
|
278
|
+
while max_row < sheet.max_row - 1:
|
279
|
+
# Get the cell value or check if it is part of a merged cell
|
280
|
+
cell = sheet.cell(row=max_row + 2, column=start_col + 1)
|
281
|
+
|
282
|
+
# Check if the cell is part of a merged range
|
283
|
+
merged_range = next(
|
284
|
+
(mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
|
285
|
+
None,
|
286
|
+
)
|
287
|
+
|
288
|
+
if cell.value is None and not merged_range:
|
289
|
+
break # Stop if the cell is empty and not merged
|
290
|
+
|
291
|
+
# Expand max_row to include the merged range if applicable
|
292
|
+
if merged_range:
|
293
|
+
max_row = max(max_row, merged_range.max_row - 1)
|
294
|
+
else:
|
295
|
+
max_row += 1
|
296
|
+
|
297
|
+
return max_row
|
298
|
+
|
299
|
+
def _find_table_right(self, sheet: Worksheet, start_row: int, start_col: int):
|
300
|
+
"""Function to find the right boundary of the table"""
|
301
|
+
|
302
|
+
max_col = start_col
|
303
|
+
|
304
|
+
while max_col < sheet.max_column - 1:
|
305
|
+
# Get the cell value or check if it is part of a merged cell
|
306
|
+
cell = sheet.cell(row=start_row + 1, column=max_col + 2)
|
307
|
+
|
308
|
+
# Check if the cell is part of a merged range
|
309
|
+
merged_range = next(
|
310
|
+
(mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
|
311
|
+
None,
|
312
|
+
)
|
313
|
+
|
314
|
+
if cell.value is None and not merged_range:
|
315
|
+
break # Stop if the cell is empty and not merged
|
316
|
+
|
317
|
+
# Expand max_col to include the merged range if applicable
|
318
|
+
if merged_range:
|
319
|
+
max_col = max(max_col, merged_range.max_col - 1)
|
320
|
+
else:
|
321
|
+
max_col += 1
|
322
|
+
|
323
|
+
return max_col
|
324
|
+
|
325
|
+
def _find_images_in_sheet(
|
326
|
+
self, doc: DoclingDocument, sheet: Worksheet
|
327
|
+
) -> DoclingDocument:
|
328
|
+
|
329
|
+
# FIXME: mypy does not agree with _images ...
|
330
|
+
"""
|
331
|
+
# Iterate over images in the sheet
|
332
|
+
for idx, image in enumerate(sheet._images): # Access embedded images
|
333
|
+
|
334
|
+
image_bytes = BytesIO(image.ref.blob)
|
335
|
+
pil_image = Image.open(image_bytes)
|
336
|
+
|
337
|
+
doc.add_picture(
|
338
|
+
parent=self.parents[0],
|
339
|
+
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
340
|
+
caption=None,
|
341
|
+
)
|
342
|
+
"""
|
343
|
+
|
344
|
+
# FIXME: mypy does not agree with _charts ...
|
345
|
+
"""
|
346
|
+
for idx, chart in enumerate(sheet._charts): # Access embedded charts
|
347
|
+
chart_path = f"chart_{idx + 1}.png"
|
348
|
+
_log.info(
|
349
|
+
f"Chart found, but dynamic rendering is required for: {chart_path}"
|
350
|
+
)
|
351
|
+
|
352
|
+
_log.info(f"Chart {idx + 1}:")
|
353
|
+
|
354
|
+
# Chart type
|
355
|
+
_log.info(f"Type: {type(chart).__name__}")
|
356
|
+
|
357
|
+
# Title
|
358
|
+
if chart.title:
|
359
|
+
_log.info(f"Title: {chart.title}")
|
360
|
+
else:
|
361
|
+
_log.info("No title")
|
362
|
+
|
363
|
+
# Data series
|
364
|
+
for series in chart.series:
|
365
|
+
_log.info(" => series ...")
|
366
|
+
_log.info(f"Data Series: {series.title}")
|
367
|
+
_log.info(f"Values: {series.values}")
|
368
|
+
_log.info(f"Categories: {series.categories}")
|
369
|
+
|
370
|
+
# Position
|
371
|
+
# _log.info(f"Anchor Cell: {chart.anchor}")
|
372
|
+
"""
|
373
|
+
|
374
|
+
return doc
|
@@ -10,11 +10,13 @@ from docling_core.types.doc import (
|
|
10
10
|
DoclingDocument,
|
11
11
|
DocumentOrigin,
|
12
12
|
GroupLabel,
|
13
|
+
ImageRef,
|
13
14
|
ProvenanceItem,
|
14
15
|
Size,
|
15
16
|
TableCell,
|
16
17
|
TableData,
|
17
18
|
)
|
19
|
+
from PIL import Image
|
18
20
|
from pptx import Presentation
|
19
21
|
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
|
20
22
|
|
@@ -268,9 +270,22 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
268
270
|
return
|
269
271
|
|
270
272
|
def handle_pictures(self, shape, parent_slide, slide_ind, doc):
|
273
|
+
# Get the image bytes
|
274
|
+
image = shape.image
|
275
|
+
image_bytes = image.blob
|
276
|
+
im_dpi, _ = image.dpi
|
277
|
+
|
278
|
+
# Open it with PIL
|
279
|
+
pil_image = Image.open(BytesIO(image_bytes))
|
280
|
+
|
271
281
|
# shape has picture
|
272
282
|
prov = self.generate_prov(shape, slide_ind, "")
|
273
|
-
doc.add_picture(
|
283
|
+
doc.add_picture(
|
284
|
+
parent=parent_slide,
|
285
|
+
image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
|
286
|
+
caption=None,
|
287
|
+
prov=prov,
|
288
|
+
)
|
274
289
|
return
|
275
290
|
|
276
291
|
def handle_tables(self, shape, parent_slide, slide_ind, doc):
|
@@ -9,10 +9,12 @@ from docling_core.types.doc import (
|
|
9
9
|
DoclingDocument,
|
10
10
|
DocumentOrigin,
|
11
11
|
GroupLabel,
|
12
|
+
ImageRef,
|
12
13
|
TableCell,
|
13
14
|
TableData,
|
14
15
|
)
|
15
16
|
from lxml import etree
|
17
|
+
from PIL import Image
|
16
18
|
|
17
19
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
18
20
|
from docling.datamodel.base_models import InputFormat
|
@@ -130,13 +132,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
130
132
|
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
|
131
133
|
for element in body:
|
132
134
|
tag_name = etree.QName(element).localname
|
133
|
-
# Check for Inline Images (
|
134
|
-
|
135
|
-
element, ".//w:drawing", namespaces=self.xml_namespaces
|
136
|
-
)
|
137
|
-
found_pict = etree.ElementBase.xpath(
|
138
|
-
element, ".//w:pict", namespaces=self.xml_namespaces
|
139
|
-
)
|
135
|
+
# Check for Inline Images (blip elements)
|
136
|
+
drawing_blip = element.xpath(".//a:blip")
|
140
137
|
|
141
138
|
# Check for Tables
|
142
139
|
if element.tag.endswith("tbl"):
|
@@ -145,8 +142,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
145
142
|
except Exception:
|
146
143
|
_log.debug("could not parse a table, broken docx table")
|
147
144
|
|
148
|
-
elif
|
149
|
-
self.handle_pictures(element, docx_obj, doc)
|
145
|
+
elif drawing_blip:
|
146
|
+
self.handle_pictures(element, docx_obj, drawing_blip, doc)
|
150
147
|
# Check for Text
|
151
148
|
elif tag_name in ["p"]:
|
152
149
|
self.handle_text_elements(element, docx_obj, doc)
|
@@ -491,6 +488,24 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
491
488
|
doc.add_table(data=data, parent=self.parents[level - 1])
|
492
489
|
return
|
493
490
|
|
494
|
-
def handle_pictures(self, element, docx_obj, doc):
|
495
|
-
|
491
|
+
def handle_pictures(self, element, docx_obj, drawing_blip, doc):
|
492
|
+
def get_docx_image(element, drawing_blip):
|
493
|
+
rId = drawing_blip[0].get(
|
494
|
+
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
|
495
|
+
)
|
496
|
+
if rId in docx_obj.part.rels:
|
497
|
+
# Access the image part using the relationship ID
|
498
|
+
image_part = docx_obj.part.rels[rId].target_part
|
499
|
+
image_data = image_part.blob # Get the binary image data
|
500
|
+
return image_data
|
501
|
+
|
502
|
+
image_data = get_docx_image(element, drawing_blip)
|
503
|
+
image_bytes = BytesIO(image_data)
|
504
|
+
# Open the BytesIO object with PIL to create an Image
|
505
|
+
pil_image = Image.open(image_bytes)
|
506
|
+
doc.add_picture(
|
507
|
+
parent=self.parents[self.level],
|
508
|
+
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
509
|
+
caption=None,
|
510
|
+
)
|
496
511
|
return
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import importlib
|
2
2
|
import json
|
3
3
|
import logging
|
4
|
+
import re
|
4
5
|
import time
|
5
6
|
import warnings
|
6
7
|
from enum import Enum
|
@@ -129,6 +130,12 @@ def export_documents(
|
|
129
130
|
)
|
130
131
|
|
131
132
|
|
133
|
+
def _split_list(raw: Optional[str]) -> Optional[List[str]]:
|
134
|
+
if raw is None:
|
135
|
+
return None
|
136
|
+
return re.split(r"[;,]", raw)
|
137
|
+
|
138
|
+
|
132
139
|
@app.command(no_args_is_help=True)
|
133
140
|
def convert(
|
134
141
|
input_sources: Annotated[
|
@@ -163,6 +170,13 @@ def convert(
|
|
163
170
|
ocr_engine: Annotated[
|
164
171
|
OcrEngine, typer.Option(..., help="The OCR engine to use.")
|
165
172
|
] = OcrEngine.EASYOCR,
|
173
|
+
ocr_lang: Annotated[
|
174
|
+
Optional[str],
|
175
|
+
typer.Option(
|
176
|
+
...,
|
177
|
+
help="Provide a comma-separated list of languages used by the OCR engine. Note that each OCR engine has different values for the language names.",
|
178
|
+
),
|
179
|
+
] = None,
|
166
180
|
pdf_backend: Annotated[
|
167
181
|
PdfBackend, typer.Option(..., help="The PDF backend to use.")
|
168
182
|
] = PdfBackend.DLPARSE_V1,
|
@@ -185,6 +199,15 @@ def convert(
|
|
185
199
|
output: Annotated[
|
186
200
|
Path, typer.Option(..., help="Output directory where results are saved.")
|
187
201
|
] = Path("."),
|
202
|
+
verbose: Annotated[
|
203
|
+
int,
|
204
|
+
typer.Option(
|
205
|
+
"--verbose",
|
206
|
+
"-v",
|
207
|
+
count=True,
|
208
|
+
help="Set the verbosity level. -v for info logging, -vv for debug logging.",
|
209
|
+
),
|
210
|
+
] = 0,
|
188
211
|
version: Annotated[
|
189
212
|
Optional[bool],
|
190
213
|
typer.Option(
|
@@ -195,7 +218,12 @@ def convert(
|
|
195
218
|
),
|
196
219
|
] = None,
|
197
220
|
):
|
198
|
-
|
221
|
+
if verbose == 0:
|
222
|
+
logging.basicConfig(level=logging.WARNING)
|
223
|
+
elif verbose == 1:
|
224
|
+
logging.basicConfig(level=logging.INFO)
|
225
|
+
elif verbose == 2:
|
226
|
+
logging.basicConfig(level=logging.DEBUG)
|
199
227
|
|
200
228
|
if from_formats is None:
|
201
229
|
from_formats = [e for e in InputFormat]
|
@@ -234,6 +262,10 @@ def convert(
|
|
234
262
|
case _:
|
235
263
|
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
236
264
|
|
265
|
+
ocr_lang_list = _split_list(ocr_lang)
|
266
|
+
if ocr_lang_list is not None:
|
267
|
+
ocr_options.lang = ocr_lang_list
|
268
|
+
|
237
269
|
pipeline_options = PdfPipelineOptions(
|
238
270
|
do_ocr=ocr,
|
239
271
|
ocr_options=ocr_options,
|
@@ -287,5 +319,7 @@ def convert(
|
|
287
319
|
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
288
320
|
|
289
321
|
|
322
|
+
click_app = typer.main.get_command(app)
|
323
|
+
|
290
324
|
if __name__ == "__main__":
|
291
325
|
app()
|
@@ -32,6 +32,7 @@ class InputFormat(str, Enum):
|
|
32
32
|
PDF = "pdf"
|
33
33
|
ASCIIDOC = "asciidoc"
|
34
34
|
MD = "md"
|
35
|
+
XLSX = "xlsx"
|
35
36
|
|
36
37
|
|
37
38
|
class OutputFormat(str, Enum):
|
@@ -49,6 +50,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
|
49
50
|
InputFormat.HTML: ["html", "htm", "xhtml"],
|
50
51
|
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
51
52
|
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
53
|
+
InputFormat.XLSX: ["xlsx"],
|
52
54
|
}
|
53
55
|
|
54
56
|
FormatToMimeType: Dict[InputFormat, List[str]] = {
|
@@ -72,7 +74,11 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
|
72
74
|
InputFormat.PDF: ["application/pdf"],
|
73
75
|
InputFormat.ASCIIDOC: ["text/asciidoc"],
|
74
76
|
InputFormat.MD: ["text/markdown", "text/x-markdown"],
|
77
|
+
InputFormat.XLSX: [
|
78
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
79
|
+
],
|
75
80
|
}
|
81
|
+
|
76
82
|
MimeTypeToFormat = {
|
77
83
|
mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
|
78
84
|
}
|
@@ -22,6 +22,7 @@ class TableStructureOptions(BaseModel):
|
|
22
22
|
|
23
23
|
class OcrOptions(BaseModel):
|
24
24
|
kind: str
|
25
|
+
lang: List[str]
|
25
26
|
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
|
26
27
|
bitmap_area_threshold: float = (
|
27
28
|
0.05 # percentage of the area for a bitmap to processed with OCR
|
@@ -81,4 +82,11 @@ class PdfPipelineOptions(PipelineOptions):
|
|
81
82
|
images_scale: float = 1.0
|
82
83
|
generate_page_images: bool = False
|
83
84
|
generate_picture_images: bool = False
|
84
|
-
generate_table_images: bool =
|
85
|
+
generate_table_images: bool = Field(
|
86
|
+
default=False,
|
87
|
+
deprecated=(
|
88
|
+
"Field `generate_table_images` is deprecated. "
|
89
|
+
"To obtain table images, set `PdfPipelineOptions.generate_page_images = True` "
|
90
|
+
"before conversion and then use the `TableItem.get_image` function."
|
91
|
+
),
|
92
|
+
)
|
@@ -12,6 +12,7 @@ from docling.backend.asciidoc_backend import AsciiDocBackend
|
|
12
12
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
13
13
|
from docling.backend.html_backend import HTMLDocumentBackend
|
14
14
|
from docling.backend.md_backend import MarkdownDocumentBackend
|
15
|
+
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
15
16
|
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
16
17
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
17
18
|
from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
|
@@ -44,6 +45,11 @@ class FormatOption(BaseModel):
|
|
44
45
|
return self
|
45
46
|
|
46
47
|
|
48
|
+
class ExcelFormatOption(FormatOption):
|
49
|
+
pipeline_cls: Type = SimplePipeline
|
50
|
+
backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
|
51
|
+
|
52
|
+
|
47
53
|
class WordFormatOption(FormatOption):
|
48
54
|
pipeline_cls: Type = SimplePipeline
|
49
55
|
backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
|
@@ -80,6 +86,9 @@ class ImageFormatOption(FormatOption):
|
|
80
86
|
|
81
87
|
|
82
88
|
_format_to_default_options = {
|
89
|
+
InputFormat.XLSX: FormatOption(
|
90
|
+
pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
|
91
|
+
),
|
83
92
|
InputFormat.DOCX: FormatOption(
|
84
93
|
pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
|
85
94
|
),
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "docling"
|
3
|
-
version = "2.
|
3
|
+
version = "2.6.0" # DO NOT EDIT, updated automatically
|
4
4
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
5
5
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
6
6
|
license = "MIT"
|
@@ -26,7 +26,7 @@ packages = [{include = "docling"}]
|
|
26
26
|
######################
|
27
27
|
python = "^3.10"
|
28
28
|
pydantic = "^2.0.0"
|
29
|
-
docling-core = "^2.
|
29
|
+
docling-core = "^2.4.0"
|
30
30
|
docling-ibm-models = "^2.0.3"
|
31
31
|
deepsearch-glm = "^0.26.1"
|
32
32
|
filetype = "^1.2.0"
|
@@ -47,6 +47,7 @@ python-pptx = "^1.0.2"
|
|
47
47
|
beautifulsoup4 = "^4.12.3"
|
48
48
|
pandas = "^2.1.4"
|
49
49
|
marko = "^2.1.2"
|
50
|
+
openpyxl = "^3.1.5"
|
50
51
|
|
51
52
|
[tool.poetry.group.dev.dependencies]
|
52
53
|
black = {extras = ["jupyter"], version = "^24.4.2"}
|
@@ -65,10 +66,12 @@ pandas-stubs = "^2.1.4.231227"
|
|
65
66
|
ipykernel = "^6.29.5"
|
66
67
|
ipywidgets = "^8.1.5"
|
67
68
|
nbqa = "^1.9.0"
|
69
|
+
types-openpyxl = "^3.1.5.20241114"
|
68
70
|
|
69
71
|
[tool.poetry.group.docs.dependencies]
|
70
72
|
mkdocs-material = "^9.5.40"
|
71
73
|
mkdocs-jupyter = "^0.25.0"
|
74
|
+
mkdocs-click = "^0.8.1"
|
72
75
|
|
73
76
|
[tool.poetry.group.examples.dependencies]
|
74
77
|
datasets = "^2.21.0"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|