docling 2.5.2__tar.gz → 2.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {docling-2.5.2 → docling-2.6.0}/PKG-INFO +3 -2
  2. docling-2.6.0/docling/backend/msexcel_backend.py +374 -0
  3. {docling-2.5.2 → docling-2.6.0}/docling/backend/mspowerpoint_backend.py +16 -1
  4. {docling-2.5.2 → docling-2.6.0}/docling/backend/msword_backend.py +26 -11
  5. {docling-2.5.2 → docling-2.6.0}/docling/cli/main.py +35 -1
  6. {docling-2.5.2 → docling-2.6.0}/docling/datamodel/base_models.py +6 -0
  7. {docling-2.5.2 → docling-2.6.0}/docling/datamodel/pipeline_options.py +9 -1
  8. {docling-2.5.2 → docling-2.6.0}/docling/document_converter.py +9 -0
  9. {docling-2.5.2 → docling-2.6.0}/pyproject.toml +5 -2
  10. {docling-2.5.2 → docling-2.6.0}/LICENSE +0 -0
  11. {docling-2.5.2 → docling-2.6.0}/README.md +0 -0
  12. {docling-2.5.2 → docling-2.6.0}/docling/__init__.py +0 -0
  13. {docling-2.5.2 → docling-2.6.0}/docling/backend/__init__.py +0 -0
  14. {docling-2.5.2 → docling-2.6.0}/docling/backend/abstract_backend.py +0 -0
  15. {docling-2.5.2 → docling-2.6.0}/docling/backend/asciidoc_backend.py +0 -0
  16. {docling-2.5.2 → docling-2.6.0}/docling/backend/docling_parse_backend.py +0 -0
  17. {docling-2.5.2 → docling-2.6.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  18. {docling-2.5.2 → docling-2.6.0}/docling/backend/html_backend.py +0 -0
  19. {docling-2.5.2 → docling-2.6.0}/docling/backend/md_backend.py +0 -0
  20. {docling-2.5.2 → docling-2.6.0}/docling/backend/pdf_backend.py +0 -0
  21. {docling-2.5.2 → docling-2.6.0}/docling/backend/pypdfium2_backend.py +0 -0
  22. {docling-2.5.2 → docling-2.6.0}/docling/cli/__init__.py +0 -0
  23. {docling-2.5.2 → docling-2.6.0}/docling/datamodel/__init__.py +0 -0
  24. {docling-2.5.2 → docling-2.6.0}/docling/datamodel/document.py +0 -0
  25. {docling-2.5.2 → docling-2.6.0}/docling/datamodel/settings.py +0 -0
  26. {docling-2.5.2 → docling-2.6.0}/docling/models/__init__.py +0 -0
  27. {docling-2.5.2 → docling-2.6.0}/docling/models/base_model.py +0 -0
  28. {docling-2.5.2 → docling-2.6.0}/docling/models/base_ocr_model.py +0 -0
  29. {docling-2.5.2 → docling-2.6.0}/docling/models/ds_glm_model.py +0 -0
  30. {docling-2.5.2 → docling-2.6.0}/docling/models/easyocr_model.py +0 -0
  31. {docling-2.5.2 → docling-2.6.0}/docling/models/layout_model.py +0 -0
  32. {docling-2.5.2 → docling-2.6.0}/docling/models/page_assemble_model.py +0 -0
  33. {docling-2.5.2 → docling-2.6.0}/docling/models/page_preprocessing_model.py +0 -0
  34. {docling-2.5.2 → docling-2.6.0}/docling/models/table_structure_model.py +0 -0
  35. {docling-2.5.2 → docling-2.6.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
  36. {docling-2.5.2 → docling-2.6.0}/docling/models/tesseract_ocr_model.py +0 -0
  37. {docling-2.5.2 → docling-2.6.0}/docling/pipeline/__init__.py +0 -0
  38. {docling-2.5.2 → docling-2.6.0}/docling/pipeline/base_pipeline.py +0 -0
  39. {docling-2.5.2 → docling-2.6.0}/docling/pipeline/simple_pipeline.py +0 -0
  40. {docling-2.5.2 → docling-2.6.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  41. {docling-2.5.2 → docling-2.6.0}/docling/utils/__init__.py +0 -0
  42. {docling-2.5.2 → docling-2.6.0}/docling/utils/export.py +0 -0
  43. {docling-2.5.2 → docling-2.6.0}/docling/utils/layout_utils.py +0 -0
  44. {docling-2.5.2 → docling-2.6.0}/docling/utils/profiling.py +0 -0
  45. {docling-2.5.2 → docling-2.6.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.5.2
3
+ Version: 2.6.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -23,13 +23,14 @@ Provides-Extra: tesserocr
23
23
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
24
24
  Requires-Dist: certifi (>=2024.7.4)
25
25
  Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
26
- Requires-Dist: docling-core (>=2.3.0,<3.0.0)
26
+ Requires-Dist: docling-core (>=2.4.0,<3.0.0)
27
27
  Requires-Dist: docling-ibm-models (>=2.0.3,<3.0.0)
28
28
  Requires-Dist: docling-parse (>=2.0.2,<3.0.0)
29
29
  Requires-Dist: easyocr (>=1.7,<2.0)
30
30
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
31
31
  Requires-Dist: huggingface_hub (>=0.23,<1)
32
32
  Requires-Dist: marko (>=2.1.2,<3.0.0)
33
+ Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
33
34
  Requires-Dist: pandas (>=2.1.4,<3.0.0)
34
35
  Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
35
36
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
@@ -0,0 +1,374 @@
1
+ import logging
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from typing import Dict, Set, Tuple, Union
5
+
6
+ from docling_core.types.doc import (
7
+ DoclingDocument,
8
+ DocumentOrigin,
9
+ GroupLabel,
10
+ ImageRef,
11
+ TableCell,
12
+ TableData,
13
+ )
14
+
15
+ # from lxml import etree
16
+ from openpyxl import Workbook, load_workbook
17
+ from openpyxl.cell.cell import Cell
18
+ from openpyxl.drawing.image import Image
19
+ from openpyxl.worksheet.worksheet import Worksheet
20
+
21
+ from docling.backend.abstract_backend import DeclarativeDocumentBackend
22
+ from docling.datamodel.base_models import InputFormat
23
+ from docling.datamodel.document import InputDocument
24
+
25
+ _log = logging.getLogger(__name__)
26
+
27
+ from typing import Any, List
28
+
29
+ from pydantic import BaseModel
30
+
31
+
32
+ class ExcelCell(BaseModel):
33
+ row: int
34
+ col: int
35
+ text: str
36
+ row_span: int
37
+ col_span: int
38
+
39
+
40
+ class ExcelTable(BaseModel):
41
+ num_rows: int
42
+ num_cols: int
43
+ data: List[ExcelCell]
44
+
45
+
46
+ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
47
+
48
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
49
+ super().__init__(in_doc, path_or_stream)
50
+
51
+ # Initialise the parents for the hierarchy
52
+ self.max_levels = 10
53
+
54
+ self.parents: Dict[int, Any] = {}
55
+ for i in range(-1, self.max_levels):
56
+ self.parents[i] = None
57
+
58
+ self.workbook = None
59
+ try:
60
+ if isinstance(self.path_or_stream, BytesIO):
61
+ self.workbook = load_workbook(filename=self.path_or_stream)
62
+
63
+ elif isinstance(self.path_or_stream, Path):
64
+ self.workbook = load_workbook(filename=str(self.path_or_stream))
65
+
66
+ self.valid = True
67
+ except Exception as e:
68
+ self.valid = False
69
+
70
+ raise RuntimeError(
71
+ f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
72
+ ) from e
73
+
74
+ def is_valid(self) -> bool:
75
+ _log.info(f"valid: {self.valid}")
76
+ return self.valid
77
+
78
+ @classmethod
79
+ def supports_pagination(cls) -> bool:
80
+ return True
81
+
82
+ def unload(self):
83
+ if isinstance(self.path_or_stream, BytesIO):
84
+ self.path_or_stream.close()
85
+
86
+ self.path_or_stream = None
87
+
88
+ @classmethod
89
+ def supported_formats(cls) -> Set[InputFormat]:
90
+ return {InputFormat.XLSX}
91
+
92
+ def convert(self) -> DoclingDocument:
93
+ # Parses the XLSX into a structured document model.
94
+
95
+ origin = DocumentOrigin(
96
+ filename=self.file.name or "file.xlsx",
97
+ mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
98
+ binary_hash=self.document_hash,
99
+ )
100
+
101
+ doc = DoclingDocument(name=self.file.stem or "file.xlsx", origin=origin)
102
+
103
+ if self.is_valid():
104
+ doc = self._convert_workbook(doc)
105
+ else:
106
+ raise RuntimeError(
107
+ f"Cannot convert doc with {self.document_hash} because the backend failed to init."
108
+ )
109
+
110
+ return doc
111
+
112
+ def _convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
113
+
114
+ if self.workbook is not None:
115
+
116
+ # Iterate over all sheets
117
+ for sheet_name in self.workbook.sheetnames:
118
+ _log.info(f"Processing sheet: {sheet_name}")
119
+
120
+ # Access the sheet by name
121
+ sheet = self.workbook[sheet_name]
122
+
123
+ self.parents[0] = doc.add_group(
124
+ parent=None,
125
+ label=GroupLabel.SECTION,
126
+ name=f"sheet: {sheet_name}",
127
+ )
128
+
129
+ doc = self._convert_sheet(doc, sheet)
130
+ else:
131
+ _log.error("Workbook is not initialized.")
132
+
133
+ return doc
134
+
135
+ def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet):
136
+
137
+ doc = self._find_tables_in_sheet(doc, sheet)
138
+
139
+ doc = self._find_images_in_sheet(doc, sheet)
140
+
141
+ return doc
142
+
143
+ def _find_tables_in_sheet(self, doc: DoclingDocument, sheet: Worksheet):
144
+
145
+ tables = self._find_data_tables(sheet)
146
+
147
+ for excel_table in tables:
148
+ num_rows = excel_table.num_rows
149
+ num_cols = excel_table.num_cols
150
+
151
+ table_data = TableData(
152
+ num_rows=num_rows,
153
+ num_cols=num_cols,
154
+ table_cells=[],
155
+ )
156
+
157
+ for excel_cell in excel_table.data:
158
+
159
+ cell = TableCell(
160
+ text=excel_cell.text,
161
+ row_span=excel_cell.row_span,
162
+ col_span=excel_cell.col_span,
163
+ start_row_offset_idx=excel_cell.row,
164
+ end_row_offset_idx=excel_cell.row + excel_cell.row_span,
165
+ start_col_offset_idx=excel_cell.col,
166
+ end_col_offset_idx=excel_cell.col + excel_cell.col_span,
167
+ col_header=False,
168
+ row_header=False,
169
+ )
170
+ table_data.table_cells.append(cell)
171
+
172
+ doc.add_table(data=table_data, parent=self.parents[0])
173
+
174
+ return doc
175
+
176
+ def _find_data_tables(self, sheet: Worksheet):
177
+ """
178
+ Find all compact rectangular data tables in a sheet.
179
+ """
180
+ # _log.info("find_data_tables")
181
+
182
+ tables = [] # List to store found tables
183
+ visited: set[Tuple[int, int]] = set() # Track already visited cells
184
+
185
+ # Iterate over all cells in the sheet
186
+ for ri, row in enumerate(sheet.iter_rows(values_only=False)):
187
+ for rj, cell in enumerate(row):
188
+
189
+ # Skip empty or already visited cells
190
+ if cell.value is None or (ri, rj) in visited:
191
+ continue
192
+
193
+ # If the cell starts a new table, find its bounds
194
+ table_bounds, visited_cells = self._find_table_bounds(
195
+ sheet, ri, rj, visited
196
+ )
197
+
198
+ visited.update(visited_cells) # Mark these cells as visited
199
+ tables.append(table_bounds)
200
+
201
+ return tables
202
+
203
+ def _find_table_bounds(
204
+ self,
205
+ sheet: Worksheet,
206
+ start_row: int,
207
+ start_col: int,
208
+ visited: set[Tuple[int, int]],
209
+ ):
210
+ """
211
+ Determine the bounds of a compact rectangular table.
212
+ Returns:
213
+ - A dictionary with the bounds and data.
214
+ - A set of visited cell coordinates.
215
+ """
216
+ _log.info("find_table_bounds")
217
+
218
+ max_row = self._find_table_bottom(sheet, start_row, start_col)
219
+ max_col = self._find_table_right(sheet, start_row, start_col)
220
+
221
+ # Collect the data within the bounds
222
+ data = []
223
+ visited_cells = set()
224
+ for ri in range(start_row, max_row + 1):
225
+ for rj in range(start_col, max_col + 1):
226
+
227
+ cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing
228
+
229
+ # Check if the cell belongs to a merged range
230
+ row_span = 1
231
+ col_span = 1
232
+
233
+ # _log.info(sheet.merged_cells.ranges)
234
+ for merged_range in sheet.merged_cells.ranges:
235
+
236
+ if (
237
+ merged_range.min_row <= ri + 1
238
+ and ri + 1 <= merged_range.max_row
239
+ and merged_range.min_col <= rj + 1
240
+ and rj + 1 <= merged_range.max_col
241
+ ):
242
+
243
+ row_span = merged_range.max_row - merged_range.min_row + 1
244
+ col_span = merged_range.max_col - merged_range.min_col + 1
245
+ break
246
+
247
+ if (ri, rj) not in visited_cells:
248
+ data.append(
249
+ ExcelCell(
250
+ row=ri - start_row,
251
+ col=rj - start_col,
252
+ text=str(cell.value),
253
+ row_span=row_span,
254
+ col_span=col_span,
255
+ )
256
+ )
257
+ # _log.info(f"cell: {ri}, {rj} -> {ri - start_row}, {rj - start_col}, {row_span}, {col_span}: {str(cell.value)}")
258
+
259
+ # Mark all cells in the span as visited
260
+ for span_row in range(ri, ri + row_span):
261
+ for span_col in range(rj, rj + col_span):
262
+ visited_cells.add((span_row, span_col))
263
+
264
+ return (
265
+ ExcelTable(
266
+ num_rows=max_row + 1 - start_row,
267
+ num_cols=max_col + 1 - start_col,
268
+ data=data,
269
+ ),
270
+ visited_cells,
271
+ )
272
+
273
+ def _find_table_bottom(self, sheet: Worksheet, start_row: int, start_col: int):
274
+ """Function to find the bottom boundary of the table"""
275
+
276
+ max_row = start_row
277
+
278
+ while max_row < sheet.max_row - 1:
279
+ # Get the cell value or check if it is part of a merged cell
280
+ cell = sheet.cell(row=max_row + 2, column=start_col + 1)
281
+
282
+ # Check if the cell is part of a merged range
283
+ merged_range = next(
284
+ (mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
285
+ None,
286
+ )
287
+
288
+ if cell.value is None and not merged_range:
289
+ break # Stop if the cell is empty and not merged
290
+
291
+ # Expand max_row to include the merged range if applicable
292
+ if merged_range:
293
+ max_row = max(max_row, merged_range.max_row - 1)
294
+ else:
295
+ max_row += 1
296
+
297
+ return max_row
298
+
299
+ def _find_table_right(self, sheet: Worksheet, start_row: int, start_col: int):
300
+ """Function to find the right boundary of the table"""
301
+
302
+ max_col = start_col
303
+
304
+ while max_col < sheet.max_column - 1:
305
+ # Get the cell value or check if it is part of a merged cell
306
+ cell = sheet.cell(row=start_row + 1, column=max_col + 2)
307
+
308
+ # Check if the cell is part of a merged range
309
+ merged_range = next(
310
+ (mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
311
+ None,
312
+ )
313
+
314
+ if cell.value is None and not merged_range:
315
+ break # Stop if the cell is empty and not merged
316
+
317
+ # Expand max_col to include the merged range if applicable
318
+ if merged_range:
319
+ max_col = max(max_col, merged_range.max_col - 1)
320
+ else:
321
+ max_col += 1
322
+
323
+ return max_col
324
+
325
+ def _find_images_in_sheet(
326
+ self, doc: DoclingDocument, sheet: Worksheet
327
+ ) -> DoclingDocument:
328
+
329
+ # FIXME: mypy does not agree with _images ...
330
+ """
331
+ # Iterate over images in the sheet
332
+ for idx, image in enumerate(sheet._images): # Access embedded images
333
+
334
+ image_bytes = BytesIO(image.ref.blob)
335
+ pil_image = Image.open(image_bytes)
336
+
337
+ doc.add_picture(
338
+ parent=self.parents[0],
339
+ image=ImageRef.from_pil(image=pil_image, dpi=72),
340
+ caption=None,
341
+ )
342
+ """
343
+
344
+ # FIXME: mypy does not agree with _charts ...
345
+ """
346
+ for idx, chart in enumerate(sheet._charts): # Access embedded charts
347
+ chart_path = f"chart_{idx + 1}.png"
348
+ _log.info(
349
+ f"Chart found, but dynamic rendering is required for: {chart_path}"
350
+ )
351
+
352
+ _log.info(f"Chart {idx + 1}:")
353
+
354
+ # Chart type
355
+ _log.info(f"Type: {type(chart).__name__}")
356
+
357
+ # Title
358
+ if chart.title:
359
+ _log.info(f"Title: {chart.title}")
360
+ else:
361
+ _log.info("No title")
362
+
363
+ # Data series
364
+ for series in chart.series:
365
+ _log.info(" => series ...")
366
+ _log.info(f"Data Series: {series.title}")
367
+ _log.info(f"Values: {series.values}")
368
+ _log.info(f"Categories: {series.categories}")
369
+
370
+ # Position
371
+ # _log.info(f"Anchor Cell: {chart.anchor}")
372
+ """
373
+
374
+ return doc
@@ -10,11 +10,13 @@ from docling_core.types.doc import (
10
10
  DoclingDocument,
11
11
  DocumentOrigin,
12
12
  GroupLabel,
13
+ ImageRef,
13
14
  ProvenanceItem,
14
15
  Size,
15
16
  TableCell,
16
17
  TableData,
17
18
  )
19
+ from PIL import Image
18
20
  from pptx import Presentation
19
21
  from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
20
22
 
@@ -268,9 +270,22 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
268
270
  return
269
271
 
270
272
  def handle_pictures(self, shape, parent_slide, slide_ind, doc):
273
+ # Get the image bytes
274
+ image = shape.image
275
+ image_bytes = image.blob
276
+ im_dpi, _ = image.dpi
277
+
278
+ # Open it with PIL
279
+ pil_image = Image.open(BytesIO(image_bytes))
280
+
271
281
  # shape has picture
272
282
  prov = self.generate_prov(shape, slide_ind, "")
273
- doc.add_picture(parent=parent_slide, caption=None, prov=prov)
283
+ doc.add_picture(
284
+ parent=parent_slide,
285
+ image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
286
+ caption=None,
287
+ prov=prov,
288
+ )
274
289
  return
275
290
 
276
291
  def handle_tables(self, shape, parent_slide, slide_ind, doc):
@@ -9,10 +9,12 @@ from docling_core.types.doc import (
9
9
  DoclingDocument,
10
10
  DocumentOrigin,
11
11
  GroupLabel,
12
+ ImageRef,
12
13
  TableCell,
13
14
  TableData,
14
15
  )
15
16
  from lxml import etree
17
+ from PIL import Image
16
18
 
17
19
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
18
20
  from docling.datamodel.base_models import InputFormat
@@ -130,13 +132,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
130
132
  def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
131
133
  for element in body:
132
134
  tag_name = etree.QName(element).localname
133
- # Check for Inline Images (drawings or blip elements)
134
- found_drawing = etree.ElementBase.xpath(
135
- element, ".//w:drawing", namespaces=self.xml_namespaces
136
- )
137
- found_pict = etree.ElementBase.xpath(
138
- element, ".//w:pict", namespaces=self.xml_namespaces
139
- )
135
+ # Check for Inline Images (blip elements)
136
+ drawing_blip = element.xpath(".//a:blip")
140
137
 
141
138
  # Check for Tables
142
139
  if element.tag.endswith("tbl"):
@@ -145,8 +142,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
145
142
  except Exception:
146
143
  _log.debug("could not parse a table, broken docx table")
147
144
 
148
- elif found_drawing or found_pict:
149
- self.handle_pictures(element, docx_obj, doc)
145
+ elif drawing_blip:
146
+ self.handle_pictures(element, docx_obj, drawing_blip, doc)
150
147
  # Check for Text
151
148
  elif tag_name in ["p"]:
152
149
  self.handle_text_elements(element, docx_obj, doc)
@@ -491,6 +488,24 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
491
488
  doc.add_table(data=data, parent=self.parents[level - 1])
492
489
  return
493
490
 
494
- def handle_pictures(self, element, docx_obj, doc):
495
- doc.add_picture(parent=self.parents[self.level], caption=None)
491
+ def handle_pictures(self, element, docx_obj, drawing_blip, doc):
492
+ def get_docx_image(element, drawing_blip):
493
+ rId = drawing_blip[0].get(
494
+ "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
495
+ )
496
+ if rId in docx_obj.part.rels:
497
+ # Access the image part using the relationship ID
498
+ image_part = docx_obj.part.rels[rId].target_part
499
+ image_data = image_part.blob # Get the binary image data
500
+ return image_data
501
+
502
+ image_data = get_docx_image(element, drawing_blip)
503
+ image_bytes = BytesIO(image_data)
504
+ # Open the BytesIO object with PIL to create an Image
505
+ pil_image = Image.open(image_bytes)
506
+ doc.add_picture(
507
+ parent=self.parents[self.level],
508
+ image=ImageRef.from_pil(image=pil_image, dpi=72),
509
+ caption=None,
510
+ )
496
511
  return
@@ -1,6 +1,7 @@
1
1
  import importlib
2
2
  import json
3
3
  import logging
4
+ import re
4
5
  import time
5
6
  import warnings
6
7
  from enum import Enum
@@ -129,6 +130,12 @@ def export_documents(
129
130
  )
130
131
 
131
132
 
133
+ def _split_list(raw: Optional[str]) -> Optional[List[str]]:
134
+ if raw is None:
135
+ return None
136
+ return re.split(r"[;,]", raw)
137
+
138
+
132
139
  @app.command(no_args_is_help=True)
133
140
  def convert(
134
141
  input_sources: Annotated[
@@ -163,6 +170,13 @@ def convert(
163
170
  ocr_engine: Annotated[
164
171
  OcrEngine, typer.Option(..., help="The OCR engine to use.")
165
172
  ] = OcrEngine.EASYOCR,
173
+ ocr_lang: Annotated[
174
+ Optional[str],
175
+ typer.Option(
176
+ ...,
177
+ help="Provide a comma-separated list of languages used by the OCR engine. Note that each OCR engine has different values for the language names.",
178
+ ),
179
+ ] = None,
166
180
  pdf_backend: Annotated[
167
181
  PdfBackend, typer.Option(..., help="The PDF backend to use.")
168
182
  ] = PdfBackend.DLPARSE_V1,
@@ -185,6 +199,15 @@ def convert(
185
199
  output: Annotated[
186
200
  Path, typer.Option(..., help="Output directory where results are saved.")
187
201
  ] = Path("."),
202
+ verbose: Annotated[
203
+ int,
204
+ typer.Option(
205
+ "--verbose",
206
+ "-v",
207
+ count=True,
208
+ help="Set the verbosity level. -v for info logging, -vv for debug logging.",
209
+ ),
210
+ ] = 0,
188
211
  version: Annotated[
189
212
  Optional[bool],
190
213
  typer.Option(
@@ -195,7 +218,12 @@ def convert(
195
218
  ),
196
219
  ] = None,
197
220
  ):
198
- logging.basicConfig(level=logging.INFO)
221
+ if verbose == 0:
222
+ logging.basicConfig(level=logging.WARNING)
223
+ elif verbose == 1:
224
+ logging.basicConfig(level=logging.INFO)
225
+ elif verbose == 2:
226
+ logging.basicConfig(level=logging.DEBUG)
199
227
 
200
228
  if from_formats is None:
201
229
  from_formats = [e for e in InputFormat]
@@ -234,6 +262,10 @@ def convert(
234
262
  case _:
235
263
  raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
236
264
 
265
+ ocr_lang_list = _split_list(ocr_lang)
266
+ if ocr_lang_list is not None:
267
+ ocr_options.lang = ocr_lang_list
268
+
237
269
  pipeline_options = PdfPipelineOptions(
238
270
  do_ocr=ocr,
239
271
  ocr_options=ocr_options,
@@ -287,5 +319,7 @@ def convert(
287
319
  _log.info(f"All documents were converted in {end_time:.2f} seconds.")
288
320
 
289
321
 
322
+ click_app = typer.main.get_command(app)
323
+
290
324
  if __name__ == "__main__":
291
325
  app()
@@ -32,6 +32,7 @@ class InputFormat(str, Enum):
32
32
  PDF = "pdf"
33
33
  ASCIIDOC = "asciidoc"
34
34
  MD = "md"
35
+ XLSX = "xlsx"
35
36
 
36
37
 
37
38
  class OutputFormat(str, Enum):
@@ -49,6 +50,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
49
50
  InputFormat.HTML: ["html", "htm", "xhtml"],
50
51
  InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
51
52
  InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
53
+ InputFormat.XLSX: ["xlsx"],
52
54
  }
53
55
 
54
56
  FormatToMimeType: Dict[InputFormat, List[str]] = {
@@ -72,7 +74,11 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
72
74
  InputFormat.PDF: ["application/pdf"],
73
75
  InputFormat.ASCIIDOC: ["text/asciidoc"],
74
76
  InputFormat.MD: ["text/markdown", "text/x-markdown"],
77
+ InputFormat.XLSX: [
78
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
79
+ ],
75
80
  }
81
+
76
82
  MimeTypeToFormat = {
77
83
  mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
78
84
  }
@@ -22,6 +22,7 @@ class TableStructureOptions(BaseModel):
22
22
 
23
23
  class OcrOptions(BaseModel):
24
24
  kind: str
25
+ lang: List[str]
25
26
  force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
26
27
  bitmap_area_threshold: float = (
27
28
  0.05 # percentage of the area for a bitmap to processed with OCR
@@ -81,4 +82,11 @@ class PdfPipelineOptions(PipelineOptions):
81
82
  images_scale: float = 1.0
82
83
  generate_page_images: bool = False
83
84
  generate_picture_images: bool = False
84
- generate_table_images: bool = False
85
+ generate_table_images: bool = Field(
86
+ default=False,
87
+ deprecated=(
88
+ "Field `generate_table_images` is deprecated. "
89
+ "To obtain table images, set `PdfPipelineOptions.generate_page_images = True` "
90
+ "before conversion and then use the `TableItem.get_image` function."
91
+ ),
92
+ )
@@ -12,6 +12,7 @@ from docling.backend.asciidoc_backend import AsciiDocBackend
12
12
  from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
13
13
  from docling.backend.html_backend import HTMLDocumentBackend
14
14
  from docling.backend.md_backend import MarkdownDocumentBackend
15
+ from docling.backend.msexcel_backend import MsExcelDocumentBackend
15
16
  from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
16
17
  from docling.backend.msword_backend import MsWordDocumentBackend
17
18
  from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
@@ -44,6 +45,11 @@ class FormatOption(BaseModel):
44
45
  return self
45
46
 
46
47
 
48
+ class ExcelFormatOption(FormatOption):
49
+ pipeline_cls: Type = SimplePipeline
50
+ backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
51
+
52
+
47
53
  class WordFormatOption(FormatOption):
48
54
  pipeline_cls: Type = SimplePipeline
49
55
  backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
@@ -80,6 +86,9 @@ class ImageFormatOption(FormatOption):
80
86
 
81
87
 
82
88
  _format_to_default_options = {
89
+ InputFormat.XLSX: FormatOption(
90
+ pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
91
+ ),
83
92
  InputFormat.DOCX: FormatOption(
84
93
  pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
85
94
  ),
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "2.5.2" # DO NOT EDIT, updated automatically
3
+ version = "2.6.0" # DO NOT EDIT, updated automatically
4
4
  description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
@@ -26,7 +26,7 @@ packages = [{include = "docling"}]
26
26
  ######################
27
27
  python = "^3.10"
28
28
  pydantic = "^2.0.0"
29
- docling-core = "^2.3.0"
29
+ docling-core = "^2.4.0"
30
30
  docling-ibm-models = "^2.0.3"
31
31
  deepsearch-glm = "^0.26.1"
32
32
  filetype = "^1.2.0"
@@ -47,6 +47,7 @@ python-pptx = "^1.0.2"
47
47
  beautifulsoup4 = "^4.12.3"
48
48
  pandas = "^2.1.4"
49
49
  marko = "^2.1.2"
50
+ openpyxl = "^3.1.5"
50
51
 
51
52
  [tool.poetry.group.dev.dependencies]
52
53
  black = {extras = ["jupyter"], version = "^24.4.2"}
@@ -65,10 +66,12 @@ pandas-stubs = "^2.1.4.231227"
65
66
  ipykernel = "^6.29.5"
66
67
  ipywidgets = "^8.1.5"
67
68
  nbqa = "^1.9.0"
69
+ types-openpyxl = "^3.1.5.20241114"
68
70
 
69
71
  [tool.poetry.group.docs.dependencies]
70
72
  mkdocs-material = "^9.5.40"
71
73
  mkdocs-jupyter = "^0.25.0"
74
+ mkdocs-click = "^0.8.1"
72
75
 
73
76
  [tool.poetry.group.examples.dependencies]
74
77
  datasets = "^2.21.0"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes