docling 2.29.0__py3-none-any.whl → 2.30.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,36 +1,50 @@
1
1
  import logging
2
2
  from io import BytesIO
3
3
  from pathlib import Path
4
- from typing import Dict, Set, Tuple, Union
4
+ from typing import Any, Union, cast
5
5
 
6
6
  from docling_core.types.doc import (
7
+ BoundingBox,
8
+ CoordOrigin,
9
+ DocItem,
7
10
  DoclingDocument,
8
11
  DocumentOrigin,
9
12
  GroupLabel,
10
13
  ImageRef,
14
+ ProvenanceItem,
15
+ Size,
11
16
  TableCell,
12
17
  TableData,
13
18
  )
14
-
15
- # from lxml import etree
16
- from openpyxl import Workbook, load_workbook
17
- from openpyxl.cell.cell import Cell
19
+ from openpyxl import load_workbook
18
20
  from openpyxl.drawing.image import Image
21
+ from openpyxl.drawing.spreadsheet_drawing import TwoCellAnchor
19
22
  from openpyxl.worksheet.worksheet import Worksheet
23
+ from PIL import Image as PILImage
24
+ from pydantic import BaseModel, NonNegativeInt, PositiveInt
25
+ from typing_extensions import override
20
26
 
21
- from docling.backend.abstract_backend import DeclarativeDocumentBackend
27
+ from docling.backend.abstract_backend import (
28
+ DeclarativeDocumentBackend,
29
+ PaginatedDocumentBackend,
30
+ )
22
31
  from docling.datamodel.base_models import InputFormat
23
32
  from docling.datamodel.document import InputDocument
24
33
 
25
34
  _log = logging.getLogger(__name__)
26
35
 
27
- from typing import Any, List
28
36
 
29
- from PIL import Image as PILImage
30
- from pydantic import BaseModel
37
+ class ExcelCell(BaseModel):
38
+ """Represents an Excel cell.
31
39
 
40
+ Attributes:
41
+ row: The row number of the cell.
42
+ col: The column number of the cell.
43
+ text: The text content of the cell.
44
+ row_span: The number of rows the cell spans.
45
+ col_span: The number of columns the cell spans.
46
+ """
32
47
 
33
- class ExcelCell(BaseModel):
34
48
  row: int
35
49
  col: int
36
50
  text: str
@@ -39,19 +53,57 @@ class ExcelCell(BaseModel):
39
53
 
40
54
 
41
55
  class ExcelTable(BaseModel):
56
+ """Represents an Excel table on a worksheet.
57
+
58
+ Attributes:
59
+ anchor: The column and row indices of the upper-left cell of the table
60
+ (0-based index).
61
+ num_rows: The number of rows in the table.
62
+ num_cols: The number of columns in the table.
63
+ data: The data in the table, represented as a list of ExcelCell objects.
64
+ """
65
+
66
+ anchor: tuple[NonNegativeInt, NonNegativeInt]
42
67
  num_rows: int
43
68
  num_cols: int
44
- data: List[ExcelCell]
69
+ data: list[ExcelCell]
45
70
 
46
71
 
47
- class MsExcelDocumentBackend(DeclarativeDocumentBackend):
48
- def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
72
+ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
73
+ """Backend for parsing Excel workbooks.
74
+
75
+ The backend converts an Excel workbook into a DoclingDocument object.
76
+ Each worksheet is converted into a separate page.
77
+ The following elements are parsed:
78
+ - Cell contents, parsed as tables. If two groups of cells are disconnected
79
+ between each other, they will be parsed as two different tables.
80
+ - Images, parsed as PictureItem objects.
81
+
82
+ The DoclingDocument tables and pictures have their provenance information, including
83
+ the position in their original Excel worksheet. The position is represented by a
84
+ bounding box object with the cell indices as units (0-based index). The size of this
85
+ bounding box is the number of columns and rows that the table or picture spans.
86
+ """
87
+
88
+ @override
89
+ def __init__(
90
+ self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
91
+ ) -> None:
92
+ """Initialize the MsExcelDocumentBackend object.
93
+
94
+ Parameters:
95
+ in_doc: The input document object.
96
+ path_or_stream: The path or stream to the Excel file.
97
+
98
+ Raises:
99
+ RuntimeError: An error occurred parsing the file.
100
+ """
49
101
  super().__init__(in_doc, path_or_stream)
50
102
 
51
103
  # Initialise the parents for the hierarchy
52
104
  self.max_levels = 10
53
105
 
54
- self.parents: Dict[int, Any] = {}
106
+ self.parents: dict[int, Any] = {}
55
107
  for i in range(-1, self.max_levels):
56
108
  self.parents[i] = None
57
109
 
@@ -63,35 +115,47 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
63
115
  elif isinstance(self.path_or_stream, Path):
64
116
  self.workbook = load_workbook(filename=str(self.path_or_stream))
65
117
 
66
- self.valid = True
118
+ self.valid = self.workbook is not None
67
119
  except Exception as e:
68
120
  self.valid = False
69
121
 
70
122
  raise RuntimeError(
71
- f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
123
+ f"MsExcelDocumentBackend could not load document with hash {self.document_hash}"
72
124
  ) from e
73
125
 
126
+ @override
74
127
  def is_valid(self) -> bool:
75
- _log.info(f"valid: {self.valid}")
128
+ _log.debug(f"valid: {self.valid}")
76
129
  return self.valid
77
130
 
78
131
  @classmethod
132
+ @override
79
133
  def supports_pagination(cls) -> bool:
80
134
  return True
81
135
 
82
- def unload(self):
83
- if isinstance(self.path_or_stream, BytesIO):
84
- self.path_or_stream.close()
85
-
86
- self.path_or_stream = None
136
+ @override
137
+ def page_count(self) -> int:
138
+ if self.is_valid() and self.workbook:
139
+ return len(self.workbook.sheetnames)
140
+ else:
141
+ return 0
87
142
 
88
143
  @classmethod
89
- def supported_formats(cls) -> Set[InputFormat]:
144
+ @override
145
+ def supported_formats(cls) -> set[InputFormat]:
90
146
  return {InputFormat.XLSX}
91
147
 
148
+ @override
92
149
  def convert(self) -> DoclingDocument:
93
- # Parses the XLSX into a structured document model.
150
+ """Parse the Excel workbook into a DoclingDocument object.
94
151
 
152
+ Raises:
153
+ RuntimeError: Unable to run the conversion since the backend object failed to
154
+ initialize.
155
+
156
+ Returns:
157
+ The DoclingDocument object representing the Excel workbook.
158
+ """
95
159
  origin = DocumentOrigin(
96
160
  filename=self.file.name or "file.xlsx",
97
161
  mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
@@ -110,6 +174,14 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
110
174
  return doc
111
175
 
112
176
  def _convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
177
+ """Parse the Excel workbook and attach its structure to a DoclingDocument.
178
+
179
+ Args:
180
+ doc: A DoclingDocument object.
181
+
182
+ Returns:
183
+ A DoclingDocument object with the parsed items.
184
+ """
113
185
 
114
186
  if self.workbook is not None:
115
187
 
@@ -117,22 +189,34 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
117
189
  for sheet_name in self.workbook.sheetnames:
118
190
  _log.info(f"Processing sheet: {sheet_name}")
119
191
 
120
- # Access the sheet by name
121
192
  sheet = self.workbook[sheet_name]
193
+ page_no = self.workbook.index(sheet) + 1
194
+ # do not rely on sheet.max_column, sheet.max_row if there are images
195
+ page = doc.add_page(page_no=page_no, size=Size(width=0, height=0))
122
196
 
123
197
  self.parents[0] = doc.add_group(
124
198
  parent=None,
125
199
  label=GroupLabel.SECTION,
126
200
  name=f"sheet: {sheet_name}",
127
201
  )
128
-
129
202
  doc = self._convert_sheet(doc, sheet)
203
+ width, height = self._find_page_size(doc, page_no)
204
+ page.size = Size(width=width, height=height)
130
205
  else:
131
206
  _log.error("Workbook is not initialized.")
132
207
 
133
208
  return doc
134
209
 
135
- def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet):
210
+ def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet) -> DoclingDocument:
211
+ """Parse an Excel worksheet and attach its structure to a DoclingDocument
212
+
213
+ Args:
214
+ doc: The DoclingDocument to be updated.
215
+ sheet: The Excel worksheet to be parsed.
216
+
217
+ Returns:
218
+ The updated DoclingDocument.
219
+ """
136
220
 
137
221
  doc = self._find_tables_in_sheet(doc, sheet)
138
222
 
@@ -140,47 +224,81 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
140
224
 
141
225
  return doc
142
226
 
143
- def _find_tables_in_sheet(self, doc: DoclingDocument, sheet: Worksheet):
144
-
145
- tables = self._find_data_tables(sheet)
227
+ def _find_tables_in_sheet(
228
+ self, doc: DoclingDocument, sheet: Worksheet
229
+ ) -> DoclingDocument:
230
+ """Find all tables in an Excel sheet and attach them to a DoclingDocument.
146
231
 
147
- for excel_table in tables:
148
- num_rows = excel_table.num_rows
149
- num_cols = excel_table.num_cols
232
+ Args:
233
+ doc: The DoclingDocument to be updated.
234
+ sheet: The Excel worksheet to be parsed.
150
235
 
151
- table_data = TableData(
152
- num_rows=num_rows,
153
- num_cols=num_cols,
154
- table_cells=[],
155
- )
236
+ Returns:
237
+ The updated DoclingDocument.
238
+ """
156
239
 
157
- for excel_cell in excel_table.data:
158
-
159
- cell = TableCell(
160
- text=excel_cell.text,
161
- row_span=excel_cell.row_span,
162
- col_span=excel_cell.col_span,
163
- start_row_offset_idx=excel_cell.row,
164
- end_row_offset_idx=excel_cell.row + excel_cell.row_span,
165
- start_col_offset_idx=excel_cell.col,
166
- end_col_offset_idx=excel_cell.col + excel_cell.col_span,
167
- column_header=excel_cell.row == 0,
168
- row_header=False,
240
+ if self.workbook is not None:
241
+ tables = self._find_data_tables(sheet)
242
+
243
+ for excel_table in tables:
244
+ origin_col = excel_table.anchor[0]
245
+ origin_row = excel_table.anchor[1]
246
+ num_rows = excel_table.num_rows
247
+ num_cols = excel_table.num_cols
248
+
249
+ table_data = TableData(
250
+ num_rows=num_rows,
251
+ num_cols=num_cols,
252
+ table_cells=[],
169
253
  )
170
- table_data.table_cells.append(cell)
171
254
 
172
- doc.add_table(data=table_data, parent=self.parents[0])
255
+ for excel_cell in excel_table.data:
256
+
257
+ cell = TableCell(
258
+ text=excel_cell.text,
259
+ row_span=excel_cell.row_span,
260
+ col_span=excel_cell.col_span,
261
+ start_row_offset_idx=excel_cell.row,
262
+ end_row_offset_idx=excel_cell.row + excel_cell.row_span,
263
+ start_col_offset_idx=excel_cell.col,
264
+ end_col_offset_idx=excel_cell.col + excel_cell.col_span,
265
+ column_header=excel_cell.row == 0,
266
+ row_header=False,
267
+ )
268
+ table_data.table_cells.append(cell)
269
+
270
+ page_no = self.workbook.index(sheet) + 1
271
+ doc.add_table(
272
+ data=table_data,
273
+ parent=self.parents[0],
274
+ prov=ProvenanceItem(
275
+ page_no=page_no,
276
+ charspan=(0, 0),
277
+ bbox=BoundingBox.from_tuple(
278
+ (
279
+ origin_col,
280
+ origin_row,
281
+ origin_col + num_cols,
282
+ origin_row + num_rows,
283
+ ),
284
+ origin=CoordOrigin.TOPLEFT,
285
+ ),
286
+ ),
287
+ )
173
288
 
174
289
  return doc
175
290
 
176
- def _find_data_tables(self, sheet: Worksheet) -> List[ExcelTable]:
177
- """
178
- Find all compact rectangular data tables in a sheet.
179
- """
180
- # _log.info("find_data_tables")
291
+ def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
292
+ """Find all compact rectangular data tables in an Excel worksheet.
293
+
294
+ Args:
295
+ sheet: The Excel worksheet to be parsed.
181
296
 
182
- tables = [] # List to store found tables
183
- visited: set[Tuple[int, int]] = set() # Track already visited cells
297
+ Returns:
298
+ A list of ExcelTable objects representing the data tables.
299
+ """
300
+ tables: list[ExcelTable] = [] # List to store found tables
301
+ visited: set[tuple[int, int]] = set() # Track already visited cells
184
302
 
185
303
  # Iterate over all cells in the sheet
186
304
  for ri, row in enumerate(sheet.iter_rows(values_only=False)):
@@ -191,9 +309,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
191
309
  continue
192
310
 
193
311
  # If the cell starts a new table, find its bounds
194
- table_bounds, visited_cells = self._find_table_bounds(
195
- sheet, ri, rj, visited
196
- )
312
+ table_bounds, visited_cells = self._find_table_bounds(sheet, ri, rj)
197
313
 
198
314
  visited.update(visited_cells) # Mark these cells as visited
199
315
  tables.append(table_bounds)
@@ -205,22 +321,25 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
205
321
  sheet: Worksheet,
206
322
  start_row: int,
207
323
  start_col: int,
208
- visited: set[Tuple[int, int]],
209
- ):
210
- """
211
- Determine the bounds of a compact rectangular table.
324
+ ) -> tuple[ExcelTable, set[tuple[int, int]]]:
325
+ """Determine the bounds of a compact rectangular table.
326
+
327
+ Args:
328
+ sheet: The Excel worksheet to be parsed.
329
+ start_row: The row number of the starting cell.
330
+ start_col: The column number of the starting cell.
331
+
212
332
  Returns:
213
- - A dictionary with the bounds and data.
214
- - A set of visited cell coordinates.
333
+ A tuple with an Excel table and a set of cell coordinates.
215
334
  """
216
- _log.info("find_table_bounds")
335
+ _log.debug("find_table_bounds")
217
336
 
218
337
  max_row = self._find_table_bottom(sheet, start_row, start_col)
219
338
  max_col = self._find_table_right(sheet, start_row, start_col)
220
339
 
221
340
  # Collect the data within the bounds
222
341
  data = []
223
- visited_cells = set()
342
+ visited_cells: set[tuple[int, int]] = set()
224
343
  for ri in range(start_row, max_row + 1):
225
344
  for rj in range(start_col, max_col + 1):
226
345
 
@@ -230,7 +349,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
230
349
  row_span = 1
231
350
  col_span = 1
232
351
 
233
- # _log.info(sheet.merged_cells.ranges)
234
352
  for merged_range in sheet.merged_cells.ranges:
235
353
 
236
354
  if (
@@ -254,7 +372,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
254
372
  col_span=col_span,
255
373
  )
256
374
  )
257
- # _log.info(f"cell: {ri}, {rj} -> {ri - start_row}, {rj - start_col}, {row_span}, {col_span}: {str(cell.value)}")
258
375
 
259
376
  # Mark all cells in the span as visited
260
377
  for span_row in range(ri, ri + row_span):
@@ -263,6 +380,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
263
380
 
264
381
  return (
265
382
  ExcelTable(
383
+ anchor=(start_col, start_row),
266
384
  num_rows=max_row + 1 - start_row,
267
385
  num_cols=max_col + 1 - start_col,
268
386
  data=data,
@@ -270,10 +388,20 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
270
388
  visited_cells,
271
389
  )
272
390
 
273
- def _find_table_bottom(self, sheet: Worksheet, start_row: int, start_col: int):
274
- """Function to find the bottom boundary of the table"""
391
+ def _find_table_bottom(
392
+ self, sheet: Worksheet, start_row: int, start_col: int
393
+ ) -> int:
394
+ """Find the bottom boundary of a table.
275
395
 
276
- max_row = start_row
396
+ Args:
397
+ sheet: The Excel worksheet to be parsed.
398
+ start_row: The starting row of the table.
399
+ start_col: The starting column of the table.
400
+
401
+ Returns:
402
+ The row index representing the bottom boundary of the table.
403
+ """
404
+ max_row: int = start_row
277
405
 
278
406
  while max_row < sheet.max_row - 1:
279
407
  # Get the cell value or check if it is part of a merged cell
@@ -296,10 +424,20 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
296
424
 
297
425
  return max_row
298
426
 
299
- def _find_table_right(self, sheet: Worksheet, start_row: int, start_col: int):
300
- """Function to find the right boundary of the table"""
427
+ def _find_table_right(
428
+ self, sheet: Worksheet, start_row: int, start_col: int
429
+ ) -> int:
430
+ """Find the right boundary of a table.
431
+
432
+ Args:
433
+ sheet: The Excel worksheet to be parsed.
434
+ start_row: The starting row of the table.
435
+ start_col: The starting column of the table.
301
436
 
302
- max_col = start_col
437
+ Returns:
438
+ The column index representing the right boundary of the table."
439
+ """
440
+ max_col: int = start_col
303
441
 
304
442
  while max_col < sheet.max_column - 1:
305
443
  # Get the cell value or check if it is part of a merged cell
@@ -325,19 +463,63 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
325
463
  def _find_images_in_sheet(
326
464
  self, doc: DoclingDocument, sheet: Worksheet
327
465
  ) -> DoclingDocument:
466
+ """Find images in the Excel sheet and attach them to the DoclingDocument.
328
467
 
329
- # Iterate over byte images in the sheet
330
- for idx, image in enumerate(sheet._images): # type: ignore
468
+ Args:
469
+ doc: The DoclingDocument to be updated.
470
+ sheet: The Excel worksheet to be parsed.
331
471
 
332
- try:
333
- pil_image = PILImage.open(image.ref)
334
-
335
- doc.add_picture(
336
- parent=self.parents[0],
337
- image=ImageRef.from_pil(image=pil_image, dpi=72),
338
- caption=None,
339
- )
340
- except:
341
- _log.error("could not extract the image from excel sheets")
472
+ Returns:
473
+ The updated DoclingDocument.
474
+ """
475
+ if self.workbook is not None:
476
+ # Iterate over byte images in the sheet
477
+ for item in sheet._images: # type: ignore[attr-defined]
478
+ try:
479
+ image: Image = cast(Image, item)
480
+ pil_image = PILImage.open(image.ref) # type: ignore[arg-type]
481
+ page_no = self.workbook.index(sheet) + 1
482
+ anchor = (0, 0, 0, 0)
483
+ if isinstance(image.anchor, TwoCellAnchor):
484
+ anchor = (
485
+ image.anchor._from.col,
486
+ image.anchor._from.row,
487
+ image.anchor.to.col + 1,
488
+ image.anchor.to.row + 1,
489
+ )
490
+ doc.add_picture(
491
+ parent=self.parents[0],
492
+ image=ImageRef.from_pil(image=pil_image, dpi=72),
493
+ caption=None,
494
+ prov=ProvenanceItem(
495
+ page_no=page_no,
496
+ charspan=(0, 0),
497
+ bbox=BoundingBox.from_tuple(
498
+ anchor, origin=CoordOrigin.TOPLEFT
499
+ ),
500
+ ),
501
+ )
502
+ except:
503
+ _log.error("could not extract the image from excel sheets")
342
504
 
343
505
  return doc
506
+
507
+ @staticmethod
508
+ def _find_page_size(
509
+ doc: DoclingDocument, page_no: PositiveInt
510
+ ) -> tuple[float, float]:
511
+ left: float = -1.0
512
+ top: float = -1.0
513
+ right: float = -1.0
514
+ bottom: float = -1.0
515
+ for item, _ in doc.iterate_items(traverse_pictures=True, page_no=page_no):
516
+ if not isinstance(item, DocItem):
517
+ continue
518
+ for provenance in item.prov:
519
+ bbox = provenance.bbox
520
+ left = min(left, bbox.l) if left != -1 else bbox.l
521
+ right = max(right, bbox.r) if right != -1 else bbox.r
522
+ top = min(top, bbox.t) if top != -1 else bbox.t
523
+ bottom = max(bottom, bbox.b) if bottom != -1 else bbox.b
524
+
525
+ return (right - left, bottom - top)
@@ -850,7 +850,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
850
850
  def _handle_pictures(
851
851
  self, docx_obj: DocxDocument, drawing_blip: Any, doc: DoclingDocument
852
852
  ) -> None:
853
- def get_docx_image(drawing_blip):
853
+ def get_docx_image(drawing_blip: Any) -> Optional[bytes]:
854
+ image_data: Optional[bytes] = None
854
855
  rId = drawing_blip[0].get(
855
856
  "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
856
857
  )
@@ -862,19 +863,26 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
862
863
 
863
864
  level = self._get_level()
864
865
  # Open the BytesIO object with PIL to create an Image
865
- try:
866
- image_data = get_docx_image(drawing_blip)
867
- image_bytes = BytesIO(image_data)
868
- pil_image = Image.open(image_bytes)
869
- doc.add_picture(
870
- parent=self.parents[level - 1],
871
- image=ImageRef.from_pil(image=pil_image, dpi=72),
872
- caption=None,
873
- )
874
- except (UnidentifiedImageError, OSError) as e:
875
- _log.warning("Warning: image cannot be loaded by Pillow")
866
+ image_data: Optional[bytes] = get_docx_image(drawing_blip)
867
+ if image_data is None:
868
+ _log.warning("Warning: image cannot be found")
876
869
  doc.add_picture(
877
870
  parent=self.parents[level - 1],
878
871
  caption=None,
879
872
  )
873
+ else:
874
+ try:
875
+ image_bytes = BytesIO(image_data)
876
+ pil_image = Image.open(image_bytes)
877
+ doc.add_picture(
878
+ parent=self.parents[level - 1],
879
+ image=ImageRef.from_pil(image=pil_image, dpi=72),
880
+ caption=None,
881
+ )
882
+ except (UnidentifiedImageError, OSError) as e:
883
+ _log.warning("Warning: image cannot be loaded by Pillow")
884
+ doc.add_picture(
885
+ parent=self.parents[level - 1],
886
+ caption=None,
887
+ )
880
888
  return
docling/cli/main.py CHANGED
@@ -40,6 +40,7 @@ from docling.datamodel.pipeline_options import (
40
40
  VlmModelType,
41
41
  VlmPipelineOptions,
42
42
  granite_vision_vlm_conversion_options,
43
+ granite_vision_vlm_ollama_conversion_options,
43
44
  smoldocling_vlm_conversion_options,
44
45
  smoldocling_vlm_mlx_conversion_options,
45
46
  )
@@ -153,6 +154,7 @@ def export_documents(
153
154
  output_dir: Path,
154
155
  export_json: bool,
155
156
  export_html: bool,
157
+ export_html_split_page: bool,
156
158
  export_md: bool,
157
159
  export_txt: bool,
158
160
  export_doctags: bool,
@@ -180,7 +182,15 @@ def export_documents(
180
182
  fname = output_dir / f"{doc_filename}.html"
181
183
  _log.info(f"writing HTML output to {fname}")
182
184
  conv_res.document.save_as_html(
183
- filename=fname, image_mode=image_export_mode
185
+ filename=fname, image_mode=image_export_mode, split_page_view=False
186
+ )
187
+
188
+ # Export HTML format:
189
+ if export_html_split_page:
190
+ fname = output_dir / f"{doc_filename}.html"
191
+ _log.info(f"writing HTML output to {fname}")
192
+ conv_res.document.save_as_html(
193
+ filename=fname, image_mode=image_export_mode, split_page_view=True
184
194
  )
185
195
 
186
196
  # Export Text format:
@@ -471,6 +481,7 @@ def convert(
471
481
 
472
482
  export_json = OutputFormat.JSON in to_formats
473
483
  export_html = OutputFormat.HTML in to_formats
484
+ export_html_split_page = OutputFormat.HTML_SPLIT_PAGE in to_formats
474
485
  export_md = OutputFormat.MARKDOWN in to_formats
475
486
  export_txt = OutputFormat.TEXT in to_formats
476
487
  export_doctags = OutputFormat.DOCTAGS in to_formats
@@ -531,10 +542,16 @@ def convert(
531
542
  backend=backend, # pdf_backend
532
543
  )
533
544
  elif pipeline == PdfPipeline.VLM:
534
- pipeline_options = VlmPipelineOptions()
545
+ pipeline_options = VlmPipelineOptions(
546
+ enable_remote_services=enable_remote_services,
547
+ )
535
548
 
536
549
  if vlm_model == VlmModelType.GRANITE_VISION:
537
550
  pipeline_options.vlm_options = granite_vision_vlm_conversion_options
551
+ elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
552
+ pipeline_options.vlm_options = (
553
+ granite_vision_vlm_ollama_conversion_options
554
+ )
538
555
  elif vlm_model == VlmModelType.SMOLDOCLING:
539
556
  pipeline_options.vlm_options = smoldocling_vlm_conversion_options
540
557
  if sys.platform == "darwin":
@@ -578,6 +595,7 @@ def convert(
578
595
  output_dir=output,
579
596
  export_json=export_json,
580
597
  export_html=export_html,
598
+ export_html_split_page=export_html_split_page,
581
599
  export_md=export_md,
582
600
  export_txt=export_txt,
583
601
  export_doctags=export_doctags,
@@ -50,6 +50,7 @@ class OutputFormat(str, Enum):
50
50
  MARKDOWN = "md"
51
51
  JSON = "json"
52
52
  HTML = "html"
53
+ HTML_SPLIT_PAGE = "html_split_page"
53
54
  TEXT = "text"
54
55
  DOCTAGS = "doctags"
55
56
 
@@ -262,3 +263,35 @@ class Page(BaseModel):
262
263
  @property
263
264
  def image(self) -> Optional[Image]:
264
265
  return self.get_image(scale=self._default_image_scale)
266
+
267
+
268
+ ## OpenAI API Request / Response Models ##
269
+
270
+
271
+ class OpenAiChatMessage(BaseModel):
272
+ role: str
273
+ content: str
274
+
275
+
276
+ class OpenAiResponseChoice(BaseModel):
277
+ index: int
278
+ message: OpenAiChatMessage
279
+ finish_reason: str
280
+
281
+
282
+ class OpenAiResponseUsage(BaseModel):
283
+ prompt_tokens: int
284
+ completion_tokens: int
285
+ total_tokens: int
286
+
287
+
288
+ class OpenAiApiResponse(BaseModel):
289
+ model_config = ConfigDict(
290
+ protected_namespaces=(),
291
+ )
292
+
293
+ id: str
294
+ model: Optional[str] = None # returned by openai
295
+ choices: List[OpenAiResponseChoice]
296
+ created: int
297
+ usage: OpenAiResponseUsage
@@ -283,6 +283,13 @@ class _DocumentConversionInput(BaseModel):
283
283
  if mime is None: # must guess from
284
284
  with obj.open("rb") as f:
285
285
  content = f.read(1024) # Read first 1KB
286
+ if mime is not None and mime.lower() == "application/zip":
287
+ if obj.suffixes[-1].lower() == ".xlsx":
288
+ mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
289
+ elif obj.suffixes[-1].lower() == ".docx":
290
+ mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
291
+ elif obj.suffixes[-1].lower() == ".pptx":
292
+ mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
286
293
 
287
294
  elif isinstance(obj, DocumentStream):
288
295
  content = obj.stream.read(8192)
@@ -213,8 +213,8 @@ class PictureDescriptionBaseOptions(BaseOptions):
213
213
  batch_size: int = 8
214
214
  scale: float = 2
215
215
 
216
- bitmap_area_threshold: float = (
217
- 0.2 # percentage of the area for a bitmap to processed with the models
216
+ picture_area_threshold: float = (
217
+ 0.05 # percentage of the area for a picture to processed with the models
218
218
  )
219
219
 
220
220
 
@@ -266,6 +266,7 @@ class ResponseFormat(str, Enum):
266
266
  class InferenceFramework(str, Enum):
267
267
  MLX = "mlx"
268
268
  TRANSFORMERS = "transformers"
269
+ OPENAI = "openai"
269
270
 
270
271
 
271
272
  class HuggingFaceVlmOptions(BaseVlmOptions):
@@ -284,6 +285,19 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
284
285
  return self.repo_id.replace("/", "--")
285
286
 
286
287
 
288
+ class ApiVlmOptions(BaseVlmOptions):
289
+ kind: Literal["api_model_options"] = "api_model_options"
290
+
291
+ url: AnyUrl = AnyUrl(
292
+ "http://localhost:11434/v1/chat/completions"
293
+ ) # Default to ollama
294
+ headers: Dict[str, str] = {}
295
+ params: Dict[str, Any] = {}
296
+ scale: float = 2.0
297
+ timeout: float = 60
298
+ response_format: ResponseFormat
299
+
300
+
287
301
  smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
288
302
  repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
289
303
  prompt="Convert this page to docling.",
@@ -307,10 +321,20 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
307
321
  inference_framework=InferenceFramework.TRANSFORMERS,
308
322
  )
309
323
 
324
+ granite_vision_vlm_ollama_conversion_options = ApiVlmOptions(
325
+ url=AnyUrl("http://localhost:11434/v1/chat/completions"),
326
+ params={"model": "granite3.2-vision:2b"},
327
+ prompt="OCR the full page to markdown.",
328
+ scale=1.0,
329
+ timeout=120,
330
+ response_format=ResponseFormat.MARKDOWN,
331
+ )
332
+
310
333
 
311
334
  class VlmModelType(str, Enum):
312
335
  SMOLDOCLING = "smoldocling"
313
336
  GRANITE_VISION = "granite_vision"
337
+ GRANITE_VISION_OLLAMA = "granite_vision_ollama"
314
338
 
315
339
 
316
340
  # Define an enum for the backend options
@@ -362,7 +386,9 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
362
386
  False # (To be used with vlms, or other generative models)
363
387
  )
364
388
  # If True, text from backend will be used instead of generated text
365
- vlm_options: Union[HuggingFaceVlmOptions] = smoldocling_vlm_conversion_options
389
+ vlm_options: Union[HuggingFaceVlmOptions, ApiVlmOptions] = (
390
+ smoldocling_vlm_conversion_options
391
+ )
366
392
 
367
393
 
368
394
  class PdfPipelineOptions(PaginatedPipelineOptions):
@@ -0,0 +1,67 @@
1
+ from typing import Iterable
2
+
3
+ from docling.datamodel.base_models import Page, VlmPrediction
4
+ from docling.datamodel.document import ConversionResult
5
+ from docling.datamodel.pipeline_options import ApiVlmOptions
6
+ from docling.exceptions import OperationNotAllowed
7
+ from docling.models.base_model import BasePageModel
8
+ from docling.utils.api_image_request import api_image_request
9
+ from docling.utils.profiling import TimeRecorder
10
+
11
+
12
+ class ApiVlmModel(BasePageModel):
13
+
14
+ def __init__(
15
+ self,
16
+ enabled: bool,
17
+ enable_remote_services: bool,
18
+ vlm_options: ApiVlmOptions,
19
+ ):
20
+ self.enabled = enabled
21
+ self.vlm_options = vlm_options
22
+ if self.enabled:
23
+ if not enable_remote_services:
24
+ raise OperationNotAllowed(
25
+ "Connections to remote services is only allowed when set explicitly. "
26
+ "pipeline_options.enable_remote_services=True, or using the CLI "
27
+ "--enable-remote-services."
28
+ )
29
+
30
+ self.timeout = self.vlm_options.timeout
31
+ self.prompt_content = (
32
+ f"This is a page from a document.\n{self.vlm_options.prompt}"
33
+ )
34
+ self.params = {
35
+ **self.vlm_options.params,
36
+ "temperature": 0,
37
+ }
38
+
39
+ def __call__(
40
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
41
+ ) -> Iterable[Page]:
42
+ for page in page_batch:
43
+ assert page._backend is not None
44
+ if not page._backend.is_valid():
45
+ yield page
46
+ else:
47
+ with TimeRecorder(conv_res, "vlm"):
48
+ assert page.size is not None
49
+
50
+ hi_res_image = page.get_image(scale=self.vlm_options.scale)
51
+ assert hi_res_image is not None
52
+ if hi_res_image:
53
+ if hi_res_image.mode != "RGB":
54
+ hi_res_image = hi_res_image.convert("RGB")
55
+
56
+ page_tags = api_image_request(
57
+ image=hi_res_image,
58
+ prompt=self.prompt_content,
59
+ url=self.vlm_options.url,
60
+ timeout=self.timeout,
61
+ headers=self.vlm_options.headers,
62
+ **self.params,
63
+ )
64
+
65
+ page.predictions.vlm_response = VlmPrediction(text=page_tags)
66
+
67
+ yield page
@@ -1,12 +1,7 @@
1
- import base64
2
- import io
3
- import logging
4
1
  from pathlib import Path
5
- from typing import Iterable, List, Optional, Type, Union
2
+ from typing import Iterable, Optional, Type, Union
6
3
 
7
- import requests
8
4
  from PIL import Image
9
- from pydantic import BaseModel, ConfigDict
10
5
 
11
6
  from docling.datamodel.pipeline_options import (
12
7
  AcceleratorOptions,
@@ -15,37 +10,7 @@ from docling.datamodel.pipeline_options import (
15
10
  )
16
11
  from docling.exceptions import OperationNotAllowed
17
12
  from docling.models.picture_description_base_model import PictureDescriptionBaseModel
18
-
19
- _log = logging.getLogger(__name__)
20
-
21
-
22
- class ChatMessage(BaseModel):
23
- role: str
24
- content: str
25
-
26
-
27
- class ResponseChoice(BaseModel):
28
- index: int
29
- message: ChatMessage
30
- finish_reason: str
31
-
32
-
33
- class ResponseUsage(BaseModel):
34
- prompt_tokens: int
35
- completion_tokens: int
36
- total_tokens: int
37
-
38
-
39
- class ApiResponse(BaseModel):
40
- model_config = ConfigDict(
41
- protected_namespaces=(),
42
- )
43
-
44
- id: str
45
- model: Optional[str] = None # returned by openai
46
- choices: List[ResponseChoice]
47
- created: int
48
- usage: ResponseUsage
13
+ from docling.utils.api_image_request import api_image_request
49
14
 
50
15
 
51
16
  class PictureDescriptionApiModel(PictureDescriptionBaseModel):
@@ -83,43 +48,11 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
83
48
  # Note: technically we could make a batch request here,
84
49
  # but not all APIs will allow for it. For example, vllm won't allow more than 1.
85
50
  for image in images:
86
- img_io = io.BytesIO()
87
- image.save(img_io, "PNG")
88
- image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
89
-
90
- messages = [
91
- {
92
- "role": "user",
93
- "content": [
94
- {
95
- "type": "text",
96
- "text": self.options.prompt,
97
- },
98
- {
99
- "type": "image_url",
100
- "image_url": {
101
- "url": f"data:image/png;base64,{image_base64}"
102
- },
103
- },
104
- ],
105
- }
106
- ]
107
-
108
- payload = {
109
- "messages": messages,
110
- **self.options.params,
111
- }
112
-
113
- r = requests.post(
114
- str(self.options.url),
115
- headers=self.options.headers,
116
- json=payload,
51
+ yield api_image_request(
52
+ image=image,
53
+ prompt=self.options.prompt,
54
+ url=self.options.url,
117
55
  timeout=self.options.timeout,
56
+ headers=self.options.headers,
57
+ **self.options.params,
118
58
  )
119
- if not r.ok:
120
- _log.error(f"Error calling the API. Reponse was {r.text}")
121
- r.raise_for_status()
122
-
123
- api_resp = ApiResponse.model_validate_json(r.text)
124
- generated_text = api_resp.choices[0].message.content.strip()
125
- yield generated_text
@@ -63,8 +63,20 @@ class PictureDescriptionBaseModel(
63
63
  elements: List[PictureItem] = []
64
64
  for el in element_batch:
65
65
  assert isinstance(el.item, PictureItem)
66
- elements.append(el.item)
67
- images.append(el.image)
66
+ describe_image = True
67
+ # Don't describe the image if it's smaller than the threshold
68
+ if len(el.item.prov) > 0:
69
+ prov = el.item.prov[0] # PictureItems have at most a single provenance
70
+ page = doc.pages.get(prov.page_no)
71
+ if page is not None:
72
+ page_area = page.size.width * page.size.height
73
+ if page_area > 0:
74
+ area_fraction = prov.bbox.area() / page_area
75
+ if area_fraction < self.options.picture_area_threshold:
76
+ describe_image = False
77
+ if describe_image:
78
+ elements.append(el.item)
79
+ images.append(el.image)
68
80
 
69
81
  outputs = self._annotate_images(images)
70
82
 
@@ -2,7 +2,7 @@ import logging
2
2
  import sys
3
3
  import warnings
4
4
  from pathlib import Path
5
- from typing import Optional
5
+ from typing import Optional, cast
6
6
 
7
7
  from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
8
8
 
@@ -226,7 +226,11 @@ class StandardPdfPipeline(PaginatedPipeline):
226
226
  and self.pipeline_options.generate_table_images
227
227
  ):
228
228
  page_ix = element.prov[0].page_no - 1
229
- page = conv_res.pages[page_ix]
229
+ page = next(
230
+ (p for p in conv_res.pages if p.page_no == page_ix),
231
+ cast("Page", None),
232
+ )
233
+ assert page is not None
230
234
  assert page.size is not None
231
235
  assert page.image is not None
232
236
 
@@ -15,11 +15,14 @@ from docling.backend.pdf_backend import PdfDocumentBackend
15
15
  from docling.datamodel.base_models import InputFormat, Page
16
16
  from docling.datamodel.document import ConversionResult, InputDocument
17
17
  from docling.datamodel.pipeline_options import (
18
+ ApiVlmOptions,
19
+ HuggingFaceVlmOptions,
18
20
  InferenceFramework,
19
21
  ResponseFormat,
20
22
  VlmPipelineOptions,
21
23
  )
22
24
  from docling.datamodel.settings import settings
25
+ from docling.models.api_vlm_model import ApiVlmModel
23
26
  from docling.models.hf_mlx_model import HuggingFaceMlxModel
24
27
  from docling.models.hf_vlm_model import HuggingFaceVlmModel
25
28
  from docling.pipeline.base_pipeline import PaginatedPipeline
@@ -57,27 +60,34 @@ class VlmPipeline(PaginatedPipeline):
57
60
 
58
61
  self.keep_images = self.pipeline_options.generate_page_images
59
62
 
60
- if (
61
- self.pipeline_options.vlm_options.inference_framework
62
- == InferenceFramework.MLX
63
- ):
63
+ if isinstance(pipeline_options.vlm_options, ApiVlmOptions):
64
64
  self.build_pipe = [
65
- HuggingFaceMlxModel(
65
+ ApiVlmModel(
66
66
  enabled=True, # must be always enabled for this pipeline to make sense.
67
- artifacts_path=artifacts_path,
68
- accelerator_options=pipeline_options.accelerator_options,
69
- vlm_options=self.pipeline_options.vlm_options,
70
- ),
71
- ]
72
- else:
73
- self.build_pipe = [
74
- HuggingFaceVlmModel(
75
- enabled=True, # must be always enabled for this pipeline to make sense.
76
- artifacts_path=artifacts_path,
77
- accelerator_options=pipeline_options.accelerator_options,
78
- vlm_options=self.pipeline_options.vlm_options,
67
+ enable_remote_services=self.pipeline_options.enable_remote_services,
68
+ vlm_options=cast(ApiVlmOptions, self.pipeline_options.vlm_options),
79
69
  ),
80
70
  ]
71
+ elif isinstance(self.pipeline_options.vlm_options, HuggingFaceVlmOptions):
72
+ vlm_options = cast(HuggingFaceVlmOptions, self.pipeline_options.vlm_options)
73
+ if vlm_options.inference_framework == InferenceFramework.MLX:
74
+ self.build_pipe = [
75
+ HuggingFaceMlxModel(
76
+ enabled=True, # must be always enabled for this pipeline to make sense.
77
+ artifacts_path=artifacts_path,
78
+ accelerator_options=pipeline_options.accelerator_options,
79
+ vlm_options=vlm_options,
80
+ ),
81
+ ]
82
+ else:
83
+ self.build_pipe = [
84
+ HuggingFaceVlmModel(
85
+ enabled=True, # must be always enabled for this pipeline to make sense.
86
+ artifacts_path=artifacts_path,
87
+ accelerator_options=pipeline_options.accelerator_options,
88
+ vlm_options=vlm_options,
89
+ ),
90
+ ]
81
91
 
82
92
  self.enrichment_pipe = [
83
93
  # Other models working on `NodeItem` elements in the DoclingDocument
@@ -0,0 +1,61 @@
1
+ import base64
2
+ import logging
3
+ from io import BytesIO
4
+ from typing import Dict, Optional
5
+
6
+ import requests
7
+ from PIL import Image
8
+ from pydantic import AnyUrl
9
+
10
+ from docling.datamodel.base_models import OpenAiApiResponse
11
+
12
+ _log = logging.getLogger(__name__)
13
+
14
+
15
+ def api_image_request(
16
+ image: Image.Image,
17
+ prompt: str,
18
+ url: AnyUrl,
19
+ timeout: float = 20,
20
+ headers: Optional[Dict[str, str]] = None,
21
+ **params,
22
+ ) -> str:
23
+ img_io = BytesIO()
24
+ image.save(img_io, "PNG")
25
+ image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
26
+ messages = [
27
+ {
28
+ "role": "user",
29
+ "content": [
30
+ {
31
+ "type": "image_url",
32
+ "image_url": {"url": f"data:image/png;base64,{image_base64}"},
33
+ },
34
+ {
35
+ "type": "text",
36
+ "text": prompt,
37
+ },
38
+ ],
39
+ }
40
+ ]
41
+
42
+ payload = {
43
+ "messages": messages,
44
+ **params,
45
+ }
46
+
47
+ headers = headers or {}
48
+
49
+ r = requests.post(
50
+ str(url),
51
+ headers=headers,
52
+ json=payload,
53
+ timeout=timeout,
54
+ )
55
+ if not r.ok:
56
+ _log.error(f"Error calling the API. Response was {r.text}")
57
+ r.raise_for_status()
58
+
59
+ api_resp = OpenAiApiResponse.model_validate_json(r.text)
60
+ generated_text = api_resp.choices[0].message.content.strip()
61
+ return generated_text
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.29.0
3
+ Version: 2.30.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/docling-project/docling
6
6
  License: MIT
@@ -28,7 +28,7 @@ Provides-Extra: vlm
28
28
  Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
29
29
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
30
30
  Requires-Dist: certifi (>=2024.7.4)
31
- Requires-Dist: docling-core[chunking] (>=2.24.1,<3.0.0)
31
+ Requires-Dist: docling-core[chunking] (>=2.26.0,<3.0.0)
32
32
  Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
33
33
  Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
34
34
  Requires-Dist: easyocr (>=1.7,<2.0)
@@ -58,7 +58,7 @@ Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
58
58
  Requires-Dist: tqdm (>=4.65.0,<5.0.0)
59
59
  Requires-Dist: transformers (>=4.42.0,<4.43.0) ; (sys_platform == "darwin" and platform_machine == "x86_64") and (extra == "vlm")
60
60
  Requires-Dist: transformers (>=4.46.0,<5.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
61
- Requires-Dist: typer (>=0.12.5,<0.13.0)
61
+ Requires-Dist: typer (>=0.12.5,<0.16.0)
62
62
  Project-URL: Repository, https://github.com/docling-project/docling
63
63
  Description-Content-Type: text/markdown
64
64
 
@@ -14,9 +14,9 @@ docling/backend/html_backend.py,sha256=ghPLZfdBEPBzLIO9IWzzx0t1Os9B9r4VyGyEZtMsZ
14
14
  docling/backend/json/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
15
  docling/backend/json/docling_json_backend.py,sha256=LlFMVoZrrCfVwbDuRbNN4Xg96Lujh4xxrTBt9jGhY9I,1984
16
16
  docling/backend/md_backend.py,sha256=lqDiKIBHGsA0u-H1n9oVpPlrcpVT4gYRuNXXcyGlftM,17219
17
- docling/backend/msexcel_backend.py,sha256=_ZVZFKRRijpg-Xz10xNxu2m-NpDaYvoiBqEZP6GbrgE,11095
17
+ docling/backend/msexcel_backend.py,sha256=KRPoHRDv-mqko9RUHGQCzdRrvDo7g7zSU2Z5zoL_Hzo,18106
18
18
  docling/backend/mspowerpoint_backend.py,sha256=X55-1anXm562wxAuYn5uwQkqKjirmgrn1KfbeaKUbXw,17273
19
- docling/backend/msword_backend.py,sha256=1Yjs8J9vRSNDsgb9IKSKYcbvnoj1hO4Kf_mqncz3Ijs,32103
19
+ docling/backend/msword_backend.py,sha256=CgNPjU8SQ7rkAYH_BGiUyv568MGhoH3R0M39WBT8gkc,32468
20
20
  docling/backend/pdf_backend.py,sha256=odWb1rxk3WCUIEJMhq-dYFNUQ1pSDuNHbU9wlTZIRAs,2211
21
21
  docling/backend/pypdfium2_backend.py,sha256=wRwhA5XHRqL7vyNhCAHM6P-ONkwtyjKG9LgC4NJ-4i8,10784
22
22
  docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -24,17 +24,18 @@ docling/backend/xml/jats_backend.py,sha256=HXailrDjiwu4swwFnXy3lNfRtLZmkBBp4yqaf
24
24
  docling/backend/xml/uspto_backend.py,sha256=H0jwIt2skOke_yEUk0wfXCtodrB-hrj2ygLtB3jMWaI,71056
25
25
  docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
26
26
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
- docling/cli/main.py,sha256=x8wmu0vb_wwpswdj8EKJyc3EnpVA1wnTJA4bjXRdi5Q,25255
27
+ docling/cli/main.py,sha256=TD-cEf4giuk1O5NPoB-heXHHteUqKoLsj4Rg4xsBUrs,26119
28
28
  docling/cli/models.py,sha256=tM_qbMM3YOPxFU7JlME96MLbtd1CX_bOAK7FS-NhJvY,3979
29
29
  docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
30
30
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
- docling/datamodel/base_models.py,sha256=MAHr8LlffZ2uIXZ3AXOsikh_-oQIEYTiwwjsz-dQW9U,7287
32
- docling/datamodel/document.py,sha256=DbJifyMgBEkAk80BMYXTuSgqH2vijDENDkU7Fmr6j_g,14567
33
- docling/datamodel/pipeline_options.py,sha256=TpRf_-7UuCjjaytFWA0nL2m-KP4no9jeAjaXRjBLMLE,12593
31
+ docling/datamodel/base_models.py,sha256=fJfFMaHXc-CUrAVfhPF8lKrdb-gaXr2tohx6dHldvRU,7926
32
+ docling/datamodel/document.py,sha256=V0iK1MYOkPIzd4eQa-G8unp-t01fktlG9wwQ1IwE6Zg,15109
33
+ docling/datamodel/pipeline_options.py,sha256=iGLijZR-YOtmg0RQs59pqoG_1uGsDYbg5wMDD0FWYx4,13351
34
34
  docling/datamodel/settings.py,sha256=bNMdowIKv7RUchabQTo4rFNEsxfB6pGg2LoZSY634zo,1869
35
35
  docling/document_converter.py,sha256=LCX92FzgmXNJLFVSQfjqH9SGe3zA7FGwARedSigFIpY,13798
36
36
  docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
37
37
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
+ docling/models/api_vlm_model.py,sha256=6SxMsFPf0SbT365P67KspdpF3TXZSeu5kmPE3lXAhW4,2470
38
39
  docling/models/base_model.py,sha256=9xJ0VIlpR2BzqoEWMC8LYp5Y96QAEKip4b_HCwCDltY,2931
39
40
  docling/models/base_ocr_model.py,sha256=xvKMhE4ZOGkL2GAhpDvrAHLLFps3ZUfxXZ5ctL1lXUw,7226
40
41
  docling/models/code_formula_model.py,sha256=mOu5luYMzyrCCr8MRGOciNcSvULpQysDd_FXn96WPc8,11477
@@ -50,8 +51,8 @@ docling/models/layout_model.py,sha256=7fQWipGV1HDrvbP4uOKa9QAicQl89jp7lailQmbFL3
50
51
  docling/models/ocr_mac_model.py,sha256=2pZaUWg19go_u88mKWr5y_52PAYEN__GsbyUYLdY4zo,5353
51
52
  docling/models/page_assemble_model.py,sha256=ivkCdbZJpFcGl7CazLegcP1tLK8ZixDfVhQXqsdW_UA,6359
52
53
  docling/models/page_preprocessing_model.py,sha256=Ja7RE1K-2fWxWrxOzNm6QDSGqFf-MY6_uY5OAZ7AQSo,3078
53
- docling/models/picture_description_api_model.py,sha256=SRjOkCTBYa1pTIaQffDLUPabljjYrLOQ916MywESEXk,3715
54
- docling/models/picture_description_base_model.py,sha256=uRpjBXC2qjpPyWFUt600N1GvmvF-vWwB8f-OTQ7PfDg,2305
54
+ docling/models/picture_description_api_model.py,sha256=DowWOU93MXAjj3N1A9ex88Sa3Nic2c3dfoOYir5jZEA,2064
55
+ docling/models/picture_description_base_model.py,sha256=khuhQZDAZemZMe4BsrBMpjEwkY3nhMFXuczjQpSQrVY,2971
55
56
  docling/models/picture_description_vlm_model.py,sha256=I2Un3vfhQVeWEyZ3Sd3Kygw9la2QSZCwDfl_7XVlMm4,4042
56
57
  docling/models/plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
57
58
  docling/models/plugins/defaults.py,sha256=qslXGnRX07Z3GGttNriqaox0v0vXp4zs4KLurHCZjp4,858
@@ -63,11 +64,12 @@ docling/models/tesseract_ocr_model.py,sha256=UpLAgKgJtBgbKtJELmKBNMcejJJKBCyFK0q
63
64
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
64
65
  docling/pipeline/base_pipeline.py,sha256=9ABK-Cr235bxE5vweoIA5rgBZV_EF8qFxAqLI27H_Pg,8749
65
66
  docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
66
- docling/pipeline/standard_pdf_pipeline.py,sha256=tHOHFyJajX6IAhm4y3I27uqn5jfMTuCaSaFOKT5JM2M,10593
67
- docling/pipeline/vlm_pipeline.py,sha256=1eKt3gqWf6PxGvYZuqhKi2BFljJGJWIyHemzOAwa39Y,9065
67
+ docling/pipeline/standard_pdf_pipeline.py,sha256=gPNqUparhIONG4AyMekW9OfZ7t8YMs0odhtbE6Z-Hxw,10784
68
+ docling/pipeline/vlm_pipeline.py,sha256=dqQYAd3viW577TVSZltnB4P-f-ZUWQh0J8SSFDuQN6Q,9738
68
69
  docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
69
70
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
70
71
  docling/utils/accelerator_utils.py,sha256=ONNRrC8fH-8E93WUCNhfOq1t7WrQ1T7-YsmExTOY5f0,2292
72
+ docling/utils/api_image_request.py,sha256=_CgdzmPqdsyXmyYUFGLZcXcoH586qC6A1p5vsNbj1Q0,1416
71
73
  docling/utils/export.py,sha256=4W-ptI1fLdVrtoqHdHY1RF9Xn2Yescs-hunITqxJ7Is,4697
72
74
  docling/utils/glm_utils.py,sha256=W4JRoP0xQ6SJmhhIoAfcKxm5dr1CFvLHp8pqI1kdhxs,12250
73
75
  docling/utils/layout_postprocessor.py,sha256=Q36DfcIYMuMfC6LzCBIrYtHK7pBE-Xyvjepz660s9UM,24508
@@ -77,8 +79,8 @@ docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,26
77
79
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
78
80
  docling/utils/utils.py,sha256=0ozCk7zUkYzxRVmYoIB2zA1lqjQOuaQzxfGuf1wmKW4,1866
79
81
  docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
80
- docling-2.29.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
81
- docling-2.29.0.dist-info/METADATA,sha256=PPcVfE4GnjhcLLurofnugm6QLj0EKRuaIuhlPuXYRT8,9982
82
- docling-2.29.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
83
- docling-2.29.0.dist-info/entry_points.txt,sha256=pIxel-UeVo1S7FhoNG5xgEfPjLZfBLi_N9TsGPtJSLo,144
84
- docling-2.29.0.dist-info/RECORD,,
82
+ docling-2.30.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
83
+ docling-2.30.0.dist-info/METADATA,sha256=HSI154YUnSDJE8BMMjOuu-U3EXQg0ksFuyuyzv7-UdU,9982
84
+ docling-2.30.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
85
+ docling-2.30.0.dist-info/entry_points.txt,sha256=pIxel-UeVo1S7FhoNG5xgEfPjLZfBLi_N9TsGPtJSLo,144
86
+ docling-2.30.0.dist-info/RECORD,,