docling 2.5.2__py3-none-any.whl → 2.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,374 @@
1
+ import logging
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from typing import Dict, Set, Tuple, Union
5
+
6
+ from docling_core.types.doc import (
7
+ DoclingDocument,
8
+ DocumentOrigin,
9
+ GroupLabel,
10
+ ImageRef,
11
+ TableCell,
12
+ TableData,
13
+ )
14
+
15
+ # from lxml import etree
16
+ from openpyxl import Workbook, load_workbook
17
+ from openpyxl.cell.cell import Cell
18
+ from openpyxl.drawing.image import Image
19
+ from openpyxl.worksheet.worksheet import Worksheet
20
+
21
+ from docling.backend.abstract_backend import DeclarativeDocumentBackend
22
+ from docling.datamodel.base_models import InputFormat
23
+ from docling.datamodel.document import InputDocument
24
+
25
+ _log = logging.getLogger(__name__)
26
+
27
+ from typing import Any, List
28
+
29
+ from pydantic import BaseModel
30
+
31
+
32
+ class ExcelCell(BaseModel):
33
+ row: int
34
+ col: int
35
+ text: str
36
+ row_span: int
37
+ col_span: int
38
+
39
+
40
+ class ExcelTable(BaseModel):
41
+ num_rows: int
42
+ num_cols: int
43
+ data: List[ExcelCell]
44
+
45
+
46
+ class MsExcelDocumentBackend(DeclarativeDocumentBackend):
47
+
48
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
49
+ super().__init__(in_doc, path_or_stream)
50
+
51
+ # Initialise the parents for the hierarchy
52
+ self.max_levels = 10
53
+
54
+ self.parents: Dict[int, Any] = {}
55
+ for i in range(-1, self.max_levels):
56
+ self.parents[i] = None
57
+
58
+ self.workbook = None
59
+ try:
60
+ if isinstance(self.path_or_stream, BytesIO):
61
+ self.workbook = load_workbook(filename=self.path_or_stream)
62
+
63
+ elif isinstance(self.path_or_stream, Path):
64
+ self.workbook = load_workbook(filename=str(self.path_or_stream))
65
+
66
+ self.valid = True
67
+ except Exception as e:
68
+ self.valid = False
69
+
70
+ raise RuntimeError(
71
+ f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
72
+ ) from e
73
+
74
+ def is_valid(self) -> bool:
75
+ _log.info(f"valid: {self.valid}")
76
+ return self.valid
77
+
78
+ @classmethod
79
+ def supports_pagination(cls) -> bool:
80
+ return True
81
+
82
+ def unload(self):
83
+ if isinstance(self.path_or_stream, BytesIO):
84
+ self.path_or_stream.close()
85
+
86
+ self.path_or_stream = None
87
+
88
+ @classmethod
89
+ def supported_formats(cls) -> Set[InputFormat]:
90
+ return {InputFormat.XLSX}
91
+
92
+ def convert(self) -> DoclingDocument:
93
+ # Parses the XLSX into a structured document model.
94
+
95
+ origin = DocumentOrigin(
96
+ filename=self.file.name or "file.xlsx",
97
+ mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
98
+ binary_hash=self.document_hash,
99
+ )
100
+
101
+ doc = DoclingDocument(name=self.file.stem or "file.xlsx", origin=origin)
102
+
103
+ if self.is_valid():
104
+ doc = self._convert_workbook(doc)
105
+ else:
106
+ raise RuntimeError(
107
+ f"Cannot convert doc with {self.document_hash} because the backend failed to init."
108
+ )
109
+
110
+ return doc
111
+
112
+ def _convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
113
+
114
+ if self.workbook is not None:
115
+
116
+ # Iterate over all sheets
117
+ for sheet_name in self.workbook.sheetnames:
118
+ _log.info(f"Processing sheet: {sheet_name}")
119
+
120
+ # Access the sheet by name
121
+ sheet = self.workbook[sheet_name]
122
+
123
+ self.parents[0] = doc.add_group(
124
+ parent=None,
125
+ label=GroupLabel.SECTION,
126
+ name=f"sheet: {sheet_name}",
127
+ )
128
+
129
+ doc = self._convert_sheet(doc, sheet)
130
+ else:
131
+ _log.error("Workbook is not initialized.")
132
+
133
+ return doc
134
+
135
+ def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet):
136
+
137
+ doc = self._find_tables_in_sheet(doc, sheet)
138
+
139
+ doc = self._find_images_in_sheet(doc, sheet)
140
+
141
+ return doc
142
+
143
+ def _find_tables_in_sheet(self, doc: DoclingDocument, sheet: Worksheet):
144
+
145
+ tables = self._find_data_tables(sheet)
146
+
147
+ for excel_table in tables:
148
+ num_rows = excel_table.num_rows
149
+ num_cols = excel_table.num_cols
150
+
151
+ table_data = TableData(
152
+ num_rows=num_rows,
153
+ num_cols=num_cols,
154
+ table_cells=[],
155
+ )
156
+
157
+ for excel_cell in excel_table.data:
158
+
159
+ cell = TableCell(
160
+ text=excel_cell.text,
161
+ row_span=excel_cell.row_span,
162
+ col_span=excel_cell.col_span,
163
+ start_row_offset_idx=excel_cell.row,
164
+ end_row_offset_idx=excel_cell.row + excel_cell.row_span,
165
+ start_col_offset_idx=excel_cell.col,
166
+ end_col_offset_idx=excel_cell.col + excel_cell.col_span,
167
+ col_header=False,
168
+ row_header=False,
169
+ )
170
+ table_data.table_cells.append(cell)
171
+
172
+ doc.add_table(data=table_data, parent=self.parents[0])
173
+
174
+ return doc
175
+
176
+ def _find_data_tables(self, sheet: Worksheet):
177
+ """
178
+ Find all compact rectangular data tables in a sheet.
179
+ """
180
+ # _log.info("find_data_tables")
181
+
182
+ tables = [] # List to store found tables
183
+ visited: set[Tuple[int, int]] = set() # Track already visited cells
184
+
185
+ # Iterate over all cells in the sheet
186
+ for ri, row in enumerate(sheet.iter_rows(values_only=False)):
187
+ for rj, cell in enumerate(row):
188
+
189
+ # Skip empty or already visited cells
190
+ if cell.value is None or (ri, rj) in visited:
191
+ continue
192
+
193
+ # If the cell starts a new table, find its bounds
194
+ table_bounds, visited_cells = self._find_table_bounds(
195
+ sheet, ri, rj, visited
196
+ )
197
+
198
+ visited.update(visited_cells) # Mark these cells as visited
199
+ tables.append(table_bounds)
200
+
201
+ return tables
202
+
203
+ def _find_table_bounds(
204
+ self,
205
+ sheet: Worksheet,
206
+ start_row: int,
207
+ start_col: int,
208
+ visited: set[Tuple[int, int]],
209
+ ):
210
+ """
211
+ Determine the bounds of a compact rectangular table.
212
+ Returns:
213
+ - A dictionary with the bounds and data.
214
+ - A set of visited cell coordinates.
215
+ """
216
+ _log.info("find_table_bounds")
217
+
218
+ max_row = self._find_table_bottom(sheet, start_row, start_col)
219
+ max_col = self._find_table_right(sheet, start_row, start_col)
220
+
221
+ # Collect the data within the bounds
222
+ data = []
223
+ visited_cells = set()
224
+ for ri in range(start_row, max_row + 1):
225
+ for rj in range(start_col, max_col + 1):
226
+
227
+ cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing
228
+
229
+ # Check if the cell belongs to a merged range
230
+ row_span = 1
231
+ col_span = 1
232
+
233
+ # _log.info(sheet.merged_cells.ranges)
234
+ for merged_range in sheet.merged_cells.ranges:
235
+
236
+ if (
237
+ merged_range.min_row <= ri + 1
238
+ and ri + 1 <= merged_range.max_row
239
+ and merged_range.min_col <= rj + 1
240
+ and rj + 1 <= merged_range.max_col
241
+ ):
242
+
243
+ row_span = merged_range.max_row - merged_range.min_row + 1
244
+ col_span = merged_range.max_col - merged_range.min_col + 1
245
+ break
246
+
247
+ if (ri, rj) not in visited_cells:
248
+ data.append(
249
+ ExcelCell(
250
+ row=ri - start_row,
251
+ col=rj - start_col,
252
+ text=str(cell.value),
253
+ row_span=row_span,
254
+ col_span=col_span,
255
+ )
256
+ )
257
+ # _log.info(f"cell: {ri}, {rj} -> {ri - start_row}, {rj - start_col}, {row_span}, {col_span}: {str(cell.value)}")
258
+
259
+ # Mark all cells in the span as visited
260
+ for span_row in range(ri, ri + row_span):
261
+ for span_col in range(rj, rj + col_span):
262
+ visited_cells.add((span_row, span_col))
263
+
264
+ return (
265
+ ExcelTable(
266
+ num_rows=max_row + 1 - start_row,
267
+ num_cols=max_col + 1 - start_col,
268
+ data=data,
269
+ ),
270
+ visited_cells,
271
+ )
272
+
273
+ def _find_table_bottom(self, sheet: Worksheet, start_row: int, start_col: int):
274
+ """Function to find the bottom boundary of the table"""
275
+
276
+ max_row = start_row
277
+
278
+ while max_row < sheet.max_row - 1:
279
+ # Get the cell value or check if it is part of a merged cell
280
+ cell = sheet.cell(row=max_row + 2, column=start_col + 1)
281
+
282
+ # Check if the cell is part of a merged range
283
+ merged_range = next(
284
+ (mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
285
+ None,
286
+ )
287
+
288
+ if cell.value is None and not merged_range:
289
+ break # Stop if the cell is empty and not merged
290
+
291
+ # Expand max_row to include the merged range if applicable
292
+ if merged_range:
293
+ max_row = max(max_row, merged_range.max_row - 1)
294
+ else:
295
+ max_row += 1
296
+
297
+ return max_row
298
+
299
+ def _find_table_right(self, sheet: Worksheet, start_row: int, start_col: int):
300
+ """Function to find the right boundary of the table"""
301
+
302
+ max_col = start_col
303
+
304
+ while max_col < sheet.max_column - 1:
305
+ # Get the cell value or check if it is part of a merged cell
306
+ cell = sheet.cell(row=start_row + 1, column=max_col + 2)
307
+
308
+ # Check if the cell is part of a merged range
309
+ merged_range = next(
310
+ (mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
311
+ None,
312
+ )
313
+
314
+ if cell.value is None and not merged_range:
315
+ break # Stop if the cell is empty and not merged
316
+
317
+ # Expand max_col to include the merged range if applicable
318
+ if merged_range:
319
+ max_col = max(max_col, merged_range.max_col - 1)
320
+ else:
321
+ max_col += 1
322
+
323
+ return max_col
324
+
325
+ def _find_images_in_sheet(
326
+ self, doc: DoclingDocument, sheet: Worksheet
327
+ ) -> DoclingDocument:
328
+
329
+ # FIXME: mypy does not agree with _images ...
330
+ """
331
+ # Iterate over images in the sheet
332
+ for idx, image in enumerate(sheet._images): # Access embedded images
333
+
334
+ image_bytes = BytesIO(image.ref.blob)
335
+ pil_image = Image.open(image_bytes)
336
+
337
+ doc.add_picture(
338
+ parent=self.parents[0],
339
+ image=ImageRef.from_pil(image=pil_image, dpi=72),
340
+ caption=None,
341
+ )
342
+ """
343
+
344
+ # FIXME: mypy does not agree with _charts ...
345
+ """
346
+ for idx, chart in enumerate(sheet._charts): # Access embedded charts
347
+ chart_path = f"chart_{idx + 1}.png"
348
+ _log.info(
349
+ f"Chart found, but dynamic rendering is required for: {chart_path}"
350
+ )
351
+
352
+ _log.info(f"Chart {idx + 1}:")
353
+
354
+ # Chart type
355
+ _log.info(f"Type: {type(chart).__name__}")
356
+
357
+ # Title
358
+ if chart.title:
359
+ _log.info(f"Title: {chart.title}")
360
+ else:
361
+ _log.info("No title")
362
+
363
+ # Data series
364
+ for series in chart.series:
365
+ _log.info(" => series ...")
366
+ _log.info(f"Data Series: {series.title}")
367
+ _log.info(f"Values: {series.values}")
368
+ _log.info(f"Categories: {series.categories}")
369
+
370
+ # Position
371
+ # _log.info(f"Anchor Cell: {chart.anchor}")
372
+ """
373
+
374
+ return doc
@@ -10,11 +10,13 @@ from docling_core.types.doc import (
10
10
  DoclingDocument,
11
11
  DocumentOrigin,
12
12
  GroupLabel,
13
+ ImageRef,
13
14
  ProvenanceItem,
14
15
  Size,
15
16
  TableCell,
16
17
  TableData,
17
18
  )
19
+ from PIL import Image
18
20
  from pptx import Presentation
19
21
  from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
20
22
 
@@ -268,9 +270,22 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
268
270
  return
269
271
 
270
272
  def handle_pictures(self, shape, parent_slide, slide_ind, doc):
273
+ # Get the image bytes
274
+ image = shape.image
275
+ image_bytes = image.blob
276
+ im_dpi, _ = image.dpi
277
+
278
+ # Open it with PIL
279
+ pil_image = Image.open(BytesIO(image_bytes))
280
+
271
281
  # shape has picture
272
282
  prov = self.generate_prov(shape, slide_ind, "")
273
- doc.add_picture(parent=parent_slide, caption=None, prov=prov)
283
+ doc.add_picture(
284
+ parent=parent_slide,
285
+ image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
286
+ caption=None,
287
+ prov=prov,
288
+ )
274
289
  return
275
290
 
276
291
  def handle_tables(self, shape, parent_slide, slide_ind, doc):
@@ -9,10 +9,12 @@ from docling_core.types.doc import (
9
9
  DoclingDocument,
10
10
  DocumentOrigin,
11
11
  GroupLabel,
12
+ ImageRef,
12
13
  TableCell,
13
14
  TableData,
14
15
  )
15
16
  from lxml import etree
17
+ from PIL import Image
16
18
 
17
19
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
18
20
  from docling.datamodel.base_models import InputFormat
@@ -130,13 +132,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
130
132
  def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
131
133
  for element in body:
132
134
  tag_name = etree.QName(element).localname
133
- # Check for Inline Images (drawings or blip elements)
134
- found_drawing = etree.ElementBase.xpath(
135
- element, ".//w:drawing", namespaces=self.xml_namespaces
136
- )
137
- found_pict = etree.ElementBase.xpath(
138
- element, ".//w:pict", namespaces=self.xml_namespaces
139
- )
135
+ # Check for Inline Images (blip elements)
136
+ drawing_blip = element.xpath(".//a:blip")
140
137
 
141
138
  # Check for Tables
142
139
  if element.tag.endswith("tbl"):
@@ -145,8 +142,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
145
142
  except Exception:
146
143
  _log.debug("could not parse a table, broken docx table")
147
144
 
148
- elif found_drawing or found_pict:
149
- self.handle_pictures(element, docx_obj, doc)
145
+ elif drawing_blip:
146
+ self.handle_pictures(element, docx_obj, drawing_blip, doc)
150
147
  # Check for Text
151
148
  elif tag_name in ["p"]:
152
149
  self.handle_text_elements(element, docx_obj, doc)
@@ -491,6 +488,24 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
491
488
  doc.add_table(data=data, parent=self.parents[level - 1])
492
489
  return
493
490
 
494
- def handle_pictures(self, element, docx_obj, doc):
495
- doc.add_picture(parent=self.parents[self.level], caption=None)
491
+ def handle_pictures(self, element, docx_obj, drawing_blip, doc):
492
+ def get_docx_image(element, drawing_blip):
493
+ rId = drawing_blip[0].get(
494
+ "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
495
+ )
496
+ if rId in docx_obj.part.rels:
497
+ # Access the image part using the relationship ID
498
+ image_part = docx_obj.part.rels[rId].target_part
499
+ image_data = image_part.blob # Get the binary image data
500
+ return image_data
501
+
502
+ image_data = get_docx_image(element, drawing_blip)
503
+ image_bytes = BytesIO(image_data)
504
+ # Open the BytesIO object with PIL to create an Image
505
+ pil_image = Image.open(image_bytes)
506
+ doc.add_picture(
507
+ parent=self.parents[self.level],
508
+ image=ImageRef.from_pil(image=pil_image, dpi=72),
509
+ caption=None,
510
+ )
496
511
  return
docling/cli/main.py CHANGED
@@ -1,6 +1,7 @@
1
1
  import importlib
2
2
  import json
3
3
  import logging
4
+ import re
4
5
  import time
5
6
  import warnings
6
7
  from enum import Enum
@@ -23,6 +24,7 @@ from docling.datamodel.base_models import (
23
24
  from docling.datamodel.document import ConversionResult
24
25
  from docling.datamodel.pipeline_options import (
25
26
  EasyOcrOptions,
27
+ OcrMacOptions,
26
28
  OcrOptions,
27
29
  PdfPipelineOptions,
28
30
  TableFormerMode,
@@ -73,6 +75,7 @@ class OcrEngine(str, Enum):
73
75
  EASYOCR = "easyocr"
74
76
  TESSERACT_CLI = "tesseract_cli"
75
77
  TESSERACT = "tesseract"
78
+ OCRMAC = "ocrmac"
76
79
 
77
80
 
78
81
  def export_documents(
@@ -129,6 +132,12 @@ def export_documents(
129
132
  )
130
133
 
131
134
 
135
+ def _split_list(raw: Optional[str]) -> Optional[List[str]]:
136
+ if raw is None:
137
+ return None
138
+ return re.split(r"[;,]", raw)
139
+
140
+
132
141
  @app.command(no_args_is_help=True)
133
142
  def convert(
134
143
  input_sources: Annotated[
@@ -163,6 +172,13 @@ def convert(
163
172
  ocr_engine: Annotated[
164
173
  OcrEngine, typer.Option(..., help="The OCR engine to use.")
165
174
  ] = OcrEngine.EASYOCR,
175
+ ocr_lang: Annotated[
176
+ Optional[str],
177
+ typer.Option(
178
+ ...,
179
+ help="Provide a comma-separated list of languages used by the OCR engine. Note that each OCR engine has different values for the language names.",
180
+ ),
181
+ ] = None,
166
182
  pdf_backend: Annotated[
167
183
  PdfBackend, typer.Option(..., help="The PDF backend to use.")
168
184
  ] = PdfBackend.DLPARSE_V1,
@@ -185,6 +201,15 @@ def convert(
185
201
  output: Annotated[
186
202
  Path, typer.Option(..., help="Output directory where results are saved.")
187
203
  ] = Path("."),
204
+ verbose: Annotated[
205
+ int,
206
+ typer.Option(
207
+ "--verbose",
208
+ "-v",
209
+ count=True,
210
+ help="Set the verbosity level. -v for info logging, -vv for debug logging.",
211
+ ),
212
+ ] = 0,
188
213
  version: Annotated[
189
214
  Optional[bool],
190
215
  typer.Option(
@@ -195,7 +220,12 @@ def convert(
195
220
  ),
196
221
  ] = None,
197
222
  ):
198
- logging.basicConfig(level=logging.INFO)
223
+ if verbose == 0:
224
+ logging.basicConfig(level=logging.WARNING)
225
+ elif verbose == 1:
226
+ logging.basicConfig(level=logging.INFO)
227
+ elif verbose == 2:
228
+ logging.basicConfig(level=logging.DEBUG)
199
229
 
200
230
  if from_formats is None:
201
231
  from_formats = [e for e in InputFormat]
@@ -224,15 +254,20 @@ def convert(
224
254
  export_txt = OutputFormat.TEXT in to_formats
225
255
  export_doctags = OutputFormat.DOCTAGS in to_formats
226
256
 
227
- match ocr_engine:
228
- case OcrEngine.EASYOCR:
229
- ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
230
- case OcrEngine.TESSERACT_CLI:
231
- ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
232
- case OcrEngine.TESSERACT:
233
- ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
234
- case _:
235
- raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
257
+ if ocr_engine == OcrEngine.EASYOCR:
258
+ ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
259
+ elif ocr_engine == OcrEngine.TESSERACT_CLI:
260
+ ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
261
+ elif ocr_engine == OcrEngine.TESSERACT:
262
+ ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
263
+ elif ocr_engine == OcrEngine.OCRMAC:
264
+ ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
265
+ else:
266
+ raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
267
+
268
+ ocr_lang_list = _split_list(ocr_lang)
269
+ if ocr_lang_list is not None:
270
+ ocr_options.lang = ocr_lang_list
236
271
 
237
272
  pipeline_options = PdfPipelineOptions(
238
273
  do_ocr=ocr,
@@ -245,15 +280,14 @@ def convert(
245
280
  if artifacts_path is not None:
246
281
  pipeline_options.artifacts_path = artifacts_path
247
282
 
248
- match pdf_backend:
249
- case PdfBackend.DLPARSE_V1:
250
- backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
251
- case PdfBackend.DLPARSE_V2:
252
- backend = DoclingParseV2DocumentBackend
253
- case PdfBackend.PYPDFIUM2:
254
- backend = PyPdfiumDocumentBackend
255
- case _:
256
- raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
283
+ if pdf_backend == PdfBackend.DLPARSE_V1:
284
+ backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
285
+ elif pdf_backend == PdfBackend.DLPARSE_V2:
286
+ backend = DoclingParseV2DocumentBackend
287
+ elif pdf_backend == PdfBackend.PYPDFIUM2:
288
+ backend = PyPdfiumDocumentBackend
289
+ else:
290
+ raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
257
291
 
258
292
  format_options: Dict[InputFormat, FormatOption] = {
259
293
  InputFormat.PDF: PdfFormatOption(
@@ -287,5 +321,7 @@ def convert(
287
321
  _log.info(f"All documents were converted in {end_time:.2f} seconds.")
288
322
 
289
323
 
324
+ click_app = typer.main.get_command(app)
325
+
290
326
  if __name__ == "__main__":
291
327
  app()
@@ -32,6 +32,7 @@ class InputFormat(str, Enum):
32
32
  PDF = "pdf"
33
33
  ASCIIDOC = "asciidoc"
34
34
  MD = "md"
35
+ XLSX = "xlsx"
35
36
 
36
37
 
37
38
  class OutputFormat(str, Enum):
@@ -49,6 +50,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
49
50
  InputFormat.HTML: ["html", "htm", "xhtml"],
50
51
  InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
51
52
  InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
53
+ InputFormat.XLSX: ["xlsx"],
52
54
  }
53
55
 
54
56
  FormatToMimeType: Dict[InputFormat, List[str]] = {
@@ -72,7 +74,11 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
72
74
  InputFormat.PDF: ["application/pdf"],
73
75
  InputFormat.ASCIIDOC: ["text/asciidoc"],
74
76
  InputFormat.MD: ["text/markdown", "text/x-markdown"],
77
+ InputFormat.XLSX: [
78
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
79
+ ],
75
80
  }
81
+
76
82
  MimeTypeToFormat = {
77
83
  mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
78
84
  }
@@ -22,6 +22,7 @@ class TableStructureOptions(BaseModel):
22
22
 
23
23
  class OcrOptions(BaseModel):
24
24
  kind: str
25
+ lang: List[str]
25
26
  force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
26
27
  bitmap_area_threshold: float = (
27
28
  0.05 # percentage of the area for a bitmap to processed with OCR
@@ -62,6 +63,17 @@ class TesseractOcrOptions(OcrOptions):
62
63
  )
63
64
 
64
65
 
66
+ class OcrMacOptions(OcrOptions):
67
+ kind: Literal["ocrmac"] = "ocrmac"
68
+ lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
69
+ recognition: str = "accurate"
70
+ framework: str = "vision"
71
+
72
+ model_config = ConfigDict(
73
+ extra="forbid",
74
+ )
75
+
76
+
65
77
  class PipelineOptions(BaseModel):
66
78
  create_legacy_output: bool = (
67
79
  True # This defautl will be set to False on a future version of docling
@@ -74,11 +86,18 @@ class PdfPipelineOptions(PipelineOptions):
74
86
  do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
75
87
 
76
88
  table_structure_options: TableStructureOptions = TableStructureOptions()
77
- ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
78
- Field(EasyOcrOptions(), discriminator="kind")
79
- )
89
+ ocr_options: Union[
90
+ EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions
91
+ ] = Field(EasyOcrOptions(), discriminator="kind")
80
92
 
81
93
  images_scale: float = 1.0
82
94
  generate_page_images: bool = False
83
95
  generate_picture_images: bool = False
84
- generate_table_images: bool = False
96
+ generate_table_images: bool = Field(
97
+ default=False,
98
+ deprecated=(
99
+ "Field `generate_table_images` is deprecated. "
100
+ "To obtain table images, set `PdfPipelineOptions.generate_page_images = True` "
101
+ "before conversion and then use the `TableItem.get_image` function."
102
+ ),
103
+ )
@@ -3,7 +3,7 @@ import sys
3
3
  import time
4
4
  from functools import partial
5
5
  from pathlib import Path
6
- from typing import Dict, Iterable, Iterator, List, Optional, Type
6
+ from typing import Dict, Iterable, Iterator, List, Optional, Type, Union
7
7
 
8
8
  from pydantic import BaseModel, ConfigDict, model_validator, validate_call
9
9
 
@@ -12,6 +12,7 @@ from docling.backend.asciidoc_backend import AsciiDocBackend
12
12
  from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
13
13
  from docling.backend.html_backend import HTMLDocumentBackend
14
14
  from docling.backend.md_backend import MarkdownDocumentBackend
15
+ from docling.backend.msexcel_backend import MsExcelDocumentBackend
15
16
  from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
16
17
  from docling.backend.msword_backend import MsWordDocumentBackend
17
18
  from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
@@ -44,6 +45,11 @@ class FormatOption(BaseModel):
44
45
  return self
45
46
 
46
47
 
48
+ class ExcelFormatOption(FormatOption):
49
+ pipeline_cls: Type = SimplePipeline
50
+ backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
51
+
52
+
47
53
  class WordFormatOption(FormatOption):
48
54
  pipeline_cls: Type = SimplePipeline
49
55
  backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
@@ -80,6 +86,9 @@ class ImageFormatOption(FormatOption):
80
86
 
81
87
 
82
88
  _format_to_default_options = {
89
+ InputFormat.XLSX: FormatOption(
90
+ pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
91
+ ),
83
92
  InputFormat.DOCX: FormatOption(
84
93
  pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
85
94
  ),
@@ -146,7 +155,7 @@ class DocumentConverter:
146
155
  @validate_call(config=ConfigDict(strict=True))
147
156
  def convert(
148
157
  self,
149
- source: Path | str | DocumentStream, # TODO review naming
158
+ source: Union[Path, str, DocumentStream], # TODO review naming
150
159
  raises_on_error: bool = True,
151
160
  max_num_pages: int = sys.maxsize,
152
161
  max_file_size: int = sys.maxsize,
@@ -163,7 +172,7 @@ class DocumentConverter:
163
172
  @validate_call(config=ConfigDict(strict=True))
164
173
  def convert_all(
165
174
  self,
166
- source: Iterable[Path | str | DocumentStream], # TODO review naming
175
+ source: Iterable[Union[Path, str, DocumentStream]], # TODO review naming
167
176
  raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
168
177
  max_num_pages: int = sys.maxsize,
169
178
  max_file_size: int = sys.maxsize,
@@ -174,7 +183,7 @@ class DocumentConverter:
174
183
  )
175
184
  conv_input = _DocumentConversionInput(
176
185
  path_or_stream_iterator=source,
177
- limit=limits,
186
+ limits=limits,
178
187
  )
179
188
  conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
180
189
  for conv_res in conv_res_iter:
@@ -0,0 +1,118 @@
1
+ import logging
2
+ import tempfile
3
+ from typing import Iterable, Optional, Tuple
4
+
5
+ from docling_core.types.doc import BoundingBox, CoordOrigin
6
+
7
+ from docling.datamodel.base_models import OcrCell, Page
8
+ from docling.datamodel.document import ConversionResult
9
+ from docling.datamodel.pipeline_options import OcrMacOptions
10
+ from docling.datamodel.settings import settings
11
+ from docling.models.base_ocr_model import BaseOcrModel
12
+ from docling.utils.profiling import TimeRecorder
13
+
14
+ _log = logging.getLogger(__name__)
15
+
16
+
17
+ class OcrMacModel(BaseOcrModel):
18
+ def __init__(self, enabled: bool, options: OcrMacOptions):
19
+ super().__init__(enabled=enabled, options=options)
20
+ self.options: OcrMacOptions
21
+
22
+ self.scale = 3 # multiplier for 72 dpi == 216 dpi.
23
+
24
+ if self.enabled:
25
+ install_errmsg = (
26
+ "ocrmac is not correctly installed. "
27
+ "Please install it via `pip install ocrmac` to use this OCR engine. "
28
+ "Alternatively, Docling has support for other OCR engines. See the documentation: "
29
+ "https://ds4sd.github.io/docling/installation/"
30
+ )
31
+ try:
32
+ from ocrmac import ocrmac
33
+ except ImportError:
34
+ raise ImportError(install_errmsg)
35
+
36
+ self.reader_RIL = ocrmac.OCR
37
+
38
+ def __call__(
39
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
40
+ ) -> Iterable[Page]:
41
+
42
+ if not self.enabled:
43
+ yield from page_batch
44
+ return
45
+
46
+ for page in page_batch:
47
+ assert page._backend is not None
48
+ if not page._backend.is_valid():
49
+ yield page
50
+ else:
51
+ with TimeRecorder(conv_res, "ocr"):
52
+
53
+ ocr_rects = self.get_ocr_rects(page)
54
+
55
+ all_ocr_cells = []
56
+ for ocr_rect in ocr_rects:
57
+ # Skip zero area boxes
58
+ if ocr_rect.area() == 0:
59
+ continue
60
+ high_res_image = page._backend.get_page_image(
61
+ scale=self.scale, cropbox=ocr_rect
62
+ )
63
+
64
+ with tempfile.NamedTemporaryFile(
65
+ suffix=".png", mode="w"
66
+ ) as image_file:
67
+ fname = image_file.name
68
+ high_res_image.save(fname)
69
+
70
+ boxes = self.reader_RIL(
71
+ fname,
72
+ recognition_level=self.options.recognition,
73
+ framework=self.options.framework,
74
+ language_preference=self.options.lang,
75
+ ).recognize()
76
+
77
+ im_width, im_height = high_res_image.size
78
+ cells = []
79
+ for ix, (text, confidence, box) in enumerate(boxes):
80
+ x = float(box[0])
81
+ y = float(box[1])
82
+ w = float(box[2])
83
+ h = float(box[3])
84
+
85
+ x1 = x * im_width
86
+ y2 = (1 - y) * im_height
87
+
88
+ x2 = x1 + w * im_width
89
+ y1 = y2 - h * im_height
90
+
91
+ left = x1 / self.scale
92
+ top = y1 / self.scale
93
+ right = x2 / self.scale
94
+ bottom = y2 / self.scale
95
+
96
+ cells.append(
97
+ OcrCell(
98
+ id=ix,
99
+ text=text,
100
+ confidence=confidence,
101
+ bbox=BoundingBox.from_tuple(
102
+ coord=(left, top, right, bottom),
103
+ origin=CoordOrigin.TOPLEFT,
104
+ ),
105
+ )
106
+ )
107
+
108
+ # del high_res_image
109
+ all_ocr_cells.extend(cells)
110
+
111
+ # Post-process the cells
112
+ page.cells = self.post_process_cells(all_ocr_cells, page.cells)
113
+
114
+ # DEBUG code:
115
+ if settings.debug.visualize_ocr:
116
+ self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
117
+
118
+ yield page
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import sys
2
3
  from pathlib import Path
3
4
  from typing import Optional
4
5
 
@@ -10,6 +11,7 @@ from docling.datamodel.base_models import AssembledUnit, Page
10
11
  from docling.datamodel.document import ConversionResult
11
12
  from docling.datamodel.pipeline_options import (
12
13
  EasyOcrOptions,
14
+ OcrMacOptions,
13
15
  PdfPipelineOptions,
14
16
  TesseractCliOcrOptions,
15
17
  TesseractOcrOptions,
@@ -18,6 +20,7 @@ from docling.models.base_ocr_model import BaseOcrModel
18
20
  from docling.models.ds_glm_model import GlmModel, GlmOptions
19
21
  from docling.models.easyocr_model import EasyOcrModel
20
22
  from docling.models.layout_model import LayoutModel
23
+ from docling.models.ocr_mac_model import OcrMacModel
21
24
  from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
22
25
  from docling.models.page_preprocessing_model import (
23
26
  PagePreprocessingModel,
@@ -118,6 +121,15 @@ class StandardPdfPipeline(PaginatedPipeline):
118
121
  enabled=self.pipeline_options.do_ocr,
119
122
  options=self.pipeline_options.ocr_options,
120
123
  )
124
+ elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions):
125
+ if "darwin" != sys.platform:
126
+ raise RuntimeError(
127
+ f"The specified OCR type is only supported on Mac: {self.pipeline_options.ocr_options.kind}."
128
+ )
129
+ return OcrMacModel(
130
+ enabled=self.pipeline_options.do_ocr,
131
+ options=self.pipeline_options.ocr_options,
132
+ )
121
133
  return None
122
134
 
123
135
  def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.5.2
3
+ Version: 2.7.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
7
7
  Keywords: docling,convert,document,pdf,docx,html,markdown,layout model,segmentation,table structure,table former
8
8
  Author: Christoph Auer
9
9
  Author-email: cau@zurich.ibm.com
10
- Requires-Python: >=3.10,<4.0
10
+ Requires-Python: >=3.9,<4.0
11
11
  Classifier: Development Status :: 5 - Production/Stable
12
12
  Classifier: Intended Audience :: Developers
13
13
  Classifier: Intended Audience :: Science/Research
@@ -15,21 +15,25 @@ Classifier: License :: OSI Approved :: MIT License
15
15
  Classifier: Operating System :: MacOS :: MacOS X
16
16
  Classifier: Operating System :: POSIX :: Linux
17
17
  Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.9
18
19
  Classifier: Programming Language :: Python :: 3.10
19
20
  Classifier: Programming Language :: Python :: 3.11
20
21
  Classifier: Programming Language :: Python :: 3.12
21
22
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
+ Provides-Extra: ocrmac
22
24
  Provides-Extra: tesserocr
23
25
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
24
26
  Requires-Dist: certifi (>=2024.7.4)
25
27
  Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
26
- Requires-Dist: docling-core (>=2.3.0,<3.0.0)
27
- Requires-Dist: docling-ibm-models (>=2.0.3,<3.0.0)
28
- Requires-Dist: docling-parse (>=2.0.2,<3.0.0)
28
+ Requires-Dist: docling-core (>=2.4.0,<3.0.0)
29
+ Requires-Dist: docling-ibm-models (>=2.0.6,<3.0.0)
30
+ Requires-Dist: docling-parse (>=2.0.5,<3.0.0)
29
31
  Requires-Dist: easyocr (>=1.7,<2.0)
30
32
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
31
33
  Requires-Dist: huggingface_hub (>=0.23,<1)
32
34
  Requires-Dist: marko (>=2.1.2,<3.0.0)
35
+ Requires-Dist: ocrmac (>=1.0.0,<2.0.0) ; (sys_platform == "darwin") and (extra == "ocrmac")
36
+ Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
33
37
  Requires-Dist: pandas (>=2.1.4,<3.0.0)
34
38
  Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
35
39
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
@@ -39,7 +43,7 @@ Requires-Dist: python-docx (>=1.1.2,<2.0.0)
39
43
  Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
40
44
  Requires-Dist: requests (>=2.32.3,<3.0.0)
41
45
  Requires-Dist: rtree (>=1.3.0,<2.0.0)
42
- Requires-Dist: scipy (>=1.14.1,<2.0.0)
46
+ Requires-Dist: scipy (>=1.6.0,<2.0.0)
43
47
  Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
44
48
  Requires-Dist: typer (>=0.12.5,<0.13.0)
45
49
  Project-URL: Repository, https://github.com/DS4SD/docling
@@ -60,7 +64,7 @@ Description-Content-Type: text/markdown
60
64
  [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
61
65
  [![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://ds4sd.github.io/docling/)
62
66
  [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
63
- ![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)
67
+ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/docling)](https://pypi.org/project/docling/)
64
68
  [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
65
69
  [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
66
70
  [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
@@ -6,24 +6,26 @@ docling/backend/docling_parse_backend.py,sha256=csWy6ZGxDuZfNr0YTrUU40DXqelN_TJk
6
6
  docling/backend/docling_parse_v2_backend.py,sha256=gUr9_fwHbkj238oYQPJ9AxpjFL2jGvhjBlBQPblmSAg,8589
7
7
  docling/backend/html_backend.py,sha256=qbu1W8xoTGnXMuZPRPLq68hDbCEj6ygnpxP5gYaodAQ,15593
8
8
  docling/backend/md_backend.py,sha256=tmuSCghjor9PqKIiVieCuZ4_t5JEjZMy3cq7u3yTgyU,14032
9
- docling/backend/mspowerpoint_backend.py,sha256=YaVJc6RXWmM1EPTp0TzAiXpGxu6K-MZdPNsmR_64LSg,15358
10
- docling/backend/msword_backend.py,sha256=IEqGz-lUrQw0tgBly_gv_mYGC0X0iNnGhkwnDWaDtBY,17341
9
+ docling/backend/msexcel_backend.py,sha256=23qUEScqr5GhY06xiqg-eBQ_JlAqO0FkPEmX6554sVA,12040
10
+ docling/backend/mspowerpoint_backend.py,sha256=QD0NaatTO8U9CIFoiipkq3X5HxLZaaahH8nlrQ6ecDA,15710
11
+ docling/backend/msword_backend.py,sha256=-cCEh4EhdGknHrxiVGFE4GDo_iYpAqP2QxRaeqrJHUE,17939
11
12
  docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
12
13
  docling/backend/pypdfium2_backend.py,sha256=B4bfv-dfzlWiKTfF8LN5fto_99YBu8A2c1_XIVwRUWI,8996
13
14
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
- docling/cli/main.py,sha256=7stF4dMjGVp5R0Gvcawm21rff5RbEQnWj8ZzoAHvV9k,9619
15
+ docling/cli/main.py,sha256=MpjbAXhOlbGnAnl5_OaKCdub61YPQBy1NOqroXQtNYE,10722
15
16
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
- docling/datamodel/base_models.py,sha256=fmkS6iTxGZCTtNCo2zsgMmBC11Ogf2Ht-mNIlZ9GP-o,5375
17
+ docling/datamodel/base_models.py,sha256=6qlwPamDZ3XUsE2kTAyGKG6O2IJClVjCqaE7DZ74KHU,5533
17
18
  docling/datamodel/document.py,sha256=9dQf_J18X_MEWs-Mg3Ed6BykFPJ79ETmkkxcssY-vYo,20698
18
- docling/datamodel/pipeline_options.py,sha256=-PXwqkdwSpWjIMCxyqwB8Q453szVNR1zVM-7d0PAOWQ,2530
19
+ docling/datamodel/pipeline_options.py,sha256=aC_CmtEhNLIbn9n3JuYhL_aA8UA0vFgw7HcGMUuOI4o,3117
19
20
  docling/datamodel/settings.py,sha256=JK8lZPBjUx2kD2q-Qpg-o3vOElADMcyQbRUL0EHZ7us,1263
20
- docling/document_converter.py,sha256=U52_rZQDm2wzrnsuUrvsfX2MnmOWFFhjBzfS8tEvt6Y,10595
21
+ docling/document_converter.py,sha256=L0A3g7IQBaKIK7dWpUFC72ZqKywIPYkyh71Qd6DiNPE,10940
21
22
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
23
  docling/models/base_model.py,sha256=Yq_-FmUhqhE20vXYG3WiQXDRTIPjik1CyuEZ8iYTGAY,701
23
24
  docling/models/base_ocr_model.py,sha256=rGSpBF4dByITcsBaRIgvFKpiu0CrhmZS_PHIo686Dw0,6428
24
25
  docling/models/ds_glm_model.py,sha256=hBRCx6oFGhxBbKEJlRSWVndDwFtB5IpeLOowFAVqFM0,12033
25
26
  docling/models/easyocr_model.py,sha256=c2m4x9dZpSc-cMgeEdFBRVBlB78uMGlYD8Q_2gzRuMU,3734
26
27
  docling/models/layout_model.py,sha256=ZvbTSyxvXB5yLHNEti0Wv3trz0vwGuHySI5TCdApb0U,14011
28
+ docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
27
29
  docling/models/page_assemble_model.py,sha256=kSGNiRKhmzkpFH7xCiT3rulMsgJmUXFa6Th_eB-cLEk,7103
28
30
  docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
29
31
  docling/models/table_structure_model.py,sha256=-ANSQpiN2avt3B9sbi7dHcoULUJbMBalAR5xxlrM7To,8421
@@ -32,14 +34,14 @@ docling/models/tesseract_ocr_model.py,sha256=RDf6iV1q-oXaGfZXv0bW6SqjHNKQvBUDlUs
32
34
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
35
  docling/pipeline/base_pipeline.py,sha256=IF1XWYgUGbdB4-teLkmM4Hvg_UNEfPrGuhExMRTUsk8,7168
34
36
  docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
35
- docling/pipeline/standard_pdf_pipeline.py,sha256=h59eA0CLMYuuJoH-0SyCRkYEregNs6i0pa46Ioqf8kU,7947
37
+ docling/pipeline/standard_pdf_pipeline.py,sha256=btm_y1ZsjUrtWvMbF6RA8BVM0ENrK4z_rqF0jjdeZmU,8473
36
38
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
39
  docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
38
40
  docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
39
41
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
40
42
  docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
41
- docling-2.5.2.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
42
- docling-2.5.2.dist-info/METADATA,sha256=oEAVaoncnXpewHqwn3rbOuszNifzG8s-TtWxhcnufzs,6530
43
- docling-2.5.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
44
- docling-2.5.2.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
45
- docling-2.5.2.dist-info/RECORD,,
43
+ docling-2.7.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
44
+ docling-2.7.0.dist-info/METADATA,sha256=6cpEQMbjK1tKCQ3kkzeOD7URm41HPx2xUSs-gxvlsM4,6761
45
+ docling-2.7.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
46
+ docling-2.7.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
47
+ docling-2.7.0.dist-info/RECORD,,