docling 2.26.0__py3-none-any.whl → 2.28.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. docling/backend/asciidoc_backend.py +1 -1
  2. docling/backend/csv_backend.py +1 -1
  3. docling/backend/docling_parse_backend.py +21 -13
  4. docling/backend/docling_parse_v2_backend.py +20 -12
  5. docling/backend/docling_parse_v4_backend.py +192 -0
  6. docling/backend/docx/__init__.py +0 -0
  7. docling/backend/docx/latex/__init__.py +0 -0
  8. docling/backend/docx/latex/latex_dict.py +271 -0
  9. docling/backend/docx/latex/omml.py +453 -0
  10. docling/backend/html_backend.py +7 -7
  11. docling/backend/md_backend.py +1 -1
  12. docling/backend/msexcel_backend.py +2 -45
  13. docling/backend/mspowerpoint_backend.py +19 -1
  14. docling/backend/msword_backend.py +68 -3
  15. docling/backend/pdf_backend.py +7 -2
  16. docling/backend/pypdfium2_backend.py +52 -30
  17. docling/backend/xml/uspto_backend.py +1 -1
  18. docling/cli/main.py +135 -53
  19. docling/cli/models.py +1 -1
  20. docling/datamodel/base_models.py +8 -10
  21. docling/datamodel/pipeline_options.py +54 -32
  22. docling/document_converter.py +5 -5
  23. docling/models/base_model.py +9 -1
  24. docling/models/base_ocr_model.py +27 -16
  25. docling/models/easyocr_model.py +28 -13
  26. docling/models/factories/__init__.py +27 -0
  27. docling/models/factories/base_factory.py +122 -0
  28. docling/models/factories/ocr_factory.py +11 -0
  29. docling/models/factories/picture_description_factory.py +11 -0
  30. docling/models/hf_mlx_model.py +137 -0
  31. docling/models/ocr_mac_model.py +39 -11
  32. docling/models/page_preprocessing_model.py +4 -0
  33. docling/models/picture_description_api_model.py +20 -3
  34. docling/models/picture_description_base_model.py +19 -3
  35. docling/models/picture_description_vlm_model.py +14 -2
  36. docling/models/plugins/__init__.py +0 -0
  37. docling/models/plugins/defaults.py +28 -0
  38. docling/models/rapid_ocr_model.py +34 -13
  39. docling/models/table_structure_model.py +13 -4
  40. docling/models/tesseract_ocr_cli_model.py +40 -15
  41. docling/models/tesseract_ocr_model.py +37 -12
  42. docling/pipeline/standard_pdf_pipeline.py +25 -78
  43. docling/pipeline/vlm_pipeline.py +78 -398
  44. docling/utils/export.py +8 -6
  45. docling/utils/layout_postprocessor.py +26 -23
  46. docling/utils/visualization.py +1 -1
  47. {docling-2.26.0.dist-info → docling-2.28.0.dist-info}/METADATA +47 -23
  48. docling-2.28.0.dist-info/RECORD +84 -0
  49. {docling-2.26.0.dist-info → docling-2.28.0.dist-info}/entry_points.txt +3 -0
  50. docling-2.26.0.dist-info/RECORD +0 -72
  51. {docling-2.26.0.dist-info → docling-2.28.0.dist-info}/LICENSE +0 -0
  52. {docling-2.26.0.dist-info → docling-2.28.0.dist-info}/WHEEL +0 -0
@@ -26,6 +26,7 @@ from PIL import Image, UnidentifiedImageError
26
26
  from typing_extensions import override
27
27
 
28
28
  from docling.backend.abstract_backend import DeclarativeDocumentBackend
29
+ from docling.backend.docx.latex.omml import oMath2Latex
29
30
  from docling.datamodel.base_models import InputFormat
30
31
  from docling.datamodel.document import InputDocument
31
32
 
@@ -260,6 +261,27 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
260
261
  else:
261
262
  return label, None
262
263
 
264
+ def handle_equations_in_text(self, element, text):
265
+ only_texts = []
266
+ only_equations = []
267
+ texts_and_equations = []
268
+ for subt in element.iter():
269
+ tag_name = etree.QName(subt).localname
270
+ if tag_name == "t" and "math" not in subt.tag:
271
+ only_texts.append(subt.text)
272
+ texts_and_equations.append(subt.text)
273
+ elif "oMath" in subt.tag and "oMathPara" not in subt.tag:
274
+ latex_equation = str(oMath2Latex(subt))
275
+ only_equations.append(latex_equation)
276
+ texts_and_equations.append(latex_equation)
277
+
278
+ if "".join(only_texts).strip() != text.strip():
279
+ # If we are not able to reconstruct the initial raw text
280
+ # do not try to parse equations and return the original
281
+ return text, []
282
+
283
+ return "".join(texts_and_equations), only_equations
284
+
263
285
  def handle_text_elements(
264
286
  self,
265
287
  element: BaseOxmlElement,
@@ -268,9 +290,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
268
290
  ) -> None:
269
291
  paragraph = Paragraph(element, docx_obj)
270
292
 
271
- if paragraph.text is None:
293
+ raw_text = paragraph.text
294
+ text, equations = self.handle_equations_in_text(element=element, text=raw_text)
295
+
296
+ if text is None:
272
297
  return
273
- text = paragraph.text.strip()
298
+ text = text.strip()
274
299
 
275
300
  # Common styles for bullet and numbered lists.
276
301
  # "List Bullet", "List Number", "List Paragraph"
@@ -323,6 +348,46 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
323
348
  elif "Heading" in p_style_id:
324
349
  self.add_header(doc, p_level, text)
325
350
 
351
+ elif len(equations) > 0:
352
+ if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
353
+ # Standalone equation
354
+ level = self.get_level()
355
+ doc.add_text(
356
+ label=DocItemLabel.FORMULA,
357
+ parent=self.parents[level - 1],
358
+ text=text,
359
+ )
360
+ else:
361
+ # Inline equation
362
+ level = self.get_level()
363
+ inline_equation = doc.add_group(
364
+ label=GroupLabel.INLINE, parent=self.parents[level - 1]
365
+ )
366
+ text_tmp = text
367
+ for eq in equations:
368
+ if len(text_tmp) == 0:
369
+ break
370
+
371
+ pre_eq_text = text_tmp.split(eq, maxsplit=1)[0]
372
+ text_tmp = text_tmp.split(eq, maxsplit=1)[1]
373
+ if len(pre_eq_text) > 0:
374
+ doc.add_text(
375
+ label=DocItemLabel.PARAGRAPH,
376
+ parent=inline_equation,
377
+ text=pre_eq_text,
378
+ )
379
+ doc.add_text(
380
+ label=DocItemLabel.FORMULA,
381
+ parent=inline_equation,
382
+ text=eq,
383
+ )
384
+ if len(text_tmp) > 0:
385
+ doc.add_text(
386
+ label=DocItemLabel.PARAGRAPH,
387
+ parent=inline_equation,
388
+ text=text_tmp,
389
+ )
390
+
326
391
  elif p_style_id in [
327
392
  "Paragraph",
328
393
  "Normal",
@@ -539,7 +604,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
539
604
  end_row_offset_idx=row.grid_cols_before + spanned_idx,
540
605
  start_col_offset_idx=col_idx,
541
606
  end_col_offset_idx=col_idx + cell.grid_span,
542
- col_header=False,
607
+ column_header=row.grid_cols_before + row_idx == 0,
543
608
  row_header=False,
544
609
  )
545
610
  data.table_cells.append(table_cell)
@@ -4,10 +4,11 @@ from pathlib import Path
4
4
  from typing import Iterable, Optional, Set, Union
5
5
 
6
6
  from docling_core.types.doc import BoundingBox, Size
7
+ from docling_core.types.doc.page import SegmentedPdfPage, TextCell
7
8
  from PIL import Image
8
9
 
9
10
  from docling.backend.abstract_backend import PaginatedDocumentBackend
10
- from docling.datamodel.base_models import Cell, InputFormat
11
+ from docling.datamodel.base_models import InputFormat
11
12
  from docling.datamodel.document import InputDocument
12
13
 
13
14
 
@@ -17,7 +18,11 @@ class PdfPageBackend(ABC):
17
18
  pass
18
19
 
19
20
  @abstractmethod
20
- def get_text_cells(self) -> Iterable[Cell]:
21
+ def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
22
+ pass
23
+
24
+ @abstractmethod
25
+ def get_text_cells(self) -> Iterable[TextCell]:
21
26
  pass
22
27
 
23
28
  @abstractmethod
@@ -7,12 +7,12 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
7
7
  import pypdfium2 as pdfium
8
8
  import pypdfium2.raw as pdfium_c
9
9
  from docling_core.types.doc import BoundingBox, CoordOrigin, Size
10
+ from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
10
11
  from PIL import Image, ImageDraw
11
12
  from pypdfium2 import PdfTextPage
12
13
  from pypdfium2._helpers.misc import PdfiumError
13
14
 
14
15
  from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
15
- from docling.datamodel.base_models import Cell
16
16
  from docling.utils.locks import pypdfium2_lock
17
17
 
18
18
  if TYPE_CHECKING:
@@ -68,7 +68,10 @@ class PyPdfiumPageBackend(PdfPageBackend):
68
68
 
69
69
  return text_piece
70
70
 
71
- def get_text_cells(self) -> Iterable[Cell]:
71
+ def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
72
+ return None
73
+
74
+ def get_text_cells(self) -> Iterable[TextCell]:
72
75
  with pypdfium2_lock:
73
76
  if not self.text_page:
74
77
  self.text_page = self._ppage.get_textpage()
@@ -84,11 +87,19 @@ class PyPdfiumPageBackend(PdfPageBackend):
84
87
  text_piece = self.text_page.get_text_bounded(*rect)
85
88
  x0, y0, x1, y1 = rect
86
89
  cells.append(
87
- Cell(
88
- id=cell_counter,
90
+ TextCell(
91
+ index=cell_counter,
89
92
  text=text_piece,
90
- bbox=BoundingBox(
91
- l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT
93
+ orig=text_piece,
94
+ from_ocr=False,
95
+ rect=BoundingRectangle.from_bounding_box(
96
+ BoundingBox(
97
+ l=x0,
98
+ b=y0,
99
+ r=x1,
100
+ t=y1,
101
+ coord_origin=CoordOrigin.BOTTOMLEFT,
102
+ )
92
103
  ).to_top_left_origin(page_size.height),
93
104
  )
94
105
  )
@@ -97,51 +108,56 @@ class PyPdfiumPageBackend(PdfPageBackend):
97
108
  # PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs.
98
109
  # The cell merging code below is to clean this up.
99
110
  def merge_horizontal_cells(
100
- cells: List[Cell],
111
+ cells: List[TextCell],
101
112
  horizontal_threshold_factor: float = 1.0,
102
113
  vertical_threshold_factor: float = 0.5,
103
- ) -> List[Cell]:
114
+ ) -> List[TextCell]:
104
115
  if not cells:
105
116
  return []
106
117
 
107
- def group_rows(cells: List[Cell]) -> List[List[Cell]]:
118
+ def group_rows(cells: List[TextCell]) -> List[List[TextCell]]:
108
119
  rows = []
109
120
  current_row = [cells[0]]
110
- row_top = cells[0].bbox.t
111
- row_bottom = cells[0].bbox.b
112
- row_height = cells[0].bbox.height
121
+ row_top = cells[0].rect.to_bounding_box().t
122
+ row_bottom = cells[0].rect.to_bounding_box().b
123
+ row_height = cells[0].rect.to_bounding_box().height
113
124
 
114
125
  for cell in cells[1:]:
115
126
  vertical_threshold = row_height * vertical_threshold_factor
116
127
  if (
117
- abs(cell.bbox.t - row_top) <= vertical_threshold
118
- and abs(cell.bbox.b - row_bottom) <= vertical_threshold
128
+ abs(cell.rect.to_bounding_box().t - row_top)
129
+ <= vertical_threshold
130
+ and abs(cell.rect.to_bounding_box().b - row_bottom)
131
+ <= vertical_threshold
119
132
  ):
120
133
  current_row.append(cell)
121
- row_top = min(row_top, cell.bbox.t)
122
- row_bottom = max(row_bottom, cell.bbox.b)
134
+ row_top = min(row_top, cell.rect.to_bounding_box().t)
135
+ row_bottom = max(row_bottom, cell.rect.to_bounding_box().b)
123
136
  row_height = row_bottom - row_top
124
137
  else:
125
138
  rows.append(current_row)
126
139
  current_row = [cell]
127
- row_top = cell.bbox.t
128
- row_bottom = cell.bbox.b
129
- row_height = cell.bbox.height
140
+ row_top = cell.rect.to_bounding_box().t
141
+ row_bottom = cell.rect.to_bounding_box().b
142
+ row_height = cell.rect.to_bounding_box().height
130
143
 
131
144
  if current_row:
132
145
  rows.append(current_row)
133
146
 
134
147
  return rows
135
148
 
136
- def merge_row(row: List[Cell]) -> List[Cell]:
149
+ def merge_row(row: List[TextCell]) -> List[TextCell]:
137
150
  merged = []
138
151
  current_group = [row[0]]
139
152
 
140
153
  for cell in row[1:]:
141
154
  prev_cell = current_group[-1]
142
- avg_height = (prev_cell.bbox.height + cell.bbox.height) / 2
155
+ avg_height = (
156
+ prev_cell.rect.height + cell.rect.to_bounding_box().height
157
+ ) / 2
143
158
  if (
144
- cell.bbox.l - prev_cell.bbox.r
159
+ cell.rect.to_bounding_box().l
160
+ - prev_cell.rect.to_bounding_box().r
145
161
  <= avg_height * horizontal_threshold_factor
146
162
  ):
147
163
  current_group.append(cell)
@@ -154,24 +170,30 @@ class PyPdfiumPageBackend(PdfPageBackend):
154
170
 
155
171
  return merged
156
172
 
157
- def merge_group(group: List[Cell]) -> Cell:
173
+ def merge_group(group: List[TextCell]) -> TextCell:
158
174
  if len(group) == 1:
159
175
  return group[0]
160
176
 
161
177
  merged_text = "".join(cell.text for cell in group)
162
178
  merged_bbox = BoundingBox(
163
- l=min(cell.bbox.l for cell in group),
164
- t=min(cell.bbox.t for cell in group),
165
- r=max(cell.bbox.r for cell in group),
166
- b=max(cell.bbox.b for cell in group),
179
+ l=min(cell.rect.to_bounding_box().l for cell in group),
180
+ t=min(cell.rect.to_bounding_box().t for cell in group),
181
+ r=max(cell.rect.to_bounding_box().r for cell in group),
182
+ b=max(cell.rect.to_bounding_box().b for cell in group),
183
+ )
184
+ return TextCell(
185
+ index=group[0].index,
186
+ text=merged_text,
187
+ orig=merged_text,
188
+ rect=BoundingRectangle.from_bounding_box(merged_bbox),
189
+ from_ocr=False,
167
190
  )
168
- return Cell(id=group[0].id, text=merged_text, bbox=merged_bbox)
169
191
 
170
192
  rows = group_rows(cells)
171
193
  merged_cells = [cell for row in rows for cell in merge_row(row)]
172
194
 
173
195
  for i, cell in enumerate(merged_cells, 1):
174
- cell.id = i
196
+ cell.index = i
175
197
 
176
198
  return merged_cells
177
199
 
@@ -181,7 +203,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
181
203
  ) # make new image to avoid drawing on the saved ones
182
204
  draw = ImageDraw.Draw(image)
183
205
  for c in cells:
184
- x0, y0, x1, y1 = c.bbox.as_tuple()
206
+ x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
185
207
  cell_color = (
186
208
  random.randint(30, 140),
187
209
  random.randint(30, 140),
@@ -999,7 +999,7 @@ class PatentUsptoGrantAps(PatentUspto):
999
999
  parent=self.parents[self.level],
1000
1000
  )
1001
1001
 
1002
- last_claim.text += f" {value}" if last_claim.text else value
1002
+ last_claim.text += f" {value.strip()}" if last_claim.text else value.strip()
1003
1003
 
1004
1004
  elif field == self.Field.CAPTION.value and section in (
1005
1005
  self.Section.SUMMARY.value,
docling/cli/main.py CHANGED
@@ -9,6 +9,7 @@ import warnings
9
9
  from pathlib import Path
10
10
  from typing import Annotated, Dict, Iterable, List, Optional, Type
11
11
 
12
+ import rich.table
12
13
  import typer
13
14
  from docling_core.types.doc import ImageRefMode
14
15
  from docling_core.utils.file import resolve_source_to_path
@@ -16,6 +17,7 @@ from pydantic import TypeAdapter
16
17
 
17
18
  from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
18
19
  from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
20
+ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
19
21
  from docling.backend.pdf_backend import PdfDocumentBackend
20
22
  from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
21
23
  from docling.datamodel.base_models import (
@@ -29,18 +31,22 @@ from docling.datamodel.pipeline_options import (
29
31
  AcceleratorDevice,
30
32
  AcceleratorOptions,
31
33
  EasyOcrOptions,
32
- OcrEngine,
33
- OcrMacOptions,
34
34
  OcrOptions,
35
+ PaginatedPipelineOptions,
35
36
  PdfBackend,
37
+ PdfPipeline,
36
38
  PdfPipelineOptions,
37
- RapidOcrOptions,
38
39
  TableFormerMode,
39
- TesseractCliOcrOptions,
40
- TesseractOcrOptions,
40
+ VlmModelType,
41
+ VlmPipelineOptions,
42
+ granite_vision_vlm_conversion_options,
43
+ smoldocling_vlm_conversion_options,
44
+ smoldocling_vlm_mlx_conversion_options,
41
45
  )
42
46
  from docling.datamodel.settings import settings
43
47
  from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
48
+ from docling.models.factories import get_ocr_factory
49
+ from docling.pipeline.vlm_pipeline import VlmPipeline
44
50
 
45
51
  warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
46
52
  warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
@@ -48,8 +54,11 @@ warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr
48
54
  _log = logging.getLogger(__name__)
49
55
  from rich.console import Console
50
56
 
57
+ console = Console()
51
58
  err_console = Console(stderr=True)
52
59
 
60
+ ocr_factory_internal = get_ocr_factory(allow_external_plugins=False)
61
+ ocr_engines_enum_internal = ocr_factory_internal.get_enum()
53
62
 
54
63
  app = typer.Typer(
55
64
  name="Docling",
@@ -77,6 +86,24 @@ def version_callback(value: bool):
77
86
  raise typer.Exit()
78
87
 
79
88
 
89
+ def show_external_plugins_callback(value: bool):
90
+ if value:
91
+ ocr_factory_all = get_ocr_factory(allow_external_plugins=True)
92
+ table = rich.table.Table(title="Available OCR engines")
93
+ table.add_column("Name", justify="right")
94
+ table.add_column("Plugin")
95
+ table.add_column("Package")
96
+ for meta in ocr_factory_all.registered_meta.values():
97
+ if not meta.module.startswith("docling."):
98
+ table.add_row(
99
+ f"[bold]{meta.kind}[/bold]",
100
+ meta.plugin_name,
101
+ meta.module.split(".")[0],
102
+ )
103
+ rich.print(table)
104
+ raise typer.Exit()
105
+
106
+
80
107
  def export_documents(
81
108
  conv_results: Iterable[ConversionResult],
82
109
  output_dir: Path,
@@ -181,6 +208,14 @@ def convert(
181
208
  help="Image export mode for the document (only in case of JSON, Markdown or HTML). With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.",
182
209
  ),
183
210
  ] = ImageRefMode.EMBEDDED,
211
+ pipeline: Annotated[
212
+ PdfPipeline,
213
+ typer.Option(..., help="Choose the pipeline to process PDF or image files."),
214
+ ] = PdfPipeline.STANDARD,
215
+ vlm_model: Annotated[
216
+ VlmModelType,
217
+ typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
218
+ ] = VlmModelType.SMOLDOCLING,
184
219
  ocr: Annotated[
185
220
  bool,
186
221
  typer.Option(
@@ -195,8 +230,16 @@ def convert(
195
230
  ),
196
231
  ] = False,
197
232
  ocr_engine: Annotated[
198
- OcrEngine, typer.Option(..., help="The OCR engine to use.")
199
- ] = OcrEngine.EASYOCR,
233
+ str,
234
+ typer.Option(
235
+ ...,
236
+ help=(
237
+ f"The OCR engine to use. When --allow-external-plugins is *not* set, the available values are: "
238
+ f"{', '.join((o.value for o in ocr_engines_enum_internal))}. "
239
+ f"Use the option --show-external-plugins to see the options allowed with external plugins."
240
+ ),
241
+ ),
242
+ ] = EasyOcrOptions.kind,
200
243
  ocr_lang: Annotated[
201
244
  Optional[str],
202
245
  typer.Option(
@@ -240,6 +283,21 @@ def convert(
240
283
  ..., help="Must be enabled when using models connecting to remote services."
241
284
  ),
242
285
  ] = False,
286
+ allow_external_plugins: Annotated[
287
+ bool,
288
+ typer.Option(
289
+ ..., help="Must be enabled for loading modules from third-party plugins."
290
+ ),
291
+ ] = False,
292
+ show_external_plugins: Annotated[
293
+ bool,
294
+ typer.Option(
295
+ ...,
296
+ help="List the third-party plugins which are available when the option --allow-external-plugins is set.",
297
+ callback=show_external_plugins_callback,
298
+ is_eager=True,
299
+ ),
300
+ ] = False,
243
301
  abort_on_error: Annotated[
244
302
  bool,
245
303
  typer.Option(
@@ -367,64 +425,88 @@ def convert(
367
425
  export_txt = OutputFormat.TEXT in to_formats
368
426
  export_doctags = OutputFormat.DOCTAGS in to_formats
369
427
 
370
- if ocr_engine == OcrEngine.EASYOCR:
371
- ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
372
- elif ocr_engine == OcrEngine.TESSERACT_CLI:
373
- ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
374
- elif ocr_engine == OcrEngine.TESSERACT:
375
- ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
376
- elif ocr_engine == OcrEngine.OCRMAC:
377
- ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
378
- elif ocr_engine == OcrEngine.RAPIDOCR:
379
- ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
380
- else:
381
- raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
428
+ ocr_factory = get_ocr_factory(allow_external_plugins=allow_external_plugins)
429
+ ocr_options: OcrOptions = ocr_factory.create_options( # type: ignore
430
+ kind=ocr_engine,
431
+ force_full_page_ocr=force_ocr,
432
+ )
382
433
 
383
434
  ocr_lang_list = _split_list(ocr_lang)
384
435
  if ocr_lang_list is not None:
385
436
  ocr_options.lang = ocr_lang_list
386
437
 
387
438
  accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
388
- pipeline_options = PdfPipelineOptions(
389
- enable_remote_services=enable_remote_services,
390
- accelerator_options=accelerator_options,
391
- do_ocr=ocr,
392
- ocr_options=ocr_options,
393
- do_table_structure=True,
394
- do_code_enrichment=enrich_code,
395
- do_formula_enrichment=enrich_formula,
396
- do_picture_description=enrich_picture_description,
397
- do_picture_classification=enrich_picture_classes,
398
- document_timeout=document_timeout,
399
- )
400
- pipeline_options.table_structure_options.do_cell_matching = (
401
- True # do_cell_matching
402
- )
403
- pipeline_options.table_structure_options.mode = table_mode
439
+ pipeline_options: PaginatedPipelineOptions
440
+
441
+ if pipeline == PdfPipeline.STANDARD:
442
+ pipeline_options = PdfPipelineOptions(
443
+ allow_external_plugins=allow_external_plugins,
444
+ enable_remote_services=enable_remote_services,
445
+ accelerator_options=accelerator_options,
446
+ do_ocr=ocr,
447
+ ocr_options=ocr_options,
448
+ do_table_structure=True,
449
+ do_code_enrichment=enrich_code,
450
+ do_formula_enrichment=enrich_formula,
451
+ do_picture_description=enrich_picture_description,
452
+ do_picture_classification=enrich_picture_classes,
453
+ document_timeout=document_timeout,
454
+ )
455
+ pipeline_options.table_structure_options.do_cell_matching = (
456
+ True # do_cell_matching
457
+ )
458
+ pipeline_options.table_structure_options.mode = table_mode
404
459
 
405
- if image_export_mode != ImageRefMode.PLACEHOLDER:
406
- pipeline_options.generate_page_images = True
407
- pipeline_options.generate_picture_images = (
408
- True # FIXME: to be deprecated in verson 3
460
+ if image_export_mode != ImageRefMode.PLACEHOLDER:
461
+ pipeline_options.generate_page_images = True
462
+ pipeline_options.generate_picture_images = (
463
+ True # FIXME: to be deprecated in verson 3
464
+ )
465
+ pipeline_options.images_scale = 2
466
+
467
+ backend: Type[PdfDocumentBackend]
468
+ if pdf_backend == PdfBackend.DLPARSE_V1:
469
+ backend = DoclingParseDocumentBackend
470
+ elif pdf_backend == PdfBackend.DLPARSE_V2:
471
+ backend = DoclingParseV2DocumentBackend
472
+ elif pdf_backend == PdfBackend.DLPARSE_V4:
473
+ backend = DoclingParseV4DocumentBackend # type: ignore
474
+ elif pdf_backend == PdfBackend.PYPDFIUM2:
475
+ backend = PyPdfiumDocumentBackend # type: ignore
476
+ else:
477
+ raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
478
+
479
+ pdf_format_option = PdfFormatOption(
480
+ pipeline_options=pipeline_options,
481
+ backend=backend, # pdf_backend
482
+ )
483
+ elif pipeline == PdfPipeline.VLM:
484
+ pipeline_options = VlmPipelineOptions()
485
+
486
+ if vlm_model == VlmModelType.GRANITE_VISION:
487
+ pipeline_options.vlm_options = granite_vision_vlm_conversion_options
488
+ elif vlm_model == VlmModelType.SMOLDOCLING:
489
+ pipeline_options.vlm_options = smoldocling_vlm_conversion_options
490
+ if sys.platform == "darwin":
491
+ try:
492
+ import mlx_vlm
493
+
494
+ pipeline_options.vlm_options = (
495
+ smoldocling_vlm_mlx_conversion_options
496
+ )
497
+ except ImportError:
498
+ _log.warning(
499
+ "To run SmolDocling faster, please install mlx-vlm:\n"
500
+ "pip install mlx-vlm"
501
+ )
502
+
503
+ pdf_format_option = PdfFormatOption(
504
+ pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
409
505
  )
410
- pipeline_options.images_scale = 2
411
506
 
412
507
  if artifacts_path is not None:
413
508
  pipeline_options.artifacts_path = artifacts_path
414
509
 
415
- if pdf_backend == PdfBackend.DLPARSE_V1:
416
- backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
417
- elif pdf_backend == PdfBackend.DLPARSE_V2:
418
- backend = DoclingParseV2DocumentBackend
419
- elif pdf_backend == PdfBackend.PYPDFIUM2:
420
- backend = PyPdfiumDocumentBackend
421
- else:
422
- raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
423
-
424
- pdf_format_option = PdfFormatOption(
425
- pipeline_options=pipeline_options,
426
- backend=backend, # pdf_backend
427
- )
428
510
  format_options: Dict[InputFormat, FormatOption] = {
429
511
  InputFormat.PDF: pdf_format_option,
430
512
  InputFormat.IMAGE: pdf_format_option,
docling/cli/models.py CHANGED
@@ -121,7 +121,7 @@ def download(
121
121
  "Using the CLI:",
122
122
  f"`docling --artifacts-path={output_dir} FILE`",
123
123
  "\n",
124
- "Using Python: see the documentation at <https://ds4sd.github.io/docling/usage>.",
124
+ "Using Python: see the documentation at <https://docling-project.github.io/docling/usage>.",
125
125
  )
126
126
 
127
127
 
@@ -9,6 +9,7 @@ from docling_core.types.doc import (
9
9
  Size,
10
10
  TableCell,
11
11
  )
12
+ from docling_core.types.doc.page import SegmentedPdfPage, TextCell
12
13
  from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
13
14
  DocumentStream,
14
15
  )
@@ -123,14 +124,10 @@ class ErrorItem(BaseModel):
123
124
  error_message: str
124
125
 
125
126
 
126
- class Cell(BaseModel):
127
- id: int
128
- text: str
129
- bbox: BoundingBox
130
-
131
-
132
- class OcrCell(Cell):
133
- confidence: float
127
+ # class Cell(BaseModel):
128
+ # id: int
129
+ # text: str
130
+ # bbox: BoundingBox
134
131
 
135
132
 
136
133
  class Cluster(BaseModel):
@@ -138,7 +135,7 @@ class Cluster(BaseModel):
138
135
  label: DocItemLabel
139
136
  bbox: BoundingBox
140
137
  confidence: float = 1.0
141
- cells: List[Cell] = []
138
+ cells: List[TextCell] = []
142
139
  children: List["Cluster"] = [] # Add child cluster support
143
140
 
144
141
 
@@ -226,7 +223,8 @@ class Page(BaseModel):
226
223
  page_no: int
227
224
  # page_hash: Optional[str] = None
228
225
  size: Optional[Size] = None
229
- cells: List[Cell] = []
226
+ cells: List[TextCell] = []
227
+ parsed_page: Optional[SegmentedPdfPage] = None
230
228
  predictions: PagePredictions = PagePredictions()
231
229
  assembled: Optional[AssembledUnit] = None
232
230