docling 2.26.0__py3-none-any.whl → 2.27.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. docling/backend/asciidoc_backend.py +1 -1
  2. docling/backend/csv_backend.py +1 -1
  3. docling/backend/docling_parse_backend.py +21 -13
  4. docling/backend/docling_parse_v2_backend.py +20 -12
  5. docling/backend/docling_parse_v4_backend.py +185 -0
  6. docling/backend/docx/__init__.py +0 -0
  7. docling/backend/docx/latex/__init__.py +0 -0
  8. docling/backend/docx/latex/latex_dict.py +271 -0
  9. docling/backend/docx/latex/omml.py +453 -0
  10. docling/backend/html_backend.py +7 -7
  11. docling/backend/md_backend.py +1 -1
  12. docling/backend/msexcel_backend.py +2 -45
  13. docling/backend/mspowerpoint_backend.py +1 -1
  14. docling/backend/msword_backend.py +65 -3
  15. docling/backend/pdf_backend.py +7 -2
  16. docling/backend/pypdfium2_backend.py +52 -30
  17. docling/backend/xml/uspto_backend.py +1 -1
  18. docling/cli/main.py +60 -21
  19. docling/cli/models.py +1 -1
  20. docling/datamodel/base_models.py +8 -10
  21. docling/datamodel/pipeline_options.py +26 -30
  22. docling/document_converter.py +5 -5
  23. docling/models/base_model.py +9 -1
  24. docling/models/base_ocr_model.py +27 -16
  25. docling/models/easyocr_model.py +28 -13
  26. docling/models/factories/__init__.py +27 -0
  27. docling/models/factories/base_factory.py +122 -0
  28. docling/models/factories/ocr_factory.py +11 -0
  29. docling/models/factories/picture_description_factory.py +11 -0
  30. docling/models/ocr_mac_model.py +39 -11
  31. docling/models/page_preprocessing_model.py +4 -0
  32. docling/models/picture_description_api_model.py +20 -3
  33. docling/models/picture_description_base_model.py +19 -3
  34. docling/models/picture_description_vlm_model.py +14 -2
  35. docling/models/plugins/__init__.py +0 -0
  36. docling/models/plugins/defaults.py +28 -0
  37. docling/models/rapid_ocr_model.py +34 -13
  38. docling/models/table_structure_model.py +13 -4
  39. docling/models/tesseract_ocr_cli_model.py +40 -15
  40. docling/models/tesseract_ocr_model.py +37 -12
  41. docling/pipeline/standard_pdf_pipeline.py +25 -78
  42. docling/utils/export.py +8 -6
  43. docling/utils/layout_postprocessor.py +26 -23
  44. docling/utils/visualization.py +1 -1
  45. {docling-2.26.0.dist-info → docling-2.27.0.dist-info}/METADATA +48 -19
  46. docling-2.27.0.dist-info/RECORD +83 -0
  47. {docling-2.26.0.dist-info → docling-2.27.0.dist-info}/entry_points.txt +3 -0
  48. docling-2.26.0.dist-info/RECORD +0 -72
  49. {docling-2.26.0.dist-info → docling-2.27.0.dist-info}/LICENSE +0 -0
  50. {docling-2.26.0.dist-info → docling-2.27.0.dist-info}/WHEEL +0 -0
@@ -7,12 +7,12 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
7
7
  import pypdfium2 as pdfium
8
8
  import pypdfium2.raw as pdfium_c
9
9
  from docling_core.types.doc import BoundingBox, CoordOrigin, Size
10
+ from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
10
11
  from PIL import Image, ImageDraw
11
12
  from pypdfium2 import PdfTextPage
12
13
  from pypdfium2._helpers.misc import PdfiumError
13
14
 
14
15
  from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
15
- from docling.datamodel.base_models import Cell
16
16
  from docling.utils.locks import pypdfium2_lock
17
17
 
18
18
  if TYPE_CHECKING:
@@ -68,7 +68,10 @@ class PyPdfiumPageBackend(PdfPageBackend):
68
68
 
69
69
  return text_piece
70
70
 
71
- def get_text_cells(self) -> Iterable[Cell]:
71
+ def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
72
+ return None
73
+
74
+ def get_text_cells(self) -> Iterable[TextCell]:
72
75
  with pypdfium2_lock:
73
76
  if not self.text_page:
74
77
  self.text_page = self._ppage.get_textpage()
@@ -84,11 +87,19 @@ class PyPdfiumPageBackend(PdfPageBackend):
84
87
  text_piece = self.text_page.get_text_bounded(*rect)
85
88
  x0, y0, x1, y1 = rect
86
89
  cells.append(
87
- Cell(
88
- id=cell_counter,
90
+ TextCell(
91
+ index=cell_counter,
89
92
  text=text_piece,
90
- bbox=BoundingBox(
91
- l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT
93
+ orig=text_piece,
94
+ from_ocr=False,
95
+ rect=BoundingRectangle.from_bounding_box(
96
+ BoundingBox(
97
+ l=x0,
98
+ b=y0,
99
+ r=x1,
100
+ t=y1,
101
+ coord_origin=CoordOrigin.BOTTOMLEFT,
102
+ )
92
103
  ).to_top_left_origin(page_size.height),
93
104
  )
94
105
  )
@@ -97,51 +108,56 @@ class PyPdfiumPageBackend(PdfPageBackend):
97
108
  # PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs.
98
109
  # The cell merging code below is to clean this up.
99
110
  def merge_horizontal_cells(
100
- cells: List[Cell],
111
+ cells: List[TextCell],
101
112
  horizontal_threshold_factor: float = 1.0,
102
113
  vertical_threshold_factor: float = 0.5,
103
- ) -> List[Cell]:
114
+ ) -> List[TextCell]:
104
115
  if not cells:
105
116
  return []
106
117
 
107
- def group_rows(cells: List[Cell]) -> List[List[Cell]]:
118
+ def group_rows(cells: List[TextCell]) -> List[List[TextCell]]:
108
119
  rows = []
109
120
  current_row = [cells[0]]
110
- row_top = cells[0].bbox.t
111
- row_bottom = cells[0].bbox.b
112
- row_height = cells[0].bbox.height
121
+ row_top = cells[0].rect.to_bounding_box().t
122
+ row_bottom = cells[0].rect.to_bounding_box().b
123
+ row_height = cells[0].rect.to_bounding_box().height
113
124
 
114
125
  for cell in cells[1:]:
115
126
  vertical_threshold = row_height * vertical_threshold_factor
116
127
  if (
117
- abs(cell.bbox.t - row_top) <= vertical_threshold
118
- and abs(cell.bbox.b - row_bottom) <= vertical_threshold
128
+ abs(cell.rect.to_bounding_box().t - row_top)
129
+ <= vertical_threshold
130
+ and abs(cell.rect.to_bounding_box().b - row_bottom)
131
+ <= vertical_threshold
119
132
  ):
120
133
  current_row.append(cell)
121
- row_top = min(row_top, cell.bbox.t)
122
- row_bottom = max(row_bottom, cell.bbox.b)
134
+ row_top = min(row_top, cell.rect.to_bounding_box().t)
135
+ row_bottom = max(row_bottom, cell.rect.to_bounding_box().b)
123
136
  row_height = row_bottom - row_top
124
137
  else:
125
138
  rows.append(current_row)
126
139
  current_row = [cell]
127
- row_top = cell.bbox.t
128
- row_bottom = cell.bbox.b
129
- row_height = cell.bbox.height
140
+ row_top = cell.rect.to_bounding_box().t
141
+ row_bottom = cell.rect.to_bounding_box().b
142
+ row_height = cell.rect.to_bounding_box().height
130
143
 
131
144
  if current_row:
132
145
  rows.append(current_row)
133
146
 
134
147
  return rows
135
148
 
136
- def merge_row(row: List[Cell]) -> List[Cell]:
149
+ def merge_row(row: List[TextCell]) -> List[TextCell]:
137
150
  merged = []
138
151
  current_group = [row[0]]
139
152
 
140
153
  for cell in row[1:]:
141
154
  prev_cell = current_group[-1]
142
- avg_height = (prev_cell.bbox.height + cell.bbox.height) / 2
155
+ avg_height = (
156
+ prev_cell.rect.height + cell.rect.to_bounding_box().height
157
+ ) / 2
143
158
  if (
144
- cell.bbox.l - prev_cell.bbox.r
159
+ cell.rect.to_bounding_box().l
160
+ - prev_cell.rect.to_bounding_box().r
145
161
  <= avg_height * horizontal_threshold_factor
146
162
  ):
147
163
  current_group.append(cell)
@@ -154,24 +170,30 @@ class PyPdfiumPageBackend(PdfPageBackend):
154
170
 
155
171
  return merged
156
172
 
157
- def merge_group(group: List[Cell]) -> Cell:
173
+ def merge_group(group: List[TextCell]) -> TextCell:
158
174
  if len(group) == 1:
159
175
  return group[0]
160
176
 
161
177
  merged_text = "".join(cell.text for cell in group)
162
178
  merged_bbox = BoundingBox(
163
- l=min(cell.bbox.l for cell in group),
164
- t=min(cell.bbox.t for cell in group),
165
- r=max(cell.bbox.r for cell in group),
166
- b=max(cell.bbox.b for cell in group),
179
+ l=min(cell.rect.to_bounding_box().l for cell in group),
180
+ t=min(cell.rect.to_bounding_box().t for cell in group),
181
+ r=max(cell.rect.to_bounding_box().r for cell in group),
182
+ b=max(cell.rect.to_bounding_box().b for cell in group),
183
+ )
184
+ return TextCell(
185
+ index=group[0].index,
186
+ text=merged_text,
187
+ orig=merged_text,
188
+ rect=BoundingRectangle.from_bounding_box(merged_bbox),
189
+ from_ocr=False,
167
190
  )
168
- return Cell(id=group[0].id, text=merged_text, bbox=merged_bbox)
169
191
 
170
192
  rows = group_rows(cells)
171
193
  merged_cells = [cell for row in rows for cell in merge_row(row)]
172
194
 
173
195
  for i, cell in enumerate(merged_cells, 1):
174
- cell.id = i
196
+ cell.index = i
175
197
 
176
198
  return merged_cells
177
199
 
@@ -181,7 +203,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
181
203
  ) # make new image to avoid drawing on the saved ones
182
204
  draw = ImageDraw.Draw(image)
183
205
  for c in cells:
184
- x0, y0, x1, y1 = c.bbox.as_tuple()
206
+ x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
185
207
  cell_color = (
186
208
  random.randint(30, 140),
187
209
  random.randint(30, 140),
@@ -999,7 +999,7 @@ class PatentUsptoGrantAps(PatentUspto):
999
999
  parent=self.parents[self.level],
1000
1000
  )
1001
1001
 
1002
- last_claim.text += f" {value}" if last_claim.text else value
1002
+ last_claim.text += f" {value.strip()}" if last_claim.text else value.strip()
1003
1003
 
1004
1004
  elif field == self.Field.CAPTION.value and section in (
1005
1005
  self.Section.SUMMARY.value,
docling/cli/main.py CHANGED
@@ -9,6 +9,7 @@ import warnings
9
9
  from pathlib import Path
10
10
  from typing import Annotated, Dict, Iterable, List, Optional, Type
11
11
 
12
+ import rich.table
12
13
  import typer
13
14
  from docling_core.types.doc import ImageRefMode
14
15
  from docling_core.utils.file import resolve_source_to_path
@@ -16,6 +17,7 @@ from pydantic import TypeAdapter
16
17
 
17
18
  from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
18
19
  from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
20
+ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
19
21
  from docling.backend.pdf_backend import PdfDocumentBackend
20
22
  from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
21
23
  from docling.datamodel.base_models import (
@@ -29,18 +31,14 @@ from docling.datamodel.pipeline_options import (
29
31
  AcceleratorDevice,
30
32
  AcceleratorOptions,
31
33
  EasyOcrOptions,
32
- OcrEngine,
33
- OcrMacOptions,
34
34
  OcrOptions,
35
35
  PdfBackend,
36
36
  PdfPipelineOptions,
37
- RapidOcrOptions,
38
37
  TableFormerMode,
39
- TesseractCliOcrOptions,
40
- TesseractOcrOptions,
41
38
  )
42
39
  from docling.datamodel.settings import settings
43
40
  from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
41
+ from docling.models.factories import get_ocr_factory
44
42
 
45
43
  warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
46
44
  warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
@@ -48,8 +46,11 @@ warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr
48
46
  _log = logging.getLogger(__name__)
49
47
  from rich.console import Console
50
48
 
49
+ console = Console()
51
50
  err_console = Console(stderr=True)
52
51
 
52
+ ocr_factory_internal = get_ocr_factory(allow_external_plugins=False)
53
+ ocr_engines_enum_internal = ocr_factory_internal.get_enum()
53
54
 
54
55
  app = typer.Typer(
55
56
  name="Docling",
@@ -77,6 +78,24 @@ def version_callback(value: bool):
77
78
  raise typer.Exit()
78
79
 
79
80
 
81
+ def show_external_plugins_callback(value: bool):
82
+ if value:
83
+ ocr_factory_all = get_ocr_factory(allow_external_plugins=True)
84
+ table = rich.table.Table(title="Available OCR engines")
85
+ table.add_column("Name", justify="right")
86
+ table.add_column("Plugin")
87
+ table.add_column("Package")
88
+ for meta in ocr_factory_all.registered_meta.values():
89
+ if not meta.module.startswith("docling."):
90
+ table.add_row(
91
+ f"[bold]{meta.kind}[/bold]",
92
+ meta.plugin_name,
93
+ meta.module.split(".")[0],
94
+ )
95
+ rich.print(table)
96
+ raise typer.Exit()
97
+
98
+
80
99
  def export_documents(
81
100
  conv_results: Iterable[ConversionResult],
82
101
  output_dir: Path,
@@ -195,8 +214,16 @@ def convert(
195
214
  ),
196
215
  ] = False,
197
216
  ocr_engine: Annotated[
198
- OcrEngine, typer.Option(..., help="The OCR engine to use.")
199
- ] = OcrEngine.EASYOCR,
217
+ str,
218
+ typer.Option(
219
+ ...,
220
+ help=(
221
+ f"The OCR engine to use. When --allow-external-plugins is *not* set, the available values are: "
222
+ f"{', '.join((o.value for o in ocr_engines_enum_internal))}. "
223
+ f"Use the option --show-external-plugins to see the options allowed with external plugins."
224
+ ),
225
+ ),
226
+ ] = EasyOcrOptions.kind,
200
227
  ocr_lang: Annotated[
201
228
  Optional[str],
202
229
  typer.Option(
@@ -240,6 +267,21 @@ def convert(
240
267
  ..., help="Must be enabled when using models connecting to remote services."
241
268
  ),
242
269
  ] = False,
270
+ allow_external_plugins: Annotated[
271
+ bool,
272
+ typer.Option(
273
+ ..., help="Must be enabled for loading modules from third-party plugins."
274
+ ),
275
+ ] = False,
276
+ show_external_plugins: Annotated[
277
+ bool,
278
+ typer.Option(
279
+ ...,
280
+ help="List the third-party plugins which are available when the option --allow-external-plugins is set.",
281
+ callback=show_external_plugins_callback,
282
+ is_eager=True,
283
+ ),
284
+ ] = False,
243
285
  abort_on_error: Annotated[
244
286
  bool,
245
287
  typer.Option(
@@ -367,18 +409,11 @@ def convert(
367
409
  export_txt = OutputFormat.TEXT in to_formats
368
410
  export_doctags = OutputFormat.DOCTAGS in to_formats
369
411
 
370
- if ocr_engine == OcrEngine.EASYOCR:
371
- ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
372
- elif ocr_engine == OcrEngine.TESSERACT_CLI:
373
- ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
374
- elif ocr_engine == OcrEngine.TESSERACT:
375
- ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
376
- elif ocr_engine == OcrEngine.OCRMAC:
377
- ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
378
- elif ocr_engine == OcrEngine.RAPIDOCR:
379
- ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
380
- else:
381
- raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
412
+ ocr_factory = get_ocr_factory(allow_external_plugins=allow_external_plugins)
413
+ ocr_options: OcrOptions = ocr_factory.create_options( # type: ignore
414
+ kind=ocr_engine,
415
+ force_full_page_ocr=force_ocr,
416
+ )
382
417
 
383
418
  ocr_lang_list = _split_list(ocr_lang)
384
419
  if ocr_lang_list is not None:
@@ -386,6 +421,7 @@ def convert(
386
421
 
387
422
  accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
388
423
  pipeline_options = PdfPipelineOptions(
424
+ allow_external_plugins=allow_external_plugins,
389
425
  enable_remote_services=enable_remote_services,
390
426
  accelerator_options=accelerator_options,
391
427
  do_ocr=ocr,
@@ -412,12 +448,15 @@ def convert(
412
448
  if artifacts_path is not None:
413
449
  pipeline_options.artifacts_path = artifacts_path
414
450
 
451
+ backend: Type[PdfDocumentBackend]
415
452
  if pdf_backend == PdfBackend.DLPARSE_V1:
416
- backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
453
+ backend = DoclingParseDocumentBackend
417
454
  elif pdf_backend == PdfBackend.DLPARSE_V2:
418
455
  backend = DoclingParseV2DocumentBackend
456
+ elif pdf_backend == PdfBackend.DLPARSE_V4:
457
+ backend = DoclingParseV4DocumentBackend # type: ignore
419
458
  elif pdf_backend == PdfBackend.PYPDFIUM2:
420
- backend = PyPdfiumDocumentBackend
459
+ backend = PyPdfiumDocumentBackend # type: ignore
421
460
  else:
422
461
  raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
423
462
 
docling/cli/models.py CHANGED
@@ -121,7 +121,7 @@ def download(
121
121
  "Using the CLI:",
122
122
  f"`docling --artifacts-path={output_dir} FILE`",
123
123
  "\n",
124
- "Using Python: see the documentation at <https://ds4sd.github.io/docling/usage>.",
124
+ "Using Python: see the documentation at <https://docling-project.github.io/docling/usage>.",
125
125
  )
126
126
 
127
127
 
@@ -9,6 +9,7 @@ from docling_core.types.doc import (
9
9
  Size,
10
10
  TableCell,
11
11
  )
12
+ from docling_core.types.doc.page import SegmentedPdfPage, TextCell
12
13
  from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
13
14
  DocumentStream,
14
15
  )
@@ -123,14 +124,10 @@ class ErrorItem(BaseModel):
123
124
  error_message: str
124
125
 
125
126
 
126
- class Cell(BaseModel):
127
- id: int
128
- text: str
129
- bbox: BoundingBox
130
-
131
-
132
- class OcrCell(Cell):
133
- confidence: float
127
+ # class Cell(BaseModel):
128
+ # id: int
129
+ # text: str
130
+ # bbox: BoundingBox
134
131
 
135
132
 
136
133
  class Cluster(BaseModel):
@@ -138,7 +135,7 @@ class Cluster(BaseModel):
138
135
  label: DocItemLabel
139
136
  bbox: BoundingBox
140
137
  confidence: float = 1.0
141
- cells: List[Cell] = []
138
+ cells: List[TextCell] = []
142
139
  children: List["Cluster"] = [] # Add child cluster support
143
140
 
144
141
 
@@ -226,7 +223,8 @@ class Page(BaseModel):
226
223
  page_no: int
227
224
  # page_hash: Optional[str] = None
228
225
  size: Optional[Size] = None
229
- cells: List[Cell] = []
226
+ cells: List[TextCell] = []
227
+ parsed_page: Optional[SegmentedPdfPage] = None
230
228
  predictions: PagePredictions = PagePredictions()
231
229
  assembled: Optional[AssembledUnit] = None
232
230
 
@@ -1,10 +1,9 @@
1
1
  import logging
2
2
  import os
3
3
  import re
4
- import warnings
5
4
  from enum import Enum
6
5
  from pathlib import Path
7
- from typing import Annotated, Any, Dict, List, Literal, Optional, Union
6
+ from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
8
7
 
9
8
  from pydantic import (
10
9
  AnyUrl,
@@ -13,13 +12,8 @@ from pydantic import (
13
12
  Field,
14
13
  field_validator,
15
14
  model_validator,
16
- validator,
17
- )
18
- from pydantic_settings import (
19
- BaseSettings,
20
- PydanticBaseSettingsSource,
21
- SettingsConfigDict,
22
15
  )
16
+ from pydantic_settings import BaseSettings, SettingsConfigDict
23
17
  from typing_extensions import deprecated
24
18
 
25
19
  _log = logging.getLogger(__name__)
@@ -83,6 +77,12 @@ class AcceleratorOptions(BaseSettings):
83
77
  return data
84
78
 
85
79
 
80
+ class BaseOptions(BaseModel):
81
+ """Base class for options."""
82
+
83
+ kind: ClassVar[str]
84
+
85
+
86
86
  class TableFormerMode(str, Enum):
87
87
  """Modes for the TableFormer model."""
88
88
 
@@ -102,10 +102,9 @@ class TableStructureOptions(BaseModel):
102
102
  mode: TableFormerMode = TableFormerMode.ACCURATE
103
103
 
104
104
 
105
- class OcrOptions(BaseModel):
105
+ class OcrOptions(BaseOptions):
106
106
  """OCR options."""
107
107
 
108
- kind: str
109
108
  lang: List[str]
110
109
  force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
111
110
  bitmap_area_threshold: float = (
@@ -116,7 +115,7 @@ class OcrOptions(BaseModel):
116
115
  class RapidOcrOptions(OcrOptions):
117
116
  """Options for the RapidOCR engine."""
118
117
 
119
- kind: Literal["rapidocr"] = "rapidocr"
118
+ kind: ClassVar[Literal["rapidocr"]] = "rapidocr"
120
119
 
121
120
  # English and chinese are the most commly used models and have been tested with RapidOCR.
122
121
  lang: List[str] = [
@@ -155,7 +154,7 @@ class RapidOcrOptions(OcrOptions):
155
154
  class EasyOcrOptions(OcrOptions):
156
155
  """Options for the EasyOCR engine."""
157
156
 
158
- kind: Literal["easyocr"] = "easyocr"
157
+ kind: ClassVar[Literal["easyocr"]] = "easyocr"
159
158
  lang: List[str] = ["fr", "de", "es", "en"]
160
159
 
161
160
  use_gpu: Optional[bool] = None
@@ -175,7 +174,7 @@ class EasyOcrOptions(OcrOptions):
175
174
  class TesseractCliOcrOptions(OcrOptions):
176
175
  """Options for the TesseractCli engine."""
177
176
 
178
- kind: Literal["tesseract"] = "tesseract"
177
+ kind: ClassVar[Literal["tesseract"]] = "tesseract"
179
178
  lang: List[str] = ["fra", "deu", "spa", "eng"]
180
179
  tesseract_cmd: str = "tesseract"
181
180
  path: Optional[str] = None
@@ -188,7 +187,7 @@ class TesseractCliOcrOptions(OcrOptions):
188
187
  class TesseractOcrOptions(OcrOptions):
189
188
  """Options for the Tesseract engine."""
190
189
 
191
- kind: Literal["tesserocr"] = "tesserocr"
190
+ kind: ClassVar[Literal["tesserocr"]] = "tesserocr"
192
191
  lang: List[str] = ["fra", "deu", "spa", "eng"]
193
192
  path: Optional[str] = None
194
193
 
@@ -200,7 +199,7 @@ class TesseractOcrOptions(OcrOptions):
200
199
  class OcrMacOptions(OcrOptions):
201
200
  """Options for the Mac OCR engine."""
202
201
 
203
- kind: Literal["ocrmac"] = "ocrmac"
202
+ kind: ClassVar[Literal["ocrmac"]] = "ocrmac"
204
203
  lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
205
204
  recognition: str = "accurate"
206
205
  framework: str = "vision"
@@ -210,8 +209,7 @@ class OcrMacOptions(OcrOptions):
210
209
  )
211
210
 
212
211
 
213
- class PictureDescriptionBaseOptions(BaseModel):
214
- kind: str
212
+ class PictureDescriptionBaseOptions(BaseOptions):
215
213
  batch_size: int = 8
216
214
  scale: float = 2
217
215
 
@@ -221,7 +219,7 @@ class PictureDescriptionBaseOptions(BaseModel):
221
219
 
222
220
 
223
221
  class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
224
- kind: Literal["api"] = "api"
222
+ kind: ClassVar[Literal["api"]] = "api"
225
223
 
226
224
  url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions")
227
225
  headers: Dict[str, str] = {}
@@ -233,7 +231,7 @@ class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
233
231
 
234
232
 
235
233
  class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
236
- kind: Literal["vlm"] = "vlm"
234
+ kind: ClassVar[Literal["vlm"]] = "vlm"
237
235
 
238
236
  repo_id: str
239
237
  prompt: str = "Describe this image in a few sentences."
@@ -301,9 +299,11 @@ class PdfBackend(str, Enum):
301
299
  PYPDFIUM2 = "pypdfium2"
302
300
  DLPARSE_V1 = "dlparse_v1"
303
301
  DLPARSE_V2 = "dlparse_v2"
302
+ DLPARSE_V4 = "dlparse_v4"
304
303
 
305
304
 
306
305
  # Define an enum for the ocr engines
306
+ @deprecated("Use ocr_factory.registered_enum")
307
307
  class OcrEngine(str, Enum):
308
308
  """Enum of valid OCR engines."""
309
309
 
@@ -323,6 +323,7 @@ class PipelineOptions(BaseModel):
323
323
  document_timeout: Optional[float] = None
324
324
  accelerator_options: AcceleratorOptions = AcceleratorOptions()
325
325
  enable_remote_services: bool = False
326
+ allow_external_plugins: bool = False
326
327
 
327
328
 
328
329
  class PaginatedPipelineOptions(PipelineOptions):
@@ -358,17 +359,10 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
358
359
  # If True, text from backend will be used instead of generated text
359
360
 
360
361
  table_structure_options: TableStructureOptions = TableStructureOptions()
361
- ocr_options: Union[
362
- EasyOcrOptions,
363
- TesseractCliOcrOptions,
364
- TesseractOcrOptions,
365
- OcrMacOptions,
366
- RapidOcrOptions,
367
- ] = Field(EasyOcrOptions(), discriminator="kind")
368
- picture_description_options: Annotated[
369
- Union[PictureDescriptionApiOptions, PictureDescriptionVlmOptions],
370
- Field(discriminator="kind"),
371
- ] = smolvlm_picture_description
362
+ ocr_options: OcrOptions = EasyOcrOptions()
363
+ picture_description_options: PictureDescriptionBaseOptions = (
364
+ smolvlm_picture_description
365
+ )
372
366
 
373
367
  images_scale: float = 1.0
374
368
  generate_page_images: bool = False
@@ -381,3 +375,5 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
381
375
  "before conversion and then use the `TableItem.get_image` function."
382
376
  ),
383
377
  )
378
+
379
+ generate_parsed_pages: bool = False
@@ -11,7 +11,7 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call
11
11
  from docling.backend.abstract_backend import AbstractDocumentBackend
12
12
  from docling.backend.asciidoc_backend import AsciiDocBackend
13
13
  from docling.backend.csv_backend import CsvDocumentBackend
14
- from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
14
+ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
15
15
  from docling.backend.html_backend import HTMLDocumentBackend
16
16
  from docling.backend.json.docling_json_backend import DoclingJSONBackend
17
17
  from docling.backend.md_backend import MarkdownDocumentBackend
@@ -109,12 +109,12 @@ class XMLJatsFormatOption(FormatOption):
109
109
 
110
110
  class ImageFormatOption(FormatOption):
111
111
  pipeline_cls: Type = StandardPdfPipeline
112
- backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
112
+ backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
113
113
 
114
114
 
115
115
  class PdfFormatOption(FormatOption):
116
116
  pipeline_cls: Type = StandardPdfPipeline
117
- backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
117
+ backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
118
118
 
119
119
 
120
120
  def _get_default_option(format: InputFormat) -> FormatOption:
@@ -147,10 +147,10 @@ def _get_default_option(format: InputFormat) -> FormatOption:
147
147
  pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
148
148
  ),
149
149
  InputFormat.IMAGE: FormatOption(
150
- pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
150
+ pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
151
151
  ),
152
152
  InputFormat.PDF: FormatOption(
153
- pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
153
+ pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
154
154
  ),
155
155
  InputFormat.JSON_DOCLING: FormatOption(
156
156
  pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
@@ -1,14 +1,22 @@
1
1
  from abc import ABC, abstractmethod
2
- from typing import Any, Generic, Iterable, Optional
2
+ from typing import Any, Generic, Iterable, Optional, Protocol, Type
3
3
 
4
4
  from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
5
5
  from typing_extensions import TypeVar
6
6
 
7
7
  from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
8
8
  from docling.datamodel.document import ConversionResult
9
+ from docling.datamodel.pipeline_options import BaseOptions
9
10
  from docling.datamodel.settings import settings
10
11
 
11
12
 
13
+ class BaseModelWithOptions(Protocol):
14
+ @classmethod
15
+ def get_options_type(cls) -> Type[BaseOptions]: ...
16
+
17
+ def __init__(self, *, options: BaseOptions, **kwargs): ...
18
+
19
+
12
20
  class BasePageModel(ABC):
13
21
  @abstractmethod
14
22
  def __call__(