docling 2.57.0__py3-none-any.whl → 2.59.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (35) hide show
  1. docling/backend/abstract_backend.py +24 -3
  2. docling/backend/asciidoc_backend.py +3 -3
  3. docling/backend/docling_parse_v4_backend.py +15 -4
  4. docling/backend/html_backend.py +130 -20
  5. docling/backend/md_backend.py +27 -5
  6. docling/backend/msexcel_backend.py +121 -29
  7. docling/backend/mspowerpoint_backend.py +2 -2
  8. docling/backend/msword_backend.py +18 -18
  9. docling/backend/pdf_backend.py +9 -2
  10. docling/backend/pypdfium2_backend.py +12 -3
  11. docling/cli/main.py +104 -38
  12. docling/datamodel/asr_model_specs.py +408 -6
  13. docling/datamodel/backend_options.py +82 -0
  14. docling/datamodel/base_models.py +19 -2
  15. docling/datamodel/document.py +81 -48
  16. docling/datamodel/pipeline_options_asr_model.py +21 -1
  17. docling/datamodel/pipeline_options_vlm_model.py +1 -0
  18. docling/document_converter.py +37 -45
  19. docling/document_extractor.py +12 -11
  20. docling/models/api_vlm_model.py +5 -3
  21. docling/models/picture_description_vlm_model.py +5 -1
  22. docling/models/readingorder_model.py +6 -7
  23. docling/models/vlm_models_inline/hf_transformers_model.py +13 -3
  24. docling/models/vlm_models_inline/mlx_model.py +9 -3
  25. docling/models/vlm_models_inline/nuextract_transformers_model.py +13 -3
  26. docling/models/vlm_models_inline/vllm_model.py +42 -8
  27. docling/pipeline/asr_pipeline.py +149 -6
  28. docling/utils/api_image_request.py +20 -9
  29. docling/utils/layout_postprocessor.py +23 -24
  30. {docling-2.57.0.dist-info → docling-2.59.0.dist-info}/METADATA +11 -8
  31. {docling-2.57.0.dist-info → docling-2.59.0.dist-info}/RECORD +35 -34
  32. {docling-2.57.0.dist-info → docling-2.59.0.dist-info}/WHEEL +0 -0
  33. {docling-2.57.0.dist-info → docling-2.59.0.dist-info}/entry_points.txt +0 -0
  34. {docling-2.57.0.dist-info → docling-2.59.0.dist-info}/licenses/LICENSE +0 -0
  35. {docling-2.57.0.dist-info → docling-2.59.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,82 @@
1
+ from pathlib import PurePath
2
+ from typing import Annotated, Literal, Optional, Union
3
+
4
+ from pydantic import AnyUrl, BaseModel, Field, SecretStr
5
+
6
+
7
+ class BaseBackendOptions(BaseModel):
8
+ """Common options for all declarative document backends."""
9
+
10
+ enable_remote_fetch: bool = Field(
11
+ False, description="Enable remote resource fetching."
12
+ )
13
+ enable_local_fetch: bool = Field(
14
+ False, description="Enable local resource fetching."
15
+ )
16
+
17
+
18
+ class DeclarativeBackendOptions(BaseBackendOptions):
19
+ """Default backend options for a declarative document backend."""
20
+
21
+ kind: Literal["declarative"] = Field("declarative", exclude=True, repr=False)
22
+
23
+
24
+ class HTMLBackendOptions(BaseBackendOptions):
25
+ """Options specific to the HTML backend.
26
+
27
+ This class can be extended to include options specific to HTML processing.
28
+ """
29
+
30
+ kind: Literal["html"] = Field("html", exclude=True, repr=False)
31
+ fetch_images: bool = Field(
32
+ False,
33
+ description=(
34
+ "Whether the backend should access remote or local resources to parse "
35
+ "images in an HTML document."
36
+ ),
37
+ )
38
+ source_uri: Optional[Union[AnyUrl, PurePath]] = Field(
39
+ None,
40
+ description=(
41
+ "The URI that originates the HTML document. If provided, the backend "
42
+ "will use it to resolve relative paths in the HTML document."
43
+ ),
44
+ )
45
+
46
+
47
+ class MarkdownBackendOptions(BaseBackendOptions):
48
+ """Options specific to the Markdown backend."""
49
+
50
+ kind: Literal["md"] = Field("md", exclude=True, repr=False)
51
+ fetch_images: bool = Field(
52
+ False,
53
+ description=(
54
+ "Whether the backend should access remote or local resources to parse "
55
+ "images in the markdown document."
56
+ ),
57
+ )
58
+ source_uri: Optional[Union[AnyUrl, PurePath]] = Field(
59
+ None,
60
+ description=(
61
+ "The URI that originates the markdown document. If provided, the backend "
62
+ "will use it to resolve relative paths in the markdown document."
63
+ ),
64
+ )
65
+
66
+
67
+ class PdfBackendOptions(BaseBackendOptions):
68
+ """Backend options for pdf document backends."""
69
+
70
+ kind: Literal["pdf"] = Field("pdf", exclude=True, repr=False)
71
+ password: Optional[SecretStr] = None
72
+
73
+
74
+ BackendOptions = Annotated[
75
+ Union[
76
+ DeclarativeBackendOptions,
77
+ HTMLBackendOptions,
78
+ MarkdownBackendOptions,
79
+ PdfBackendOptions,
80
+ ],
81
+ Field(discriminator="kind"),
82
+ ]
@@ -94,7 +94,7 @@ FormatToExtensions: dict[InputFormat, list[str]] = {
94
94
  InputFormat.XML_USPTO: ["xml", "txt"],
95
95
  InputFormat.METS_GBS: ["tar.gz"],
96
96
  InputFormat.JSON_DOCLING: ["json"],
97
- InputFormat.AUDIO: ["wav", "mp3"],
97
+ InputFormat.AUDIO: ["wav", "mp3", "m4a", "aac", "ogg", "flac", "mp4", "avi", "mov"],
98
98
  InputFormat.VTT: ["vtt"],
99
99
  }
100
100
 
@@ -128,7 +128,22 @@ FormatToMimeType: dict[InputFormat, list[str]] = {
128
128
  InputFormat.XML_USPTO: ["application/xml", "text/plain"],
129
129
  InputFormat.METS_GBS: ["application/mets+xml"],
130
130
  InputFormat.JSON_DOCLING: ["application/json"],
131
- InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
131
+ InputFormat.AUDIO: [
132
+ "audio/x-wav",
133
+ "audio/mpeg",
134
+ "audio/wav",
135
+ "audio/mp3",
136
+ "audio/mp4",
137
+ "audio/m4a",
138
+ "audio/aac",
139
+ "audio/ogg",
140
+ "audio/flac",
141
+ "audio/x-flac",
142
+ "video/mp4",
143
+ "video/avi",
144
+ "video/x-msvideo",
145
+ "video/quicktime",
146
+ ],
132
147
  InputFormat.VTT: ["text/vtt"],
133
148
  }
134
149
 
@@ -192,6 +207,8 @@ class VlmPrediction(BaseModel):
192
207
  text: str = ""
193
208
  generated_tokens: list[VlmPredictionToken] = []
194
209
  generation_time: float = -1
210
+ num_tokens: Optional[int] = None
211
+ stop_reason: Optional[str] = None # todo define an enum for possible stop reasons
195
212
 
196
213
 
197
214
  class ContainerElement(
@@ -8,14 +8,12 @@ from io import BytesIO
8
8
  from pathlib import Path, PurePath
9
9
  from typing import (
10
10
  TYPE_CHECKING,
11
- Any,
12
- Dict,
13
- List,
11
+ Annotated,
14
12
  Literal,
15
13
  Optional,
16
- Set,
17
14
  Type,
18
15
  Union,
16
+ cast,
19
17
  )
20
18
 
21
19
  import filetype
@@ -54,8 +52,10 @@ from typing_extensions import deprecated
54
52
 
55
53
  from docling.backend.abstract_backend import (
56
54
  AbstractDocumentBackend,
55
+ DeclarativeDocumentBackend,
57
56
  PaginatedDocumentBackend,
58
57
  )
58
+ from docling.datamodel.backend_options import BackendOptions
59
59
  from docling.datamodel.base_models import (
60
60
  AssembledUnit,
61
61
  ConfidenceReport,
@@ -74,6 +74,7 @@ from docling.utils.utils import create_file_hash
74
74
 
75
75
  if TYPE_CHECKING:
76
76
  from docling.datamodel.base_models import BaseFormatOption
77
+ from docling.document_converter import FormatOption
77
78
 
78
79
  _log = logging.getLogger(__name__)
79
80
 
@@ -102,29 +103,46 @@ _EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
102
103
 
103
104
 
104
105
  class InputDocument(BaseModel):
105
- file: PurePath
106
- document_hash: str # = None
107
- valid: bool = True
108
- limits: DocumentLimits = DocumentLimits()
109
- format: InputFormat # = None
110
-
111
- filesize: Optional[int] = None
112
- page_count: int = 0
113
-
114
- _backend: AbstractDocumentBackend # Internal PDF backend used
106
+ """A document as an input of a Docling conversion."""
107
+
108
+ file: Annotated[
109
+ PurePath, Field(description="A path representation the input document.")
110
+ ]
111
+ document_hash: Annotated[
112
+ str,
113
+ Field(description="A stable hash of the path or stream of the input document."),
114
+ ]
115
+ valid: bool = Field(True, description="Whether this is is a valid input document.")
116
+ backend_options: Optional[BackendOptions] = Field(
117
+ None, description="Custom options for backends."
118
+ )
119
+ limits: DocumentLimits = Field(
120
+ DocumentLimits(), description="Limits in the input document for the conversion."
121
+ )
122
+ format: Annotated[InputFormat, Field(description="The document format.")]
123
+
124
+ filesize: Optional[int] = Field(
125
+ None, description="Size of the input file, in bytes."
126
+ )
127
+ page_count: int = Field(0, description="Number of pages in the input document.")
128
+
129
+ _backend: AbstractDocumentBackend
115
130
 
116
131
  def __init__(
117
132
  self,
118
133
  path_or_stream: Union[BytesIO, Path],
119
134
  format: InputFormat,
120
135
  backend: Type[AbstractDocumentBackend],
136
+ backend_options: Optional[BackendOptions] = None,
121
137
  filename: Optional[str] = None,
122
138
  limits: Optional[DocumentLimits] = None,
123
- ):
139
+ ) -> None:
124
140
  super().__init__(
125
- file="", document_hash="", format=InputFormat.PDF
141
+ file="",
142
+ document_hash="",
143
+ format=InputFormat.PDF,
144
+ backend_options=backend_options,
126
145
  ) # initialize with dummy values
127
-
128
146
  self.limits = limits or DocumentLimits()
129
147
  self.format = format
130
148
 
@@ -140,7 +158,8 @@ class InputDocument(BaseModel):
140
158
 
141
159
  elif isinstance(path_or_stream, BytesIO):
142
160
  assert filename is not None, (
143
- "Can't construct InputDocument from stream without providing filename arg."
161
+ "Can't construct InputDocument from stream without providing "
162
+ "filename arg."
144
163
  )
145
164
  self.file = PurePath(filename)
146
165
  self.filesize = path_or_stream.getbuffer().nbytes
@@ -175,7 +194,8 @@ class InputDocument(BaseModel):
175
194
  except RuntimeError as e:
176
195
  self.valid = False
177
196
  _log.exception(
178
- f"An unexpected error occurred while opening the document {self.file.name}",
197
+ "An unexpected error occurred while opening the document "
198
+ "f{self.file.name}",
179
199
  exc_info=e,
180
200
  )
181
201
  # raise
@@ -185,7 +205,15 @@ class InputDocument(BaseModel):
185
205
  backend: Type[AbstractDocumentBackend],
186
206
  path_or_stream: Union[BytesIO, Path],
187
207
  ) -> None:
188
- self._backend = backend(self, path_or_stream=path_or_stream)
208
+ if self.backend_options:
209
+ self._backend = backend(
210
+ self,
211
+ path_or_stream=path_or_stream,
212
+ options=self.backend_options,
213
+ )
214
+ else:
215
+ self._backend = backend(self, path_or_stream=path_or_stream)
216
+
189
217
  if not self._backend.is_valid():
190
218
  self.valid = False
191
219
 
@@ -199,11 +227,11 @@ class ConversionResult(BaseModel):
199
227
  input: InputDocument
200
228
 
201
229
  status: ConversionStatus = ConversionStatus.PENDING # failure, success
202
- errors: List[ErrorItem] = [] # structure to keep errors
230
+ errors: list[ErrorItem] = [] # structure to keep errors
203
231
 
204
- pages: List[Page] = []
232
+ pages: list[Page] = []
205
233
  assembled: AssembledUnit = AssembledUnit()
206
- timings: Dict[str, ProfilingItem] = {}
234
+ timings: dict[str, ProfilingItem] = {}
207
235
  confidence: ConfidenceReport = Field(default_factory=ConfidenceReport)
208
236
 
209
237
  document: DoclingDocument = _EMPTY_DOCLING_DOC
@@ -222,7 +250,7 @@ class _DummyBackend(AbstractDocumentBackend):
222
250
  return False
223
251
 
224
252
  @classmethod
225
- def supported_formats(cls) -> Set[InputFormat]:
253
+ def supported_formats(cls) -> set[InputFormat]:
226
254
  return set()
227
255
 
228
256
  @classmethod
@@ -235,7 +263,7 @@ class _DummyBackend(AbstractDocumentBackend):
235
263
 
236
264
  class _DocumentConversionInput(BaseModel):
237
265
  path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
238
- headers: Optional[Dict[str, str]] = None
266
+ headers: Optional[dict[str, str]] = None
239
267
  limits: Optional[DocumentLimits] = DocumentLimits()
240
268
 
241
269
  def docs(
@@ -250,33 +278,36 @@ class _DocumentConversionInput(BaseModel):
250
278
  )
251
279
  format = self._guess_format(obj)
252
280
  backend: Type[AbstractDocumentBackend]
253
- if format not in format_options.keys():
281
+ backend_options: Optional[BackendOptions] = None
282
+ if not format or format not in format_options:
254
283
  _log.error(
255
- f"Input document {obj.name} with format {format} does not match any allowed format: ({format_options.keys()})"
284
+ f"Input document {obj.name} with format {format} does not match "
285
+ f"any allowed format: ({format_options.keys()})"
256
286
  )
257
287
  backend = _DummyBackend
258
288
  else:
259
- backend = format_options[format].backend
289
+ options = format_options[format]
290
+ backend = options.backend
291
+ if "backend_options" in options.model_fields_set:
292
+ backend_options = cast("FormatOption", options).backend_options
260
293
 
294
+ path_or_stream: Union[BytesIO, Path]
261
295
  if isinstance(obj, Path):
262
- yield InputDocument(
263
- path_or_stream=obj,
264
- format=format, # type: ignore[arg-type]
265
- filename=obj.name,
266
- limits=self.limits,
267
- backend=backend,
268
- )
296
+ path_or_stream = obj
269
297
  elif isinstance(obj, DocumentStream):
270
- yield InputDocument(
271
- path_or_stream=obj.stream,
272
- format=format, # type: ignore[arg-type]
273
- filename=obj.name,
274
- limits=self.limits,
275
- backend=backend,
276
- )
298
+ path_or_stream = obj.stream
277
299
  else:
278
300
  raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
279
301
 
302
+ yield InputDocument(
303
+ path_or_stream=path_or_stream,
304
+ format=format, # type: ignore[arg-type]
305
+ filename=obj.name,
306
+ limits=self.limits,
307
+ backend=backend,
308
+ backend_options=backend_options,
309
+ )
310
+
280
311
  def _guess_format(self, obj: Union[Path, DocumentStream]) -> Optional[InputFormat]:
281
312
  content = b"" # empty binary blob
282
313
  formats: list[InputFormat] = []
@@ -290,12 +321,13 @@ class _DocumentConversionInput(BaseModel):
290
321
  with obj.open("rb") as f:
291
322
  content = f.read(1024) # Read first 1KB
292
323
  if mime is not None and mime.lower() == "application/zip":
324
+ mime_root = "application/vnd.openxmlformats-officedocument"
293
325
  if obj.suffixes[-1].lower() == ".xlsx":
294
- mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
326
+ mime = mime_root + ".spreadsheetml.sheet"
295
327
  elif obj.suffixes[-1].lower() == ".docx":
296
- mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
328
+ mime = mime_root + ".wordprocessingml.document"
297
329
  elif obj.suffixes[-1].lower() == ".pptx":
298
- mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
330
+ mime = mime_root + ".presentationml.presentation"
299
331
 
300
332
  elif isinstance(obj, DocumentStream):
301
333
  content = obj.stream.read(8192)
@@ -310,12 +342,13 @@ class _DocumentConversionInput(BaseModel):
310
342
  mime = _DocumentConversionInput._mime_from_extension(ext.lower())
311
343
  if mime is not None and mime.lower() == "application/zip":
312
344
  objname = obj.name.lower()
345
+ mime_root = "application/vnd.openxmlformats-officedocument"
313
346
  if objname.endswith(".xlsx"):
314
- mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
347
+ mime = mime_root + ".spreadsheetml.sheet"
315
348
  elif objname.endswith(".docx"):
316
- mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
349
+ mime = mime_root + ".wordprocessingml.document"
317
350
  elif objname.endswith(".pptx"):
318
- mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
351
+ mime = mime_root + ".presentationml.presentation"
319
352
 
320
353
  if mime is not None and mime.lower() == "application/gzip":
321
354
  if detected_mime := _DocumentConversionInput._detect_mets_gbs(obj):
@@ -17,7 +17,7 @@ class BaseAsrOptions(BaseModel):
17
17
 
18
18
 
19
19
  class InferenceAsrFramework(str, Enum):
20
- # MLX = "mlx" # disabled for now
20
+ MLX = "mlx"
21
21
  # TRANSFORMERS = "transformers" # disabled for now
22
22
  WHISPER = "whisper"
23
23
 
@@ -55,3 +55,23 @@ class InlineAsrNativeWhisperOptions(InlineAsrOptions):
55
55
  AcceleratorDevice.CUDA,
56
56
  ]
57
57
  word_timestamps: bool = True
58
+
59
+
60
+ class InlineAsrMlxWhisperOptions(InlineAsrOptions):
61
+ """
62
+ MLX Whisper options for Apple Silicon optimization.
63
+
64
+ Uses mlx-whisper library for efficient inference on Apple Silicon devices.
65
+ """
66
+
67
+ inference_framework: InferenceAsrFramework = InferenceAsrFramework.MLX
68
+
69
+ language: str = "en"
70
+ task: str = "transcribe" # "transcribe" or "translate"
71
+ supported_devices: List[AcceleratorDevice] = [
72
+ AcceleratorDevice.MPS, # MLX is optimized for Apple Silicon
73
+ ]
74
+ word_timestamps: bool = True
75
+ no_speech_threshold: float = 0.6 # Threshold for detecting speech
76
+ logprob_threshold: float = -1.0 # Log probability threshold
77
+ compression_ratio_threshold: float = 2.4 # Compression ratio threshold
@@ -82,6 +82,7 @@ class InlineVlmOptions(BaseVlmOptions):
82
82
 
83
83
  use_kv_cache: bool = True
84
84
  max_new_tokens: int = 4096
85
+ track_generated_tokens: bool = False
85
86
 
86
87
  @property
87
88
  def repo_cache_folder(self) -> str:
@@ -9,11 +9,14 @@ from datetime import datetime
9
9
  from functools import partial
10
10
  from io import BytesIO
11
11
  from pathlib import Path
12
- from typing import Dict, List, Optional, Tuple, Type, Union
12
+ from typing import Optional, Type, Union
13
13
 
14
- from pydantic import BaseModel, ConfigDict, model_validator, validate_call
14
+ from pydantic import ConfigDict, model_validator, validate_call
15
+ from typing_extensions import Self
15
16
 
16
- from docling.backend.abstract_backend import AbstractDocumentBackend
17
+ from docling.backend.abstract_backend import (
18
+ AbstractDocumentBackend,
19
+ )
17
20
  from docling.backend.asciidoc_backend import AsciiDocBackend
18
21
  from docling.backend.csv_backend import CsvDocumentBackend
19
22
  from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
@@ -28,6 +31,12 @@ from docling.backend.noop_backend import NoOpBackend
28
31
  from docling.backend.webvtt_backend import WebVTTDocumentBackend
29
32
  from docling.backend.xml.jats_backend import JatsDocumentBackend
30
33
  from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
34
+ from docling.datamodel.backend_options import (
35
+ BackendOptions,
36
+ HTMLBackendOptions,
37
+ MarkdownBackendOptions,
38
+ PdfBackendOptions,
39
+ )
31
40
  from docling.datamodel.base_models import (
32
41
  BaseFormatOption,
33
42
  ConversionStatus,
@@ -61,11 +70,13 @@ _PIPELINE_CACHE_LOCK = threading.Lock()
61
70
 
62
71
  class FormatOption(BaseFormatOption):
63
72
  pipeline_cls: Type[BasePipeline]
73
+ backend_options: Optional[BackendOptions] = None
64
74
 
65
75
  @model_validator(mode="after")
66
- def set_optional_field_default(self) -> "FormatOption":
76
+ def set_optional_field_default(self) -> Self:
67
77
  if self.pipeline_options is None:
68
78
  self.pipeline_options = self.pipeline_cls.get_default_options()
79
+
69
80
  return self
70
81
 
71
82
 
@@ -92,6 +103,7 @@ class PowerpointFormatOption(FormatOption):
92
103
  class MarkdownFormatOption(FormatOption):
93
104
  pipeline_cls: Type = SimplePipeline
94
105
  backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
106
+ backend_options: Optional[MarkdownBackendOptions] = None
95
107
 
96
108
 
97
109
  class AsciiDocFormatOption(FormatOption):
@@ -102,6 +114,7 @@ class AsciiDocFormatOption(FormatOption):
102
114
  class HTMLFormatOption(FormatOption):
103
115
  pipeline_cls: Type = SimplePipeline
104
116
  backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
117
+ backend_options: Optional[HTMLBackendOptions] = None
105
118
 
106
119
 
107
120
  class PatentUsptoFormatOption(FormatOption):
@@ -122,6 +135,7 @@ class ImageFormatOption(FormatOption):
122
135
  class PdfFormatOption(FormatOption):
123
136
  pipeline_cls: Type = StandardPdfPipeline
124
137
  backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
138
+ backend_options: Optional[PdfBackendOptions] = None
125
139
 
126
140
 
127
141
  class AudioFormatOption(FormatOption):
@@ -131,46 +145,24 @@ class AudioFormatOption(FormatOption):
131
145
 
132
146
  def _get_default_option(format: InputFormat) -> FormatOption:
133
147
  format_to_default_options = {
134
- InputFormat.CSV: FormatOption(
135
- pipeline_cls=SimplePipeline, backend=CsvDocumentBackend
136
- ),
137
- InputFormat.XLSX: FormatOption(
138
- pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
139
- ),
140
- InputFormat.DOCX: FormatOption(
141
- pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
142
- ),
143
- InputFormat.PPTX: FormatOption(
144
- pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
145
- ),
146
- InputFormat.MD: FormatOption(
147
- pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
148
- ),
149
- InputFormat.ASCIIDOC: FormatOption(
150
- pipeline_cls=SimplePipeline, backend=AsciiDocBackend
151
- ),
152
- InputFormat.HTML: FormatOption(
153
- pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
154
- ),
155
- InputFormat.XML_USPTO: FormatOption(
156
- pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
157
- ),
158
- InputFormat.XML_JATS: FormatOption(
159
- pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
160
- ),
148
+ InputFormat.CSV: CsvFormatOption(),
149
+ InputFormat.XLSX: ExcelFormatOption(),
150
+ InputFormat.DOCX: WordFormatOption(),
151
+ InputFormat.PPTX: PowerpointFormatOption(),
152
+ InputFormat.MD: MarkdownFormatOption(),
153
+ InputFormat.ASCIIDOC: AsciiDocFormatOption(),
154
+ InputFormat.HTML: HTMLFormatOption(),
155
+ InputFormat.XML_USPTO: PatentUsptoFormatOption(),
156
+ InputFormat.XML_JATS: XMLJatsFormatOption(),
161
157
  InputFormat.METS_GBS: FormatOption(
162
158
  pipeline_cls=StandardPdfPipeline, backend=MetsGbsDocumentBackend
163
159
  ),
164
- InputFormat.IMAGE: FormatOption(
165
- pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
166
- ),
167
- InputFormat.PDF: FormatOption(
168
- pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
169
- ),
160
+ InputFormat.IMAGE: ImageFormatOption(),
161
+ InputFormat.PDF: PdfFormatOption(),
170
162
  InputFormat.JSON_DOCLING: FormatOption(
171
163
  pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
172
164
  ),
173
- InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=NoOpBackend),
165
+ InputFormat.AUDIO: AudioFormatOption(),
174
166
  InputFormat.VTT: FormatOption(
175
167
  pipeline_cls=SimplePipeline, backend=WebVTTDocumentBackend
176
168
  ),
@@ -186,13 +178,13 @@ class DocumentConverter:
186
178
 
187
179
  def __init__(
188
180
  self,
189
- allowed_formats: Optional[List[InputFormat]] = None,
190
- format_options: Optional[Dict[InputFormat, FormatOption]] = None,
181
+ allowed_formats: Optional[list[InputFormat]] = None,
182
+ format_options: Optional[dict[InputFormat, FormatOption]] = None,
191
183
  ):
192
184
  self.allowed_formats = (
193
185
  allowed_formats if allowed_formats is not None else list(InputFormat)
194
186
  )
195
- self.format_to_options: Dict[InputFormat, FormatOption] = {
187
+ self.format_to_options: dict[InputFormat, FormatOption] = {
196
188
  format: (
197
189
  _get_default_option(format=format)
198
190
  if (custom_option := (format_options or {}).get(format)) is None
@@ -200,8 +192,8 @@ class DocumentConverter:
200
192
  )
201
193
  for format in self.allowed_formats
202
194
  }
203
- self.initialized_pipelines: Dict[
204
- Tuple[Type[BasePipeline], str], BasePipeline
195
+ self.initialized_pipelines: dict[
196
+ tuple[Type[BasePipeline], str], BasePipeline
205
197
  ] = {}
206
198
 
207
199
  def _get_initialized_pipelines(
@@ -228,7 +220,7 @@ class DocumentConverter:
228
220
  def convert(
229
221
  self,
230
222
  source: Union[Path, str, DocumentStream], # TODO review naming
231
- headers: Optional[Dict[str, str]] = None,
223
+ headers: Optional[dict[str, str]] = None,
232
224
  raises_on_error: bool = True,
233
225
  max_num_pages: int = sys.maxsize,
234
226
  max_file_size: int = sys.maxsize,
@@ -248,7 +240,7 @@ class DocumentConverter:
248
240
  def convert_all(
249
241
  self,
250
242
  source: Iterable[Union[Path, str, DocumentStream]], # TODO review naming
251
- headers: Optional[Dict[str, str]] = None,
243
+ headers: Optional[dict[str, str]] = None,
252
244
  raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
253
245
  max_num_pages: int = sys.maxsize,
254
246
  max_file_size: int = sys.maxsize,
@@ -8,9 +8,10 @@ from collections.abc import Iterable, Iterator
8
8
  from concurrent.futures import ThreadPoolExecutor
9
9
  from functools import partial
10
10
  from pathlib import Path
11
- from typing import Dict, List, Optional, Tuple, Type, Union
11
+ from typing import Optional, Type, Union
12
12
 
13
13
  from pydantic import ConfigDict, model_validator, validate_call
14
+ from typing_extensions import Self
14
15
 
15
16
  from docling.backend.abstract_backend import AbstractDocumentBackend
16
17
  from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
@@ -56,7 +57,7 @@ class ExtractionFormatOption(BaseFormatOption):
56
57
  pipeline_cls: Type[BaseExtractionPipeline]
57
58
 
58
59
  @model_validator(mode="after")
59
- def set_optional_field_default(self) -> "ExtractionFormatOption":
60
+ def set_optional_field_default(self) -> Self:
60
61
  if self.pipeline_options is None:
61
62
  # `get_default_options` comes from BaseExtractionPipeline
62
63
  self.pipeline_options = self.pipeline_cls.get_default_options() # type: ignore[assignment]
@@ -70,7 +71,7 @@ def _get_default_extraction_option(fmt: InputFormat) -> ExtractionFormatOption:
70
71
  the VLM extractor. This duplication will be removed when we deduplicate
71
72
  the format registry between convert/extract.
72
73
  """
73
- format_to_default_backend: Dict[InputFormat, Type[AbstractDocumentBackend]] = {
74
+ format_to_default_backend: dict[InputFormat, Type[AbstractDocumentBackend]] = {
74
75
  InputFormat.IMAGE: PyPdfiumDocumentBackend,
75
76
  InputFormat.PDF: PyPdfiumDocumentBackend,
76
77
  }
@@ -98,24 +99,24 @@ class DocumentExtractor:
98
99
 
99
100
  def __init__(
100
101
  self,
101
- allowed_formats: Optional[List[InputFormat]] = None,
102
+ allowed_formats: Optional[list[InputFormat]] = None,
102
103
  extraction_format_options: Optional[
103
- Dict[InputFormat, ExtractionFormatOption]
104
+ dict[InputFormat, ExtractionFormatOption]
104
105
  ] = None,
105
106
  ) -> None:
106
- self.allowed_formats: List[InputFormat] = (
107
+ self.allowed_formats: list[InputFormat] = (
107
108
  allowed_formats if allowed_formats is not None else list(InputFormat)
108
109
  )
109
110
  # Build per-format options with defaults, then apply any user overrides
110
111
  overrides = extraction_format_options or {}
111
- self.extraction_format_to_options: Dict[InputFormat, ExtractionFormatOption] = {
112
+ self.extraction_format_to_options: dict[InputFormat, ExtractionFormatOption] = {
112
113
  fmt: overrides.get(fmt, _get_default_extraction_option(fmt))
113
114
  for fmt in self.allowed_formats
114
115
  }
115
116
 
116
117
  # Cache pipelines by (class, options-hash)
117
- self._initialized_pipelines: Dict[
118
- Tuple[Type[BaseExtractionPipeline], str], BaseExtractionPipeline
118
+ self._initialized_pipelines: dict[
119
+ tuple[Type[BaseExtractionPipeline], str], BaseExtractionPipeline
119
120
  ] = {}
120
121
 
121
122
  # ---------------------------- Public API ---------------------------------
@@ -125,7 +126,7 @@ class DocumentExtractor:
125
126
  self,
126
127
  source: Union[Path, str, DocumentStream],
127
128
  template: ExtractionTemplateType,
128
- headers: Optional[Dict[str, str]] = None,
129
+ headers: Optional[dict[str, str]] = None,
129
130
  raises_on_error: bool = True,
130
131
  max_num_pages: int = sys.maxsize,
131
132
  max_file_size: int = sys.maxsize,
@@ -147,7 +148,7 @@ class DocumentExtractor:
147
148
  self,
148
149
  source: Iterable[Union[Path, str, DocumentStream]],
149
150
  template: ExtractionTemplateType,
150
- headers: Optional[Dict[str, str]] = None,
151
+ headers: Optional[dict[str, str]] = None,
151
152
  raises_on_error: bool = True,
152
153
  max_num_pages: int = sys.maxsize,
153
154
  max_file_size: int = sys.maxsize,