docling 2.56.1__py3-none-any.whl → 2.58.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/backend/abstract_backend.py +24 -3
- docling/backend/asciidoc_backend.py +3 -3
- docling/backend/docling_parse_v4_backend.py +15 -4
- docling/backend/docx/drawingml/utils.py +131 -0
- docling/backend/html_backend.py +130 -20
- docling/backend/md_backend.py +27 -5
- docling/backend/msexcel_backend.py +115 -27
- docling/backend/mspowerpoint_backend.py +2 -2
- docling/backend/msword_backend.py +104 -29
- docling/backend/pdf_backend.py +9 -2
- docling/backend/pypdfium2_backend.py +12 -3
- docling/cli/main.py +85 -30
- docling/datamodel/asr_model_specs.py +408 -6
- docling/datamodel/backend_options.py +82 -0
- docling/datamodel/base_models.py +17 -2
- docling/datamodel/document.py +81 -48
- docling/datamodel/pipeline_options_asr_model.py +21 -1
- docling/document_converter.py +37 -45
- docling/document_extractor.py +12 -11
- docling/models/readingorder_model.py +6 -7
- docling/pipeline/asr_pipeline.py +139 -3
- docling/pipeline/vlm_pipeline.py +53 -33
- docling/utils/api_image_request.py +4 -4
- docling/utils/layout_postprocessor.py +23 -24
- {docling-2.56.1.dist-info → docling-2.58.0.dist-info}/METADATA +4 -2
- {docling-2.56.1.dist-info → docling-2.58.0.dist-info}/RECORD +30 -28
- {docling-2.56.1.dist-info → docling-2.58.0.dist-info}/WHEEL +0 -0
- {docling-2.56.1.dist-info → docling-2.58.0.dist-info}/entry_points.txt +0 -0
- {docling-2.56.1.dist-info → docling-2.58.0.dist-info}/licenses/LICENSE +0 -0
- {docling-2.56.1.dist-info → docling-2.58.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from pathlib import PurePath
|
|
2
|
+
from typing import Annotated, Literal, Optional, Union
|
|
3
|
+
|
|
4
|
+
from pydantic import AnyUrl, BaseModel, Field, SecretStr
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class BaseBackendOptions(BaseModel):
|
|
8
|
+
"""Common options for all declarative document backends."""
|
|
9
|
+
|
|
10
|
+
enable_remote_fetch: bool = Field(
|
|
11
|
+
False, description="Enable remote resource fetching."
|
|
12
|
+
)
|
|
13
|
+
enable_local_fetch: bool = Field(
|
|
14
|
+
False, description="Enable local resource fetching."
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DeclarativeBackendOptions(BaseBackendOptions):
|
|
19
|
+
"""Default backend options for a declarative document backend."""
|
|
20
|
+
|
|
21
|
+
kind: Literal["declarative"] = Field("declarative", exclude=True, repr=False)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class HTMLBackendOptions(BaseBackendOptions):
|
|
25
|
+
"""Options specific to the HTML backend.
|
|
26
|
+
|
|
27
|
+
This class can be extended to include options specific to HTML processing.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
kind: Literal["html"] = Field("html", exclude=True, repr=False)
|
|
31
|
+
fetch_images: bool = Field(
|
|
32
|
+
False,
|
|
33
|
+
description=(
|
|
34
|
+
"Whether the backend should access remote or local resources to parse "
|
|
35
|
+
"images in an HTML document."
|
|
36
|
+
),
|
|
37
|
+
)
|
|
38
|
+
source_uri: Optional[Union[AnyUrl, PurePath]] = Field(
|
|
39
|
+
None,
|
|
40
|
+
description=(
|
|
41
|
+
"The URI that originates the HTML document. If provided, the backend "
|
|
42
|
+
"will use it to resolve relative paths in the HTML document."
|
|
43
|
+
),
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class MarkdownBackendOptions(BaseBackendOptions):
|
|
48
|
+
"""Options specific to the Markdown backend."""
|
|
49
|
+
|
|
50
|
+
kind: Literal["md"] = Field("md", exclude=True, repr=False)
|
|
51
|
+
fetch_images: bool = Field(
|
|
52
|
+
False,
|
|
53
|
+
description=(
|
|
54
|
+
"Whether the backend should access remote or local resources to parse "
|
|
55
|
+
"images in the markdown document."
|
|
56
|
+
),
|
|
57
|
+
)
|
|
58
|
+
source_uri: Optional[Union[AnyUrl, PurePath]] = Field(
|
|
59
|
+
None,
|
|
60
|
+
description=(
|
|
61
|
+
"The URI that originates the markdown document. If provided, the backend "
|
|
62
|
+
"will use it to resolve relative paths in the markdown document."
|
|
63
|
+
),
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class PdfBackendOptions(BaseBackendOptions):
|
|
68
|
+
"""Backend options for pdf document backends."""
|
|
69
|
+
|
|
70
|
+
kind: Literal["pdf"] = Field("pdf", exclude=True, repr=False)
|
|
71
|
+
password: Optional[SecretStr] = None
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
BackendOptions = Annotated[
|
|
75
|
+
Union[
|
|
76
|
+
DeclarativeBackendOptions,
|
|
77
|
+
HTMLBackendOptions,
|
|
78
|
+
MarkdownBackendOptions,
|
|
79
|
+
PdfBackendOptions,
|
|
80
|
+
],
|
|
81
|
+
Field(discriminator="kind"),
|
|
82
|
+
]
|
docling/datamodel/base_models.py
CHANGED
|
@@ -94,7 +94,7 @@ FormatToExtensions: dict[InputFormat, list[str]] = {
|
|
|
94
94
|
InputFormat.XML_USPTO: ["xml", "txt"],
|
|
95
95
|
InputFormat.METS_GBS: ["tar.gz"],
|
|
96
96
|
InputFormat.JSON_DOCLING: ["json"],
|
|
97
|
-
InputFormat.AUDIO: ["wav", "mp3"],
|
|
97
|
+
InputFormat.AUDIO: ["wav", "mp3", "m4a", "aac", "ogg", "flac", "mp4", "avi", "mov"],
|
|
98
98
|
InputFormat.VTT: ["vtt"],
|
|
99
99
|
}
|
|
100
100
|
|
|
@@ -128,7 +128,22 @@ FormatToMimeType: dict[InputFormat, list[str]] = {
|
|
|
128
128
|
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
|
|
129
129
|
InputFormat.METS_GBS: ["application/mets+xml"],
|
|
130
130
|
InputFormat.JSON_DOCLING: ["application/json"],
|
|
131
|
-
InputFormat.AUDIO: [
|
|
131
|
+
InputFormat.AUDIO: [
|
|
132
|
+
"audio/x-wav",
|
|
133
|
+
"audio/mpeg",
|
|
134
|
+
"audio/wav",
|
|
135
|
+
"audio/mp3",
|
|
136
|
+
"audio/mp4",
|
|
137
|
+
"audio/m4a",
|
|
138
|
+
"audio/aac",
|
|
139
|
+
"audio/ogg",
|
|
140
|
+
"audio/flac",
|
|
141
|
+
"audio/x-flac",
|
|
142
|
+
"video/mp4",
|
|
143
|
+
"video/avi",
|
|
144
|
+
"video/x-msvideo",
|
|
145
|
+
"video/quicktime",
|
|
146
|
+
],
|
|
132
147
|
InputFormat.VTT: ["text/vtt"],
|
|
133
148
|
}
|
|
134
149
|
|
docling/datamodel/document.py
CHANGED
|
@@ -8,14 +8,12 @@ from io import BytesIO
|
|
|
8
8
|
from pathlib import Path, PurePath
|
|
9
9
|
from typing import (
|
|
10
10
|
TYPE_CHECKING,
|
|
11
|
-
|
|
12
|
-
Dict,
|
|
13
|
-
List,
|
|
11
|
+
Annotated,
|
|
14
12
|
Literal,
|
|
15
13
|
Optional,
|
|
16
|
-
Set,
|
|
17
14
|
Type,
|
|
18
15
|
Union,
|
|
16
|
+
cast,
|
|
19
17
|
)
|
|
20
18
|
|
|
21
19
|
import filetype
|
|
@@ -54,8 +52,10 @@ from typing_extensions import deprecated
|
|
|
54
52
|
|
|
55
53
|
from docling.backend.abstract_backend import (
|
|
56
54
|
AbstractDocumentBackend,
|
|
55
|
+
DeclarativeDocumentBackend,
|
|
57
56
|
PaginatedDocumentBackend,
|
|
58
57
|
)
|
|
58
|
+
from docling.datamodel.backend_options import BackendOptions
|
|
59
59
|
from docling.datamodel.base_models import (
|
|
60
60
|
AssembledUnit,
|
|
61
61
|
ConfidenceReport,
|
|
@@ -74,6 +74,7 @@ from docling.utils.utils import create_file_hash
|
|
|
74
74
|
|
|
75
75
|
if TYPE_CHECKING:
|
|
76
76
|
from docling.datamodel.base_models import BaseFormatOption
|
|
77
|
+
from docling.document_converter import FormatOption
|
|
77
78
|
|
|
78
79
|
_log = logging.getLogger(__name__)
|
|
79
80
|
|
|
@@ -102,29 +103,46 @@ _EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
|
|
|
102
103
|
|
|
103
104
|
|
|
104
105
|
class InputDocument(BaseModel):
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
106
|
+
"""A document as an input of a Docling conversion."""
|
|
107
|
+
|
|
108
|
+
file: Annotated[
|
|
109
|
+
PurePath, Field(description="A path representation the input document.")
|
|
110
|
+
]
|
|
111
|
+
document_hash: Annotated[
|
|
112
|
+
str,
|
|
113
|
+
Field(description="A stable hash of the path or stream of the input document."),
|
|
114
|
+
]
|
|
115
|
+
valid: bool = Field(True, description="Whether this is is a valid input document.")
|
|
116
|
+
backend_options: Optional[BackendOptions] = Field(
|
|
117
|
+
None, description="Custom options for backends."
|
|
118
|
+
)
|
|
119
|
+
limits: DocumentLimits = Field(
|
|
120
|
+
DocumentLimits(), description="Limits in the input document for the conversion."
|
|
121
|
+
)
|
|
122
|
+
format: Annotated[InputFormat, Field(description="The document format.")]
|
|
123
|
+
|
|
124
|
+
filesize: Optional[int] = Field(
|
|
125
|
+
None, description="Size of the input file, in bytes."
|
|
126
|
+
)
|
|
127
|
+
page_count: int = Field(0, description="Number of pages in the input document.")
|
|
128
|
+
|
|
129
|
+
_backend: AbstractDocumentBackend
|
|
115
130
|
|
|
116
131
|
def __init__(
|
|
117
132
|
self,
|
|
118
133
|
path_or_stream: Union[BytesIO, Path],
|
|
119
134
|
format: InputFormat,
|
|
120
135
|
backend: Type[AbstractDocumentBackend],
|
|
136
|
+
backend_options: Optional[BackendOptions] = None,
|
|
121
137
|
filename: Optional[str] = None,
|
|
122
138
|
limits: Optional[DocumentLimits] = None,
|
|
123
|
-
):
|
|
139
|
+
) -> None:
|
|
124
140
|
super().__init__(
|
|
125
|
-
file="",
|
|
141
|
+
file="",
|
|
142
|
+
document_hash="",
|
|
143
|
+
format=InputFormat.PDF,
|
|
144
|
+
backend_options=backend_options,
|
|
126
145
|
) # initialize with dummy values
|
|
127
|
-
|
|
128
146
|
self.limits = limits or DocumentLimits()
|
|
129
147
|
self.format = format
|
|
130
148
|
|
|
@@ -140,7 +158,8 @@ class InputDocument(BaseModel):
|
|
|
140
158
|
|
|
141
159
|
elif isinstance(path_or_stream, BytesIO):
|
|
142
160
|
assert filename is not None, (
|
|
143
|
-
"Can't construct InputDocument from stream without providing
|
|
161
|
+
"Can't construct InputDocument from stream without providing "
|
|
162
|
+
"filename arg."
|
|
144
163
|
)
|
|
145
164
|
self.file = PurePath(filename)
|
|
146
165
|
self.filesize = path_or_stream.getbuffer().nbytes
|
|
@@ -175,7 +194,8 @@ class InputDocument(BaseModel):
|
|
|
175
194
|
except RuntimeError as e:
|
|
176
195
|
self.valid = False
|
|
177
196
|
_log.exception(
|
|
178
|
-
|
|
197
|
+
"An unexpected error occurred while opening the document "
|
|
198
|
+
"f{self.file.name}",
|
|
179
199
|
exc_info=e,
|
|
180
200
|
)
|
|
181
201
|
# raise
|
|
@@ -185,7 +205,15 @@ class InputDocument(BaseModel):
|
|
|
185
205
|
backend: Type[AbstractDocumentBackend],
|
|
186
206
|
path_or_stream: Union[BytesIO, Path],
|
|
187
207
|
) -> None:
|
|
188
|
-
self.
|
|
208
|
+
if self.backend_options:
|
|
209
|
+
self._backend = backend(
|
|
210
|
+
self,
|
|
211
|
+
path_or_stream=path_or_stream,
|
|
212
|
+
options=self.backend_options,
|
|
213
|
+
)
|
|
214
|
+
else:
|
|
215
|
+
self._backend = backend(self, path_or_stream=path_or_stream)
|
|
216
|
+
|
|
189
217
|
if not self._backend.is_valid():
|
|
190
218
|
self.valid = False
|
|
191
219
|
|
|
@@ -199,11 +227,11 @@ class ConversionResult(BaseModel):
|
|
|
199
227
|
input: InputDocument
|
|
200
228
|
|
|
201
229
|
status: ConversionStatus = ConversionStatus.PENDING # failure, success
|
|
202
|
-
errors:
|
|
230
|
+
errors: list[ErrorItem] = [] # structure to keep errors
|
|
203
231
|
|
|
204
|
-
pages:
|
|
232
|
+
pages: list[Page] = []
|
|
205
233
|
assembled: AssembledUnit = AssembledUnit()
|
|
206
|
-
timings:
|
|
234
|
+
timings: dict[str, ProfilingItem] = {}
|
|
207
235
|
confidence: ConfidenceReport = Field(default_factory=ConfidenceReport)
|
|
208
236
|
|
|
209
237
|
document: DoclingDocument = _EMPTY_DOCLING_DOC
|
|
@@ -222,7 +250,7 @@ class _DummyBackend(AbstractDocumentBackend):
|
|
|
222
250
|
return False
|
|
223
251
|
|
|
224
252
|
@classmethod
|
|
225
|
-
def supported_formats(cls) ->
|
|
253
|
+
def supported_formats(cls) -> set[InputFormat]:
|
|
226
254
|
return set()
|
|
227
255
|
|
|
228
256
|
@classmethod
|
|
@@ -235,7 +263,7 @@ class _DummyBackend(AbstractDocumentBackend):
|
|
|
235
263
|
|
|
236
264
|
class _DocumentConversionInput(BaseModel):
|
|
237
265
|
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
|
238
|
-
headers: Optional[
|
|
266
|
+
headers: Optional[dict[str, str]] = None
|
|
239
267
|
limits: Optional[DocumentLimits] = DocumentLimits()
|
|
240
268
|
|
|
241
269
|
def docs(
|
|
@@ -250,33 +278,36 @@ class _DocumentConversionInput(BaseModel):
|
|
|
250
278
|
)
|
|
251
279
|
format = self._guess_format(obj)
|
|
252
280
|
backend: Type[AbstractDocumentBackend]
|
|
253
|
-
|
|
281
|
+
backend_options: Optional[BackendOptions] = None
|
|
282
|
+
if not format or format not in format_options:
|
|
254
283
|
_log.error(
|
|
255
|
-
f"Input document {obj.name} with format {format} does not match
|
|
284
|
+
f"Input document {obj.name} with format {format} does not match "
|
|
285
|
+
f"any allowed format: ({format_options.keys()})"
|
|
256
286
|
)
|
|
257
287
|
backend = _DummyBackend
|
|
258
288
|
else:
|
|
259
|
-
|
|
289
|
+
options = format_options[format]
|
|
290
|
+
backend = options.backend
|
|
291
|
+
if "backend_options" in options.model_fields_set:
|
|
292
|
+
backend_options = cast("FormatOption", options).backend_options
|
|
260
293
|
|
|
294
|
+
path_or_stream: Union[BytesIO, Path]
|
|
261
295
|
if isinstance(obj, Path):
|
|
262
|
-
|
|
263
|
-
path_or_stream=obj,
|
|
264
|
-
format=format, # type: ignore[arg-type]
|
|
265
|
-
filename=obj.name,
|
|
266
|
-
limits=self.limits,
|
|
267
|
-
backend=backend,
|
|
268
|
-
)
|
|
296
|
+
path_or_stream = obj
|
|
269
297
|
elif isinstance(obj, DocumentStream):
|
|
270
|
-
|
|
271
|
-
path_or_stream=obj.stream,
|
|
272
|
-
format=format, # type: ignore[arg-type]
|
|
273
|
-
filename=obj.name,
|
|
274
|
-
limits=self.limits,
|
|
275
|
-
backend=backend,
|
|
276
|
-
)
|
|
298
|
+
path_or_stream = obj.stream
|
|
277
299
|
else:
|
|
278
300
|
raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
|
|
279
301
|
|
|
302
|
+
yield InputDocument(
|
|
303
|
+
path_or_stream=path_or_stream,
|
|
304
|
+
format=format, # type: ignore[arg-type]
|
|
305
|
+
filename=obj.name,
|
|
306
|
+
limits=self.limits,
|
|
307
|
+
backend=backend,
|
|
308
|
+
backend_options=backend_options,
|
|
309
|
+
)
|
|
310
|
+
|
|
280
311
|
def _guess_format(self, obj: Union[Path, DocumentStream]) -> Optional[InputFormat]:
|
|
281
312
|
content = b"" # empty binary blob
|
|
282
313
|
formats: list[InputFormat] = []
|
|
@@ -290,12 +321,13 @@ class _DocumentConversionInput(BaseModel):
|
|
|
290
321
|
with obj.open("rb") as f:
|
|
291
322
|
content = f.read(1024) # Read first 1KB
|
|
292
323
|
if mime is not None and mime.lower() == "application/zip":
|
|
324
|
+
mime_root = "application/vnd.openxmlformats-officedocument"
|
|
293
325
|
if obj.suffixes[-1].lower() == ".xlsx":
|
|
294
|
-
mime = "
|
|
326
|
+
mime = mime_root + ".spreadsheetml.sheet"
|
|
295
327
|
elif obj.suffixes[-1].lower() == ".docx":
|
|
296
|
-
mime = "
|
|
328
|
+
mime = mime_root + ".wordprocessingml.document"
|
|
297
329
|
elif obj.suffixes[-1].lower() == ".pptx":
|
|
298
|
-
mime = "
|
|
330
|
+
mime = mime_root + ".presentationml.presentation"
|
|
299
331
|
|
|
300
332
|
elif isinstance(obj, DocumentStream):
|
|
301
333
|
content = obj.stream.read(8192)
|
|
@@ -310,12 +342,13 @@ class _DocumentConversionInput(BaseModel):
|
|
|
310
342
|
mime = _DocumentConversionInput._mime_from_extension(ext.lower())
|
|
311
343
|
if mime is not None and mime.lower() == "application/zip":
|
|
312
344
|
objname = obj.name.lower()
|
|
345
|
+
mime_root = "application/vnd.openxmlformats-officedocument"
|
|
313
346
|
if objname.endswith(".xlsx"):
|
|
314
|
-
mime = "
|
|
347
|
+
mime = mime_root + ".spreadsheetml.sheet"
|
|
315
348
|
elif objname.endswith(".docx"):
|
|
316
|
-
mime = "
|
|
349
|
+
mime = mime_root + ".wordprocessingml.document"
|
|
317
350
|
elif objname.endswith(".pptx"):
|
|
318
|
-
mime = "
|
|
351
|
+
mime = mime_root + ".presentationml.presentation"
|
|
319
352
|
|
|
320
353
|
if mime is not None and mime.lower() == "application/gzip":
|
|
321
354
|
if detected_mime := _DocumentConversionInput._detect_mets_gbs(obj):
|
|
@@ -17,7 +17,7 @@ class BaseAsrOptions(BaseModel):
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
class InferenceAsrFramework(str, Enum):
|
|
20
|
-
|
|
20
|
+
MLX = "mlx"
|
|
21
21
|
# TRANSFORMERS = "transformers" # disabled for now
|
|
22
22
|
WHISPER = "whisper"
|
|
23
23
|
|
|
@@ -55,3 +55,23 @@ class InlineAsrNativeWhisperOptions(InlineAsrOptions):
|
|
|
55
55
|
AcceleratorDevice.CUDA,
|
|
56
56
|
]
|
|
57
57
|
word_timestamps: bool = True
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class InlineAsrMlxWhisperOptions(InlineAsrOptions):
|
|
61
|
+
"""
|
|
62
|
+
MLX Whisper options for Apple Silicon optimization.
|
|
63
|
+
|
|
64
|
+
Uses mlx-whisper library for efficient inference on Apple Silicon devices.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
inference_framework: InferenceAsrFramework = InferenceAsrFramework.MLX
|
|
68
|
+
|
|
69
|
+
language: str = "en"
|
|
70
|
+
task: str = "transcribe" # "transcribe" or "translate"
|
|
71
|
+
supported_devices: List[AcceleratorDevice] = [
|
|
72
|
+
AcceleratorDevice.MPS, # MLX is optimized for Apple Silicon
|
|
73
|
+
]
|
|
74
|
+
word_timestamps: bool = True
|
|
75
|
+
no_speech_threshold: float = 0.6 # Threshold for detecting speech
|
|
76
|
+
logprob_threshold: float = -1.0 # Log probability threshold
|
|
77
|
+
compression_ratio_threshold: float = 2.4 # Compression ratio threshold
|
docling/document_converter.py
CHANGED
|
@@ -9,11 +9,14 @@ from datetime import datetime
|
|
|
9
9
|
from functools import partial
|
|
10
10
|
from io import BytesIO
|
|
11
11
|
from pathlib import Path
|
|
12
|
-
from typing import
|
|
12
|
+
from typing import Optional, Type, Union
|
|
13
13
|
|
|
14
|
-
from pydantic import
|
|
14
|
+
from pydantic import ConfigDict, model_validator, validate_call
|
|
15
|
+
from typing_extensions import Self
|
|
15
16
|
|
|
16
|
-
from docling.backend.abstract_backend import
|
|
17
|
+
from docling.backend.abstract_backend import (
|
|
18
|
+
AbstractDocumentBackend,
|
|
19
|
+
)
|
|
17
20
|
from docling.backend.asciidoc_backend import AsciiDocBackend
|
|
18
21
|
from docling.backend.csv_backend import CsvDocumentBackend
|
|
19
22
|
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
|
@@ -28,6 +31,12 @@ from docling.backend.noop_backend import NoOpBackend
|
|
|
28
31
|
from docling.backend.webvtt_backend import WebVTTDocumentBackend
|
|
29
32
|
from docling.backend.xml.jats_backend import JatsDocumentBackend
|
|
30
33
|
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
|
|
34
|
+
from docling.datamodel.backend_options import (
|
|
35
|
+
BackendOptions,
|
|
36
|
+
HTMLBackendOptions,
|
|
37
|
+
MarkdownBackendOptions,
|
|
38
|
+
PdfBackendOptions,
|
|
39
|
+
)
|
|
31
40
|
from docling.datamodel.base_models import (
|
|
32
41
|
BaseFormatOption,
|
|
33
42
|
ConversionStatus,
|
|
@@ -61,11 +70,13 @@ _PIPELINE_CACHE_LOCK = threading.Lock()
|
|
|
61
70
|
|
|
62
71
|
class FormatOption(BaseFormatOption):
|
|
63
72
|
pipeline_cls: Type[BasePipeline]
|
|
73
|
+
backend_options: Optional[BackendOptions] = None
|
|
64
74
|
|
|
65
75
|
@model_validator(mode="after")
|
|
66
|
-
def set_optional_field_default(self) ->
|
|
76
|
+
def set_optional_field_default(self) -> Self:
|
|
67
77
|
if self.pipeline_options is None:
|
|
68
78
|
self.pipeline_options = self.pipeline_cls.get_default_options()
|
|
79
|
+
|
|
69
80
|
return self
|
|
70
81
|
|
|
71
82
|
|
|
@@ -92,6 +103,7 @@ class PowerpointFormatOption(FormatOption):
|
|
|
92
103
|
class MarkdownFormatOption(FormatOption):
|
|
93
104
|
pipeline_cls: Type = SimplePipeline
|
|
94
105
|
backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
|
|
106
|
+
backend_options: Optional[MarkdownBackendOptions] = None
|
|
95
107
|
|
|
96
108
|
|
|
97
109
|
class AsciiDocFormatOption(FormatOption):
|
|
@@ -102,6 +114,7 @@ class AsciiDocFormatOption(FormatOption):
|
|
|
102
114
|
class HTMLFormatOption(FormatOption):
|
|
103
115
|
pipeline_cls: Type = SimplePipeline
|
|
104
116
|
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
|
|
117
|
+
backend_options: Optional[HTMLBackendOptions] = None
|
|
105
118
|
|
|
106
119
|
|
|
107
120
|
class PatentUsptoFormatOption(FormatOption):
|
|
@@ -122,6 +135,7 @@ class ImageFormatOption(FormatOption):
|
|
|
122
135
|
class PdfFormatOption(FormatOption):
|
|
123
136
|
pipeline_cls: Type = StandardPdfPipeline
|
|
124
137
|
backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
|
|
138
|
+
backend_options: Optional[PdfBackendOptions] = None
|
|
125
139
|
|
|
126
140
|
|
|
127
141
|
class AudioFormatOption(FormatOption):
|
|
@@ -131,46 +145,24 @@ class AudioFormatOption(FormatOption):
|
|
|
131
145
|
|
|
132
146
|
def _get_default_option(format: InputFormat) -> FormatOption:
|
|
133
147
|
format_to_default_options = {
|
|
134
|
-
InputFormat.CSV:
|
|
135
|
-
|
|
136
|
-
),
|
|
137
|
-
InputFormat.
|
|
138
|
-
|
|
139
|
-
),
|
|
140
|
-
InputFormat.
|
|
141
|
-
|
|
142
|
-
),
|
|
143
|
-
InputFormat.PPTX: FormatOption(
|
|
144
|
-
pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
|
|
145
|
-
),
|
|
146
|
-
InputFormat.MD: FormatOption(
|
|
147
|
-
pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
|
|
148
|
-
),
|
|
149
|
-
InputFormat.ASCIIDOC: FormatOption(
|
|
150
|
-
pipeline_cls=SimplePipeline, backend=AsciiDocBackend
|
|
151
|
-
),
|
|
152
|
-
InputFormat.HTML: FormatOption(
|
|
153
|
-
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
|
|
154
|
-
),
|
|
155
|
-
InputFormat.XML_USPTO: FormatOption(
|
|
156
|
-
pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
|
|
157
|
-
),
|
|
158
|
-
InputFormat.XML_JATS: FormatOption(
|
|
159
|
-
pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
|
|
160
|
-
),
|
|
148
|
+
InputFormat.CSV: CsvFormatOption(),
|
|
149
|
+
InputFormat.XLSX: ExcelFormatOption(),
|
|
150
|
+
InputFormat.DOCX: WordFormatOption(),
|
|
151
|
+
InputFormat.PPTX: PowerpointFormatOption(),
|
|
152
|
+
InputFormat.MD: MarkdownFormatOption(),
|
|
153
|
+
InputFormat.ASCIIDOC: AsciiDocFormatOption(),
|
|
154
|
+
InputFormat.HTML: HTMLFormatOption(),
|
|
155
|
+
InputFormat.XML_USPTO: PatentUsptoFormatOption(),
|
|
156
|
+
InputFormat.XML_JATS: XMLJatsFormatOption(),
|
|
161
157
|
InputFormat.METS_GBS: FormatOption(
|
|
162
158
|
pipeline_cls=StandardPdfPipeline, backend=MetsGbsDocumentBackend
|
|
163
159
|
),
|
|
164
|
-
InputFormat.IMAGE:
|
|
165
|
-
|
|
166
|
-
),
|
|
167
|
-
InputFormat.PDF: FormatOption(
|
|
168
|
-
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
|
|
169
|
-
),
|
|
160
|
+
InputFormat.IMAGE: ImageFormatOption(),
|
|
161
|
+
InputFormat.PDF: PdfFormatOption(),
|
|
170
162
|
InputFormat.JSON_DOCLING: FormatOption(
|
|
171
163
|
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
|
|
172
164
|
),
|
|
173
|
-
InputFormat.AUDIO:
|
|
165
|
+
InputFormat.AUDIO: AudioFormatOption(),
|
|
174
166
|
InputFormat.VTT: FormatOption(
|
|
175
167
|
pipeline_cls=SimplePipeline, backend=WebVTTDocumentBackend
|
|
176
168
|
),
|
|
@@ -186,13 +178,13 @@ class DocumentConverter:
|
|
|
186
178
|
|
|
187
179
|
def __init__(
|
|
188
180
|
self,
|
|
189
|
-
allowed_formats: Optional[
|
|
190
|
-
format_options: Optional[
|
|
181
|
+
allowed_formats: Optional[list[InputFormat]] = None,
|
|
182
|
+
format_options: Optional[dict[InputFormat, FormatOption]] = None,
|
|
191
183
|
):
|
|
192
184
|
self.allowed_formats = (
|
|
193
185
|
allowed_formats if allowed_formats is not None else list(InputFormat)
|
|
194
186
|
)
|
|
195
|
-
self.format_to_options:
|
|
187
|
+
self.format_to_options: dict[InputFormat, FormatOption] = {
|
|
196
188
|
format: (
|
|
197
189
|
_get_default_option(format=format)
|
|
198
190
|
if (custom_option := (format_options or {}).get(format)) is None
|
|
@@ -200,8 +192,8 @@ class DocumentConverter:
|
|
|
200
192
|
)
|
|
201
193
|
for format in self.allowed_formats
|
|
202
194
|
}
|
|
203
|
-
self.initialized_pipelines:
|
|
204
|
-
|
|
195
|
+
self.initialized_pipelines: dict[
|
|
196
|
+
tuple[Type[BasePipeline], str], BasePipeline
|
|
205
197
|
] = {}
|
|
206
198
|
|
|
207
199
|
def _get_initialized_pipelines(
|
|
@@ -228,7 +220,7 @@ class DocumentConverter:
|
|
|
228
220
|
def convert(
|
|
229
221
|
self,
|
|
230
222
|
source: Union[Path, str, DocumentStream], # TODO review naming
|
|
231
|
-
headers: Optional[
|
|
223
|
+
headers: Optional[dict[str, str]] = None,
|
|
232
224
|
raises_on_error: bool = True,
|
|
233
225
|
max_num_pages: int = sys.maxsize,
|
|
234
226
|
max_file_size: int = sys.maxsize,
|
|
@@ -248,7 +240,7 @@ class DocumentConverter:
|
|
|
248
240
|
def convert_all(
|
|
249
241
|
self,
|
|
250
242
|
source: Iterable[Union[Path, str, DocumentStream]], # TODO review naming
|
|
251
|
-
headers: Optional[
|
|
243
|
+
headers: Optional[dict[str, str]] = None,
|
|
252
244
|
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
|
|
253
245
|
max_num_pages: int = sys.maxsize,
|
|
254
246
|
max_file_size: int = sys.maxsize,
|
docling/document_extractor.py
CHANGED
|
@@ -8,9 +8,10 @@ from collections.abc import Iterable, Iterator
|
|
|
8
8
|
from concurrent.futures import ThreadPoolExecutor
|
|
9
9
|
from functools import partial
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import
|
|
11
|
+
from typing import Optional, Type, Union
|
|
12
12
|
|
|
13
13
|
from pydantic import ConfigDict, model_validator, validate_call
|
|
14
|
+
from typing_extensions import Self
|
|
14
15
|
|
|
15
16
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
|
16
17
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
|
@@ -56,7 +57,7 @@ class ExtractionFormatOption(BaseFormatOption):
|
|
|
56
57
|
pipeline_cls: Type[BaseExtractionPipeline]
|
|
57
58
|
|
|
58
59
|
@model_validator(mode="after")
|
|
59
|
-
def set_optional_field_default(self) ->
|
|
60
|
+
def set_optional_field_default(self) -> Self:
|
|
60
61
|
if self.pipeline_options is None:
|
|
61
62
|
# `get_default_options` comes from BaseExtractionPipeline
|
|
62
63
|
self.pipeline_options = self.pipeline_cls.get_default_options() # type: ignore[assignment]
|
|
@@ -70,7 +71,7 @@ def _get_default_extraction_option(fmt: InputFormat) -> ExtractionFormatOption:
|
|
|
70
71
|
the VLM extractor. This duplication will be removed when we deduplicate
|
|
71
72
|
the format registry between convert/extract.
|
|
72
73
|
"""
|
|
73
|
-
format_to_default_backend:
|
|
74
|
+
format_to_default_backend: dict[InputFormat, Type[AbstractDocumentBackend]] = {
|
|
74
75
|
InputFormat.IMAGE: PyPdfiumDocumentBackend,
|
|
75
76
|
InputFormat.PDF: PyPdfiumDocumentBackend,
|
|
76
77
|
}
|
|
@@ -98,24 +99,24 @@ class DocumentExtractor:
|
|
|
98
99
|
|
|
99
100
|
def __init__(
|
|
100
101
|
self,
|
|
101
|
-
allowed_formats: Optional[
|
|
102
|
+
allowed_formats: Optional[list[InputFormat]] = None,
|
|
102
103
|
extraction_format_options: Optional[
|
|
103
|
-
|
|
104
|
+
dict[InputFormat, ExtractionFormatOption]
|
|
104
105
|
] = None,
|
|
105
106
|
) -> None:
|
|
106
|
-
self.allowed_formats:
|
|
107
|
+
self.allowed_formats: list[InputFormat] = (
|
|
107
108
|
allowed_formats if allowed_formats is not None else list(InputFormat)
|
|
108
109
|
)
|
|
109
110
|
# Build per-format options with defaults, then apply any user overrides
|
|
110
111
|
overrides = extraction_format_options or {}
|
|
111
|
-
self.extraction_format_to_options:
|
|
112
|
+
self.extraction_format_to_options: dict[InputFormat, ExtractionFormatOption] = {
|
|
112
113
|
fmt: overrides.get(fmt, _get_default_extraction_option(fmt))
|
|
113
114
|
for fmt in self.allowed_formats
|
|
114
115
|
}
|
|
115
116
|
|
|
116
117
|
# Cache pipelines by (class, options-hash)
|
|
117
|
-
self._initialized_pipelines:
|
|
118
|
-
|
|
118
|
+
self._initialized_pipelines: dict[
|
|
119
|
+
tuple[Type[BaseExtractionPipeline], str], BaseExtractionPipeline
|
|
119
120
|
] = {}
|
|
120
121
|
|
|
121
122
|
# ---------------------------- Public API ---------------------------------
|
|
@@ -125,7 +126,7 @@ class DocumentExtractor:
|
|
|
125
126
|
self,
|
|
126
127
|
source: Union[Path, str, DocumentStream],
|
|
127
128
|
template: ExtractionTemplateType,
|
|
128
|
-
headers: Optional[
|
|
129
|
+
headers: Optional[dict[str, str]] = None,
|
|
129
130
|
raises_on_error: bool = True,
|
|
130
131
|
max_num_pages: int = sys.maxsize,
|
|
131
132
|
max_file_size: int = sys.maxsize,
|
|
@@ -147,7 +148,7 @@ class DocumentExtractor:
|
|
|
147
148
|
self,
|
|
148
149
|
source: Iterable[Union[Path, str, DocumentStream]],
|
|
149
150
|
template: ExtractionTemplateType,
|
|
150
|
-
headers: Optional[
|
|
151
|
+
headers: Optional[dict[str, str]] = None,
|
|
151
152
|
raises_on_error: bool = True,
|
|
152
153
|
max_num_pages: int = sys.maxsize,
|
|
153
154
|
max_file_size: int = sys.maxsize,
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
|
-
from typing import Dict, List
|
|
3
2
|
|
|
4
3
|
from docling_core.types.doc import (
|
|
5
4
|
DocItemLabel,
|
|
@@ -48,8 +47,8 @@ class ReadingOrderModel:
|
|
|
48
47
|
|
|
49
48
|
def _assembled_to_readingorder_elements(
|
|
50
49
|
self, conv_res: ConversionResult
|
|
51
|
-
) ->
|
|
52
|
-
elements:
|
|
50
|
+
) -> list[ReadingOrderPageElement]:
|
|
51
|
+
elements: list[ReadingOrderPageElement] = []
|
|
53
52
|
page_no_to_pages = {p.page_no: p for p in conv_res.pages}
|
|
54
53
|
|
|
55
54
|
for element in conv_res.assembled.elements:
|
|
@@ -123,10 +122,10 @@ class ReadingOrderModel:
|
|
|
123
122
|
def _readingorder_elements_to_docling_doc(
|
|
124
123
|
self,
|
|
125
124
|
conv_res: ConversionResult,
|
|
126
|
-
ro_elements:
|
|
127
|
-
el_to_captions_mapping:
|
|
128
|
-
el_to_footnotes_mapping:
|
|
129
|
-
el_merges_mapping:
|
|
125
|
+
ro_elements: list[ReadingOrderPageElement],
|
|
126
|
+
el_to_captions_mapping: dict[int, list[int]],
|
|
127
|
+
el_to_footnotes_mapping: dict[int, list[int]],
|
|
128
|
+
el_merges_mapping: dict[int, list[int]],
|
|
130
129
|
) -> DoclingDocument:
|
|
131
130
|
id_to_elem = {
|
|
132
131
|
RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem
|