docling 2.8.2__tar.gz → 2.8.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.8.2 → docling-2.8.3}/PKG-INFO +1 -1
- {docling-2.8.2 → docling-2.8.3}/docling/datamodel/base_models.py +2 -0
- {docling-2.8.2 → docling-2.8.3}/docling/datamodel/document.py +24 -10
- {docling-2.8.2 → docling-2.8.3}/docling/document_converter.py +103 -83
- docling-2.8.3/docling/exceptions.py +6 -0
- {docling-2.8.2 → docling-2.8.3}/pyproject.toml +1 -1
- {docling-2.8.2 → docling-2.8.3}/LICENSE +0 -0
- {docling-2.8.2 → docling-2.8.3}/README.md +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/__init__.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/backend/__init__.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/backend/abstract_backend.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/backend/html_backend.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/backend/md_backend.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/backend/msword_backend.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/backend/pdf_backend.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/cli/__init__.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/cli/main.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/datamodel/__init__.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/datamodel/pipeline_options.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/datamodel/settings.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/models/__init__.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/models/base_model.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/models/base_ocr_model.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/models/ds_glm_model.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/models/easyocr_model.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/models/layout_model.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/models/page_assemble_model.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/models/table_structure_model.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/pipeline/__init__.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/pipeline/standard_pdf_pipeline.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/utils/__init__.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/utils/export.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/utils/layout_utils.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/utils/profiling.py +0 -0
- {docling-2.8.2 → docling-2.8.3}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.8.
|
3
|
+
Version: 2.8.3
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -24,6 +24,7 @@ class ConversionStatus(str, Enum):
|
|
24
24
|
FAILURE = auto()
|
25
25
|
SUCCESS = auto()
|
26
26
|
PARTIAL_SUCCESS = auto()
|
27
|
+
SKIPPED = auto()
|
27
28
|
|
28
29
|
|
29
30
|
class InputFormat(str, Enum):
|
@@ -95,6 +96,7 @@ class DoclingComponentType(str, Enum):
|
|
95
96
|
DOCUMENT_BACKEND = auto()
|
96
97
|
MODEL = auto()
|
97
98
|
DOC_ASSEMBLER = auto()
|
99
|
+
USER_INPUT = auto()
|
98
100
|
|
99
101
|
|
100
102
|
class ErrorItem(BaseModel):
|
@@ -3,7 +3,7 @@ import re
|
|
3
3
|
from enum import Enum
|
4
4
|
from io import BytesIO
|
5
5
|
from pathlib import Path, PurePath
|
6
|
-
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
|
6
|
+
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union
|
7
7
|
|
8
8
|
import filetype
|
9
9
|
from docling_core.types.doc import (
|
@@ -164,12 +164,6 @@ class InputDocument(BaseModel):
|
|
164
164
|
backend: Type[AbstractDocumentBackend],
|
165
165
|
path_or_stream: Union[BytesIO, Path],
|
166
166
|
) -> None:
|
167
|
-
if backend is None:
|
168
|
-
raise RuntimeError(
|
169
|
-
f"No backend configuration provided for file {self.file.name} with format {self.format}. "
|
170
|
-
f"Please check your format configuration on DocumentConverter."
|
171
|
-
)
|
172
|
-
|
173
167
|
self._backend = backend(self, path_or_stream=path_or_stream)
|
174
168
|
if not self._backend.is_valid():
|
175
169
|
self.valid = False
|
@@ -450,6 +444,25 @@ class ConversionResult(BaseModel):
|
|
450
444
|
return ds_doc
|
451
445
|
|
452
446
|
|
447
|
+
class _DummyBackend(AbstractDocumentBackend):
|
448
|
+
def __init__(self, *args, **kwargs):
|
449
|
+
super().__init__(*args, **kwargs)
|
450
|
+
|
451
|
+
def is_valid(self) -> bool:
|
452
|
+
return False
|
453
|
+
|
454
|
+
@classmethod
|
455
|
+
def supported_formats(cls) -> Set[InputFormat]:
|
456
|
+
return set()
|
457
|
+
|
458
|
+
@classmethod
|
459
|
+
def supports_pagination(cls) -> bool:
|
460
|
+
return False
|
461
|
+
|
462
|
+
def unload(self):
|
463
|
+
return super().unload()
|
464
|
+
|
465
|
+
|
453
466
|
class _DocumentConversionInput(BaseModel):
|
454
467
|
|
455
468
|
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
@@ -461,11 +474,12 @@ class _DocumentConversionInput(BaseModel):
|
|
461
474
|
for item in self.path_or_stream_iterator:
|
462
475
|
obj = resolve_source_to_stream(item) if isinstance(item, str) else item
|
463
476
|
format = self._guess_format(obj)
|
477
|
+
backend: Type[AbstractDocumentBackend]
|
464
478
|
if format not in format_options.keys():
|
465
|
-
_log.
|
466
|
-
f"
|
479
|
+
_log.error(
|
480
|
+
f"Input document {obj.name} does not match any allowed format."
|
467
481
|
)
|
468
|
-
|
482
|
+
backend = _DummyBackend
|
469
483
|
else:
|
470
484
|
backend = format_options[format].backend
|
471
485
|
|
@@ -15,7 +15,13 @@ from docling.backend.md_backend import MarkdownDocumentBackend
|
|
15
15
|
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
16
16
|
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
17
17
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
18
|
-
from docling.datamodel.base_models import
|
18
|
+
from docling.datamodel.base_models import (
|
19
|
+
ConversionStatus,
|
20
|
+
DoclingComponentType,
|
21
|
+
DocumentStream,
|
22
|
+
ErrorItem,
|
23
|
+
InputFormat,
|
24
|
+
)
|
19
25
|
from docling.datamodel.document import (
|
20
26
|
ConversionResult,
|
21
27
|
InputDocument,
|
@@ -23,6 +29,7 @@ from docling.datamodel.document import (
|
|
23
29
|
)
|
24
30
|
from docling.datamodel.pipeline_options import PipelineOptions
|
25
31
|
from docling.datamodel.settings import DocumentLimits, settings
|
32
|
+
from docling.exceptions import ConversionError
|
26
33
|
from docling.pipeline.base_pipeline import BasePipeline
|
27
34
|
from docling.pipeline.simple_pipeline import SimplePipeline
|
28
35
|
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
@@ -85,32 +92,37 @@ class ImageFormatOption(FormatOption):
|
|
85
92
|
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
|
86
93
|
|
87
94
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
95
|
+
def _get_default_option(format: InputFormat) -> FormatOption:
|
96
|
+
format_to_default_options = {
|
97
|
+
InputFormat.XLSX: FormatOption(
|
98
|
+
pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
|
99
|
+
),
|
100
|
+
InputFormat.DOCX: FormatOption(
|
101
|
+
pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
|
102
|
+
),
|
103
|
+
InputFormat.PPTX: FormatOption(
|
104
|
+
pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
|
105
|
+
),
|
106
|
+
InputFormat.MD: FormatOption(
|
107
|
+
pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
|
108
|
+
),
|
109
|
+
InputFormat.ASCIIDOC: FormatOption(
|
110
|
+
pipeline_cls=SimplePipeline, backend=AsciiDocBackend
|
111
|
+
),
|
112
|
+
InputFormat.HTML: FormatOption(
|
113
|
+
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
|
114
|
+
),
|
115
|
+
InputFormat.IMAGE: FormatOption(
|
116
|
+
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
|
117
|
+
),
|
118
|
+
InputFormat.PDF: FormatOption(
|
119
|
+
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
|
120
|
+
),
|
121
|
+
}
|
122
|
+
if (options := format_to_default_options.get(format)) is not None:
|
123
|
+
return options
|
124
|
+
else:
|
125
|
+
raise RuntimeError(f"No default options configured for {format}")
|
114
126
|
|
115
127
|
|
116
128
|
class DocumentConverter:
|
@@ -121,36 +133,26 @@ class DocumentConverter:
|
|
121
133
|
allowed_formats: Optional[List[InputFormat]] = None,
|
122
134
|
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
|
123
135
|
):
|
124
|
-
self.allowed_formats =
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
else:
|
136
|
-
for f in self.allowed_formats:
|
137
|
-
if f not in self.format_to_options.keys():
|
138
|
-
_log.debug(f"Requested format {f} will use default options.")
|
139
|
-
self.format_to_options[f] = _format_to_default_options[f]
|
140
|
-
|
141
|
-
remove_keys = []
|
142
|
-
for f in self.format_to_options.keys():
|
143
|
-
if f not in self.allowed_formats:
|
144
|
-
remove_keys.append(f)
|
145
|
-
|
146
|
-
for f in remove_keys:
|
147
|
-
self.format_to_options.pop(f)
|
148
|
-
|
136
|
+
self.allowed_formats = (
|
137
|
+
allowed_formats if allowed_formats is not None else [e for e in InputFormat]
|
138
|
+
)
|
139
|
+
self.format_to_options = {
|
140
|
+
format: (
|
141
|
+
_get_default_option(format=format)
|
142
|
+
if (custom_option := (format_options or {}).get(format)) is None
|
143
|
+
else custom_option
|
144
|
+
)
|
145
|
+
for format in self.allowed_formats
|
146
|
+
}
|
149
147
|
self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
|
150
148
|
|
151
149
|
def initialize_pipeline(self, format: InputFormat):
|
152
150
|
"""Initialize the conversion pipeline for the selected format."""
|
153
|
-
self._get_pipeline(doc_format=format)
|
151
|
+
pipeline = self._get_pipeline(doc_format=format)
|
152
|
+
if pipeline is None:
|
153
|
+
raise ConversionError(
|
154
|
+
f"No pipeline could be initialized for format {format}"
|
155
|
+
)
|
154
156
|
|
155
157
|
@validate_call(config=ConfigDict(strict=True))
|
156
158
|
def convert(
|
@@ -186,22 +188,28 @@ class DocumentConverter:
|
|
186
188
|
limits=limits,
|
187
189
|
)
|
188
190
|
conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
|
191
|
+
|
192
|
+
had_result = False
|
189
193
|
for conv_res in conv_res_iter:
|
194
|
+
had_result = True
|
190
195
|
if raises_on_error and conv_res.status not in {
|
191
196
|
ConversionStatus.SUCCESS,
|
192
197
|
ConversionStatus.PARTIAL_SUCCESS,
|
193
198
|
}:
|
194
|
-
raise
|
199
|
+
raise ConversionError(
|
195
200
|
f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
|
196
201
|
)
|
197
202
|
else:
|
198
203
|
yield conv_res
|
199
204
|
|
205
|
+
if not had_result and raises_on_error:
|
206
|
+
raise ConversionError(
|
207
|
+
f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
|
208
|
+
)
|
209
|
+
|
200
210
|
def _convert(
|
201
211
|
self, conv_input: _DocumentConversionInput, raises_on_error: bool
|
202
212
|
) -> Iterator[ConversionResult]:
|
203
|
-
assert self.format_to_options is not None
|
204
|
-
|
205
213
|
start_time = time.monotonic()
|
206
214
|
|
207
215
|
for input_batch in chunkify(
|
@@ -223,27 +231,22 @@ class DocumentConverter:
|
|
223
231
|
):
|
224
232
|
elapsed = time.monotonic() - start_time
|
225
233
|
start_time = time.monotonic()
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
)
|
231
|
-
yield item
|
232
|
-
else:
|
233
|
-
_log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
|
234
|
+
_log.info(
|
235
|
+
f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
|
236
|
+
)
|
237
|
+
yield item
|
234
238
|
|
235
239
|
def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
|
236
|
-
assert self.format_to_options is not None
|
237
|
-
|
238
240
|
fopt = self.format_to_options.get(doc_format)
|
239
241
|
|
240
242
|
if fopt is None:
|
241
|
-
|
243
|
+
return None
|
242
244
|
else:
|
243
245
|
pipeline_class = fopt.pipeline_cls
|
244
246
|
pipeline_options = fopt.pipeline_options
|
245
247
|
|
246
|
-
|
248
|
+
if pipeline_options is None:
|
249
|
+
return None
|
247
250
|
# TODO this will ignore if different options have been defined for the same pipeline class.
|
248
251
|
if (
|
249
252
|
pipeline_class not in self.initialized_pipelines
|
@@ -257,11 +260,26 @@ class DocumentConverter:
|
|
257
260
|
|
258
261
|
def _process_document(
|
259
262
|
self, in_doc: InputDocument, raises_on_error: bool
|
260
|
-
) ->
|
261
|
-
assert self.allowed_formats is not None
|
262
|
-
assert in_doc.format in self.allowed_formats
|
263
|
+
) -> ConversionResult:
|
263
264
|
|
264
|
-
|
265
|
+
valid = (
|
266
|
+
self.allowed_formats is not None and in_doc.format in self.allowed_formats
|
267
|
+
)
|
268
|
+
if valid:
|
269
|
+
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
|
270
|
+
else:
|
271
|
+
error_message = f"File format not allowed: {in_doc.file}"
|
272
|
+
if raises_on_error:
|
273
|
+
raise ConversionError(error_message)
|
274
|
+
else:
|
275
|
+
error_item = ErrorItem(
|
276
|
+
component_type=DoclingComponentType.USER_INPUT,
|
277
|
+
module_name="",
|
278
|
+
error_message=error_message,
|
279
|
+
)
|
280
|
+
conv_res = ConversionResult(
|
281
|
+
input=in_doc, status=ConversionStatus.SKIPPED, errors=[error_item]
|
282
|
+
)
|
265
283
|
|
266
284
|
return conv_res
|
267
285
|
|
@@ -270,26 +288,28 @@ class DocumentConverter:
|
|
270
288
|
) -> ConversionResult:
|
271
289
|
if in_doc.valid:
|
272
290
|
pipeline = self._get_pipeline(in_doc.format)
|
273
|
-
if pipeline is None:
|
291
|
+
if pipeline is not None:
|
292
|
+
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
|
293
|
+
else:
|
274
294
|
if raises_on_error:
|
275
|
-
raise
|
295
|
+
raise ConversionError(
|
276
296
|
f"No pipeline could be initialized for {in_doc.file}."
|
277
297
|
)
|
278
298
|
else:
|
279
|
-
conv_res = ConversionResult(
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
|
284
|
-
|
299
|
+
conv_res = ConversionResult(
|
300
|
+
input=in_doc,
|
301
|
+
status=ConversionStatus.FAILURE,
|
302
|
+
)
|
285
303
|
else:
|
286
304
|
if raises_on_error:
|
287
|
-
raise
|
305
|
+
raise ConversionError(f"Input document {in_doc.file} is not valid.")
|
288
306
|
|
289
307
|
else:
|
290
308
|
# invalid doc or not of desired format
|
291
|
-
conv_res = ConversionResult(
|
292
|
-
|
309
|
+
conv_res = ConversionResult(
|
310
|
+
input=in_doc,
|
311
|
+
status=ConversionStatus.FAILURE,
|
312
|
+
)
|
293
313
|
# TODO add error log why it failed.
|
294
314
|
|
295
315
|
return conv_res
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "docling"
|
3
|
-
version = "2.8.
|
3
|
+
version = "2.8.3" # DO NOT EDIT, updated automatically
|
4
4
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
5
5
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
6
6
|
license = "MIT"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|