docling 2.8.2__tar.gz → 2.8.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. {docling-2.8.2 → docling-2.8.3}/PKG-INFO +1 -1
  2. {docling-2.8.2 → docling-2.8.3}/docling/datamodel/base_models.py +2 -0
  3. {docling-2.8.2 → docling-2.8.3}/docling/datamodel/document.py +24 -10
  4. {docling-2.8.2 → docling-2.8.3}/docling/document_converter.py +103 -83
  5. docling-2.8.3/docling/exceptions.py +6 -0
  6. {docling-2.8.2 → docling-2.8.3}/pyproject.toml +1 -1
  7. {docling-2.8.2 → docling-2.8.3}/LICENSE +0 -0
  8. {docling-2.8.2 → docling-2.8.3}/README.md +0 -0
  9. {docling-2.8.2 → docling-2.8.3}/docling/__init__.py +0 -0
  10. {docling-2.8.2 → docling-2.8.3}/docling/backend/__init__.py +0 -0
  11. {docling-2.8.2 → docling-2.8.3}/docling/backend/abstract_backend.py +0 -0
  12. {docling-2.8.2 → docling-2.8.3}/docling/backend/asciidoc_backend.py +0 -0
  13. {docling-2.8.2 → docling-2.8.3}/docling/backend/docling_parse_backend.py +0 -0
  14. {docling-2.8.2 → docling-2.8.3}/docling/backend/docling_parse_v2_backend.py +0 -0
  15. {docling-2.8.2 → docling-2.8.3}/docling/backend/html_backend.py +0 -0
  16. {docling-2.8.2 → docling-2.8.3}/docling/backend/md_backend.py +0 -0
  17. {docling-2.8.2 → docling-2.8.3}/docling/backend/msexcel_backend.py +0 -0
  18. {docling-2.8.2 → docling-2.8.3}/docling/backend/mspowerpoint_backend.py +0 -0
  19. {docling-2.8.2 → docling-2.8.3}/docling/backend/msword_backend.py +0 -0
  20. {docling-2.8.2 → docling-2.8.3}/docling/backend/pdf_backend.py +0 -0
  21. {docling-2.8.2 → docling-2.8.3}/docling/backend/pypdfium2_backend.py +0 -0
  22. {docling-2.8.2 → docling-2.8.3}/docling/cli/__init__.py +0 -0
  23. {docling-2.8.2 → docling-2.8.3}/docling/cli/main.py +0 -0
  24. {docling-2.8.2 → docling-2.8.3}/docling/datamodel/__init__.py +0 -0
  25. {docling-2.8.2 → docling-2.8.3}/docling/datamodel/pipeline_options.py +0 -0
  26. {docling-2.8.2 → docling-2.8.3}/docling/datamodel/settings.py +0 -0
  27. {docling-2.8.2 → docling-2.8.3}/docling/models/__init__.py +0 -0
  28. {docling-2.8.2 → docling-2.8.3}/docling/models/base_model.py +0 -0
  29. {docling-2.8.2 → docling-2.8.3}/docling/models/base_ocr_model.py +0 -0
  30. {docling-2.8.2 → docling-2.8.3}/docling/models/ds_glm_model.py +0 -0
  31. {docling-2.8.2 → docling-2.8.3}/docling/models/easyocr_model.py +0 -0
  32. {docling-2.8.2 → docling-2.8.3}/docling/models/layout_model.py +0 -0
  33. {docling-2.8.2 → docling-2.8.3}/docling/models/ocr_mac_model.py +0 -0
  34. {docling-2.8.2 → docling-2.8.3}/docling/models/page_assemble_model.py +0 -0
  35. {docling-2.8.2 → docling-2.8.3}/docling/models/page_preprocessing_model.py +0 -0
  36. {docling-2.8.2 → docling-2.8.3}/docling/models/rapid_ocr_model.py +0 -0
  37. {docling-2.8.2 → docling-2.8.3}/docling/models/table_structure_model.py +0 -0
  38. {docling-2.8.2 → docling-2.8.3}/docling/models/tesseract_ocr_cli_model.py +0 -0
  39. {docling-2.8.2 → docling-2.8.3}/docling/models/tesseract_ocr_model.py +0 -0
  40. {docling-2.8.2 → docling-2.8.3}/docling/pipeline/__init__.py +0 -0
  41. {docling-2.8.2 → docling-2.8.3}/docling/pipeline/base_pipeline.py +0 -0
  42. {docling-2.8.2 → docling-2.8.3}/docling/pipeline/simple_pipeline.py +0 -0
  43. {docling-2.8.2 → docling-2.8.3}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  44. {docling-2.8.2 → docling-2.8.3}/docling/utils/__init__.py +0 -0
  45. {docling-2.8.2 → docling-2.8.3}/docling/utils/export.py +0 -0
  46. {docling-2.8.2 → docling-2.8.3}/docling/utils/layout_utils.py +0 -0
  47. {docling-2.8.2 → docling-2.8.3}/docling/utils/profiling.py +0 -0
  48. {docling-2.8.2 → docling-2.8.3}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.8.2
3
+ Version: 2.8.3
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -24,6 +24,7 @@ class ConversionStatus(str, Enum):
24
24
  FAILURE = auto()
25
25
  SUCCESS = auto()
26
26
  PARTIAL_SUCCESS = auto()
27
+ SKIPPED = auto()
27
28
 
28
29
 
29
30
  class InputFormat(str, Enum):
@@ -95,6 +96,7 @@ class DoclingComponentType(str, Enum):
95
96
  DOCUMENT_BACKEND = auto()
96
97
  MODEL = auto()
97
98
  DOC_ASSEMBLER = auto()
99
+ USER_INPUT = auto()
98
100
 
99
101
 
100
102
  class ErrorItem(BaseModel):
@@ -3,7 +3,7 @@ import re
3
3
  from enum import Enum
4
4
  from io import BytesIO
5
5
  from pathlib import Path, PurePath
6
- from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
6
+ from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union
7
7
 
8
8
  import filetype
9
9
  from docling_core.types.doc import (
@@ -164,12 +164,6 @@ class InputDocument(BaseModel):
164
164
  backend: Type[AbstractDocumentBackend],
165
165
  path_or_stream: Union[BytesIO, Path],
166
166
  ) -> None:
167
- if backend is None:
168
- raise RuntimeError(
169
- f"No backend configuration provided for file {self.file.name} with format {self.format}. "
170
- f"Please check your format configuration on DocumentConverter."
171
- )
172
-
173
167
  self._backend = backend(self, path_or_stream=path_or_stream)
174
168
  if not self._backend.is_valid():
175
169
  self.valid = False
@@ -450,6 +444,25 @@ class ConversionResult(BaseModel):
450
444
  return ds_doc
451
445
 
452
446
 
447
+ class _DummyBackend(AbstractDocumentBackend):
448
+ def __init__(self, *args, **kwargs):
449
+ super().__init__(*args, **kwargs)
450
+
451
+ def is_valid(self) -> bool:
452
+ return False
453
+
454
+ @classmethod
455
+ def supported_formats(cls) -> Set[InputFormat]:
456
+ return set()
457
+
458
+ @classmethod
459
+ def supports_pagination(cls) -> bool:
460
+ return False
461
+
462
+ def unload(self):
463
+ return super().unload()
464
+
465
+
453
466
  class _DocumentConversionInput(BaseModel):
454
467
 
455
468
  path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
@@ -461,11 +474,12 @@ class _DocumentConversionInput(BaseModel):
461
474
  for item in self.path_or_stream_iterator:
462
475
  obj = resolve_source_to_stream(item) if isinstance(item, str) else item
463
476
  format = self._guess_format(obj)
477
+ backend: Type[AbstractDocumentBackend]
464
478
  if format not in format_options.keys():
465
- _log.info(
466
- f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
479
+ _log.error(
480
+ f"Input document {obj.name} does not match any allowed format."
467
481
  )
468
- continue
482
+ backend = _DummyBackend
469
483
  else:
470
484
  backend = format_options[format].backend
471
485
 
@@ -15,7 +15,13 @@ from docling.backend.md_backend import MarkdownDocumentBackend
15
15
  from docling.backend.msexcel_backend import MsExcelDocumentBackend
16
16
  from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
17
17
  from docling.backend.msword_backend import MsWordDocumentBackend
18
- from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
18
+ from docling.datamodel.base_models import (
19
+ ConversionStatus,
20
+ DoclingComponentType,
21
+ DocumentStream,
22
+ ErrorItem,
23
+ InputFormat,
24
+ )
19
25
  from docling.datamodel.document import (
20
26
  ConversionResult,
21
27
  InputDocument,
@@ -23,6 +29,7 @@ from docling.datamodel.document import (
23
29
  )
24
30
  from docling.datamodel.pipeline_options import PipelineOptions
25
31
  from docling.datamodel.settings import DocumentLimits, settings
32
+ from docling.exceptions import ConversionError
26
33
  from docling.pipeline.base_pipeline import BasePipeline
27
34
  from docling.pipeline.simple_pipeline import SimplePipeline
28
35
  from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
@@ -85,32 +92,37 @@ class ImageFormatOption(FormatOption):
85
92
  backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
86
93
 
87
94
 
88
- _format_to_default_options = {
89
- InputFormat.XLSX: FormatOption(
90
- pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
91
- ),
92
- InputFormat.DOCX: FormatOption(
93
- pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
94
- ),
95
- InputFormat.PPTX: FormatOption(
96
- pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
97
- ),
98
- InputFormat.MD: FormatOption(
99
- pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
100
- ),
101
- InputFormat.ASCIIDOC: FormatOption(
102
- pipeline_cls=SimplePipeline, backend=AsciiDocBackend
103
- ),
104
- InputFormat.HTML: FormatOption(
105
- pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
106
- ),
107
- InputFormat.IMAGE: FormatOption(
108
- pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
109
- ),
110
- InputFormat.PDF: FormatOption(
111
- pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
112
- ),
113
- }
95
+ def _get_default_option(format: InputFormat) -> FormatOption:
96
+ format_to_default_options = {
97
+ InputFormat.XLSX: FormatOption(
98
+ pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
99
+ ),
100
+ InputFormat.DOCX: FormatOption(
101
+ pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
102
+ ),
103
+ InputFormat.PPTX: FormatOption(
104
+ pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
105
+ ),
106
+ InputFormat.MD: FormatOption(
107
+ pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
108
+ ),
109
+ InputFormat.ASCIIDOC: FormatOption(
110
+ pipeline_cls=SimplePipeline, backend=AsciiDocBackend
111
+ ),
112
+ InputFormat.HTML: FormatOption(
113
+ pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
114
+ ),
115
+ InputFormat.IMAGE: FormatOption(
116
+ pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
117
+ ),
118
+ InputFormat.PDF: FormatOption(
119
+ pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
120
+ ),
121
+ }
122
+ if (options := format_to_default_options.get(format)) is not None:
123
+ return options
124
+ else:
125
+ raise RuntimeError(f"No default options configured for {format}")
114
126
 
115
127
 
116
128
  class DocumentConverter:
@@ -121,36 +133,26 @@ class DocumentConverter:
121
133
  allowed_formats: Optional[List[InputFormat]] = None,
122
134
  format_options: Optional[Dict[InputFormat, FormatOption]] = None,
123
135
  ):
124
- self.allowed_formats = allowed_formats
125
- self.format_to_options = format_options
126
-
127
- if self.allowed_formats is None:
128
- # if self.format_to_options is not None:
129
- # self.allowed_formats = self.format_to_options.keys()
130
- # else:
131
- self.allowed_formats = [e for e in InputFormat] # all formats
132
-
133
- if self.format_to_options is None:
134
- self.format_to_options = _format_to_default_options
135
- else:
136
- for f in self.allowed_formats:
137
- if f not in self.format_to_options.keys():
138
- _log.debug(f"Requested format {f} will use default options.")
139
- self.format_to_options[f] = _format_to_default_options[f]
140
-
141
- remove_keys = []
142
- for f in self.format_to_options.keys():
143
- if f not in self.allowed_formats:
144
- remove_keys.append(f)
145
-
146
- for f in remove_keys:
147
- self.format_to_options.pop(f)
148
-
136
+ self.allowed_formats = (
137
+ allowed_formats if allowed_formats is not None else [e for e in InputFormat]
138
+ )
139
+ self.format_to_options = {
140
+ format: (
141
+ _get_default_option(format=format)
142
+ if (custom_option := (format_options or {}).get(format)) is None
143
+ else custom_option
144
+ )
145
+ for format in self.allowed_formats
146
+ }
149
147
  self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
150
148
 
151
149
  def initialize_pipeline(self, format: InputFormat):
152
150
  """Initialize the conversion pipeline for the selected format."""
153
- self._get_pipeline(doc_format=format)
151
+ pipeline = self._get_pipeline(doc_format=format)
152
+ if pipeline is None:
153
+ raise ConversionError(
154
+ f"No pipeline could be initialized for format {format}"
155
+ )
154
156
 
155
157
  @validate_call(config=ConfigDict(strict=True))
156
158
  def convert(
@@ -186,22 +188,28 @@ class DocumentConverter:
186
188
  limits=limits,
187
189
  )
188
190
  conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
191
+
192
+ had_result = False
189
193
  for conv_res in conv_res_iter:
194
+ had_result = True
190
195
  if raises_on_error and conv_res.status not in {
191
196
  ConversionStatus.SUCCESS,
192
197
  ConversionStatus.PARTIAL_SUCCESS,
193
198
  }:
194
- raise RuntimeError(
199
+ raise ConversionError(
195
200
  f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
196
201
  )
197
202
  else:
198
203
  yield conv_res
199
204
 
205
+ if not had_result and raises_on_error:
206
+ raise ConversionError(
207
+ f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
208
+ )
209
+
200
210
  def _convert(
201
211
  self, conv_input: _DocumentConversionInput, raises_on_error: bool
202
212
  ) -> Iterator[ConversionResult]:
203
- assert self.format_to_options is not None
204
-
205
213
  start_time = time.monotonic()
206
214
 
207
215
  for input_batch in chunkify(
@@ -223,27 +231,22 @@ class DocumentConverter:
223
231
  ):
224
232
  elapsed = time.monotonic() - start_time
225
233
  start_time = time.monotonic()
226
-
227
- if item is not None:
228
- _log.info(
229
- f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
230
- )
231
- yield item
232
- else:
233
- _log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
234
+ _log.info(
235
+ f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
236
+ )
237
+ yield item
234
238
 
235
239
  def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
236
- assert self.format_to_options is not None
237
-
238
240
  fopt = self.format_to_options.get(doc_format)
239
241
 
240
242
  if fopt is None:
241
- raise RuntimeError(f"Could not get pipeline for {doc_format}")
243
+ return None
242
244
  else:
243
245
  pipeline_class = fopt.pipeline_cls
244
246
  pipeline_options = fopt.pipeline_options
245
247
 
246
- assert pipeline_options is not None
248
+ if pipeline_options is None:
249
+ return None
247
250
  # TODO this will ignore if different options have been defined for the same pipeline class.
248
251
  if (
249
252
  pipeline_class not in self.initialized_pipelines
@@ -257,11 +260,26 @@ class DocumentConverter:
257
260
 
258
261
  def _process_document(
259
262
  self, in_doc: InputDocument, raises_on_error: bool
260
- ) -> Optional[ConversionResult]:
261
- assert self.allowed_formats is not None
262
- assert in_doc.format in self.allowed_formats
263
+ ) -> ConversionResult:
263
264
 
264
- conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
265
+ valid = (
266
+ self.allowed_formats is not None and in_doc.format in self.allowed_formats
267
+ )
268
+ if valid:
269
+ conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
270
+ else:
271
+ error_message = f"File format not allowed: {in_doc.file}"
272
+ if raises_on_error:
273
+ raise ConversionError(error_message)
274
+ else:
275
+ error_item = ErrorItem(
276
+ component_type=DoclingComponentType.USER_INPUT,
277
+ module_name="",
278
+ error_message=error_message,
279
+ )
280
+ conv_res = ConversionResult(
281
+ input=in_doc, status=ConversionStatus.SKIPPED, errors=[error_item]
282
+ )
265
283
 
266
284
  return conv_res
267
285
 
@@ -270,26 +288,28 @@ class DocumentConverter:
270
288
  ) -> ConversionResult:
271
289
  if in_doc.valid:
272
290
  pipeline = self._get_pipeline(in_doc.format)
273
- if pipeline is None: # Can't find a default pipeline. Should this raise?
291
+ if pipeline is not None:
292
+ conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
293
+ else:
274
294
  if raises_on_error:
275
- raise RuntimeError(
295
+ raise ConversionError(
276
296
  f"No pipeline could be initialized for {in_doc.file}."
277
297
  )
278
298
  else:
279
- conv_res = ConversionResult(input=in_doc)
280
- conv_res.status = ConversionStatus.FAILURE
281
- return conv_res
282
-
283
- conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
284
-
299
+ conv_res = ConversionResult(
300
+ input=in_doc,
301
+ status=ConversionStatus.FAILURE,
302
+ )
285
303
  else:
286
304
  if raises_on_error:
287
- raise RuntimeError(f"Input document {in_doc.file} is not valid.")
305
+ raise ConversionError(f"Input document {in_doc.file} is not valid.")
288
306
 
289
307
  else:
290
308
  # invalid doc or not of desired format
291
- conv_res = ConversionResult(input=in_doc)
292
- conv_res.status = ConversionStatus.FAILURE
309
+ conv_res = ConversionResult(
310
+ input=in_doc,
311
+ status=ConversionStatus.FAILURE,
312
+ )
293
313
  # TODO add error log why it failed.
294
314
 
295
315
  return conv_res
@@ -0,0 +1,6 @@
1
+ class BaseError(RuntimeError):
2
+ pass
3
+
4
+
5
+ class ConversionError(BaseError):
6
+ pass
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "2.8.2" # DO NOT EDIT, updated automatically
3
+ version = "2.8.3" # DO NOT EDIT, updated automatically
4
4
  description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes