docling 2.8.1__tar.gz → 2.8.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. {docling-2.8.1 → docling-2.8.3}/PKG-INFO +2 -2
  2. {docling-2.8.1 → docling-2.8.3}/docling/cli/main.py +88 -84
  3. {docling-2.8.1 → docling-2.8.3}/docling/datamodel/base_models.py +5 -8
  4. {docling-2.8.1 → docling-2.8.3}/docling/datamodel/document.py +26 -12
  5. {docling-2.8.1 → docling-2.8.3}/docling/datamodel/pipeline_options.py +20 -0
  6. {docling-2.8.1 → docling-2.8.3}/docling/document_converter.py +103 -83
  7. docling-2.8.3/docling/exceptions.py +6 -0
  8. {docling-2.8.1 → docling-2.8.3}/docling/models/tesseract_ocr_cli_model.py +12 -7
  9. {docling-2.8.1 → docling-2.8.3}/pyproject.toml +9 -4
  10. {docling-2.8.1 → docling-2.8.3}/LICENSE +0 -0
  11. {docling-2.8.1 → docling-2.8.3}/README.md +0 -0
  12. {docling-2.8.1 → docling-2.8.3}/docling/__init__.py +0 -0
  13. {docling-2.8.1 → docling-2.8.3}/docling/backend/__init__.py +0 -0
  14. {docling-2.8.1 → docling-2.8.3}/docling/backend/abstract_backend.py +0 -0
  15. {docling-2.8.1 → docling-2.8.3}/docling/backend/asciidoc_backend.py +0 -0
  16. {docling-2.8.1 → docling-2.8.3}/docling/backend/docling_parse_backend.py +0 -0
  17. {docling-2.8.1 → docling-2.8.3}/docling/backend/docling_parse_v2_backend.py +0 -0
  18. {docling-2.8.1 → docling-2.8.3}/docling/backend/html_backend.py +0 -0
  19. {docling-2.8.1 → docling-2.8.3}/docling/backend/md_backend.py +0 -0
  20. {docling-2.8.1 → docling-2.8.3}/docling/backend/msexcel_backend.py +0 -0
  21. {docling-2.8.1 → docling-2.8.3}/docling/backend/mspowerpoint_backend.py +0 -0
  22. {docling-2.8.1 → docling-2.8.3}/docling/backend/msword_backend.py +0 -0
  23. {docling-2.8.1 → docling-2.8.3}/docling/backend/pdf_backend.py +0 -0
  24. {docling-2.8.1 → docling-2.8.3}/docling/backend/pypdfium2_backend.py +0 -0
  25. {docling-2.8.1 → docling-2.8.3}/docling/cli/__init__.py +0 -0
  26. {docling-2.8.1 → docling-2.8.3}/docling/datamodel/__init__.py +0 -0
  27. {docling-2.8.1 → docling-2.8.3}/docling/datamodel/settings.py +0 -0
  28. {docling-2.8.1 → docling-2.8.3}/docling/models/__init__.py +0 -0
  29. {docling-2.8.1 → docling-2.8.3}/docling/models/base_model.py +0 -0
  30. {docling-2.8.1 → docling-2.8.3}/docling/models/base_ocr_model.py +0 -0
  31. {docling-2.8.1 → docling-2.8.3}/docling/models/ds_glm_model.py +0 -0
  32. {docling-2.8.1 → docling-2.8.3}/docling/models/easyocr_model.py +0 -0
  33. {docling-2.8.1 → docling-2.8.3}/docling/models/layout_model.py +0 -0
  34. {docling-2.8.1 → docling-2.8.3}/docling/models/ocr_mac_model.py +0 -0
  35. {docling-2.8.1 → docling-2.8.3}/docling/models/page_assemble_model.py +0 -0
  36. {docling-2.8.1 → docling-2.8.3}/docling/models/page_preprocessing_model.py +0 -0
  37. {docling-2.8.1 → docling-2.8.3}/docling/models/rapid_ocr_model.py +0 -0
  38. {docling-2.8.1 → docling-2.8.3}/docling/models/table_structure_model.py +0 -0
  39. {docling-2.8.1 → docling-2.8.3}/docling/models/tesseract_ocr_model.py +0 -0
  40. {docling-2.8.1 → docling-2.8.3}/docling/pipeline/__init__.py +0 -0
  41. {docling-2.8.1 → docling-2.8.3}/docling/pipeline/base_pipeline.py +0 -0
  42. {docling-2.8.1 → docling-2.8.3}/docling/pipeline/simple_pipeline.py +0 -0
  43. {docling-2.8.1 → docling-2.8.3}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  44. {docling-2.8.1 → docling-2.8.3}/docling/utils/__init__.py +0 -0
  45. {docling-2.8.1 → docling-2.8.3}/docling/utils/export.py +0 -0
  46. {docling-2.8.1 → docling-2.8.3}/docling/utils/layout_utils.py +0 -0
  47. {docling-2.8.1 → docling-2.8.3}/docling/utils/profiling.py +0 -0
  48. {docling-2.8.1 → docling-2.8.3}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.8.1
3
+ Version: 2.8.3
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -26,7 +26,7 @@ Provides-Extra: tesserocr
26
26
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
27
27
  Requires-Dist: certifi (>=2024.7.4)
28
28
  Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
29
- Requires-Dist: docling-core (>=2.5.1,<3.0.0)
29
+ Requires-Dist: docling-core (>=2.6.1,<3.0.0)
30
30
  Requires-Dist: docling-ibm-models (>=2.0.6,<3.0.0)
31
31
  Requires-Dist: docling-parse (>=2.0.5,<3.0.0)
32
32
  Requires-Dist: easyocr (>=1.7,<2.0)
@@ -2,6 +2,7 @@ import importlib
2
2
  import json
3
3
  import logging
4
4
  import re
5
+ import tempfile
5
6
  import time
6
7
  import warnings
7
8
  from enum import Enum
@@ -9,7 +10,7 @@ from pathlib import Path
9
10
  from typing import Annotated, Dict, Iterable, List, Optional, Type
10
11
 
11
12
  import typer
12
- from docling_core.utils.file import resolve_file_source
13
+ from docling_core.utils.file import resolve_source_to_path
13
14
 
14
15
  from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
15
16
  from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
@@ -256,95 +257,98 @@ def convert(
256
257
  if from_formats is None:
257
258
  from_formats = [e for e in InputFormat]
258
259
 
259
- input_doc_paths: List[Path] = []
260
- for src in input_sources:
261
- source = resolve_file_source(source=src)
262
- if not source.exists():
263
- err_console.print(
264
- f"[red]Error: The input file {source} does not exist.[/red]"
265
- )
266
- raise typer.Abort()
267
- elif source.is_dir():
268
- for fmt in from_formats:
269
- for ext in FormatToExtensions[fmt]:
270
- input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
271
- input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
260
+ with tempfile.TemporaryDirectory() as tempdir:
261
+ input_doc_paths: List[Path] = []
262
+ for src in input_sources:
263
+ source = resolve_source_to_path(source=src, workdir=Path(tempdir))
264
+ if not source.exists():
265
+ err_console.print(
266
+ f"[red]Error: The input file {source} does not exist.[/red]"
267
+ )
268
+ raise typer.Abort()
269
+ elif source.is_dir():
270
+ for fmt in from_formats:
271
+ for ext in FormatToExtensions[fmt]:
272
+ input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
273
+ input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
274
+ else:
275
+ input_doc_paths.append(source)
276
+
277
+ if to_formats is None:
278
+ to_formats = [OutputFormat.MARKDOWN]
279
+
280
+ export_json = OutputFormat.JSON in to_formats
281
+ export_md = OutputFormat.MARKDOWN in to_formats
282
+ export_txt = OutputFormat.TEXT in to_formats
283
+ export_doctags = OutputFormat.DOCTAGS in to_formats
284
+
285
+ if ocr_engine == OcrEngine.EASYOCR:
286
+ ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
287
+ elif ocr_engine == OcrEngine.TESSERACT_CLI:
288
+ ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
289
+ elif ocr_engine == OcrEngine.TESSERACT:
290
+ ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
291
+ elif ocr_engine == OcrEngine.OCRMAC:
292
+ ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
293
+ elif ocr_engine == OcrEngine.RAPIDOCR:
294
+ ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
272
295
  else:
273
- input_doc_paths.append(source)
274
-
275
- if to_formats is None:
276
- to_formats = [OutputFormat.MARKDOWN]
277
-
278
- export_json = OutputFormat.JSON in to_formats
279
- export_md = OutputFormat.MARKDOWN in to_formats
280
- export_txt = OutputFormat.TEXT in to_formats
281
- export_doctags = OutputFormat.DOCTAGS in to_formats
282
-
283
- if ocr_engine == OcrEngine.EASYOCR:
284
- ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
285
- elif ocr_engine == OcrEngine.TESSERACT_CLI:
286
- ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
287
- elif ocr_engine == OcrEngine.TESSERACT:
288
- ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
289
- elif ocr_engine == OcrEngine.OCRMAC:
290
- ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
291
- elif ocr_engine == OcrEngine.RAPIDOCR:
292
- ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
293
- else:
294
- raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
295
-
296
- ocr_lang_list = _split_list(ocr_lang)
297
- if ocr_lang_list is not None:
298
- ocr_options.lang = ocr_lang_list
299
-
300
- pipeline_options = PdfPipelineOptions(
301
- do_ocr=ocr,
302
- ocr_options=ocr_options,
303
- do_table_structure=True,
304
- )
305
- pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
306
- pipeline_options.table_structure_options.mode = table_mode
307
-
308
- if artifacts_path is not None:
309
- pipeline_options.artifacts_path = artifacts_path
310
-
311
- if pdf_backend == PdfBackend.DLPARSE_V1:
312
- backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
313
- elif pdf_backend == PdfBackend.DLPARSE_V2:
314
- backend = DoclingParseV2DocumentBackend
315
- elif pdf_backend == PdfBackend.PYPDFIUM2:
316
- backend = PyPdfiumDocumentBackend
317
- else:
318
- raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
319
-
320
- format_options: Dict[InputFormat, FormatOption] = {
321
- InputFormat.PDF: PdfFormatOption(
322
- pipeline_options=pipeline_options,
323
- backend=backend, # pdf_backend
296
+ raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
297
+
298
+ ocr_lang_list = _split_list(ocr_lang)
299
+ if ocr_lang_list is not None:
300
+ ocr_options.lang = ocr_lang_list
301
+
302
+ pipeline_options = PdfPipelineOptions(
303
+ do_ocr=ocr,
304
+ ocr_options=ocr_options,
305
+ do_table_structure=True,
324
306
  )
325
- }
326
- doc_converter = DocumentConverter(
327
- allowed_formats=from_formats,
328
- format_options=format_options,
329
- )
307
+ pipeline_options.table_structure_options.do_cell_matching = (
308
+ True # do_cell_matching
309
+ )
310
+ pipeline_options.table_structure_options.mode = table_mode
330
311
 
331
- start_time = time.time()
312
+ if artifacts_path is not None:
313
+ pipeline_options.artifacts_path = artifacts_path
332
314
 
333
- conv_results = doc_converter.convert_all(
334
- input_doc_paths, raises_on_error=abort_on_error
335
- )
315
+ if pdf_backend == PdfBackend.DLPARSE_V1:
316
+ backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
317
+ elif pdf_backend == PdfBackend.DLPARSE_V2:
318
+ backend = DoclingParseV2DocumentBackend
319
+ elif pdf_backend == PdfBackend.PYPDFIUM2:
320
+ backend = PyPdfiumDocumentBackend
321
+ else:
322
+ raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
336
323
 
337
- output.mkdir(parents=True, exist_ok=True)
338
- export_documents(
339
- conv_results,
340
- output_dir=output,
341
- export_json=export_json,
342
- export_md=export_md,
343
- export_txt=export_txt,
344
- export_doctags=export_doctags,
345
- )
324
+ format_options: Dict[InputFormat, FormatOption] = {
325
+ InputFormat.PDF: PdfFormatOption(
326
+ pipeline_options=pipeline_options,
327
+ backend=backend, # pdf_backend
328
+ )
329
+ }
330
+ doc_converter = DocumentConverter(
331
+ allowed_formats=from_formats,
332
+ format_options=format_options,
333
+ )
334
+
335
+ start_time = time.time()
336
+
337
+ conv_results = doc_converter.convert_all(
338
+ input_doc_paths, raises_on_error=abort_on_error
339
+ )
340
+
341
+ output.mkdir(parents=True, exist_ok=True)
342
+ export_documents(
343
+ conv_results,
344
+ output_dir=output,
345
+ export_json=export_json,
346
+ export_md=export_md,
347
+ export_txt=export_txt,
348
+ export_doctags=export_doctags,
349
+ )
346
350
 
347
- end_time = time.time() - start_time
351
+ end_time = time.time() - start_time
348
352
 
349
353
  _log.info(f"All documents were converted in {end_time:.2f} seconds.")
350
354
 
@@ -1,5 +1,4 @@
1
1
  from enum import Enum, auto
2
- from io import BytesIO
3
2
  from typing import TYPE_CHECKING, Dict, List, Optional, Union
4
3
 
5
4
  from docling_core.types.doc import (
@@ -9,6 +8,9 @@ from docling_core.types.doc import (
9
8
  Size,
10
9
  TableCell,
11
10
  )
11
+ from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
12
+ DocumentStream,
13
+ )
12
14
  from PIL.Image import Image
13
15
  from pydantic import BaseModel, ConfigDict
14
16
 
@@ -22,6 +24,7 @@ class ConversionStatus(str, Enum):
22
24
  FAILURE = auto()
23
25
  SUCCESS = auto()
24
26
  PARTIAL_SUCCESS = auto()
27
+ SKIPPED = auto()
25
28
 
26
29
 
27
30
  class InputFormat(str, Enum):
@@ -93,6 +96,7 @@ class DoclingComponentType(str, Enum):
93
96
  DOCUMENT_BACKEND = auto()
94
97
  MODEL = auto()
95
98
  DOC_ASSEMBLER = auto()
99
+ USER_INPUT = auto()
96
100
 
97
101
 
98
102
  class ErrorItem(BaseModel):
@@ -207,10 +211,3 @@ class Page(BaseModel):
207
211
  @property
208
212
  def image(self) -> Optional[Image]:
209
213
  return self.get_image(scale=self._default_image_scale)
210
-
211
-
212
- class DocumentStream(BaseModel):
213
- model_config = ConfigDict(arbitrary_types_allowed=True)
214
-
215
- name: str
216
- stream: BytesIO
@@ -3,7 +3,7 @@ import re
3
3
  from enum import Enum
4
4
  from io import BytesIO
5
5
  from pathlib import Path, PurePath
6
- from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
6
+ from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union
7
7
 
8
8
  import filetype
9
9
  from docling_core.types.doc import (
@@ -32,7 +32,7 @@ from docling_core.types.legacy_doc.document import (
32
32
  )
33
33
  from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
34
34
  from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
35
- from docling_core.utils.file import resolve_file_source
35
+ from docling_core.utils.file import resolve_source_to_stream
36
36
  from pydantic import BaseModel
37
37
  from typing_extensions import deprecated
38
38
 
@@ -164,12 +164,6 @@ class InputDocument(BaseModel):
164
164
  backend: Type[AbstractDocumentBackend],
165
165
  path_or_stream: Union[BytesIO, Path],
166
166
  ) -> None:
167
- if backend is None:
168
- raise RuntimeError(
169
- f"No backend configuration provided for file {self.file.name} with format {self.format}. "
170
- f"Please check your format configuration on DocumentConverter."
171
- )
172
-
173
167
  self._backend = backend(self, path_or_stream=path_or_stream)
174
168
  if not self._backend.is_valid():
175
169
  self.valid = False
@@ -450,6 +444,25 @@ class ConversionResult(BaseModel):
450
444
  return ds_doc
451
445
 
452
446
 
447
+ class _DummyBackend(AbstractDocumentBackend):
448
+ def __init__(self, *args, **kwargs):
449
+ super().__init__(*args, **kwargs)
450
+
451
+ def is_valid(self) -> bool:
452
+ return False
453
+
454
+ @classmethod
455
+ def supported_formats(cls) -> Set[InputFormat]:
456
+ return set()
457
+
458
+ @classmethod
459
+ def supports_pagination(cls) -> bool:
460
+ return False
461
+
462
+ def unload(self):
463
+ return super().unload()
464
+
465
+
453
466
  class _DocumentConversionInput(BaseModel):
454
467
 
455
468
  path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
@@ -459,13 +472,14 @@ class _DocumentConversionInput(BaseModel):
459
472
  self, format_options: Dict[InputFormat, "FormatOption"]
460
473
  ) -> Iterable[InputDocument]:
461
474
  for item in self.path_or_stream_iterator:
462
- obj = resolve_file_source(item) if isinstance(item, str) else item
475
+ obj = resolve_source_to_stream(item) if isinstance(item, str) else item
463
476
  format = self._guess_format(obj)
477
+ backend: Type[AbstractDocumentBackend]
464
478
  if format not in format_options.keys():
465
- _log.info(
466
- f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
479
+ _log.error(
480
+ f"Input document {obj.name} does not match any allowed format."
467
481
  )
468
- continue
482
+ backend = _DummyBackend
469
483
  else:
470
484
  backend = format_options[format].backend
471
485
 
@@ -6,11 +6,15 @@ from pydantic import BaseModel, ConfigDict, Field
6
6
 
7
7
 
8
8
  class TableFormerMode(str, Enum):
9
+ """Modes for the TableFormer model."""
10
+
9
11
  FAST = "fast"
10
12
  ACCURATE = "accurate"
11
13
 
12
14
 
13
15
  class TableStructureOptions(BaseModel):
16
+ """Options for the table structure."""
17
+
14
18
  do_cell_matching: bool = (
15
19
  True
16
20
  # True: Matches predictions back to PDF cells. Can break table output if PDF cells
@@ -21,6 +25,8 @@ class TableStructureOptions(BaseModel):
21
25
 
22
26
 
23
27
  class OcrOptions(BaseModel):
28
+ """OCR options."""
29
+
24
30
  kind: str
25
31
  lang: List[str]
26
32
  force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
@@ -30,6 +36,8 @@ class OcrOptions(BaseModel):
30
36
 
31
37
 
32
38
  class RapidOcrOptions(OcrOptions):
39
+ """Options for the RapidOCR engine."""
40
+
33
41
  kind: Literal["rapidocr"] = "rapidocr"
34
42
 
35
43
  # English and chinese are the most commly used models and have been tested with RapidOCR.
@@ -66,6 +74,8 @@ class RapidOcrOptions(OcrOptions):
66
74
 
67
75
 
68
76
  class EasyOcrOptions(OcrOptions):
77
+ """Options for the EasyOCR engine."""
78
+
69
79
  kind: Literal["easyocr"] = "easyocr"
70
80
  lang: List[str] = ["fr", "de", "es", "en"]
71
81
  use_gpu: bool = True # same default as easyocr.Reader
@@ -79,6 +89,8 @@ class EasyOcrOptions(OcrOptions):
79
89
 
80
90
 
81
91
  class TesseractCliOcrOptions(OcrOptions):
92
+ """Options for the TesseractCli engine."""
93
+
82
94
  kind: Literal["tesseract"] = "tesseract"
83
95
  lang: List[str] = ["fra", "deu", "spa", "eng"]
84
96
  tesseract_cmd: str = "tesseract"
@@ -90,6 +102,8 @@ class TesseractCliOcrOptions(OcrOptions):
90
102
 
91
103
 
92
104
  class TesseractOcrOptions(OcrOptions):
105
+ """Options for the Tesseract engine."""
106
+
93
107
  kind: Literal["tesserocr"] = "tesserocr"
94
108
  lang: List[str] = ["fra", "deu", "spa", "eng"]
95
109
  path: Optional[str] = None
@@ -100,6 +114,8 @@ class TesseractOcrOptions(OcrOptions):
100
114
 
101
115
 
102
116
  class OcrMacOptions(OcrOptions):
117
+ """Options for the Mac OCR engine."""
118
+
103
119
  kind: Literal["ocrmac"] = "ocrmac"
104
120
  lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
105
121
  recognition: str = "accurate"
@@ -111,12 +127,16 @@ class OcrMacOptions(OcrOptions):
111
127
 
112
128
 
113
129
  class PipelineOptions(BaseModel):
130
+ """Base pipeline options."""
131
+
114
132
  create_legacy_output: bool = (
115
133
  True # This defautl will be set to False on a future version of docling
116
134
  )
117
135
 
118
136
 
119
137
  class PdfPipelineOptions(PipelineOptions):
138
+ """Options for the PDF pipeline."""
139
+
120
140
  artifacts_path: Optional[Union[Path, str]] = None
121
141
  do_table_structure: bool = True # True: perform table structure extraction
122
142
  do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
@@ -15,7 +15,13 @@ from docling.backend.md_backend import MarkdownDocumentBackend
15
15
  from docling.backend.msexcel_backend import MsExcelDocumentBackend
16
16
  from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
17
17
  from docling.backend.msword_backend import MsWordDocumentBackend
18
- from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
18
+ from docling.datamodel.base_models import (
19
+ ConversionStatus,
20
+ DoclingComponentType,
21
+ DocumentStream,
22
+ ErrorItem,
23
+ InputFormat,
24
+ )
19
25
  from docling.datamodel.document import (
20
26
  ConversionResult,
21
27
  InputDocument,
@@ -23,6 +29,7 @@ from docling.datamodel.document import (
23
29
  )
24
30
  from docling.datamodel.pipeline_options import PipelineOptions
25
31
  from docling.datamodel.settings import DocumentLimits, settings
32
+ from docling.exceptions import ConversionError
26
33
  from docling.pipeline.base_pipeline import BasePipeline
27
34
  from docling.pipeline.simple_pipeline import SimplePipeline
28
35
  from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
@@ -85,32 +92,37 @@ class ImageFormatOption(FormatOption):
85
92
  backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
86
93
 
87
94
 
88
- _format_to_default_options = {
89
- InputFormat.XLSX: FormatOption(
90
- pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
91
- ),
92
- InputFormat.DOCX: FormatOption(
93
- pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
94
- ),
95
- InputFormat.PPTX: FormatOption(
96
- pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
97
- ),
98
- InputFormat.MD: FormatOption(
99
- pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
100
- ),
101
- InputFormat.ASCIIDOC: FormatOption(
102
- pipeline_cls=SimplePipeline, backend=AsciiDocBackend
103
- ),
104
- InputFormat.HTML: FormatOption(
105
- pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
106
- ),
107
- InputFormat.IMAGE: FormatOption(
108
- pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
109
- ),
110
- InputFormat.PDF: FormatOption(
111
- pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
112
- ),
113
- }
95
+ def _get_default_option(format: InputFormat) -> FormatOption:
96
+ format_to_default_options = {
97
+ InputFormat.XLSX: FormatOption(
98
+ pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
99
+ ),
100
+ InputFormat.DOCX: FormatOption(
101
+ pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
102
+ ),
103
+ InputFormat.PPTX: FormatOption(
104
+ pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
105
+ ),
106
+ InputFormat.MD: FormatOption(
107
+ pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
108
+ ),
109
+ InputFormat.ASCIIDOC: FormatOption(
110
+ pipeline_cls=SimplePipeline, backend=AsciiDocBackend
111
+ ),
112
+ InputFormat.HTML: FormatOption(
113
+ pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
114
+ ),
115
+ InputFormat.IMAGE: FormatOption(
116
+ pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
117
+ ),
118
+ InputFormat.PDF: FormatOption(
119
+ pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
120
+ ),
121
+ }
122
+ if (options := format_to_default_options.get(format)) is not None:
123
+ return options
124
+ else:
125
+ raise RuntimeError(f"No default options configured for {format}")
114
126
 
115
127
 
116
128
  class DocumentConverter:
@@ -121,36 +133,26 @@ class DocumentConverter:
121
133
  allowed_formats: Optional[List[InputFormat]] = None,
122
134
  format_options: Optional[Dict[InputFormat, FormatOption]] = None,
123
135
  ):
124
- self.allowed_formats = allowed_formats
125
- self.format_to_options = format_options
126
-
127
- if self.allowed_formats is None:
128
- # if self.format_to_options is not None:
129
- # self.allowed_formats = self.format_to_options.keys()
130
- # else:
131
- self.allowed_formats = [e for e in InputFormat] # all formats
132
-
133
- if self.format_to_options is None:
134
- self.format_to_options = _format_to_default_options
135
- else:
136
- for f in self.allowed_formats:
137
- if f not in self.format_to_options.keys():
138
- _log.debug(f"Requested format {f} will use default options.")
139
- self.format_to_options[f] = _format_to_default_options[f]
140
-
141
- remove_keys = []
142
- for f in self.format_to_options.keys():
143
- if f not in self.allowed_formats:
144
- remove_keys.append(f)
145
-
146
- for f in remove_keys:
147
- self.format_to_options.pop(f)
148
-
136
+ self.allowed_formats = (
137
+ allowed_formats if allowed_formats is not None else [e for e in InputFormat]
138
+ )
139
+ self.format_to_options = {
140
+ format: (
141
+ _get_default_option(format=format)
142
+ if (custom_option := (format_options or {}).get(format)) is None
143
+ else custom_option
144
+ )
145
+ for format in self.allowed_formats
146
+ }
149
147
  self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
150
148
 
151
149
  def initialize_pipeline(self, format: InputFormat):
152
150
  """Initialize the conversion pipeline for the selected format."""
153
- self._get_pipeline(doc_format=format)
151
+ pipeline = self._get_pipeline(doc_format=format)
152
+ if pipeline is None:
153
+ raise ConversionError(
154
+ f"No pipeline could be initialized for format {format}"
155
+ )
154
156
 
155
157
  @validate_call(config=ConfigDict(strict=True))
156
158
  def convert(
@@ -186,22 +188,28 @@ class DocumentConverter:
186
188
  limits=limits,
187
189
  )
188
190
  conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
191
+
192
+ had_result = False
189
193
  for conv_res in conv_res_iter:
194
+ had_result = True
190
195
  if raises_on_error and conv_res.status not in {
191
196
  ConversionStatus.SUCCESS,
192
197
  ConversionStatus.PARTIAL_SUCCESS,
193
198
  }:
194
- raise RuntimeError(
199
+ raise ConversionError(
195
200
  f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
196
201
  )
197
202
  else:
198
203
  yield conv_res
199
204
 
205
+ if not had_result and raises_on_error:
206
+ raise ConversionError(
207
+ f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
208
+ )
209
+
200
210
  def _convert(
201
211
  self, conv_input: _DocumentConversionInput, raises_on_error: bool
202
212
  ) -> Iterator[ConversionResult]:
203
- assert self.format_to_options is not None
204
-
205
213
  start_time = time.monotonic()
206
214
 
207
215
  for input_batch in chunkify(
@@ -223,27 +231,22 @@ class DocumentConverter:
223
231
  ):
224
232
  elapsed = time.monotonic() - start_time
225
233
  start_time = time.monotonic()
226
-
227
- if item is not None:
228
- _log.info(
229
- f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
230
- )
231
- yield item
232
- else:
233
- _log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
234
+ _log.info(
235
+ f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
236
+ )
237
+ yield item
234
238
 
235
239
  def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
236
- assert self.format_to_options is not None
237
-
238
240
  fopt = self.format_to_options.get(doc_format)
239
241
 
240
242
  if fopt is None:
241
- raise RuntimeError(f"Could not get pipeline for {doc_format}")
243
+ return None
242
244
  else:
243
245
  pipeline_class = fopt.pipeline_cls
244
246
  pipeline_options = fopt.pipeline_options
245
247
 
246
- assert pipeline_options is not None
248
+ if pipeline_options is None:
249
+ return None
247
250
  # TODO this will ignore if different options have been defined for the same pipeline class.
248
251
  if (
249
252
  pipeline_class not in self.initialized_pipelines
@@ -257,11 +260,26 @@ class DocumentConverter:
257
260
 
258
261
  def _process_document(
259
262
  self, in_doc: InputDocument, raises_on_error: bool
260
- ) -> Optional[ConversionResult]:
261
- assert self.allowed_formats is not None
262
- assert in_doc.format in self.allowed_formats
263
+ ) -> ConversionResult:
263
264
 
264
- conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
265
+ valid = (
266
+ self.allowed_formats is not None and in_doc.format in self.allowed_formats
267
+ )
268
+ if valid:
269
+ conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
270
+ else:
271
+ error_message = f"File format not allowed: {in_doc.file}"
272
+ if raises_on_error:
273
+ raise ConversionError(error_message)
274
+ else:
275
+ error_item = ErrorItem(
276
+ component_type=DoclingComponentType.USER_INPUT,
277
+ module_name="",
278
+ error_message=error_message,
279
+ )
280
+ conv_res = ConversionResult(
281
+ input=in_doc, status=ConversionStatus.SKIPPED, errors=[error_item]
282
+ )
265
283
 
266
284
  return conv_res
267
285
 
@@ -270,26 +288,28 @@ class DocumentConverter:
270
288
  ) -> ConversionResult:
271
289
  if in_doc.valid:
272
290
  pipeline = self._get_pipeline(in_doc.format)
273
- if pipeline is None: # Can't find a default pipeline. Should this raise?
291
+ if pipeline is not None:
292
+ conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
293
+ else:
274
294
  if raises_on_error:
275
- raise RuntimeError(
295
+ raise ConversionError(
276
296
  f"No pipeline could be initialized for {in_doc.file}."
277
297
  )
278
298
  else:
279
- conv_res = ConversionResult(input=in_doc)
280
- conv_res.status = ConversionStatus.FAILURE
281
- return conv_res
282
-
283
- conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
284
-
299
+ conv_res = ConversionResult(
300
+ input=in_doc,
301
+ status=ConversionStatus.FAILURE,
302
+ )
285
303
  else:
286
304
  if raises_on_error:
287
- raise RuntimeError(f"Input document {in_doc.file} is not valid.")
305
+ raise ConversionError(f"Input document {in_doc.file} is not valid.")
288
306
 
289
307
  else:
290
308
  # invalid doc or not of desired format
291
- conv_res = ConversionResult(input=in_doc)
292
- conv_res.status = ConversionStatus.FAILURE
309
+ conv_res = ConversionResult(
310
+ input=in_doc,
311
+ status=ConversionStatus.FAILURE,
312
+ )
293
313
  # TODO add error log why it failed.
294
314
 
295
315
  return conv_res
@@ -0,0 +1,6 @@
1
+ class BaseError(RuntimeError):
2
+ pass
3
+
4
+
5
+ class ConversionError(BaseError):
6
+ pass
@@ -1,5 +1,7 @@
1
+ import csv
1
2
  import io
2
3
  import logging
4
+ import os
3
5
  import tempfile
4
6
  from subprocess import DEVNULL, PIPE, Popen
5
7
  from typing import Iterable, Optional, Tuple
@@ -95,7 +97,7 @@ class TesseractOcrCliModel(BaseOcrModel):
95
97
  # _log.info(decoded_data)
96
98
 
97
99
  # Read the TSV file generated by Tesseract
98
- df = pd.read_csv(io.StringIO(decoded_data), sep="\t")
100
+ df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t")
99
101
 
100
102
  # Display the dataframe (optional)
101
103
  # _log.info("df: ", df.head())
@@ -130,14 +132,17 @@ class TesseractOcrCliModel(BaseOcrModel):
130
132
  high_res_image = page._backend.get_page_image(
131
133
  scale=self.scale, cropbox=ocr_rect
132
134
  )
133
-
134
- with tempfile.NamedTemporaryFile(
135
- suffix=".png", mode="w"
136
- ) as image_file:
137
- fname = image_file.name
138
- high_res_image.save(fname)
135
+ try:
136
+ with tempfile.NamedTemporaryFile(
137
+ suffix=".png", mode="w+b", delete=False
138
+ ) as image_file:
139
+ fname = image_file.name
140
+ high_res_image.save(image_file)
139
141
 
140
142
  df = self._run_tesseract(fname)
143
+ finally:
144
+ if os.path.exists(fname):
145
+ os.remove(fname)
141
146
 
142
147
  # _log.info(df)
143
148
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "2.8.1" # DO NOT EDIT, updated automatically
3
+ version = "2.8.3" # DO NOT EDIT, updated automatically
4
4
  description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
@@ -26,7 +26,7 @@ packages = [{include = "docling"}]
26
26
  ######################
27
27
  python = "^3.9"
28
28
  pydantic = ">=2.0.0,<2.10"
29
- docling-core = "^2.5.1"
29
+ docling-core = "^2.6.1"
30
30
  docling-ibm-models = "^2.0.6"
31
31
  deepsearch-glm = "^0.26.1"
32
32
  filetype = "^1.2.0"
@@ -80,6 +80,8 @@ types-openpyxl = "^3.1.5.20241114"
80
80
  mkdocs-material = "^9.5.40"
81
81
  mkdocs-jupyter = "^0.25.0"
82
82
  mkdocs-click = "^0.8.1"
83
+ mkdocstrings = {extras = ["python"], version = "^0.27.0"}
84
+ griffe-pydantic = "^1.1.0"
83
85
 
84
86
  [tool.poetry.group.examples.dependencies]
85
87
  datasets = "^2.21.0"
@@ -88,10 +90,13 @@ langchain-huggingface = "^0.0.3"
88
90
  langchain-milvus = "^0.1.4"
89
91
  langchain-text-splitters = "^0.2.4"
90
92
 
93
+ [tool.poetry.group.constraints]
94
+ optional = true
95
+
91
96
  [tool.poetry.group.constraints.dependencies]
92
97
  numpy = [
93
- { version = "^2.1.0", markers = 'python_version >= "3.13"' },
94
- { version = "^1.24.4", markers = 'python_version < "3.13"' },
98
+ { version = ">=1.24.4,<3.0.0", markers = 'python_version >= "3.10"' },
99
+ { version = ">=1.24.4,<2.1.0", markers = 'python_version < "3.10"' },
95
100
  ]
96
101
 
97
102
  [tool.poetry.group.mac_intel]
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes