PyPI - docling - Versions diffs - 2.8.1__py3-none-any.whl → 2.8.3__py3-none-any.whl - Mend

docling 2.8.1py3-none-any.whl → 2.8.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

docling/cli/main.py +88 -84
docling/datamodel/base_models.py +5 -8
docling/datamodel/document.py +26 -12
docling/datamodel/pipeline_options.py +20 -0
docling/document_converter.py +103 -83
docling/exceptions.py +6 -0
docling/models/tesseract_ocr_cli_model.py +12 -7
{docling-2.8.1.dist-info → docling-2.8.3.dist-info}/METADATA +2 -2
{docling-2.8.1.dist-info → docling-2.8.3.dist-info}/RECORD +12 -11
{docling-2.8.1.dist-info → docling-2.8.3.dist-info}/LICENSE +0 -0
{docling-2.8.1.dist-info → docling-2.8.3.dist-info}/WHEEL +0 -0
{docling-2.8.1.dist-info → docling-2.8.3.dist-info}/entry_points.txt +0 -0

docling/cli/main.py CHANGED Viewed

@@ -2,6 +2,7 @@ import importlib
 import json
 import logging
 import re
+import tempfile
 import time
 import warnings
 from enum import Enum
@@ -9,7 +10,7 @@ from pathlib import Path
 from typing import Annotated, Dict, Iterable, List, Optional, Type
 import typer
-from docling_core.utils.file import resolve_file_source
+from docling_core.utils.file import resolve_source_to_path
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
@@ -256,95 +257,98 @@ def convert(
     if from_formats is None:
         from_formats = [e for e in InputFormat]
-    input_doc_paths: List[Path] = []
-    for src in input_sources:
-        source = resolve_file_source(source=src)
-        if not source.exists():
-            err_console.print(
-                f"[red]Error: The input file {source} does not exist.[/red]"
-            )
-            raise typer.Abort()
-        elif source.is_dir():
-            for fmt in from_formats:
-                for ext in FormatToExtensions[fmt]:
-                    input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
-                    input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
+    with tempfile.TemporaryDirectory() as tempdir:
+        input_doc_paths: List[Path] = []
+        for src in input_sources:
+            source = resolve_source_to_path(source=src, workdir=Path(tempdir))
+            if not source.exists():
+                err_console.print(
+                    f"[red]Error: The input file {source} does not exist.[/red]"
+                )
+                raise typer.Abort()
+            elif source.is_dir():
+                for fmt in from_formats:
+                    for ext in FormatToExtensions[fmt]:
+                        input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
+                        input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
+            else:
+                input_doc_paths.append(source)
+        if to_formats is None:
+            to_formats = [OutputFormat.MARKDOWN]
+        export_json = OutputFormat.JSON in to_formats
+        export_md = OutputFormat.MARKDOWN in to_formats
+        export_txt = OutputFormat.TEXT in to_formats
+        export_doctags = OutputFormat.DOCTAGS in to_formats
+        if ocr_engine == OcrEngine.EASYOCR:
+            ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
+        elif ocr_engine == OcrEngine.TESSERACT_CLI:
+            ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
+        elif ocr_engine == OcrEngine.TESSERACT:
+            ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
+        elif ocr_engine == OcrEngine.OCRMAC:
+            ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
+        elif ocr_engine == OcrEngine.RAPIDOCR:
+            ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
         else:
-            input_doc_paths.append(source)
-    if to_formats is None:
-        to_formats = [OutputFormat.MARKDOWN]
-    export_json = OutputFormat.JSON in to_formats
-    export_md = OutputFormat.MARKDOWN in to_formats
-    export_txt = OutputFormat.TEXT in to_formats
-    export_doctags = OutputFormat.DOCTAGS in to_formats
-    if ocr_engine == OcrEngine.EASYOCR:
-        ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
-    elif ocr_engine == OcrEngine.TESSERACT_CLI:
-        ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
-    elif ocr_engine == OcrEngine.TESSERACT:
-        ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
-    elif ocr_engine == OcrEngine.OCRMAC:
-        ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
-    elif ocr_engine == OcrEngine.RAPIDOCR:
-        ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
-    else:
-        raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
-    ocr_lang_list = _split_list(ocr_lang)
-    if ocr_lang_list is not None:
-        ocr_options.lang = ocr_lang_list
-    pipeline_options = PdfPipelineOptions(
-        do_ocr=ocr,
-        ocr_options=ocr_options,
-        do_table_structure=True,
-    )
-    pipeline_options.table_structure_options.do_cell_matching = True  # do_cell_matching
-    pipeline_options.table_structure_options.mode = table_mode
-    if artifacts_path is not None:
-        pipeline_options.artifacts_path = artifacts_path
-    if pdf_backend == PdfBackend.DLPARSE_V1:
-        backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
-    elif pdf_backend == PdfBackend.DLPARSE_V2:
-        backend = DoclingParseV2DocumentBackend
-    elif pdf_backend == PdfBackend.PYPDFIUM2:
-        backend = PyPdfiumDocumentBackend
-    else:
-        raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
-    format_options: Dict[InputFormat, FormatOption] = {
-        InputFormat.PDF: PdfFormatOption(
-            pipeline_options=pipeline_options,
-            backend=backend,  # pdf_backend
+            raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
+        ocr_lang_list = _split_list(ocr_lang)
+        if ocr_lang_list is not None:
+            ocr_options.lang = ocr_lang_list
+        pipeline_options = PdfPipelineOptions(
+            do_ocr=ocr,
+            ocr_options=ocr_options,
+            do_table_structure=True,
         )
-    }
-    doc_converter = DocumentConverter(
-        allowed_formats=from_formats,
-        format_options=format_options,
-    )
+        pipeline_options.table_structure_options.do_cell_matching = (
+            True  # do_cell_matching
+        )
+        pipeline_options.table_structure_options.mode = table_mode
-    start_time = time.time()
+        if artifacts_path is not None:
+            pipeline_options.artifacts_path = artifacts_path
-    conv_results = doc_converter.convert_all(
-        input_doc_paths, raises_on_error=abort_on_error
-    )
+        if pdf_backend == PdfBackend.DLPARSE_V1:
+            backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
+        elif pdf_backend == PdfBackend.DLPARSE_V2:
+            backend = DoclingParseV2DocumentBackend
+        elif pdf_backend == PdfBackend.PYPDFIUM2:
+            backend = PyPdfiumDocumentBackend
+        else:
+            raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
-    output.mkdir(parents=True, exist_ok=True)
-    export_documents(
-        conv_results,
-        output_dir=output,
-        export_json=export_json,
-        export_md=export_md,
-        export_txt=export_txt,
-        export_doctags=export_doctags,
-    )
+        format_options: Dict[InputFormat, FormatOption] = {
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_options=pipeline_options,
+                backend=backend,  # pdf_backend
+            )
+        }
+        doc_converter = DocumentConverter(
+            allowed_formats=from_formats,
+            format_options=format_options,
+        )
+        start_time = time.time()
+        conv_results = doc_converter.convert_all(
+            input_doc_paths, raises_on_error=abort_on_error
+        )
+        output.mkdir(parents=True, exist_ok=True)
+        export_documents(
+            conv_results,
+            output_dir=output,
+            export_json=export_json,
+            export_md=export_md,
+            export_txt=export_txt,
+            export_doctags=export_doctags,
+        )
-    end_time = time.time() - start_time
+        end_time = time.time() - start_time
     _log.info(f"All documents were converted in {end_time:.2f} seconds.")

docling/datamodel/base_models.py CHANGED Viewed

@@ -1,5 +1,4 @@
 from enum import Enum, auto
-from io import BytesIO
 from typing import TYPE_CHECKING, Dict, List, Optional, Union
 from docling_core.types.doc import (
@@ -9,6 +8,9 @@ from docling_core.types.doc import (
     Size,
     TableCell,
 )
+from docling_core.types.io import (  # DO ΝΟΤ REMOVE; explicitly exposed from this location
+    DocumentStream,
+)
 from PIL.Image import Image
 from pydantic import BaseModel, ConfigDict
@@ -22,6 +24,7 @@ class ConversionStatus(str, Enum):
     FAILURE = auto()
     SUCCESS = auto()
     PARTIAL_SUCCESS = auto()
+    SKIPPED = auto()
 class InputFormat(str, Enum):
@@ -93,6 +96,7 @@ class DoclingComponentType(str, Enum):
     DOCUMENT_BACKEND = auto()
     MODEL = auto()
     DOC_ASSEMBLER = auto()
+    USER_INPUT = auto()
 class ErrorItem(BaseModel):
@@ -207,10 +211,3 @@ class Page(BaseModel):
     @property
     def image(self) -> Optional[Image]:
         return self.get_image(scale=self._default_image_scale)
-class DocumentStream(BaseModel):
-    model_config = ConfigDict(arbitrary_types_allowed=True)
-    name: str
-    stream: BytesIO

docling/datamodel/document.py CHANGED Viewed

@@ -3,7 +3,7 @@ import re
 from enum import Enum
 from io import BytesIO
 from pathlib import Path, PurePath
-from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union
 import filetype
 from docling_core.types.doc import (
@@ -32,7 +32,7 @@ from docling_core.types.legacy_doc.document import (
 )
 from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
 from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
-from docling_core.utils.file import resolve_file_source
+from docling_core.utils.file import resolve_source_to_stream
 from pydantic import BaseModel
 from typing_extensions import deprecated
@@ -164,12 +164,6 @@ class InputDocument(BaseModel):
         backend: Type[AbstractDocumentBackend],
         path_or_stream: Union[BytesIO, Path],
     ) -> None:
-        if backend is None:
-            raise RuntimeError(
-                f"No backend configuration provided for file {self.file.name} with format {self.format}. "
-                f"Please check your format configuration on DocumentConverter."
-            )
         self._backend = backend(self, path_or_stream=path_or_stream)
         if not self._backend.is_valid():
             self.valid = False
@@ -450,6 +444,25 @@ class ConversionResult(BaseModel):
         return ds_doc
+class _DummyBackend(AbstractDocumentBackend):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def is_valid(self) -> bool:
+        return False
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return set()
+    @classmethod
+    def supports_pagination(cls) -> bool:
+        return False
+    def unload(self):
+        return super().unload()
 class _DocumentConversionInput(BaseModel):
     path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
@@ -459,13 +472,14 @@ class _DocumentConversionInput(BaseModel):
         self, format_options: Dict[InputFormat, "FormatOption"]
     ) -> Iterable[InputDocument]:
         for item in self.path_or_stream_iterator:
-            obj = resolve_file_source(item) if isinstance(item, str) else item
+            obj = resolve_source_to_stream(item) if isinstance(item, str) else item
             format = self._guess_format(obj)
+            backend: Type[AbstractDocumentBackend]
             if format not in format_options.keys():
-                _log.info(
-                    f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
+                _log.error(
+                    f"Input document {obj.name} does not match any allowed format."
                 )
-                continue
+                backend = _DummyBackend
             else:
                 backend = format_options[format].backend

docling/datamodel/pipeline_options.py CHANGED Viewed

@@ -6,11 +6,15 @@ from pydantic import BaseModel, ConfigDict, Field
 class TableFormerMode(str, Enum):
+    """Modes for the TableFormer model."""
     FAST = "fast"
     ACCURATE = "accurate"
 class TableStructureOptions(BaseModel):
+    """Options for the table structure."""
     do_cell_matching: bool = (
         True
         # True:  Matches predictions back to PDF cells. Can break table output if PDF cells
@@ -21,6 +25,8 @@ class TableStructureOptions(BaseModel):
 class OcrOptions(BaseModel):
+    """OCR options."""
     kind: str
     lang: List[str]
     force_full_page_ocr: bool = False  # If enabled a full page OCR is always applied
@@ -30,6 +36,8 @@ class OcrOptions(BaseModel):
 class RapidOcrOptions(OcrOptions):
+    """Options for the RapidOCR engine."""
     kind: Literal["rapidocr"] = "rapidocr"
     # English and chinese are the most commly used models and have been tested with RapidOCR.
@@ -66,6 +74,8 @@ class RapidOcrOptions(OcrOptions):
 class EasyOcrOptions(OcrOptions):
+    """Options for the EasyOCR engine."""
     kind: Literal["easyocr"] = "easyocr"
     lang: List[str] = ["fr", "de", "es", "en"]
     use_gpu: bool = True  # same default as easyocr.Reader
@@ -79,6 +89,8 @@ class EasyOcrOptions(OcrOptions):
 class TesseractCliOcrOptions(OcrOptions):
+    """Options for the TesseractCli engine."""
     kind: Literal["tesseract"] = "tesseract"
     lang: List[str] = ["fra", "deu", "spa", "eng"]
     tesseract_cmd: str = "tesseract"
@@ -90,6 +102,8 @@ class TesseractCliOcrOptions(OcrOptions):
 class TesseractOcrOptions(OcrOptions):
+    """Options for the Tesseract engine."""
     kind: Literal["tesserocr"] = "tesserocr"
     lang: List[str] = ["fra", "deu", "spa", "eng"]
     path: Optional[str] = None
@@ -100,6 +114,8 @@ class TesseractOcrOptions(OcrOptions):
 class OcrMacOptions(OcrOptions):
+    """Options for the Mac OCR engine."""
     kind: Literal["ocrmac"] = "ocrmac"
     lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
     recognition: str = "accurate"
@@ -111,12 +127,16 @@ class OcrMacOptions(OcrOptions):
 class PipelineOptions(BaseModel):
+    """Base pipeline options."""
     create_legacy_output: bool = (
         True  # This defautl will be set to False on a future version of docling
     )
 class PdfPipelineOptions(PipelineOptions):
+    """Options for the PDF pipeline."""
     artifacts_path: Optional[Union[Path, str]] = None
     do_table_structure: bool = True  # True: perform table structure extraction
     do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text

docling/document_converter.py CHANGED Viewed

@@ -15,7 +15,13 @@ from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.msexcel_backend import MsExcelDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
-from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
+from docling.datamodel.base_models import (
+    ConversionStatus,
+    DoclingComponentType,
+    DocumentStream,
+    ErrorItem,
+    InputFormat,
+)
 from docling.datamodel.document import (
     ConversionResult,
     InputDocument,
@@ -23,6 +29,7 @@ from docling.datamodel.document import (
 )
 from docling.datamodel.pipeline_options import PipelineOptions
 from docling.datamodel.settings import DocumentLimits, settings
+from docling.exceptions import ConversionError
 from docling.pipeline.base_pipeline import BasePipeline
 from docling.pipeline.simple_pipeline import SimplePipeline
 from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
@@ -85,32 +92,37 @@ class ImageFormatOption(FormatOption):
     backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
-_format_to_default_options = {
-    InputFormat.XLSX: FormatOption(
-        pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
-    ),
-    InputFormat.DOCX: FormatOption(
-        pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
-    ),
-    InputFormat.PPTX: FormatOption(
-        pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
-    ),
-    InputFormat.MD: FormatOption(
-        pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
-    ),
-    InputFormat.ASCIIDOC: FormatOption(
-        pipeline_cls=SimplePipeline, backend=AsciiDocBackend
-    ),
-    InputFormat.HTML: FormatOption(
-        pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
-    ),
-    InputFormat.IMAGE: FormatOption(
-        pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
-    ),
-    InputFormat.PDF: FormatOption(
-        pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
-    ),
-}
+def _get_default_option(format: InputFormat) -> FormatOption:
+    format_to_default_options = {
+        InputFormat.XLSX: FormatOption(
+            pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
+        ),
+        InputFormat.DOCX: FormatOption(
+            pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
+        ),
+        InputFormat.PPTX: FormatOption(
+            pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
+        ),
+        InputFormat.MD: FormatOption(
+            pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
+        ),
+        InputFormat.ASCIIDOC: FormatOption(
+            pipeline_cls=SimplePipeline, backend=AsciiDocBackend
+        ),
+        InputFormat.HTML: FormatOption(
+            pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
+        ),
+        InputFormat.IMAGE: FormatOption(
+            pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
+        ),
+        InputFormat.PDF: FormatOption(
+            pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
+        ),
+    }
+    if (options := format_to_default_options.get(format)) is not None:
+        return options
+    else:
+        raise RuntimeError(f"No default options configured for {format}")
 class DocumentConverter:
@@ -121,36 +133,26 @@ class DocumentConverter:
         allowed_formats: Optional[List[InputFormat]] = None,
         format_options: Optional[Dict[InputFormat, FormatOption]] = None,
     ):
-        self.allowed_formats = allowed_formats
-        self.format_to_options = format_options
-        if self.allowed_formats is None:
-            # if self.format_to_options is not None:
-            #    self.allowed_formats = self.format_to_options.keys()
-            # else:
-            self.allowed_formats = [e for e in InputFormat]  # all formats
-        if self.format_to_options is None:
-            self.format_to_options = _format_to_default_options
-        else:
-            for f in self.allowed_formats:
-                if f not in self.format_to_options.keys():
-                    _log.debug(f"Requested format {f} will use default options.")
-                    self.format_to_options[f] = _format_to_default_options[f]
-            remove_keys = []
-            for f in self.format_to_options.keys():
-                if f not in self.allowed_formats:
-                    remove_keys.append(f)
-            for f in remove_keys:
-                self.format_to_options.pop(f)
+        self.allowed_formats = (
+            allowed_formats if allowed_formats is not None else [e for e in InputFormat]
+        )
+        self.format_to_options = {
+            format: (
+                _get_default_option(format=format)
+                if (custom_option := (format_options or {}).get(format)) is None
+                else custom_option
+            )
+            for format in self.allowed_formats
+        }
         self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
     def initialize_pipeline(self, format: InputFormat):
         """Initialize the conversion pipeline for the selected format."""
-        self._get_pipeline(doc_format=format)
+        pipeline = self._get_pipeline(doc_format=format)
+        if pipeline is None:
+            raise ConversionError(
+                f"No pipeline could be initialized for format {format}"
+            )
     @validate_call(config=ConfigDict(strict=True))
     def convert(
@@ -186,22 +188,28 @@ class DocumentConverter:
             limits=limits,
         )
         conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
+        had_result = False
         for conv_res in conv_res_iter:
+            had_result = True
             if raises_on_error and conv_res.status not in {
                 ConversionStatus.SUCCESS,
                 ConversionStatus.PARTIAL_SUCCESS,
             }:
-                raise RuntimeError(
+                raise ConversionError(
                     f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
                 )
             else:
                 yield conv_res
+        if not had_result and raises_on_error:
+            raise ConversionError(
+                f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
+            )
     def _convert(
         self, conv_input: _DocumentConversionInput, raises_on_error: bool
     ) -> Iterator[ConversionResult]:
-        assert self.format_to_options is not None
         start_time = time.monotonic()
         for input_batch in chunkify(
@@ -223,27 +231,22 @@ class DocumentConverter:
             ):
                 elapsed = time.monotonic() - start_time
                 start_time = time.monotonic()
-                if item is not None:
-                    _log.info(
-                        f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
-                    )
-                    yield item
-                else:
-                    _log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
+                _log.info(
+                    f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
+                )
+                yield item
     def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
-        assert self.format_to_options is not None
         fopt = self.format_to_options.get(doc_format)
         if fopt is None:
-            raise RuntimeError(f"Could not get pipeline for {doc_format}")
+            return None
         else:
             pipeline_class = fopt.pipeline_cls
             pipeline_options = fopt.pipeline_options
-        assert pipeline_options is not None
+        if pipeline_options is None:
+            return None
         # TODO this will ignore if different options have been defined for the same pipeline class.
         if (
             pipeline_class not in self.initialized_pipelines
@@ -257,11 +260,26 @@ class DocumentConverter:
     def _process_document(
         self, in_doc: InputDocument, raises_on_error: bool
-    ) -> Optional[ConversionResult]:
-        assert self.allowed_formats is not None
-        assert in_doc.format in self.allowed_formats
+    ) -> ConversionResult:
-        conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
+        valid = (
+            self.allowed_formats is not None and in_doc.format in self.allowed_formats
+        )
+        if valid:
+            conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
+        else:
+            error_message = f"File format not allowed: {in_doc.file}"
+            if raises_on_error:
+                raise ConversionError(error_message)
+            else:
+                error_item = ErrorItem(
+                    component_type=DoclingComponentType.USER_INPUT,
+                    module_name="",
+                    error_message=error_message,
+                )
+                conv_res = ConversionResult(
+                    input=in_doc, status=ConversionStatus.SKIPPED, errors=[error_item]
+                )
         return conv_res
@@ -270,26 +288,28 @@ class DocumentConverter:
     ) -> ConversionResult:
         if in_doc.valid:
             pipeline = self._get_pipeline(in_doc.format)
-            if pipeline is None:  # Can't find a default pipeline. Should this raise?
+            if pipeline is not None:
+                conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
+            else:
                 if raises_on_error:
-                    raise RuntimeError(
+                    raise ConversionError(
                         f"No pipeline could be initialized for {in_doc.file}."
                     )
                 else:
-                    conv_res = ConversionResult(input=in_doc)
-                    conv_res.status = ConversionStatus.FAILURE
-                    return conv_res
-            conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
+                    conv_res = ConversionResult(
+                        input=in_doc,
+                        status=ConversionStatus.FAILURE,
+                    )
         else:
             if raises_on_error:
-                raise RuntimeError(f"Input document {in_doc.file} is not valid.")
+                raise ConversionError(f"Input document {in_doc.file} is not valid.")
             else:
                 # invalid doc or not of desired format
-                conv_res = ConversionResult(input=in_doc)
-                conv_res.status = ConversionStatus.FAILURE
+                conv_res = ConversionResult(
+                    input=in_doc,
+                    status=ConversionStatus.FAILURE,
+                )
                 # TODO add error log why it failed.
         return conv_res

docling/exceptions.py ADDED Viewed

@@ -0,0 +1,6 @@
+class BaseError(RuntimeError):
+    pass
+class ConversionError(BaseError):
+    pass

docling/models/tesseract_ocr_cli_model.py CHANGED Viewed

@@ -1,5 +1,7 @@
+import csv
 import io
 import logging
+import os
 import tempfile
 from subprocess import DEVNULL, PIPE, Popen
 from typing import Iterable, Optional, Tuple
@@ -95,7 +97,7 @@ class TesseractOcrCliModel(BaseOcrModel):
         # _log.info(decoded_data)
         # Read the TSV file generated by Tesseract
-        df = pd.read_csv(io.StringIO(decoded_data), sep="\t")
+        df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t")
         # Display the dataframe (optional)
         # _log.info("df: ", df.head())
@@ -130,14 +132,17 @@ class TesseractOcrCliModel(BaseOcrModel):
                         high_res_image = page._backend.get_page_image(
                             scale=self.scale, cropbox=ocr_rect
                         )
-                        with tempfile.NamedTemporaryFile(
-                            suffix=".png", mode="w"
-                        ) as image_file:
-                            fname = image_file.name
-                            high_res_image.save(fname)
+                        try:
+                            with tempfile.NamedTemporaryFile(
+                                suffix=".png", mode="w+b", delete=False
+                            ) as image_file:
+                                fname = image_file.name
+                                high_res_image.save(image_file)
                             df = self._run_tesseract(fname)
+                        finally:
+                            if os.path.exists(fname):
+                                os.remove(fname)
                         # _log.info(df)

{docling-2.8.1.dist-info → docling-2.8.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 2.8.1
+Version: 2.8.3
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Home-page: https://github.com/DS4SD/docling
 License: MIT
@@ -26,7 +26,7 @@ Provides-Extra: tesserocr
 Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
 Requires-Dist: certifi (>=2024.7.4)
 Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
-Requires-Dist: docling-core (>=2.5.1,<3.0.0)
+Requires-Dist: docling-core (>=2.6.1,<3.0.0)
 Requires-Dist: docling-ibm-models (>=2.0.6,<3.0.0)
 Requires-Dist: docling-parse (>=2.0.5,<3.0.0)
 Requires-Dist: easyocr (>=1.7,<2.0)

{docling-2.8.1.dist-info → docling-2.8.3.dist-info}/RECORD RENAMED Viewed

@@ -12,13 +12,14 @@ docling/backend/msword_backend.py,sha256=VFHPr-gCak7w3NJToc5Cs-JaTb4Vm3a1JnnRIfJ
 docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
 docling/backend/pypdfium2_backend.py,sha256=B4bfv-dfzlWiKTfF8LN5fto_99YBu8A2c1_XIVwRUWI,8996
 docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/cli/main.py,sha256=AgPD32NfM0_bmHeKjx5-fqk57ahX5tN3AeoDOerhTuE,11808
+docling/cli/main.py,sha256=R9ao2zCv1GZQIATOqg9b64O7AOUCWLwjJ-2FIpW8m0I,12236
 docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/datamodel/base_models.py,sha256=6qlwPamDZ3XUsE2kTAyGKG6O2IJClVjCqaE7DZ74KHU,5533
-docling/datamodel/document.py,sha256=9dQf_J18X_MEWs-Mg3Ed6BykFPJ79ETmkkxcssY-vYo,20698
-docling/datamodel/pipeline_options.py,sha256=J-6kWugUrxahymKzgaEgiqPuyle1fbInPXV2wNos6Vc,4550
+docling/datamodel/base_models.py,sha256=mJ4h2haE0cOYz_eLd7QlRKU1y7u4yccMGk0tiZNICkQ,5542
+docling/datamodel/document.py,sha256=Y0NEFphwz44VxIaRaDRhtmw6rifzSC7MqyaDBzaR0lM,20902
+docling/datamodel/pipeline_options.py,sha256=K65nEZ52aRfF8hWIzl0zVvRQj-3XVwoBbxTacGS6jEg,4960
 docling/datamodel/settings.py,sha256=JK8lZPBjUx2kD2q-Qpg-o3vOElADMcyQbRUL0EHZ7us,1263
-docling/document_converter.py,sha256=L0A3g7IQBaKIK7dWpUFC72ZqKywIPYkyh71Qd6DiNPE,10940
+docling/document_converter.py,sha256=bsXGQCUrbL2LmaqaaEmlkfSANl2XwBBx8HDLwFrqhFY,11570
+docling/exceptions.py,sha256=-FoP46rFJgz_jn5uDv2V052udEEg8gckk6uhoItchXc,85
 docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/models/base_model.py,sha256=Yq_-FmUhqhE20vXYG3WiQXDRTIPjik1CyuEZ8iYTGAY,701
 docling/models/base_ocr_model.py,sha256=rGSpBF4dByITcsBaRIgvFKpiu0CrhmZS_PHIo686Dw0,6428
@@ -30,7 +31,7 @@ docling/models/page_assemble_model.py,sha256=kSGNiRKhmzkpFH7xCiT3rulMsgJmUXFa6Th
 docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
 docling/models/rapid_ocr_model.py,sha256=VQ0jaFmOzB9f-1JaqZ6d0o_El55Lr-nsFHfTNubMAuc,6005
 docling/models/table_structure_model.py,sha256=-ANSQpiN2avt3B9sbi7dHcoULUJbMBalAR5xxlrM7To,8421
-docling/models/tesseract_ocr_cli_model.py,sha256=OfopQnt2FGwtLJTMtW9jbJZ9EN2G2QFkA_aACjuUuDs,6372
+docling/models/tesseract_ocr_cli_model.py,sha256=aKQBaty4cYu6zG_C5uy6Zm3eeRQo5fxIierbKixa2kc,6622
 docling/models/tesseract_ocr_model.py,sha256=RDf6iV1q-oXaGfZXv0bW6SqjHNKQvBUDlUsOkuz0neY,6095
 docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/pipeline/base_pipeline.py,sha256=IF1XWYgUGbdB4-teLkmM4Hvg_UNEfPrGuhExMRTUsk8,7168
@@ -41,8 +42,8 @@ docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
 docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
 docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
 docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
-docling-2.8.1.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
-docling-2.8.1.dist-info/METADATA,sha256=auj5PtDj-UBB72sW8jk1CSVSwQpd9q0nYzoAYIItl8o,7682
-docling-2.8.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-docling-2.8.1.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
-docling-2.8.1.dist-info/RECORD,,
+docling-2.8.3.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
+docling-2.8.3.dist-info/METADATA,sha256=TKraAUApw0vLlToJ37cBQPNyJwoPmdWMIn73hYwq4Y8,7682
+docling-2.8.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+docling-2.8.3.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
+docling-2.8.3.dist-info/RECORD,,

{docling-2.8.1.dist-info → docling-2.8.3.dist-info}/LICENSE RENAMED Viewed

File without changes

{docling-2.8.1.dist-info → docling-2.8.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{docling-2.8.1.dist-info → docling-2.8.3.dist-info}/entry_points.txt RENAMED Viewed

File without changes

docling 2.8.1__py3-none-any.whl → 2.8.3__py3-none-any.whl

docling 2.8.1py3-none-any.whl → 2.8.3py3-none-any.whl