PyPI - docling - Versions diffs - 1.19.1__py3-none-any.whl → 2.4.1__py3-none-any.whl - Mend

docling 1.19.1py3-none-any.whl → 2.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

docling/backend/abstract_backend.py +33 -37
docling/backend/asciidoc_backend.py +431 -0
docling/backend/docling_parse_backend.py +20 -16
docling/backend/docling_parse_v2_backend.py +248 -0
docling/backend/html_backend.py +429 -0
docling/backend/md_backend.py +346 -0
docling/backend/mspowerpoint_backend.py +398 -0
docling/backend/msword_backend.py +496 -0
docling/backend/pdf_backend.py +78 -0
docling/backend/pypdfium2_backend.py +16 -11
docling/cli/main.py +96 -65
docling/datamodel/base_models.py +79 -193
docling/datamodel/document.py +405 -320
docling/datamodel/pipeline_options.py +19 -3
docling/datamodel/settings.py +16 -1
docling/document_converter.py +240 -251
docling/models/base_model.py +28 -0
docling/models/base_ocr_model.py +40 -10
docling/models/ds_glm_model.py +244 -30
docling/models/easyocr_model.py +57 -42
docling/models/layout_model.py +158 -116
docling/models/page_assemble_model.py +127 -101
docling/models/page_preprocessing_model.py +79 -0
docling/models/table_structure_model.py +162 -116
docling/models/tesseract_ocr_cli_model.py +76 -59
docling/models/tesseract_ocr_model.py +90 -58
docling/pipeline/base_pipeline.py +189 -0
docling/pipeline/simple_pipeline.py +56 -0
docling/pipeline/standard_pdf_pipeline.py +201 -0
docling/utils/export.py +4 -3
docling/utils/layout_utils.py +17 -11
docling/utils/profiling.py +62 -0
docling-2.4.1.dist-info/METADATA +154 -0
docling-2.4.1.dist-info/RECORD +45 -0
docling/pipeline/base_model_pipeline.py +0 -18
docling/pipeline/standard_model_pipeline.py +0 -66
docling-1.19.1.dist-info/METADATA +0 -380
docling-1.19.1.dist-info/RECORD +0 -34
{docling-1.19.1.dist-info → docling-2.4.1.dist-info}/LICENSE +0 -0
{docling-1.19.1.dist-info → docling-2.4.1.dist-info}/WHEEL +0 -0
{docling-1.19.1.dist-info → docling-2.4.1.dist-info}/entry_points.txt +0 -0

docling/datamodel/pipeline_options.py CHANGED Viewed

@@ -1,12 +1,13 @@
-from enum import Enum, auto
+from enum import Enum
+from pathlib import Path
 from typing import List, Literal, Optional, Union
 from pydantic import BaseModel, ConfigDict, Field
 class TableFormerMode(str, Enum):
-    FAST = auto()
-    ACCURATE = auto()
+    FAST = "fast"
+    ACCURATE = "accurate"
 class TableStructureOptions(BaseModel):
@@ -21,6 +22,9 @@ class TableStructureOptions(BaseModel):
 class OcrOptions(BaseModel):
     kind: str
+    bitmap_area_threshold: float = (
+        0.05  # percentage of the area for a bitmap to processed with OCR
+    )
 class EasyOcrOptions(OcrOptions):
@@ -58,6 +62,13 @@ class TesseractOcrOptions(OcrOptions):
 class PipelineOptions(BaseModel):
+    create_legacy_output: bool = (
+        True  # This defautl will be set to False on a future version of docling
+    )
+class PdfPipelineOptions(PipelineOptions):
+    artifacts_path: Optional[Union[Path, str]] = None
     do_table_structure: bool = True  # True: perform table structure extraction
     do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
@@ -65,3 +76,8 @@ class PipelineOptions(BaseModel):
     ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
         Field(EasyOcrOptions(), discriminator="kind")
     )
+    images_scale: float = 1.0
+    generate_page_images: bool = False
+    generate_picture_images: bool = False
+    generate_table_images: bool = False

docling/datamodel/settings.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import sys
+from pathlib import Path
 from pydantic import BaseModel
 from pydantic_settings import BaseSettings
@@ -14,6 +15,7 @@ class BatchConcurrencySettings(BaseModel):
     doc_batch_concurrency: int = 2
     page_batch_size: int = 4
     page_batch_concurrency: int = 2
+    elements_batch_size: int = 16
     # doc_batch_size: int = 1
     # doc_batch_concurrency: int = 1
@@ -25,8 +27,21 @@ class BatchConcurrencySettings(BaseModel):
     # To force models into single core: export OMP_NUM_THREADS=1
+class DebugSettings(BaseModel):
+    visualize_cells: bool = False
+    visualize_ocr: bool = False
+    visualize_layout: bool = False
+    visualize_tables: bool = False
+    profile_pipeline_timings: bool = False
+    # Path used to output debug information.
+    debug_output_path: str = str(Path.cwd() / "debug")
 class AppSettings(BaseSettings):
     perf: BatchConcurrencySettings
+    debug: DebugSettings
-settings = AppSettings(perf=BatchConcurrencySettings())
+settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())

docling/document_converter.py CHANGED Viewed

@@ -1,297 +1,286 @@
-import functools
 import logging
-import tempfile
+import sys
 import time
-import traceback
+from functools import partial
 from pathlib import Path
-from typing import Iterable, Optional, Type, Union
-import requests
-from PIL import ImageDraw
-from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
-from docling.backend.abstract_backend import PdfDocumentBackend
-from docling.datamodel.base_models import (
-    AssembledUnit,
-    AssembleOptions,
-    ConversionStatus,
-    DoclingComponentType,
-    ErrorItem,
-    Page,
-)
+from typing import Dict, Iterable, Iterator, List, Optional, Type
+from pydantic import BaseModel, ConfigDict, model_validator, validate_call
+from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.backend.asciidoc_backend import AsciiDocBackend
+from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.backend.html_backend import HTMLDocumentBackend
+from docling.backend.md_backend import MarkdownDocumentBackend
+from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
+from docling.backend.msword_backend import MsWordDocumentBackend
+from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
 from docling.datamodel.document import (
     ConversionResult,
-    DocumentConversionInput,
     InputDocument,
+    _DocumentConversionInput,
 )
 from docling.datamodel.pipeline_options import PipelineOptions
-from docling.datamodel.settings import settings
-from docling.models.ds_glm_model import GlmModel
-from docling.models.page_assemble_model import PageAssembleModel
-from docling.pipeline.base_model_pipeline import BaseModelPipeline
-from docling.pipeline.standard_model_pipeline import StandardModelPipeline
-from docling.utils.utils import chunkify, create_hash
+from docling.datamodel.settings import DocumentLimits, settings
+from docling.pipeline.base_pipeline import BasePipeline
+from docling.pipeline.simple_pipeline import SimplePipeline
+from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
+from docling.utils.utils import chunkify
 _log = logging.getLogger(__name__)
-class DocumentConverter:
-    _default_download_filename = "file.pdf"
+class FormatOption(BaseModel):
+    pipeline_cls: Type[BasePipeline]
+    pipeline_options: Optional[PipelineOptions] = None
+    backend: Type[AbstractDocumentBackend]
-    def __init__(
-        self,
-        artifacts_path: Optional[Union[Path, str]] = None,
-        pipeline_options: PipelineOptions = PipelineOptions(),
-        pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
-        pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
-        assemble_options: AssembleOptions = AssembleOptions(),
-    ):
-        if not artifacts_path:
-            artifacts_path = self.download_models_hf()
+    model_config = ConfigDict(arbitrary_types_allowed=True)
-        artifacts_path = Path(artifacts_path)
+    @model_validator(mode="after")
+    def set_optional_field_default(self) -> "FormatOption":
+        if self.pipeline_options is None:
+            self.pipeline_options = self.pipeline_cls.get_default_options()
+        return self
-        self.model_pipeline = pipeline_cls(
-            artifacts_path=artifacts_path, pipeline_options=pipeline_options
-        )
-        self.page_assemble_model = PageAssembleModel(config={})
-        self.glm_model = GlmModel(config={})
-        self.pdf_backend = pdf_backend
-        self.assemble_options = assemble_options
-    @staticmethod
-    def download_models_hf(
-        local_dir: Optional[Path] = None, force: bool = False
-    ) -> Path:
-        from huggingface_hub import snapshot_download
-        download_path = snapshot_download(
-            repo_id="ds4sd/docling-models",
-            force_download=force,
-            local_dir=local_dir,
-            revision="v2.0.0",
-        )
+class WordFormatOption(FormatOption):
+    pipeline_cls: Type = SimplePipeline
+    backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
-        return Path(download_path)
-    def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:
+class PowerpointFormatOption(FormatOption):
+    pipeline_cls: Type = SimplePipeline
+    backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
-        for input_batch in chunkify(
-            input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size
-        ):
-            _log.info(f"Going to convert document batch...")
-            # parallel processing only within input_batch
-            # with ThreadPoolExecutor(
-            #    max_workers=settings.perf.doc_batch_concurrency
-            # ) as pool:
-            #   yield from pool.map(self.process_document, input_batch)
-            # Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
-            yield from map(self._process_document, input_batch)
-    def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult:
-        """Convert a single document.
-        Args:
-            source (Path | AnyHttpUrl | str): The PDF input source. Can be a path or URL.
-        Raises:
-            ValueError: If source is of unexpected type.
-            RuntimeError: If conversion fails.
-        Returns:
-            ConversionResult: The conversion result object.
-        """
-        with tempfile.TemporaryDirectory() as temp_dir:
-            try:
-                http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
-                res = requests.get(http_url, stream=True)
-                res.raise_for_status()
-                fname = None
-                # try to get filename from response header
-                if cont_disp := res.headers.get("Content-Disposition"):
-                    for par in cont_disp.strip().split(";"):
-                        # currently only handling directive "filename" (not "*filename")
-                        if (split := par.split("=")) and split[0].strip() == "filename":
-                            fname = "=".join(split[1:]).strip().strip("'\"") or None
-                            break
-                # otherwise, use name from URL:
-                if fname is None:
-                    fname = Path(http_url.path).name or self._default_download_filename
-                local_path = Path(temp_dir) / fname
-                with open(local_path, "wb") as f:
-                    for chunk in res.iter_content(chunk_size=1024):  # using 1-KB chunks
-                        f.write(chunk)
-            except ValidationError:
-                try:
-                    local_path = TypeAdapter(Path).validate_python(source)
-                except ValidationError:
-                    raise ValueError(
-                        f"Unexpected file path type encountered: {type(source)}"
-                    )
-            conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
-            conv_res_iter = self.convert(conv_inp)
-            conv_res: ConversionResult = next(conv_res_iter)
-        if conv_res.status not in {
-            ConversionStatus.SUCCESS,
-            ConversionStatus.PARTIAL_SUCCESS,
-        }:
-            raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
-        return conv_res
+class MarkdownFormatOption(FormatOption):
+    pipeline_cls: Type = SimplePipeline
+    backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
-    def _process_document(self, in_doc: InputDocument) -> ConversionResult:
-        start_doc_time = time.time()
-        conv_res = ConversionResult(input=in_doc)
-        _log.info(f"Processing document {in_doc.file.name}")
+class AsciiDocFormatOption(FormatOption):
+    pipeline_cls: Type = SimplePipeline
+    backend: Type[AbstractDocumentBackend] = AsciiDocBackend
-        if not in_doc.valid:
-            conv_res.status = ConversionStatus.FAILURE
-            return conv_res
-        for i in range(0, in_doc.page_count):
-            conv_res.pages.append(Page(page_no=i))
+class HTMLFormatOption(FormatOption):
+    pipeline_cls: Type = SimplePipeline
+    backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
-        all_assembled_pages = []
-        try:
-            # Iterate batches of pages (page_batch_size) in the doc
-            for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
-                start_pb_time = time.time()
-                # Pipeline
+class PdfFormatOption(FormatOption):
+    pipeline_cls: Type = StandardPdfPipeline
+    backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
-                # 1. Initialise the page resources
-                init_pages = map(
-                    functools.partial(self._initialize_page, in_doc), page_batch
-                )
-                # 2. Populate page image
-                pages_with_images = map(
-                    functools.partial(self._populate_page_images, in_doc), init_pages
-                )
+class ImageFormatOption(FormatOption):
+    pipeline_cls: Type = StandardPdfPipeline
+    backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
-                # 3. Populate programmatic page cells
-                pages_with_cells = map(
-                    functools.partial(self._parse_page_cells, in_doc),
-                    pages_with_images,
-                )
-                # 4. Run pipeline stages
-                pipeline_pages = self.model_pipeline.apply(pages_with_cells)
+_format_to_default_options = {
+    InputFormat.DOCX: FormatOption(
+        pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
+    ),
+    InputFormat.PPTX: FormatOption(
+        pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
+    ),
+    InputFormat.MD: FormatOption(
+        pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
+    ),
+    InputFormat.ASCIIDOC: FormatOption(
+        pipeline_cls=SimplePipeline, backend=AsciiDocBackend
+    ),
+    InputFormat.HTML: FormatOption(
+        pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
+    ),
+    InputFormat.IMAGE: FormatOption(
+        pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
+    ),
+    InputFormat.PDF: FormatOption(
+        pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
+    ),
+}
-                # 5. Assemble page elements (per page)
-                assembled_pages = self.page_assemble_model(pipeline_pages)
-                # exhaust assembled_pages
-                for assembled_page in assembled_pages:
-                    # Free up mem resources before moving on with next batch
+class DocumentConverter:
+    _default_download_filename = "file"
-                    # Remove page images (can be disabled)
-                    if self.assemble_options.images_scale is None:
-                        assembled_page._image_cache = {}
+    def __init__(
+        self,
+        allowed_formats: Optional[List[InputFormat]] = None,
+        format_options: Optional[Dict[InputFormat, FormatOption]] = None,
+    ):
+        self.allowed_formats = allowed_formats
+        self.format_to_options = format_options
+        if self.allowed_formats is None:
+            # if self.format_to_options is not None:
+            #    self.allowed_formats = self.format_to_options.keys()
+            # else:
+            self.allowed_formats = [e for e in InputFormat]  # all formats
+        if self.format_to_options is None:
+            self.format_to_options = _format_to_default_options
+        else:
+            for f in self.allowed_formats:
+                if f not in self.format_to_options.keys():
+                    _log.debug(f"Requested format {f} will use default options.")
+                    self.format_to_options[f] = _format_to_default_options[f]
+            remove_keys = []
+            for f in self.format_to_options.keys():
+                if f not in self.allowed_formats:
+                    remove_keys.append(f)
+            for f in remove_keys:
+                self.format_to_options.pop(f)
+        self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
+    def initialize_pipeline(self, format: InputFormat):
+        """Initialize the conversion pipeline for the selected format."""
+        self._get_pipeline(doc_format=format)
+    @validate_call(config=ConfigDict(strict=True))
+    def convert(
+        self,
+        source: Path | str | DocumentStream,  # TODO review naming
+        raises_on_error: bool = True,
+        max_num_pages: int = sys.maxsize,
+        max_file_size: int = sys.maxsize,
+    ) -> ConversionResult:
+        all_res = self.convert_all(
+            source=[source],
+            raises_on_error=raises_on_error,
+            max_num_pages=max_num_pages,
+            max_file_size=max_file_size,
+        )
+        return next(all_res)
-                    # Unload backend
-                    assembled_page._backend.unload()
+    @validate_call(config=ConfigDict(strict=True))
+    def convert_all(
+        self,
+        source: Iterable[Path | str | DocumentStream],  # TODO review naming
+        raises_on_error: bool = True,  # True: raises on first conversion error; False: does not raise on conv error
+        max_num_pages: int = sys.maxsize,
+        max_file_size: int = sys.maxsize,
+    ) -> Iterator[ConversionResult]:
+        limits = DocumentLimits(
+            max_num_pages=max_num_pages,
+            max_file_size=max_file_size,
+        )
+        conv_input = _DocumentConversionInput(
+            path_or_stream_iterator=source,
+            limit=limits,
+        )
+        conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
+        for conv_res in conv_res_iter:
+            if raises_on_error and conv_res.status not in {
+                ConversionStatus.SUCCESS,
+                ConversionStatus.PARTIAL_SUCCESS,
+            }:
+                raise RuntimeError(
+                    f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
+                )
+            else:
+                yield conv_res
-                    all_assembled_pages.append(assembled_page)
+    def _convert(
+        self, conv_input: _DocumentConversionInput, raises_on_error: bool
+    ) -> Iterator[ConversionResult]:
+        assert self.format_to_options is not None
-                end_pb_time = time.time() - start_pb_time
-                _log.info(f"Finished converting page batch time={end_pb_time:.3f}")
+        start_time = time.monotonic()
-            conv_res.pages = all_assembled_pages
-            self._assemble_doc(conv_res)
+        for input_batch in chunkify(
+            conv_input.docs(self.format_to_options),
+            settings.perf.doc_batch_size,  # pass format_options
+        ):
+            _log.info(f"Going to convert document batch...")
-            status = ConversionStatus.SUCCESS
-            for page in conv_res.pages:
-                if not page._backend.is_valid():
-                    conv_res.errors.append(
-                        ErrorItem(
-                            component_type=DoclingComponentType.PDF_BACKEND,
-                            module_name=type(page._backend).__name__,
-                            error_message=f"Page {page.page_no} failed to parse.",
-                        )
+            # parallel processing only within input_batch
+            # with ThreadPoolExecutor(
+            #    max_workers=settings.perf.doc_batch_concurrency
+            # ) as pool:
+            #   yield from pool.map(self.process_document, input_batch)
+            # Note: PDF backends are not thread-safe, thread pool usage was disabled.
+            for item in map(
+                partial(self._process_document, raises_on_error=raises_on_error),
+                input_batch,
+            ):
+                elapsed = time.monotonic() - start_time
+                start_time = time.monotonic()
+                if item is not None:
+                    _log.info(
+                        f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
                     )
-                    status = ConversionStatus.PARTIAL_SUCCESS
-            conv_res.status = status
-        except Exception as e:
-            conv_res.status = ConversionStatus.FAILURE
-            trace = "\n".join(traceback.format_exception(e))
-            _log.info(
-                f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
-                f"{trace}"
+                    yield item
+                else:
+                    _log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
+    def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
+        assert self.format_to_options is not None
+        fopt = self.format_to_options.get(doc_format)
+        if fopt is None:
+            raise RuntimeError(f"Could not get pipeline for {doc_format}")
+        else:
+            pipeline_class = fopt.pipeline_cls
+            pipeline_options = fopt.pipeline_options
+        assert pipeline_options is not None
+        # TODO this will ignore if different options have been defined for the same pipeline class.
+        if (
+            pipeline_class not in self.initialized_pipelines
+            or self.initialized_pipelines[pipeline_class].pipeline_options
+            != pipeline_options
+        ):
+            self.initialized_pipelines[pipeline_class] = pipeline_class(
+                pipeline_options=pipeline_options
             )
+        return self.initialized_pipelines[pipeline_class]
-        finally:
-            # Always unload the PDF backend, even in case of failure
-            if in_doc._backend:
-                in_doc._backend.unload()
+    def _process_document(
+        self, in_doc: InputDocument, raises_on_error: bool
+    ) -> Optional[ConversionResult]:
+        assert self.allowed_formats is not None
+        assert in_doc.format in self.allowed_formats
-        end_doc_time = time.time() - start_doc_time
-        _log.info(
-            f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
-        )
+        conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
         return conv_res
-    # Initialise and load resources for a page, before downstream steps (populate images, cells, ...)
-    def _initialize_page(self, doc: InputDocument, page: Page) -> Page:
-        page._backend = doc._backend.load_page(page.page_no)
-        page.size = page._backend.get_size()
-        page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no))
-        return page
-    # Generate the page image and store it in the page object
-    def _populate_page_images(self, doc: InputDocument, page: Page) -> Page:
-        # default scale
-        page.get_image(
-            scale=1.0
-        )  # puts the page image on the image cache at default scale
-        # user requested scales
-        if self.assemble_options.images_scale is not None:
-            page._default_image_scale = self.assemble_options.images_scale
-            page.get_image(
-                scale=self.assemble_options.images_scale
-            )  # this will trigger storing the image in the internal cache
-        return page
-    # Extract and populate the page cells and store it in the page object
-    def _parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
-        page.cells = page._backend.get_text_cells()
-        # DEBUG code:
-        def draw_text_boxes(image, cells):
-            draw = ImageDraw.Draw(image)
-            for c in cells:
-                x0, y0, x1, y1 = c.bbox.as_tuple()
-                draw.rectangle([(x0, y0), (x1, y1)], outline="red")
-            image.show()
-        # draw_text_boxes(page.get_image(scale=1.0), cells)
-        return page
-    def _assemble_doc(self, conv_res: ConversionResult):
-        all_elements = []
-        all_headers = []
-        all_body = []
-        for p in conv_res.pages:
-            for el in p.assembled.body:
-                all_body.append(el)
-            for el in p.assembled.headers:
-                all_headers.append(el)
-            for el in p.assembled.elements:
-                all_elements.append(el)
-        conv_res.assembled = AssembledUnit(
-            elements=all_elements, headers=all_headers, body=all_body
-        )
+    def _execute_pipeline(
+        self, in_doc: InputDocument, raises_on_error: bool
+    ) -> ConversionResult:
+        if in_doc.valid:
+            pipeline = self._get_pipeline(in_doc.format)
+            if pipeline is None:  # Can't find a default pipeline. Should this raise?
+                if raises_on_error:
+                    raise RuntimeError(
+                        f"No pipeline could be initialized for {in_doc.file}."
+                    )
+                else:
+                    conv_res = ConversionResult(input=in_doc)
+                    conv_res.status = ConversionStatus.FAILURE
+                    return conv_res
+            conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
-        conv_res.output = self.glm_model(conv_res)
+        else:
+            if raises_on_error:
+                raise RuntimeError(f"Input document {in_doc.file} is not valid.")
+            else:
+                # invalid doc or not of desired format
+                conv_res = ConversionResult(input=in_doc)
+                conv_res.status = ConversionStatus.FAILURE
+                # TODO add error log why it failed.
+        return conv_res

docling/models/base_model.py ADDED Viewed

@@ -0,0 +1,28 @@
+from abc import ABC, abstractmethod
+from typing import Any, Iterable
+from docling_core.types.doc import DoclingDocument, NodeItem
+from docling.datamodel.base_models import Page
+from docling.datamodel.document import ConversionResult
+class BasePageModel(ABC):
+    @abstractmethod
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
+        pass
+class BaseEnrichmentModel(ABC):
+    @abstractmethod
+    def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
+        pass
+    @abstractmethod
+    def __call__(
+        self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
+    ) -> Iterable[Any]:
+        pass

docling 1.19.1__py3-none-any.whl → 2.4.1__py3-none-any.whl

docling 1.19.1py3-none-any.whl → 2.4.1py3-none-any.whl