PyPI - docling - Versions diffs - 1.6.2__py3-none-any.whl → 1.19.0__py3-none-any.whl - Mend

docling 1.6.2py3-none-any.whl → 1.19.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

docling/backend/abstract_backend.py +17 -8
docling/backend/docling_parse_backend.py +42 -26
docling/backend/pypdfium2_backend.py +33 -11
docling/cli/__init__.py +0 -0
docling/cli/main.py +253 -0
docling/datamodel/base_models.py +39 -27
docling/datamodel/document.py +115 -17
docling/datamodel/pipeline_options.py +67 -0
docling/document_converter.py +65 -44
docling/models/base_ocr_model.py +4 -4
docling/models/ds_glm_model.py +11 -7
docling/models/easyocr_model.py +19 -4
docling/models/layout_model.py +3 -3
docling/models/table_structure_model.py +18 -2
docling/models/tesseract_ocr_cli_model.py +167 -0
docling/models/tesseract_ocr_model.py +122 -0
docling/pipeline/base_model_pipeline.py +4 -3
docling/pipeline/standard_model_pipeline.py +36 -8
docling/utils/export.py +145 -0
{docling-1.6.2.dist-info → docling-1.19.0.dist-info}/LICENSE +1 -1
docling-1.19.0.dist-info/METADATA +380 -0
docling-1.19.0.dist-info/RECORD +34 -0
docling-1.19.0.dist-info/entry_points.txt +3 -0
docling-1.6.2.dist-info/METADATA +0 -192
docling-1.6.2.dist-info/RECORD +0 -27
{docling-1.6.2.dist-info → docling-1.19.0.dist-info}/WHEEL +0 -0

docling/datamodel/document.py CHANGED Viewed

@@ -4,14 +4,16 @@ from pathlib import Path, PurePath
 from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
 from docling_core.types import BaseCell, BaseText
-from docling_core.types import BoundingBox as DsBoundingBox
 from docling_core.types import Document as DsDocument
 from docling_core.types import DocumentDescription as DsDocumentDescription
 from docling_core.types import FileInfoObject as DsFileInfoObject
 from docling_core.types import PageDimensions, PageReference, Prov, Ref
 from docling_core.types import Table as DsSchemaTable
 from docling_core.types import TableCell
+from docling_core.types.doc.base import BoundingBox as DsBoundingBox
+from docling_core.types.doc.base import Figure
 from pydantic import BaseModel
+from typing_extensions import deprecated
 from docling.backend.abstract_backend import PdfDocumentBackend
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
@@ -19,6 +21,7 @@ from docling.datamodel.base_models import (
     AssembledUnit,
     ConversionStatus,
     DocumentStream,
+    ErrorItem,
     FigureElement,
     Page,
     PageElement,
@@ -48,6 +51,15 @@ layout_label_to_ds_type = {
     "Text": "paragraph",
 }
+_EMPTY_DOC = DsDocument(
+    _name="",
+    description=DsDocumentDescription(logs=[]),
+    file_info=DsFileInfoObject(
+        filename="",
+        document_hash="",
+    ),
+)
 class InputDocument(BaseModel):
     file: PurePath = None
@@ -79,7 +91,9 @@ class InputDocument(BaseModel):
                     self.valid = False
                 else:
                     self.document_hash = create_file_hash(path_or_stream)
-                    self._backend = pdf_backend(path_or_stream=path_or_stream)
+                    self._backend = pdf_backend(
+                        path_or_stream=path_or_stream, document_hash=self.document_hash
+                    )
             elif isinstance(path_or_stream, BytesIO):
                 self.file = PurePath(filename)
@@ -89,7 +103,9 @@ class InputDocument(BaseModel):
                     self.valid = False
                 else:
                     self.document_hash = create_file_hash(path_or_stream)
-                    self._backend = pdf_backend(path_or_stream=path_or_stream)
+                    self._backend = pdf_backend(
+                        path_or_stream=path_or_stream, document_hash=self.document_hash
+                    )
             if self.document_hash and self._backend.page_count() > 0:
                 self.page_count = self._backend.page_count()
@@ -110,18 +126,19 @@ class InputDocument(BaseModel):
             # raise
+@deprecated("Use `ConversionResult` instead.")
 class ConvertedDocument(BaseModel):
     input: InputDocument
     status: ConversionStatus = ConversionStatus.PENDING  # failure, success
-    errors: List[Dict] = []  # structure to keep errors
+    errors: List[ErrorItem] = []  # structure to keep errors
     pages: List[Page] = []
-    assembled: Optional[AssembledUnit] = None
+    assembled: AssembledUnit = AssembledUnit()
-    output: Optional[DsDocument] = None
+    output: DsDocument = _EMPTY_DOC
-    def to_ds_document(self) -> DsDocument:
+    def _to_ds_document(self) -> DsDocument:
         title = ""
         desc = DsDocumentDescription(logs=[])
@@ -206,6 +223,8 @@ class ConvertedDocument(BaseModel):
                                 celltype = "col_header"
                             elif cell.row_header:
                                 celltype = "row_header"
+                            elif cell.row_section:
+                                celltype = "row_section"
                             def make_spans(cell):
                                 for rspan in range(
@@ -261,7 +280,7 @@ class ConvertedDocument(BaseModel):
                     ),
                 )
                 figures.append(
-                    BaseCell(
+                    Figure(
                         prov=[
                             Prov(
                                 bbox=target_bbox,
@@ -292,16 +311,91 @@ class ConvertedDocument(BaseModel):
         return ds_doc
     def render_as_dict(self):
-        if self.output:
-            return self.output.model_dump(by_alias=True, exclude_none=True)
-        else:
-            return {}
+        return self.output.model_dump(by_alias=True, exclude_none=True)
+    def render_as_markdown(
+        self,
+        delim: str = "\n\n",
+        main_text_start: int = 0,
+        main_text_stop: Optional[int] = None,
+        main_text_labels: list[str] = [
+            "title",
+            "subtitle-level-1",
+            "paragraph",
+            "caption",
+            "table",
+            "figure",
+        ],
+        strict_text: bool = False,
+        image_placeholder: str = "<!-- image -->",
+    ):
+        return self.output.export_to_markdown(
+            delim=delim,
+            main_text_start=main_text_start,
+            main_text_stop=main_text_stop,
+            main_text_labels=main_text_labels,
+            strict_text=strict_text,
+            image_placeholder=image_placeholder,
+        )
-    def render_as_markdown(self):
-        if self.output:
-            return self.output.export_to_markdown()
-        else:
-            return ""
+    def render_as_text(
+        self,
+        delim: str = "\n\n",
+        main_text_start: int = 0,
+        main_text_stop: Optional[int] = None,
+        main_text_labels: list[str] = [
+            "title",
+            "subtitle-level-1",
+            "paragraph",
+            "caption",
+        ],
+    ):
+        return self.output.export_to_markdown(
+            delim=delim,
+            main_text_start=main_text_start,
+            main_text_stop=main_text_stop,
+            main_text_labels=main_text_labels,
+            strict_text=True,
+        )
+    def render_as_doctags(
+        self,
+        delim: str = "\n\n",
+        main_text_start: int = 0,
+        main_text_stop: Optional[int] = None,
+        main_text_labels: list[str] = [
+            "title",
+            "subtitle-level-1",
+            "paragraph",
+            "caption",
+            "table",
+            "figure",
+        ],
+        xsize: int = 100,
+        ysize: int = 100,
+        add_location: bool = True,
+        add_content: bool = True,
+        add_page_index: bool = True,
+        # table specific flags
+        add_table_cell_location: bool = False,
+        add_table_cell_label: bool = True,
+        add_table_cell_text: bool = True,
+    ) -> str:
+        return self.output.export_to_document_tokens(
+            delim=delim,
+            main_text_start=main_text_start,
+            main_text_stop=main_text_stop,
+            main_text_labels=main_text_labels,
+            xsize=xsize,
+            ysize=ysize,
+            add_location=add_location,
+            add_content=add_content,
+            add_page_index=add_page_index,
+            # table specific flags
+            add_table_cell_location=add_table_cell_location,
+            add_table_cell_label=add_table_cell_label,
+            add_table_cell_text=add_table_cell_text,
+        )
     def render_element_images(
         self, element_types: Tuple[PageElement] = (FigureElement,)
@@ -318,6 +412,10 @@ class ConvertedDocument(BaseModel):
                 yield element, cropped_im
+class ConversionResult(ConvertedDocument):
+    pass
 class DocumentConversionInput(BaseModel):
     _path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None

docling/datamodel/pipeline_options.py ADDED Viewed

@@ -0,0 +1,67 @@
+from enum import Enum, auto
+from typing import List, Literal, Optional, Union
+from pydantic import BaseModel, ConfigDict, Field
+class TableFormerMode(str, Enum):
+    FAST = auto()
+    ACCURATE = auto()
+class TableStructureOptions(BaseModel):
+    do_cell_matching: bool = (
+        True
+        # True:  Matches predictions back to PDF cells. Can break table output if PDF cells
+        #        are merged across table columns.
+        # False: Let table structure model define the text cells, ignore PDF cells.
+    )
+    mode: TableFormerMode = TableFormerMode.FAST
+class OcrOptions(BaseModel):
+    kind: str
+class EasyOcrOptions(OcrOptions):
+    kind: Literal["easyocr"] = "easyocr"
+    lang: List[str] = ["fr", "de", "es", "en"]
+    use_gpu: bool = True  # same default as easyocr.Reader
+    model_storage_directory: Optional[str] = None
+    download_enabled: bool = True  # same default as easyocr.Reader
+    model_config = ConfigDict(
+        extra="forbid",
+        protected_namespaces=(),
+    )
+class TesseractCliOcrOptions(OcrOptions):
+    kind: Literal["tesseract"] = "tesseract"
+    lang: List[str] = ["fra", "deu", "spa", "eng"]
+    tesseract_cmd: str = "tesseract"
+    path: Optional[str] = None
+    model_config = ConfigDict(
+        extra="forbid",
+    )
+class TesseractOcrOptions(OcrOptions):
+    kind: Literal["tesserocr"] = "tesserocr"
+    lang: List[str] = ["fra", "deu", "spa", "eng"]
+    path: Optional[str] = None
+    model_config = ConfigDict(
+        extra="forbid",
+    )
+class PipelineOptions(BaseModel):
+    do_table_structure: bool = True  # True: perform table structure extraction
+    do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
+    table_structure_options: TableStructureOptions = TableStructureOptions()
+    ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
+        Field(EasyOcrOptions(), discriminator="kind")
+    )

docling/document_converter.py CHANGED Viewed

@@ -7,7 +7,6 @@ from pathlib import Path
 from typing import Iterable, Optional, Type, Union
 import requests
-from docling_core.types import Document
 from PIL import ImageDraw
 from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
@@ -16,14 +15,16 @@ from docling.datamodel.base_models import (
     AssembledUnit,
     AssembleOptions,
     ConversionStatus,
+    DoclingComponentType,
+    ErrorItem,
     Page,
-    PipelineOptions,
 )
 from docling.datamodel.document import (
-    ConvertedDocument,
+    ConversionResult,
     DocumentConversionInput,
     InputDocument,
 )
+from docling.datamodel.pipeline_options import PipelineOptions
 from docling.datamodel.settings import settings
 from docling.models.ds_glm_model import GlmModel
 from docling.models.page_assemble_model import PageAssembleModel
@@ -66,12 +67,15 @@ class DocumentConverter:
         from huggingface_hub import snapshot_download
         download_path = snapshot_download(
-            repo_id="ds4sd/docling-models", force_download=force, local_dir=local_dir
+            repo_id="ds4sd/docling-models",
+            force_download=force,
+            local_dir=local_dir,
+            revision="v2.0.0",
         )
         return Path(download_path)
-    def convert(self, input: DocumentConversionInput) -> Iterable[ConvertedDocument]:
+    def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:
         for input_batch in chunkify(
             input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size
@@ -84,9 +88,9 @@ class DocumentConverter:
             #   yield from pool.map(self.process_document, input_batch)
             # Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
-            yield from map(self.process_document, input_batch)
+            yield from map(self._process_document, input_batch)
-    def convert_single(self, source: Path | AnyHttpUrl | str) -> Document:
+    def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult:
         """Convert a single document.
         Args:
@@ -97,7 +101,7 @@ class DocumentConverter:
             RuntimeError: If conversion fails.
         Returns:
-            Document: The converted document object.
+            ConversionResult: The conversion result object.
         """
         with tempfile.TemporaryDirectory() as temp_dir:
             try:
@@ -127,51 +131,49 @@ class DocumentConverter:
                         f"Unexpected file path type encountered: {type(source)}"
                     )
             conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
-            converted_docs_iter = self.convert(conv_inp)
-            converted_doc: ConvertedDocument = next(converted_docs_iter)
-        if converted_doc.status not in {
+            conv_res_iter = self.convert(conv_inp)
+            conv_res: ConversionResult = next(conv_res_iter)
+        if conv_res.status not in {
             ConversionStatus.SUCCESS,
-            ConversionStatus.SUCCESS_WITH_ERRORS,
+            ConversionStatus.PARTIAL_SUCCESS,
         }:
-            raise RuntimeError(f"Conversion failed with status: {converted_doc.status}")
-        doc = converted_doc.to_ds_document()
-        return doc
+            raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
+        return conv_res
-    def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
+    def _process_document(self, in_doc: InputDocument) -> ConversionResult:
         start_doc_time = time.time()
-        converted_doc = ConvertedDocument(input=in_doc)
+        conv_res = ConversionResult(input=in_doc)
+        _log.info(f"Processing document {in_doc.file.name}")
         if not in_doc.valid:
-            converted_doc.status = ConversionStatus.FAILURE
-            return converted_doc
+            conv_res.status = ConversionStatus.FAILURE
+            return conv_res
         for i in range(0, in_doc.page_count):
-            converted_doc.pages.append(Page(page_no=i))
+            conv_res.pages.append(Page(page_no=i))
         all_assembled_pages = []
         try:
             # Iterate batches of pages (page_batch_size) in the doc
-            for page_batch in chunkify(
-                converted_doc.pages, settings.perf.page_batch_size
-            ):
+            for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
                 start_pb_time = time.time()
                 # Pipeline
                 # 1. Initialise the page resources
                 init_pages = map(
-                    functools.partial(self.initialize_page, in_doc), page_batch
+                    functools.partial(self._initialize_page, in_doc), page_batch
                 )
                 # 2. Populate page image
                 pages_with_images = map(
-                    functools.partial(self.populate_page_images, in_doc), init_pages
+                    functools.partial(self._populate_page_images, in_doc), init_pages
                 )
                 # 3. Populate programmatic page cells
                 pages_with_cells = map(
-                    functools.partial(self.parse_page_cells, in_doc),
+                    functools.partial(self._parse_page_cells, in_doc),
                     pages_with_images,
                 )
@@ -197,28 +199,45 @@ class DocumentConverter:
                 end_pb_time = time.time() - start_pb_time
                 _log.info(f"Finished converting page batch time={end_pb_time:.3f}")
-            # Free up mem resources of PDF backend
-            in_doc._backend.unload()
-            converted_doc.pages = all_assembled_pages
-            self.assemble_doc(converted_doc)
+            conv_res.pages = all_assembled_pages
+            self._assemble_doc(conv_res)
+            status = ConversionStatus.SUCCESS
+            for page in conv_res.pages:
+                if not page._backend.is_valid():
+                    conv_res.errors.append(
+                        ErrorItem(
+                            component_type=DoclingComponentType.PDF_BACKEND,
+                            module_name=type(page._backend).__name__,
+                            error_message=f"Page {page.page_no} failed to parse.",
+                        )
+                    )
+                    status = ConversionStatus.PARTIAL_SUCCESS
-            converted_doc.status = ConversionStatus.SUCCESS
+            conv_res.status = status
         except Exception as e:
-            converted_doc.status = ConversionStatus.FAILURE
+            conv_res.status = ConversionStatus.FAILURE
             trace = "\n".join(traceback.format_exception(e))
-            _log.info(f"Encountered an error during conversion: {trace}")
+            _log.info(
+                f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
+                f"{trace}"
+            )
+        finally:
+            # Always unload the PDF backend, even in case of failure
+            if in_doc._backend:
+                in_doc._backend.unload()
         end_doc_time = time.time() - start_doc_time
         _log.info(
             f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
         )
-        return converted_doc
+        return conv_res
     # Initialise and load resources for a page, before downstream steps (populate images, cells, ...)
-    def initialize_page(self, doc: InputDocument, page: Page) -> Page:
+    def _initialize_page(self, doc: InputDocument, page: Page) -> Page:
         page._backend = doc._backend.load_page(page.page_no)
         page.size = page._backend.get_size()
         page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no))
@@ -226,9 +245,11 @@ class DocumentConverter:
         return page
     # Generate the page image and store it in the page object
-    def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
+    def _populate_page_images(self, doc: InputDocument, page: Page) -> Page:
         # default scale
-        page.get_image(scale=1.0)
+        page.get_image(
+            scale=1.0
+        )  # puts the page image on the image cache at default scale
         # user requested scales
         if self.assemble_options.images_scale is not None:
@@ -240,7 +261,7 @@ class DocumentConverter:
         return page
     # Extract and populate the page cells and store it in the page object
-    def parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
+    def _parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
         page.cells = page._backend.get_text_cells()
         # DEBUG code:
@@ -255,12 +276,12 @@ class DocumentConverter:
         return page
-    def assemble_doc(self, converted_doc: ConvertedDocument):
+    def _assemble_doc(self, conv_res: ConversionResult):
         all_elements = []
         all_headers = []
         all_body = []
-        for p in converted_doc.pages:
+        for p in conv_res.pages:
             for el in p.assembled.body:
                 all_body.append(el)
@@ -269,8 +290,8 @@ class DocumentConverter:
             for el in p.assembled.elements:
                 all_elements.append(el)
-        converted_doc.assembled = AssembledUnit(
+        conv_res.assembled = AssembledUnit(
             elements=all_elements, headers=all_headers, body=all_body
         )
-        converted_doc.output = self.glm_model(converted_doc)
+        conv_res.output = self.glm_model(conv_res)

docling/models/base_ocr_model.py CHANGED Viewed

@@ -3,21 +3,21 @@ import logging
 from abc import abstractmethod
 from typing import Iterable, List, Tuple
-import numpy
 import numpy as np
 from PIL import Image, ImageDraw
 from rtree import index
 from scipy.ndimage import find_objects, label
 from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
+from docling.datamodel.pipeline_options import OcrOptions
 _log = logging.getLogger(__name__)
 class BaseOcrModel:
-    def __init__(self, config):
-        self.config = config
-        self.enabled = config["enabled"]
+    def __init__(self, enabled: bool, options: OcrOptions):
+        self.enabled = enabled
+        self.options = options
     # Computes the optimum amount and coordinates of rectangles to OCR on a given page
     def get_ocr_rects(self, page: Page) -> Tuple[bool, List[BoundingBox]]:

docling/models/ds_glm_model.py CHANGED Viewed

@@ -2,7 +2,7 @@ import copy
 import random
 from deepsearch_glm.nlp_utils import init_nlp_model
-from deepsearch_glm.utils.ds_utils import to_legacy_document_format
+from deepsearch_glm.utils.doc_utils import to_legacy_document_format
 from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
 from docling_core.types import BaseText
 from docling_core.types import Document as DsDocument
@@ -10,18 +10,22 @@ from docling_core.types import Ref
 from PIL import ImageDraw
 from docling.datamodel.base_models import BoundingBox, Cluster, CoordOrigin
-from docling.datamodel.document import ConvertedDocument
+from docling.datamodel.document import ConversionResult
 class GlmModel:
     def __init__(self, config):
         self.config = config
+        self.model_names = self.config.get(
+            "model_names", ""
+        )  # "language;term;reference"
         load_pretrained_nlp_models()
-        model = init_nlp_model(model_names="language;term;reference")
+        # model = init_nlp_model(model_names="language;term;reference")
+        model = init_nlp_model(model_names=self.model_names)
         self.model = model
-    def __call__(self, document: ConvertedDocument) -> DsDocument:
-        ds_doc = document.to_ds_document()
+    def __call__(self, conv_res: ConversionResult) -> DsDocument:
+        ds_doc = conv_res._to_ds_document()
         ds_doc_dict = ds_doc.model_dump(by_alias=True)
         glm_doc = self.model.apply_on_doc(ds_doc_dict)
@@ -34,7 +38,7 @@ class GlmModel:
         # DEBUG code:
         def draw_clusters_and_cells(ds_document, page_no):
             clusters_to_draw = []
-            image = copy.deepcopy(document.pages[page_no].image)
+            image = copy.deepcopy(conv_res.pages[page_no].image)
             for ix, elem in enumerate(ds_document.main_text):
                 if isinstance(elem, BaseText):
                     prov = elem.prov[0]
@@ -56,7 +60,7 @@ class GlmModel:
                             bbox=BoundingBox.from_tuple(
                                 coord=prov.bbox,
                                 origin=CoordOrigin.BOTTOMLEFT,
-                            ).to_top_left_origin(document.pages[page_no].size.height),
+                            ).to_top_left_origin(conv_res.pages[page_no].size.height),
                         )
                     )

docling/models/easyocr_model.py CHANGED Viewed

@@ -4,21 +4,33 @@ from typing import Iterable
 import numpy
 from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
+from docling.datamodel.pipeline_options import EasyOcrOptions
 from docling.models.base_ocr_model import BaseOcrModel
 _log = logging.getLogger(__name__)
 class EasyOcrModel(BaseOcrModel):
-    def __init__(self, config):
-        super().__init__(config)
+    def __init__(self, enabled: bool, options: EasyOcrOptions):
+        super().__init__(enabled=enabled, options=options)
+        self.options: EasyOcrOptions
         self.scale = 3  # multiplier for 72 dpi == 216 dpi.
         if self.enabled:
-            import easyocr
+            try:
+                import easyocr
+            except ImportError:
+                raise ImportError(
+                    "EasyOCR is not installed. Please install it via `pip install easyocr` to use this OCR engine. "
+                    "Alternatively, Docling has support for other OCR engines. See the documentation."
+                )
-            self.reader = easyocr.Reader(config["lang"])
+            self.reader = easyocr.Reader(
+                lang_list=self.options.lang,
+                model_storage_directory=self.options.model_storage_directory,
+                download_enabled=self.options.download_enabled,
+            )
     def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
@@ -31,6 +43,9 @@ class EasyOcrModel(BaseOcrModel):
             all_ocr_cells = []
             for ocr_rect in ocr_rects:
+                # Skip zero area boxes
+                if ocr_rect.area() == 0:
+                    continue
                 high_res_image = page._backend.get_page_image(
                     scale=self.scale, cropbox=ocr_rect
                 )

docling/models/layout_model.py CHANGED Viewed

@@ -33,6 +33,7 @@ class LayoutModel:
         "Page-footer",
         "Code",
         "List-item",
+        # "Title"
         # "Formula",
     ]
     PAGE_HEADER_LABELS = ["Page-header", "Page-footer"]
@@ -69,9 +70,7 @@ class LayoutModel:
             "Key-Value Region": 0.45,
         }
-        CLASS_REMAPPINGS = {
-            "Document Index": "Table",
-        }
+        CLASS_REMAPPINGS = {"Document Index": "Table", "Title": "Section-header"}
         _log.debug("================= Start postprocess function ====================")
         start_time = time.time()
@@ -277,6 +276,7 @@ class LayoutModel:
                     bbox=BoundingBox.model_validate(pred_item),
                     cells=[],
                 )
                 clusters.append(cluster)
             # Map cells to clusters

docling 1.6.2__py3-none-any.whl → 1.19.0__py3-none-any.whl

docling 1.6.2py3-none-any.whl → 1.19.0py3-none-any.whl