PyPI - docling - Versions diffs - 2.42.2__py3-none-any.whl → 2.44.0__py3-none-any.whl - Mend

docling 2.42.2py3-none-any.whl → 2.44.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

docling/backend/html_backend.py +78 -18
docling/backend/md_backend.py +43 -11
docling/cli/main.py +6 -0
docling/datamodel/pipeline_options.py +15 -0
docling/datamodel/settings.py +7 -12
docling/document_converter.py +57 -17
docling/models/layout_model.py +84 -66
docling/models/vlm_models_inline/mlx_model.py +2 -2
docling/pipeline/threaded_standard_pdf_pipeline.py +605 -0
{docling-2.42.2.dist-info → docling-2.44.0.dist-info}/METADATA +4 -4
{docling-2.42.2.dist-info → docling-2.44.0.dist-info}/RECORD +15 -14
{docling-2.42.2.dist-info → docling-2.44.0.dist-info}/WHEEL +0 -0
{docling-2.42.2.dist-info → docling-2.44.0.dist-info}/entry_points.txt +0 -0
{docling-2.42.2.dist-info → docling-2.44.0.dist-info}/licenses/LICENSE +0 -0
{docling-2.42.2.dist-info → docling-2.44.0.dist-info}/top_level.txt +0 -0

docling/backend/html_backend.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import logging
 import re
-import traceback
 from io import BytesIO
 from pathlib import Path
 from typing import Final, Optional, Union, cast
@@ -126,8 +125,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         # set the title as furniture, since it is part of the document metadata
         title = self.soup.title
         if title:
+            title_text = title.get_text(separator=" ", strip=True)
+            title_clean = HTMLDocumentBackend._clean_unicode(title_text)
             doc.add_title(
-                text=title.get_text(separator=" ", strip=True),
+                text=title_clean,
+                orig=title_text,
                 content_layer=ContentLayer.FURNITURE,
             )
         # remove scripts/styles
@@ -144,11 +146,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         )
         # reset context
         self.ctx = _Context()
-        try:
-            self._walk(content, doc)
-        except Exception:
-            print(traceback.format_exc())
+        self._walk(content, doc)
         return doc
@@ -173,10 +171,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 return
             for part in text.split("\n"):
                 seg = part.strip()
+                seg_clean = HTMLDocumentBackend._clean_unicode(seg)
                 if seg:
                     doc.add_text(
-                        DocItemLabel.TEXT,
-                        seg,
+                        label=DocItemLabel.TEXT,
+                        text=seg_clean,
+                        orig=seg,
                         parent=self.parents[self.level],
                         content_layer=self.content_layer,
                     )
@@ -208,13 +208,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         self.content_layer = ContentLayer.BODY
         level = int(tag_name[1])
         text = tag.get_text(strip=True, separator=" ")
+        text_clean = HTMLDocumentBackend._clean_unicode(text)
         # the first level is for the title item
         if level == 1:
             for key in self.parents.keys():
                 self.parents[key] = None
             self.level = 0
             self.parents[self.level + 1] = doc.add_title(
-                text, content_layer=self.content_layer
+                text=text_clean, orig=text, content_layer=self.content_layer
             )
         # the other levels need to be lowered by 1 if a title was set
         else:
@@ -239,7 +240,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 self.level = level
             self.parents[self.level + 1] = doc.add_heading(
                 parent=self.parents[self.level],
-                text=text,
+                text=text_clean,
+                orig=text,
                 level=self.level,
                 content_layer=self.content_layer,
             )
@@ -301,13 +303,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                         if text_part:
                             parts.append(text_part)
                 li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
+                li_clean = HTMLDocumentBackend._clean_unicode(li_text)
                 # 3) add the list item
                 if li_text:
                     self.parents[self.level + 1] = doc.add_list_item(
-                        text=li_text,
+                        text=li_clean,
                         enumerated=is_ordered,
                         marker=marker,
+                        orig=li_text,
                         parent=list_group,
                         content_layer=self.content_layer,
                     )
@@ -349,11 +353,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         elif tag_name in {"p", "address", "summary"}:
             for part in tag.text.split("\n"):
                 seg = part.strip()
+                seg_clean = HTMLDocumentBackend._clean_unicode(seg)
                 if seg:
                     doc.add_text(
-                        parent=self.parents[self.level],
                         label=DocItemLabel.TEXT,
-                        text=seg,
+                        text=seg_clean,
+                        orig=seg,
+                        parent=self.parents[self.level],
                         content_layer=self.content_layer,
                     )
             for img_tag in tag("img"):
@@ -375,10 +381,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         elif tag_name in {"pre", "code"}:
             # handle monospace code snippets (pre).
             text = tag.get_text(strip=True)
+            text_clean = HTMLDocumentBackend._clean_unicode(text)
             if text:
                 doc.add_code(
                     parent=self.parents[self.level],
-                    text=text,
+                    text=text_clean,
+                    orig=text,
                     content_layer=self.content_layer,
                 )
@@ -407,8 +415,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         caption_item: Optional[TextItem] = None
         if caption:
+            caption_clean = HTMLDocumentBackend._clean_unicode(caption)
             caption_item = doc.add_text(
-                DocItemLabel.CAPTION, text=caption, content_layer=self.content_layer
+                label=DocItemLabel.CAPTION,
+                text=caption_clean,
+                orig=caption,
+                content_layer=self.content_layer,
             )
         doc.add_picture(
@@ -447,6 +459,46 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         return "".join(parts)
+    @staticmethod
+    def _clean_unicode(text: str) -> str:
+        """Replace typical Unicode characters in HTML for text processing.
+        Several Unicode characters (e.g., non-printable or formatting) are typically
+        found in HTML but are worth replacing to sanitize text and ensure consistency
+        in text processing tasks.
+        Args:
+            text: The original text.
+        Returns:
+            The sanitized text without typical Unicode characters.
+        """
+        replacements = {
+            "\u00a0": " ",  # non-breaking space
+            "\u200b": "",  # zero-width space
+            "\u200c": "",  # zero-width non-joiner
+            "\u200d": "",  # zero-width joiner
+            "\u2010": "-",  # hyphen
+            "\u2011": "-",  # non-breaking hyphen
+            "\u2012": "-",  # dash
+            "\u2013": "-",  # dash
+            "\u2014": "-",  # dash
+            "\u2015": "-",  # horizontal bar
+            "\u2018": "'",  # left single quotation mark
+            "\u2019": "'",  # right single quotation mark
+            "\u201c": '"',  # left double quotation mark
+            "\u201d": '"',  # right double quotation mark
+            "\u2026": "...",  # ellipsis
+            "\u00ad": "",  # soft hyphen
+            "\ufeff": "",  # zero width non-break space
+            "\u202f": " ",  # narrow non-break space
+            "\u2060": "",  # word joiner
+        }
+        for raw, clean in replacements.items():
+            text = text.replace(raw, clean)
+        return text
     @staticmethod
     def _get_cell_spans(cell: Tag) -> tuple[int, int]:
         """Extract colspan and rowspan values from a table cell tag.
@@ -459,9 +511,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             str(cell.get("colspan", "1")),
             str(cell.get("rowspan", "1")),
         )
+        def _extract_num(s: str) -> int:
+            if s and s[0].isnumeric():
+                match = re.search(r"\d+", s)
+                if match:
+                    return int(match.group())
+            return 1
         int_spans: tuple[int, int] = (
-            int(raw_spans[0]) if raw_spans[0].isnumeric() else 1,
-            int(raw_spans[1]) if raw_spans[0].isnumeric() else 1,
+            _extract_num(raw_spans[0]),
+            _extract_num(raw_spans[1]),
         )
         return int_spans

docling/backend/md_backend.py CHANGED Viewed

@@ -5,7 +5,7 @@ from copy import deepcopy
 from enum import Enum
 from io import BytesIO
 from pathlib import Path
-from typing import List, Literal, Optional, Set, Union
+from typing import Literal, Optional, Union, cast
 import marko
 import marko.element
@@ -14,6 +14,7 @@ from docling_core.types.doc import (
     DocItemLabel,
     DoclingDocument,
     DocumentOrigin,
+    ListItem,
     NodeItem,
     TableCell,
     TableData,
@@ -89,7 +90,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
     def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
         super().__init__(in_doc, path_or_stream)
-        _log.debug("MD INIT!!!")
+        _log.debug("Starting MarkdownDocumentBackend...")
         # Markdown file:
         self.path_or_stream = path_or_stream
@@ -131,7 +132,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             for md_table_row in self.md_table_buffer:
                 _log.debug(md_table_row)
             _log.debug("=== TABLE END ===")
-            tcells: List[TableCell] = []
+            tcells: list[TableCell] = []
             result_table = []
             for n, md_table_row in enumerate(self.md_table_buffer):
                 data = []
@@ -232,11 +233,12 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
         element: marko.element.Element,
         depth: int,
         doc: DoclingDocument,
-        visited: Set[marko.element.Element],
+        visited: set[marko.element.Element],
         creation_stack: list[
             _CreationPayload
         ],  # stack for lazy item creation triggered deep in marko's AST (on RawText)
         list_ordered_flag_by_ref: dict[str, bool],
+        list_last_item_by_ref: dict[str, ListItem],
         parent_item: Optional[NodeItem] = None,
         formatting: Optional[Formatting] = None,
         hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -279,7 +281,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
         elif (
             isinstance(element, marko.block.ListItem)
-            and len(element.children) == 1
+            and len(element.children) > 0
             and isinstance((child := element.children[0]), marko.block.Paragraph)
             and len(child.children) > 0
         ):
@@ -291,7 +293,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 if parent_item
                 else False
             )
-            if len(child.children) > 1:  # inline group will be created further down
+            non_list_children: list[marko.element.Element] = [
+                item
+                for item in child.children
+                if not isinstance(item, marko.block.ListItem)
+            ]
+            if len(non_list_children) > 1:  # inline group will be created further down
+                parent_ref: Optional[str] = (
+                    parent_item.self_ref if parent_item else None
+                )
                 parent_item = self._create_list_item(
                     doc=doc,
                     parent_item=parent_item,
@@ -300,6 +310,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                     formatting=formatting,
                     hyperlink=hyperlink,
                 )
+                if parent_ref:
+                    list_last_item_by_ref[parent_ref] = cast(ListItem, parent_item)
             else:
                 creation_stack.append(_ListItemCreationPayload(enumerated=enumerated))
@@ -334,9 +346,11 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 element.dest
             )
-        elif isinstance(element, marko.inline.RawText):
-            _log.debug(f" - Paragraph (raw text): {element.children}")
-            snippet_text = element.children.strip()
+        elif isinstance(element, (marko.inline.RawText, marko.inline.Literal)):
+            _log.debug(f" - RawText/Literal: {element.children}")
+            snippet_text = (
+                element.children.strip() if isinstance(element.children, str) else ""
+            )
             # Detect start of the table:
             if "|" in snippet_text or self.in_table:
                 # most likely part of the markdown table
@@ -359,6 +373,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                                 if parent_item
                                 else False
                             )
+                            parent_ref = parent_item.self_ref if parent_item else None
                             parent_item = self._create_list_item(
                                 doc=doc,
                                 parent_item=parent_item,
@@ -367,6 +382,11 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                                 formatting=formatting,
                                 hyperlink=hyperlink,
                             )
+                            if parent_ref:
+                                list_last_item_by_ref[parent_ref] = cast(
+                                    ListItem, parent_item
+                                )
                         elif isinstance(to_create, _HeadingCreationPayload):
                             # not keeping as parent_item as logic for correctly tracking
                             # that not implemented yet (section components not captured
@@ -458,6 +478,17 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             element, processed_block_types
         ):
             for child in element.children:
+                if (
+                    isinstance(element, marko.block.ListItem)
+                    and isinstance(child, marko.block.List)
+                    and parent_item
+                    and list_last_item_by_ref.get(parent_item.self_ref, None)
+                ):
+                    _log.debug(
+                        f"walking into new List hanging from item of parent list {parent_item.self_ref}"
+                    )
+                    parent_item = list_last_item_by_ref[parent_item.self_ref]
                 self._iterate_elements(
                     element=child,
                     depth=depth + 1,
@@ -465,6 +496,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                     visited=visited,
                     creation_stack=creation_stack,
                     list_ordered_flag_by_ref=list_ordered_flag_by_ref,
+                    list_last_item_by_ref=list_last_item_by_ref,
                     parent_item=parent_item,
                     formatting=formatting,
                     hyperlink=hyperlink,
@@ -483,7 +515,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
         return False
     @classmethod
-    def supported_formats(cls) -> Set[InputFormat]:
+    def supported_formats(cls) -> set[InputFormat]:
         return {InputFormat.MD}
     def convert(self) -> DoclingDocument:
@@ -510,6 +542,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 visited=set(),
                 creation_stack=[],
                 list_ordered_flag_by_ref={},
+                list_last_item_by_ref={},
             )
             self._close_table(doc=doc)  # handle any last hanging table
@@ -534,7 +567,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 ]:
                     html_str = _restore_original_html(txt=html_str, regex=regex)
                 self._html_blocks = 0
                 # delegate to HTML backend
                 stream = BytesIO(bytes(html_str, encoding="utf-8"))
                 in_doc = InputDocument(

docling/cli/main.py CHANGED Viewed

@@ -262,6 +262,12 @@ def export_documents(
         else:
             _log.warning(f"Document {conv_res.input.file} failed to convert.")
+            if _log.isEnabledFor(logging.INFO):
+                for err in conv_res.errors:
+                    _log.info(
+                        f"  [Failure Detail] Component: {err.component_type}, "
+                        f"Module: {err.module_name}, Message: {err.error_message}"
+                    )
             failure_count += 1
     _log.info(

docling/datamodel/pipeline_options.py CHANGED Viewed

@@ -332,3 +332,18 @@ class ProcessingPipeline(str, Enum):
     STANDARD = "standard"
     VLM = "vlm"
     ASR = "asr"
+class ThreadedPdfPipelineOptions(PdfPipelineOptions):
+    """Pipeline options for the threaded PDF pipeline with batching and backpressure control"""
+    # Batch sizes for different stages
+    ocr_batch_size: int = 4
+    layout_batch_size: int = 4
+    table_batch_size: int = 4
+    # Timing control
+    batch_timeout_seconds: float = 2.0
+    # Backpressure and queue control
+    queue_max_size: int = 100

docling/datamodel/settings.py CHANGED Viewed

@@ -26,18 +26,13 @@ class DocumentLimits(BaseModel):
 class BatchConcurrencySettings(BaseModel):
-    doc_batch_size: int = 2
-    doc_batch_concurrency: int = 2
-    page_batch_size: int = 4
-    page_batch_concurrency: int = 2
-    elements_batch_size: int = 16
-    # doc_batch_size: int = 1
-    # doc_batch_concurrency: int = 1
-    # page_batch_size: int = 1
-    # page_batch_concurrency: int = 1
-    # model_concurrency: int = 2
+    doc_batch_size: int = 1  # Number of documents processed in one batch. Should be >= doc_batch_concurrency
+    doc_batch_concurrency: int = 1  # Number of parallel threads processing documents. Warning: Experimental! No benefit expected without free-threaded python.
+    page_batch_size: int = 4  # Number of pages processed in one batch.
+    page_batch_concurrency: int = 1  # Currently unused.
+    elements_batch_size: int = (
+        16  # Number of elements processed in one batch, in enrichment models.
+    )
     # To force models into single core: export OMP_NUM_THREADS=1

docling/document_converter.py CHANGED Viewed

@@ -4,7 +4,10 @@ import sys
 import threading
 import time
 from collections.abc import Iterable, Iterator
+from concurrent.futures import ThreadPoolExecutor
+from datetime import datetime
 from functools import partial
+from io import BytesIO
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Type, Union
@@ -274,6 +277,34 @@ class DocumentConverter:
                 "Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
             )
+    @validate_call(config=ConfigDict(strict=True))
+    def convert_string(
+        self,
+        content: str,
+        format: InputFormat,
+        name: Optional[str],
+    ) -> ConversionResult:
+        name = name or datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+        if format == InputFormat.MD:
+            if not name.endswith(".md"):
+                name += ".md"
+            buff = BytesIO(content.encode("utf-8"))
+            doc_stream = DocumentStream(name=name, stream=buff)
+            return self.convert(doc_stream)
+        elif format == InputFormat.HTML:
+            if not name.endswith(".html"):
+                name += ".html"
+            buff = BytesIO(content.encode("utf-8"))
+            doc_stream = DocumentStream(name=name, stream=buff)
+            return self.convert(doc_stream)
+        else:
+            raise ValueError(f"format {format} is not supported in `convert_string`")
     def _convert(
         self, conv_input: _DocumentConversionInput, raises_on_error: bool
     ) -> Iterator[ConversionResult]:
@@ -284,24 +315,33 @@ class DocumentConverter:
             settings.perf.doc_batch_size,  # pass format_options
         ):
             _log.info("Going to convert document batch...")
+            process_func = partial(
+                self._process_document, raises_on_error=raises_on_error
+            )
-            # parallel processing only within input_batch
-            # with ThreadPoolExecutor(
-            #    max_workers=settings.perf.doc_batch_concurrency
-            # ) as pool:
-            #   yield from pool.map(self.process_document, input_batch)
-            # Note: PDF backends are not thread-safe, thread pool usage was disabled.
-            for item in map(
-                partial(self._process_document, raises_on_error=raises_on_error),
-                input_batch,
+            if (
+                settings.perf.doc_batch_concurrency > 1
+                and settings.perf.doc_batch_size > 1
             ):
-                elapsed = time.monotonic() - start_time
-                start_time = time.monotonic()
-                _log.info(
-                    f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
-                )
-                yield item
+                with ThreadPoolExecutor(
+                    max_workers=settings.perf.doc_batch_concurrency
+                ) as pool:
+                    for item in pool.map(
+                        process_func,
+                        input_batch,
+                    ):
+                        yield item
+            else:
+                for item in map(
+                    process_func,
+                    input_batch,
+                ):
+                    elapsed = time.monotonic() - start_time
+                    start_time = time.monotonic()
+                    _log.info(
+                        f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
+                    )
+                    yield item
     def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
         """Retrieve or initialize a pipeline, reusing instances based on class and options."""
@@ -330,7 +370,7 @@ class DocumentConverter:
                     f"Reusing cached pipeline for {pipeline_class.__name__} with options hash {options_hash}"
                 )
-        return self.initialized_pipelines[cache_key]
+            return self.initialized_pipelines[cache_key]
     def _process_document(
         self, in_doc: InputDocument, raises_on_error: bool

docling 2.42.2__py3-none-any.whl → 2.44.0__py3-none-any.whl

docling 2.42.2py3-none-any.whl → 2.44.0py3-none-any.whl