PyPI - docling - Versions diffs - 2.46.0__tar.gz → 2.47.0__tar.gz - Mend

docling 2.46.0tar.gz → 2.47.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (142) hide show

{docling-2.46.0 → docling-2.47.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docling
-Version: 2.46.0
+Version: 2.47.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
 License-Expression: MIT
@@ -59,6 +59,7 @@ Provides-Extra: vlm
 Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
 Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
 Requires-Dist: mlx-vlm<1.0.0,>=0.3.0; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
+Requires-Dist: vllm<1.0.0,>=0.10.0; (python_version >= "3.10" and sys_platform == "linux") and extra == "vlm"
 Provides-Extra: rapidocr
 Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
 Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"

{docling-2.46.0 → docling-2.47.0}/docling/backend/html_backend.py RENAMED Viewed

@@ -20,7 +20,7 @@ from docling_core.types.doc import (
     TableData,
     TextItem,
 )
-from docling_core.types.doc.document import ContentLayer
+from docling_core.types.doc.document import ContentLayer, Formatting, Script
 from pydantic import AnyUrl, BaseModel, ValidationError
 from typing_extensions import override
@@ -54,6 +54,21 @@ _BLOCK_TAGS: Final = {
     "table",
 }
+_FORMAT_TAG_MAP: Final = {
+    "b": {"bold": True},
+    "strong": {"bold": True},
+    "i": {"italic": True},
+    "em": {"italic": True},
+    # "mark",
+    # "small",
+    "s": {"strikethrough": True},
+    "del": {"strikethrough": True},
+    "u": {"underline": True},
+    "ins": {"underline": True},
+    "sub": {"script": Script.SUB},
+    "sup": {"script": Script.SUPER},
+}
 class _Context(BaseModel):
     list_ordered_flag_by_ref: dict[str, bool] = {}
@@ -63,23 +78,34 @@ class _Context(BaseModel):
 class AnnotatedText(BaseModel):
     text: str
     hyperlink: Union[AnyUrl, Path, None] = None
+    formatting: Union[Formatting, None] = None
 class AnnotatedTextList(list):
     def to_single_text_element(self) -> AnnotatedText:
         current_h = None
         current_text = ""
+        current_f = None
         for at in self:
             t = at.text
             h = at.hyperlink
+            f = at.formatting
             current_text += t.strip() + " "
+            if f is not None and current_f is None:
+                current_f = f
+            elif f is not None and current_f is not None and f != current_f:
+                _log.warning(
+                    f"Clashing formatting: '{f}' and '{current_f}'! Chose '{current_f}'"
+                )
             if h is not None and current_h is None:
                 current_h = h
             elif h is not None and current_h is not None and h != current_h:
                 _log.warning(
                     f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'"
                 )
-        return AnnotatedText(text=current_text.strip(), hyperlink=current_h)
+        return AnnotatedText(
+            text=current_text.strip(), hyperlink=current_h, formatting=current_f
+        )
     def simplify_text_elements(self) -> "AnnotatedTextList":
         simplified = AnnotatedTextList()
@@ -87,21 +113,27 @@ class AnnotatedTextList(list):
             return self
         text = self[0].text
         hyperlink = self[0].hyperlink
+        formatting = self[0].formatting
         last_elm = text
         for i in range(1, len(self)):
-            if hyperlink == self[i].hyperlink:
+            if hyperlink == self[i].hyperlink and formatting == self[i].formatting:
                 sep = " "
                 if not self[i].text.strip() or not last_elm.strip():
                     sep = ""
                 text += sep + self[i].text
                 last_elm = self[i].text
             else:
-                simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
+                simplified.append(
+                    AnnotatedText(text=text, hyperlink=hyperlink, formatting=formatting)
+                )
                 text = self[i].text
                 last_elm = text
                 hyperlink = self[i].hyperlink
+                formatting = self[i].formatting
         if text:
-            simplified.append(AnnotatedText(text=text, hyperlink=hyperlink))
+            simplified.append(
+                AnnotatedText(text=text, hyperlink=hyperlink, formatting=formatting)
+            )
         return simplified
     def split_by_newline(self):
@@ -144,6 +176,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             self.parents[i] = None
         self.hyperlink = None
         self.original_url = original_url
+        self.format_tags: list[str] = []
         try:
             raw = (
@@ -254,6 +287,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                                 label=DocItemLabel.TEXT,
                                 text=seg_clean,
                                 content_layer=self.content_layer,
+                                formatting=annotated_text.formatting,
                                 hyperlink=annotated_text.hyperlink,
                             )
@@ -263,6 +297,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 if name == "img":
                     flush_buffer()
                     self._emit_image(node, doc)
+                elif name in _FORMAT_TAG_MAP:
+                    with self.use_format([name]):
+                        self._walk(node, doc)
                 elif name == "a":
                     with self.use_hyperlink(node):
                         self._walk(node, doc)
@@ -292,6 +329,27 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         flush_buffer()
+    @staticmethod
+    def _collect_parent_format_tags(item: PageElement) -> list[str]:
+        tags = []
+        for format_tag in _FORMAT_TAG_MAP:
+            this_parent = item.parent
+            while this_parent is not None:
+                if this_parent.name == format_tag:
+                    tags.append(format_tag)
+                    break
+                this_parent = this_parent.parent
+        return tags
+    @property
+    def _formatting(self):
+        kwargs = {}
+        for t in self.format_tags:
+            kwargs.update(_FORMAT_TAG_MAP[t])
+        if not kwargs:
+            return None
+        return Formatting(**kwargs)
     def _extract_text_and_hyperlink_recursively(
         self,
         item: PageElement,
@@ -302,15 +360,18 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         result: AnnotatedTextList = AnnotatedTextList()
         # If find_parent_annotation, make sure that we keep track of
-        # any a-tag that has been present in the DOM-parents already.
+        # any a- or formatting-tag that has been present in the
+        # DOM-parents already.
         if find_parent_annotation:
+            format_tags = self._collect_parent_format_tags(item)
             this_parent = item.parent
             while this_parent is not None:
                 if this_parent.name == "a" and this_parent.get("href"):
-                    with self.use_hyperlink(this_parent):
-                        return self._extract_text_and_hyperlink_recursively(
-                            item, ignore_list
-                        )
+                    with self.use_format(format_tags):
+                        with self.use_hyperlink(this_parent):
+                            return self._extract_text_and_hyperlink_recursively(
+                                item, ignore_list
+                            )
                 this_parent = this_parent.parent
         if isinstance(item, PreformattedString):
@@ -320,18 +381,37 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             text = item.strip()
             if text:
                 return AnnotatedTextList(
-                    [AnnotatedText(text=text, hyperlink=self.hyperlink)]
+                    [
+                        AnnotatedText(
+                            text=text,
+                            hyperlink=self.hyperlink,
+                            formatting=self._formatting,
+                        )
+                    ]
                 )
             if keep_newlines and item.strip("\n\r") == "":
                 return AnnotatedTextList(
-                    [AnnotatedText(text="\n", hyperlink=self.hyperlink)]
+                    [
+                        AnnotatedText(
+                            text="\n",
+                            hyperlink=self.hyperlink,
+                            formatting=self._formatting,
+                        )
+                    ]
                 )
             return AnnotatedTextList()
         tag = cast(Tag, item)
         if not ignore_list or (tag.name not in ["ul", "ol"]):
             for child in tag:
-                if isinstance(child, Tag) and child.name == "a":
+                if isinstance(child, Tag) and child.name in _FORMAT_TAG_MAP:
+                    with self.use_format([child.name]):
+                        result.extend(
+                            self._extract_text_and_hyperlink_recursively(
+                                child, ignore_list, keep_newlines=keep_newlines
+                            )
+                        )
+                elif isinstance(child, Tag) and child.name == "a":
                     with self.use_hyperlink(child):
                         result.extend(
                             self._extract_text_and_hyperlink_recursively(
@@ -369,6 +449,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 if this_href:
                     self.hyperlink = old_hyperlink
+    @contextmanager
+    def use_format(self, tags: list[str]):
+        if not tags:
+            yield None
+        else:
+            self.format_tags.extend(tags)
+            try:
+                yield None
+            finally:
+                self.format_tags = self.format_tags[: -len(tags)]
     @contextmanager
     def use_inline_group(
         self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument
@@ -420,6 +511,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             self.parents[self.level + 1] = doc.add_title(
                 text_clean,
                 content_layer=self.content_layer,
+                formatting=annotated_text.formatting,
                 hyperlink=annotated_text.hyperlink,
             )
         # the other levels need to be lowered by 1 if a title was set
@@ -449,6 +541,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 orig=annotated_text.text,
                 level=self.level,
                 content_layer=self.content_layer,
+                formatting=annotated_text.formatting,
                 hyperlink=annotated_text.hyperlink,
             )
         self.level += 1
@@ -529,6 +622,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                                     label=DocItemLabel.TEXT,
                                     text=li_clean,
                                     content_layer=self.content_layer,
+                                    formatting=annotated_text.formatting,
                                     hyperlink=annotated_text.hyperlink,
                                 )
@@ -551,6 +645,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                             orig=li_text,
                             parent=list_group,
                             content_layer=self.content_layer,
+                            formatting=annotated_text.formatting,
                             hyperlink=annotated_text.hyperlink,
                         )
@@ -603,6 +698,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                                 label=DocItemLabel.TEXT,
                                 text=seg_clean,
                                 content_layer=self.content_layer,
+                                formatting=annotated_text.formatting,
                                 hyperlink=annotated_text.hyperlink,
                             )
@@ -637,6 +733,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                         parent=self.parents[self.level],
                         text=text_clean,
                         content_layer=self.content_layer,
+                        formatting=annotated_text.formatting,
                         hyperlink=annotated_text.hyperlink,
                     )
@@ -696,6 +793,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 text=text_clean,
                 orig=caption_anno_text.text,
                 content_layer=self.content_layer,
+                formatting=caption_anno_text.formatting,
                 hyperlink=caption_anno_text.hyperlink,
             )

{docling-2.46.0 → docling-2.47.0}/docling/backend/msword_backend.py RENAMED Viewed

@@ -67,6 +67,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         self.level = 0
         self.listIter = 0
+        # Track list counters per numId and ilvl
+        self.list_counters: dict[tuple[int, int], int] = {}
         self.history: dict[str, Any] = {
             "names": [None],
@@ -315,6 +317,108 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         return None, None  # If the paragraph is not part of a list
+    def _get_list_counter(self, numid: int, ilvl: int) -> int:
+        """Get and increment the counter for a specific numId and ilvl combination."""
+        key = (numid, ilvl)
+        if key not in self.list_counters:
+            self.list_counters[key] = 0
+        self.list_counters[key] += 1
+        return self.list_counters[key]
+    def _reset_list_counters_for_new_sequence(self, numid: int):
+        """Reset counters when starting a new numbering sequence."""
+        # Reset all counters for this numid
+        keys_to_reset = [key for key in self.list_counters.keys() if key[0] == numid]
+        for key in keys_to_reset:
+            self.list_counters[key] = 0
+    def _is_numbered_list(self, docx_obj: DocxDocument, numId: int, ilvl: int) -> bool:
+        """Check if a list is numbered based on its numFmt value."""
+        try:
+            # Access the numbering part of the document
+            if not hasattr(docx_obj, "part") or not hasattr(docx_obj.part, "package"):
+                return False
+            numbering_part = None
+            # Find the numbering part
+            for part in docx_obj.part.package.parts:
+                if "numbering" in part.partname:
+                    numbering_part = part
+                    break
+            if numbering_part is None:
+                return False
+            # Parse the numbering XML
+            numbering_root = numbering_part.element
+            namespaces = {
+                "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
+            }
+            # Find the numbering definition with the given numId
+            num_xpath = f".//w:num[@w:numId='{numId}']"
+            num_element = numbering_root.find(num_xpath, namespaces=namespaces)
+            if num_element is None:
+                return False
+            # Get the abstractNumId from the num element
+            abstract_num_id_elem = num_element.find(
+                ".//w:abstractNumId", namespaces=namespaces
+            )
+            if abstract_num_id_elem is None:
+                return False
+            abstract_num_id = abstract_num_id_elem.get(
+                "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
+            )
+            if abstract_num_id is None:
+                return False
+            # Find the abstract numbering definition
+            abstract_num_xpath = (
+                f".//w:abstractNum[@w:abstractNumId='{abstract_num_id}']"
+            )
+            abstract_num_element = numbering_root.find(
+                abstract_num_xpath, namespaces=namespaces
+            )
+            if abstract_num_element is None:
+                return False
+            # Find the level definition for the given ilvl
+            lvl_xpath = f".//w:lvl[@w:ilvl='{ilvl}']"
+            lvl_element = abstract_num_element.find(lvl_xpath, namespaces=namespaces)
+            if lvl_element is None:
+                return False
+            # Get the numFmt element
+            num_fmt_element = lvl_element.find(".//w:numFmt", namespaces=namespaces)
+            if num_fmt_element is None:
+                return False
+            num_fmt = num_fmt_element.get(
+                "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
+            )
+            # Numbered formats include: decimal, lowerRoman, upperRoman, lowerLetter, upperLetter
+            # Bullet formats include: bullet
+            numbered_formats = {
+                "decimal",
+                "lowerRoman",
+                "upperRoman",
+                "lowerLetter",
+                "upperLetter",
+                "decimalZero",
+            }
+            return num_fmt in numbered_formats
+        except Exception as e:
+            _log.debug(f"Error determining if list is numbered: {e}")
+            return False
     def _get_heading_and_level(self, style_label: str) -> tuple[str, Optional[int]]:
         parts = self._split_text_and_number(style_label)
@@ -713,8 +817,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         # Common styles for bullet and numbered lists.
         # "List Bullet", "List Number", "List Paragraph"
         # Identify whether list is a numbered list or not
-        # is_numbered = "List Bullet" not in paragraph.style.name
-        is_numbered = False
         p_style_id, p_level = self._get_label_and_level(paragraph)
         numid, ilevel = self._get_numId_and_ilvl(paragraph)
@@ -727,6 +829,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
             and ilevel is not None
             and p_style_id not in ["Title", "Heading"]
         ):
+            # Check if this is actually a numbered list by examining the numFmt
+            is_numbered = self._is_numbered_list(docx_obj, numid, ilevel)
             self._add_list_item(
                 doc=doc,
                 numid=numid,
@@ -983,15 +1088,19 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         if self._prev_numid() is None:  # Open new list
             self.level_at_new_list = level
+            # Reset counters for the new numbering sequence
+            self._reset_list_counters_for_new_sequence(numid)
             self.parents[level] = doc.add_list_group(
                 name="list", parent=self.parents[level - 1]
             )
             # Set marker and enumerated arguments if this is an enumeration element.
-            self.listIter += 1
             if is_numbered:
-                enum_marker = str(self.listIter) + "."
-                is_numbered = True
+                counter = self._get_list_counter(numid, ilevel)
+                enum_marker = str(counter) + "."
+            else:
+                enum_marker = ""
             self._add_formatted_list_item(
                 doc, elements, enum_marker, is_numbered, level
             )
@@ -1005,16 +1114,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 self.level_at_new_list + prev_indent + 1,
                 self.level_at_new_list + ilevel + 1,
             ):
-                self.listIter = 0
                 self.parents[i] = doc.add_list_group(
                     name="list", parent=self.parents[i - 1]
                 )
             # TODO: Set marker and enumerated arguments if this is an enumeration element.
-            self.listIter += 1
             if is_numbered:
-                enum_marker = str(self.listIter) + "."
-                is_numbered = True
+                counter = self._get_list_counter(numid, ilevel)
+                enum_marker = str(counter) + "."
+            else:
+                enum_marker = ""
             self._add_formatted_list_item(
                 doc,
                 elements,
@@ -1033,10 +1142,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                     self.parents[k] = None
             # TODO: Set marker and enumerated arguments if this is an enumeration element.
-            self.listIter += 1
             if is_numbered:
-                enum_marker = str(self.listIter) + "."
-                is_numbered = True
+                counter = self._get_list_counter(numid, ilevel)
+                enum_marker = str(counter) + "."
+            else:
+                enum_marker = ""
             self._add_formatted_list_item(
                 doc,
                 elements,
@@ -1044,14 +1154,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 is_numbered,
                 self.level_at_new_list + ilevel,
             )
-            self.listIter = 0
         elif self._prev_numid() == numid or prev_indent == ilevel:
             # TODO: Set marker and enumerated arguments if this is an enumeration element.
-            self.listIter += 1
             if is_numbered:
-                enum_marker = str(self.listIter) + "."
-                is_numbered = True
+                counter = self._get_list_counter(numid, ilevel)
+                enum_marker = str(counter) + "."
+            else:
+                enum_marker = ""
             self._add_formatted_list_item(
                 doc, elements, enum_marker, is_numbered, level - 1
             )

{docling-2.46.0 → docling-2.47.0}/docling/cli/main.py RENAMED Viewed

@@ -60,10 +60,12 @@ from docling.datamodel.pipeline_options import (
 )
 from docling.datamodel.settings import settings
 from docling.datamodel.vlm_model_specs import (
+    GOT2_TRANSFORMERS,
     GRANITE_VISION_OLLAMA,
     GRANITE_VISION_TRANSFORMERS,
     SMOLDOCLING_MLX,
     SMOLDOCLING_TRANSFORMERS,
+    SMOLDOCLING_VLLM,
     VlmModelType,
 )
 from docling.document_converter import (
@@ -477,6 +479,13 @@ def convert(  # noqa: C901
             "--logo", callback=logo_callback, is_eager=True, help="Docling logo"
         ),
     ] = None,
+    page_batch_size: Annotated[
+        int,
+        typer.Option(
+            "--page-batch-size",
+            help=f"Number of pages processed in one batch. Default: {settings.perf.page_batch_size}",
+        ),
+    ] = settings.perf.page_batch_size,
 ):
     log_format = "%(asctime)s\t%(levelname)s\t%(name)s: %(message)s"
@@ -491,6 +500,7 @@ def convert(  # noqa: C901
     settings.debug.visualize_layout = debug_visualize_layout
     settings.debug.visualize_tables = debug_visualize_tables
     settings.debug.visualize_ocr = debug_visualize_ocr
+    settings.perf.page_batch_size = page_batch_size
     if from_formats is None:
         from_formats = list(InputFormat)
@@ -631,6 +641,8 @@ def convert(  # noqa: C901
                 pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
             elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
                 pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
+            elif vlm_model == VlmModelType.GOT_OCR_2:
+                pipeline_options.vlm_options = GOT2_TRANSFORMERS
             elif vlm_model == VlmModelType.SMOLDOCLING:
                 pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
                 if sys.platform == "darwin":
@@ -643,6 +655,8 @@ def convert(  # noqa: C901
                             "To run SmolDocling faster, please install mlx-vlm:\n"
                             "pip install mlx-vlm"
                         )
+            elif vlm_model == VlmModelType.SMOLDOCLING_VLLM:
+                pipeline_options.vlm_options = SMOLDOCLING_VLLM
             pdf_format_option = PdfFormatOption(
                 pipeline_cls=VlmPipeline, pipeline_options=pipeline_options

{docling-2.46.0 → docling-2.47.0}/docling/cli/models.py RENAMED Viewed

@@ -9,6 +9,7 @@ from rich.console import Console
 from rich.logging import RichHandler
 from docling.datamodel.settings import settings
+from docling.models.utils.hf_model_download import download_hf_model
 from docling.utils.model_downloader import download_models
 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
@@ -128,6 +129,61 @@ def download(
         )
+@app.command("download-hf-repo")
+def download_hf_repo(
+    models: Annotated[
+        list[str],
+        typer.Argument(
+            help="Specific models to download from HuggingFace identified by their repo id. For example: ds4sd/docling-models .",
+        ),
+    ],
+    output_dir: Annotated[
+        Path,
+        typer.Option(
+            ...,
+            "-o",
+            "--output-dir",
+            help="The directory where to download the models.",
+        ),
+    ] = (settings.cache_dir / "models"),
+    force: Annotated[
+        bool, typer.Option(..., help="If true, the download will be forced.")
+    ] = False,
+    quiet: Annotated[
+        bool,
+        typer.Option(
+            ...,
+            "-q",
+            "--quiet",
+            help="No extra output is generated, the CLI prints only the directory with the cached models.",
+        ),
+    ] = False,
+):
+    if not quiet:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="[blue]%(message)s[/blue]",
+            datefmt="[%X]",
+            handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
+        )
+    for item in models:
+        typer.secho(f"\nDownloading {item} model from HuggingFace...")
+        download_hf_model(
+            repo_id=item,
+            # would be better to reuse "repo_cache_folder" property: https://github.com/docling-project/docling/blob/main/docling/datamodel/pipeline_options_vlm_model.py#L76
+            # but creating options objects seams like an overkill
+            local_dir=output_dir / item.replace("/", "--"),
+            force=force,
+            progress=(not quiet),
+        )
+    if quiet:
+        typer.echo(output_dir)
+    else:
+        typer.secho(f"\nModels downloaded into: {output_dir}.", fg="green")
 click_app = typer.main.get_command(app)
 if __name__ == "__main__":

{docling-2.46.0 → docling-2.47.0}/docling/datamodel/base_models.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import math
 from collections import defaultdict
 from enum import Enum
-from typing import TYPE_CHECKING, Annotated, Dict, List, Literal, Optional, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Union
 import numpy as np
 from docling_core.types.doc import (

{docling-2.46.0 → docling-2.47.0}/docling/datamodel/pipeline_options.py RENAMED Viewed

@@ -282,6 +282,9 @@ class LayoutOptions(BaseModel):
     keep_empty_clusters: bool = (
         False  # Whether to keep clusters that contain no text cells
     )
+    skip_cell_assignment: bool = (
+        False  # Skip cell-to-cluster assignment for VLM-only processing
+    )
     model_spec: LayoutModelConfig = DOCLING_LAYOUT_V2

{docling-2.46.0 → docling-2.47.0}/docling/datamodel/pipeline_options_vlm_model.py RENAMED Viewed

@@ -26,11 +26,14 @@ class ResponseFormat(str, Enum):
     DOCTAGS = "doctags"
     MARKDOWN = "markdown"
     HTML = "html"
+    OTSL = "otsl"
+    PLAINTEXT = "plaintext"
 class InferenceFramework(str, Enum):
     MLX = "mlx"
     TRANSFORMERS = "transformers"
+    VLLM = "vllm"
 class TransformersModelType(str, Enum):
@@ -43,6 +46,7 @@ class TransformersModelType(str, Enum):
 class TransformersPromptStyle(str, Enum):
     CHAT = "chat"
     RAW = "raw"
+    NONE = "none"
 class InlineVlmOptions(BaseVlmOptions):
@@ -68,6 +72,7 @@ class InlineVlmOptions(BaseVlmOptions):
     stop_strings: List[str] = []
     extra_generation_config: Dict[str, Any] = {}
+    extra_processor_kwargs: Dict[str, Any] = {}
     use_kv_cache: bool = True
     max_new_tokens: int = 4096

docling 2.46.0__tar.gz → 2.47.0__tar.gz

docling 2.46.0tar.gz → 2.47.0tar.gz