PyPI - docling - Versions diffs - 2.47.0__tar.gz → 2.48.0__tar.gz - Mend

docling 2.47.0tar.gz → 2.48.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (138) hide show

{docling-2.47.0 → docling-2.48.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docling
-Version: 2.47.0
+Version: 2.48.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
 License-Expression: MIT
@@ -59,10 +59,11 @@ Provides-Extra: vlm
 Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
 Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
 Requires-Dist: mlx-vlm<1.0.0,>=0.3.0; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
-Requires-Dist: vllm<1.0.0,>=0.10.0; (python_version >= "3.10" and sys_platform == "linux") and extra == "vlm"
+Requires-Dist: vllm<1.0.0,>=0.10.0; (python_version >= "3.10" and sys_platform == "linux" and platform_machine == "x86_64") and extra == "vlm"
 Provides-Extra: rapidocr
-Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
+Requires-Dist: rapidocr<4.0.0,>=3.3; python_version < "3.14" and extra == "rapidocr"
 Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
+Requires-Dist: modelscope>=1.29.0; extra == "rapidocr"
 Provides-Extra: asr
 Requires-Dist: openai-whisper>=20250625; extra == "asr"
 Dynamic: license-file

{docling-2.47.0 → docling-2.48.0}/docling/backend/html_backend.py RENAMED Viewed

@@ -1,5 +1,6 @@
 import logging
 import re
+import traceback
 from contextlib import contextmanager
 from copy import deepcopy
 from io import BytesIO
@@ -45,20 +46,22 @@ _BLOCK_TAGS: Final = {
     "h4",
     "h5",
     "h6",
+    "ol",
     "p",
     "pre",
-    "code",
-    "ul",
-    "ol",
     "summary",
     "table",
+    "ul",
 }
+_CODE_TAG_SET: Final = {"code", "kbd", "samp"}
 _FORMAT_TAG_MAP: Final = {
     "b": {"bold": True},
     "strong": {"bold": True},
     "i": {"italic": True},
     "em": {"italic": True},
+    "var": {"italic": True},
     # "mark",
     # "small",
     "s": {"strikethrough": True},
@@ -67,6 +70,7 @@ _FORMAT_TAG_MAP: Final = {
     "ins": {"underline": True},
     "sub": {"script": Script.SUB},
     "sup": {"script": Script.SUPER},
+    **{k: {} for k in _CODE_TAG_SET},
 }
@@ -79,6 +83,7 @@ class AnnotatedText(BaseModel):
     text: str
     hyperlink: Union[AnyUrl, Path, None] = None
     formatting: Union[Formatting, None] = None
+    code: bool = False
 class AnnotatedTextList(list):
@@ -86,10 +91,12 @@ class AnnotatedTextList(list):
         current_h = None
         current_text = ""
         current_f = None
+        current_code = False
         for at in self:
             t = at.text
             h = at.hyperlink
             f = at.formatting
+            c = at.code
             current_text += t.strip() + " "
             if f is not None and current_f is None:
                 current_f = f
@@ -103,8 +110,13 @@ class AnnotatedTextList(list):
                 _log.warning(
                     f"Clashing hyperlinks: '{h}' and '{current_h}'! Chose '{current_h}'"
                 )
+            current_code = c if c else current_code
         return AnnotatedText(
-            text=current_text.strip(), hyperlink=current_h, formatting=current_f
+            text=current_text.strip(),
+            hyperlink=current_h,
+            formatting=current_f,
+            code=current_code,
         )
     def simplify_text_elements(self) -> "AnnotatedTextList":
@@ -114,9 +126,14 @@ class AnnotatedTextList(list):
         text = self[0].text
         hyperlink = self[0].hyperlink
         formatting = self[0].formatting
+        code = self[0].code
         last_elm = text
         for i in range(1, len(self)):
-            if hyperlink == self[i].hyperlink and formatting == self[i].formatting:
+            if (
+                hyperlink == self[i].hyperlink
+                and formatting == self[i].formatting
+                and code == self[i].code
+            ):
                 sep = " "
                 if not self[i].text.strip() or not last_elm.strip():
                     sep = ""
@@ -124,15 +141,20 @@ class AnnotatedTextList(list):
                 last_elm = self[i].text
             else:
                 simplified.append(
-                    AnnotatedText(text=text, hyperlink=hyperlink, formatting=formatting)
+                    AnnotatedText(
+                        text=text, hyperlink=hyperlink, formatting=formatting, code=code
+                    )
                 )
                 text = self[i].text
                 last_elm = text
                 hyperlink = self[i].hyperlink
                 formatting = self[i].formatting
+                code = self[i].code
         if text:
             simplified.append(
-                AnnotatedText(text=text, hyperlink=hyperlink, formatting=formatting)
+                AnnotatedText(
+                    text=text, hyperlink=hyperlink, formatting=formatting, code=code
+                )
             )
         return simplified
@@ -174,7 +196,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         self.ctx = _Context()
         for i in range(self.max_levels):
             self.parents[i] = None
-        self.hyperlink = None
+        self.hyperlink: Union[AnyUrl, Path, None] = None
         self.original_url = original_url
         self.format_tags: list[str] = []
@@ -235,9 +257,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 orig=title_text,
                 content_layer=ContentLayer.FURNITURE,
             )
-        # remove scripts/styles
+        # remove script and style tags
         for tag in self.soup(["script", "style"]):
             tag.decompose()
+        # remove any hidden tag
+        for tag in self.soup(hidden=True):
+            tag.decompose()
         content = self.soup.body or self.soup
         # normalize <br> tags
         for br in content("br"):
@@ -268,7 +294,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         def flush_buffer():
             if not buffer:
                 return
-            annotated_text_list = buffer.simplify_text_elements()
+            annotated_text_list: AnnotatedTextList = buffer.simplify_text_elements()
             parts = annotated_text_list.split_by_newline()
             buffer.clear()
@@ -276,20 +302,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 return
             for annotated_text_list in parts:
-                with self.use_inline_group(annotated_text_list, doc):
+                with self._use_inline_group(annotated_text_list, doc):
                     for annotated_text in annotated_text_list:
                         if annotated_text.text.strip():
                             seg_clean = HTMLDocumentBackend._clean_unicode(
                                 annotated_text.text.strip()
                             )
-                            doc.add_text(
-                                parent=self.parents[self.level],
-                                label=DocItemLabel.TEXT,
-                                text=seg_clean,
-                                content_layer=self.content_layer,
-                                formatting=annotated_text.formatting,
-                                hyperlink=annotated_text.hyperlink,
-                            )
+                            if annotated_text.code:
+                                doc.add_code(
+                                    parent=self.parents[self.level],
+                                    text=seg_clean,
+                                    content_layer=self.content_layer,
+                                    formatting=annotated_text.formatting,
+                                    hyperlink=annotated_text.hyperlink,
+                                )
+                            else:
+                                doc.add_text(
+                                    parent=self.parents[self.level],
+                                    label=DocItemLabel.TEXT,
+                                    text=seg_clean,
+                                    content_layer=self.content_layer,
+                                    formatting=annotated_text.formatting,
+                                    hyperlink=annotated_text.hyperlink,
+                                )
         for node in element.contents:
             if isinstance(node, Tag):
@@ -298,10 +333,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                     flush_buffer()
                     self._emit_image(node, doc)
                 elif name in _FORMAT_TAG_MAP:
-                    with self.use_format([name]):
+                    with self._use_format([name]):
                         self._walk(node, doc)
                 elif name == "a":
-                    with self.use_hyperlink(node):
+                    with self._use_hyperlink(node):
                         self._walk(node, doc)
                 elif name in _BLOCK_TAGS:
                     flush_buffer()
@@ -367,8 +402,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             this_parent = item.parent
             while this_parent is not None:
                 if this_parent.name == "a" and this_parent.get("href"):
-                    with self.use_format(format_tags):
-                        with self.use_hyperlink(this_parent):
+                    with self._use_format(format_tags):
+                        with self._use_hyperlink(this_parent):
                             return self._extract_text_and_hyperlink_recursively(
                                 item, ignore_list
                             )
@@ -379,6 +414,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         if isinstance(item, NavigableString):
             text = item.strip()
+            code = any(code_tag in self.format_tags for code_tag in _CODE_TAG_SET)
             if text:
                 return AnnotatedTextList(
                     [
@@ -386,6 +422,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                             text=text,
                             hyperlink=self.hyperlink,
                             formatting=self._formatting,
+                            code=code,
                         )
                     ]
                 )
@@ -396,6 +433,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                             text="\n",
                             hyperlink=self.hyperlink,
                             formatting=self._formatting,
+                            code=code,
                         )
                     ]
                 )
@@ -405,14 +443,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         if not ignore_list or (tag.name not in ["ul", "ol"]):
             for child in tag:
                 if isinstance(child, Tag) and child.name in _FORMAT_TAG_MAP:
-                    with self.use_format([child.name]):
+                    with self._use_format([child.name]):
                         result.extend(
                             self._extract_text_and_hyperlink_recursively(
                                 child, ignore_list, keep_newlines=keep_newlines
                             )
                         )
                 elif isinstance(child, Tag) and child.name == "a":
-                    with self.use_hyperlink(child):
+                    with self._use_hyperlink(child):
                         result.extend(
                             self._extract_text_and_hyperlink_recursively(
                                 child, ignore_list, keep_newlines=keep_newlines
@@ -428,29 +466,30 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         return result
     @contextmanager
-    def use_hyperlink(self, tag):
+    def _use_hyperlink(self, tag: Tag):
         this_href = tag.get("href")
         if this_href is None:
             yield None
         else:
-            if this_href:
-                old_hyperlink = self.hyperlink
+            if isinstance(this_href, str) and this_href:
+                old_hyperlink: Union[AnyUrl, Path, None] = self.hyperlink
+                new_hyperlink: Union[AnyUrl, Path, None] = None
                 if self.original_url is not None:
-                    this_href = urljoin(self.original_url, this_href)
+                    this_href = urljoin(str(self.original_url), str(this_href))
                 # ugly fix for relative links since pydantic does not support them.
                 try:
-                    AnyUrl(this_href)
+                    new_hyperlink = AnyUrl(this_href)
                 except ValidationError:
-                    this_href = Path(this_href)
-                self.hyperlink = this_href
+                    new_hyperlink = Path(this_href)
+                self.hyperlink = new_hyperlink
             try:
                 yield None
             finally:
-                if this_href:
+                if new_hyperlink:
                     self.hyperlink = old_hyperlink
     @contextmanager
-    def use_format(self, tags: list[str]):
+    def _use_format(self, tags: list[str]):
         if not tags:
             yield None
         else:
@@ -461,7 +500,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 self.format_tags = self.format_tags[: -len(tags)]
     @contextmanager
-    def use_inline_group(
+    def _use_inline_group(
         self, annotated_text_list: AnnotatedTextList, doc: DoclingDocument
     ):
         """Create an inline group for annotated texts.
@@ -473,9 +512,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         Args:
             annotated_text_list (AnnotatedTextList): Annotated text
             doc (DoclingDocument): Currently used document
-        Yields:
-            None: _description_
         """
         if len(annotated_text_list) > 1:
             inline_fmt = doc.add_group(
@@ -493,6 +529,57 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         else:
             yield None
+    @contextmanager
+    def _use_details(self, tag: Tag, doc: DoclingDocument):
+        """Create a group with the content of a details tag.
+        While the context manager is active, the hierarchy level is set one
+        level higher as the cuurent parent.
+        Args:
+            tag: The details tag.
+            doc: Currently used document.
+        """
+        self.parents[self.level + 1] = doc.add_group(
+            name=tag.name,
+            label=GroupLabel.SECTION,
+            parent=self.parents[self.level],
+            content_layer=self.content_layer,
+        )
+        self.level += 1
+        try:
+            yield None
+        finally:
+            self.parents[self.level + 1] = None
+            self.level -= 1
+    @contextmanager
+    def _use_footer(self, tag: Tag, doc: DoclingDocument):
+        """Create a group with a footer.
+        Create a group with the content of a footer tag. While the context manager
+        is active, the hierarchy level is set one level higher as the cuurent parent.
+        Args:
+            tag: The footer tag.
+            doc: Currently used document.
+        """
+        current_layer = self.content_layer
+        self.content_layer = ContentLayer.FURNITURE
+        self.parents[self.level + 1] = doc.add_group(
+            name=tag.name,
+            label=GroupLabel.SECTION,
+            parent=self.parents[self.level],
+            content_layer=self.content_layer,
+        )
+        self.level += 1
+        try:
+            yield None
+        finally:
+            self.parents[self.level + 1] = None
+            self.level -= 1
+            self.content_layer = current_layer
     def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
         tag_name = tag.name.lower()
         # set default content layer to BODY as soon as we encounter a heading
@@ -611,20 +698,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                             content_layer=self.content_layer,
                         )
                         self.level += 1
-                        with self.use_inline_group(min_parts, doc):
+                        with self._use_inline_group(min_parts, doc):
                             for annotated_text in min_parts:
                                 li_text = re.sub(
                                     r"\s+|\n+", " ", annotated_text.text
                                 ).strip()
                                 li_clean = HTMLDocumentBackend._clean_unicode(li_text)
-                                doc.add_text(
-                                    parent=self.parents[self.level],
-                                    label=DocItemLabel.TEXT,
-                                    text=li_clean,
-                                    content_layer=self.content_layer,
-                                    formatting=annotated_text.formatting,
-                                    hyperlink=annotated_text.hyperlink,
-                                )
+                                if annotated_text.code:
+                                    doc.add_code(
+                                        parent=self.parents[self.level],
+                                        text=li_clean,
+                                        content_layer=self.content_layer,
+                                        formatting=annotated_text.formatting,
+                                        hyperlink=annotated_text.hyperlink,
+                                    )
+                                else:
+                                    doc.add_text(
+                                        parent=self.parents[self.level],
+                                        label=DocItemLabel.TEXT,
+                                        text=li_clean,
+                                        content_layer=self.content_layer,
+                                        formatting=annotated_text.formatting,
+                                        hyperlink=annotated_text.hyperlink,
+                                    )
                         # 4) recurse into any nested lists, attaching them to this <li> item
                         for sublist in li({"ul", "ol"}, recursive=False):
@@ -687,20 +783,29 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             text_list = self._extract_text_and_hyperlink_recursively(
                 tag, find_parent_annotation=True
             )
-            annotated_texts = text_list.simplify_text_elements()
+            annotated_texts: AnnotatedTextList = text_list.simplify_text_elements()
             for part in annotated_texts.split_by_newline():
-                with self.use_inline_group(part, doc):
+                with self._use_inline_group(part, doc):
                     for annotated_text in part:
                         if seg := annotated_text.text.strip():
                             seg_clean = HTMLDocumentBackend._clean_unicode(seg)
-                            doc.add_text(
-                                parent=self.parents[self.level],
-                                label=DocItemLabel.TEXT,
-                                text=seg_clean,
-                                content_layer=self.content_layer,
-                                formatting=annotated_text.formatting,
-                                hyperlink=annotated_text.hyperlink,
-                            )
+                            if annotated_text.code:
+                                doc.add_code(
+                                    parent=self.parents[self.level],
+                                    text=seg_clean,
+                                    content_layer=self.content_layer,
+                                    formatting=annotated_text.formatting,
+                                    hyperlink=annotated_text.hyperlink,
+                                )
+                            else:
+                                doc.add_text(
+                                    parent=self.parents[self.level],
+                                    label=DocItemLabel.TEXT,
+                                    text=seg_clean,
+                                    content_layer=self.content_layer,
+                                    formatting=annotated_text.formatting,
+                                    hyperlink=annotated_text.hyperlink,
+                                )
             for img_tag in tag("img"):
                 if isinstance(img_tag, Tag):
@@ -718,13 +823,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                     content_layer=self.content_layer,
                 )
-        elif tag_name in {"pre", "code"}:
+        elif tag_name in {"pre"}:
             # handle monospace code snippets (pre).
             text_list = self._extract_text_and_hyperlink_recursively(
-                tag, find_parent_annotation=True
+                tag, find_parent_annotation=True, keep_newlines=True
             )
             annotated_texts = text_list.simplify_text_elements()
-            with self.use_inline_group(annotated_texts, doc):
+            with self._use_inline_group(annotated_texts, doc):
                 for annotated_text in annotated_texts:
                     text_clean = HTMLDocumentBackend._clean_unicode(
                         annotated_text.text.strip()
@@ -737,22 +842,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                         hyperlink=annotated_text.hyperlink,
                     )
-        elif tag_name in {"details", "footer"}:
-            if tag_name == "footer":
-                current_layer = self.content_layer
-                self.content_layer = ContentLayer.FURNITURE
-            self.parents[self.level + 1] = doc.add_group(
-                name=tag_name,
-                label=GroupLabel.SECTION,
-                parent=self.parents[self.level],
-                content_layer=self.content_layer,
-            )
-            self.level += 1
-            self._walk(tag, doc)
-            self.parents[self.level + 1] = None
-            self.level -= 1
-            if tag_name == "footer":
-                self.content_layer = current_layer
+        elif tag_name == "footer":
+            with self._use_footer(tag, doc):
+                self._walk(tag, doc)
+        elif tag_name == "details":
+            with self._use_details(tag, doc):
+                self._walk(tag, doc)
     def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
         figure = img_tag.find_parent("figure")

{docling-2.47.0 → docling-2.48.0}/docling/datamodel/pipeline_options.py RENAMED Viewed

@@ -99,6 +99,8 @@ class RapidOcrOptions(OcrOptions):
     # For more details on the following options visit
     # https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
+    # https://rapidai.github.io/RapidOCRDocs/main/install_usage/rapidocr/usage/#__tabbed_3_4
+    backend: Literal["onnxruntime", "openvino", "paddle", "torch"] = "onnxruntime"
     text_score: float = 0.5  # same default as rapidocr
     use_det: Optional[bool] = None  # same default as rapidocr

{docling-2.47.0 → docling-2.48.0}/docling/models/rapid_ocr_model.py RENAMED Viewed

@@ -42,10 +42,10 @@ class RapidOcrModel(BaseOcrModel):
         if self.enabled:
             try:
-                from rapidocr_onnxruntime import RapidOCR  # type: ignore
+                from rapidocr import EngineType, RapidOCR  # type: ignore
             except ImportError:
                 raise ImportError(
-                    "RapidOCR is not installed. Please install it via `pip install rapidocr_onnxruntime` to use this OCR engine. "
+                    "RapidOCR is not installed. Please install it via `pip install rapidocr onnxruntime` to use this OCR engine. "
                     "Alternatively, Docling has support for other OCR engines. See the documentation."
                 )
@@ -54,21 +54,39 @@ class RapidOcrModel(BaseOcrModel):
             use_cuda = str(AcceleratorDevice.CUDA.value).lower() in device
             use_dml = accelerator_options.device == AcceleratorDevice.AUTO
             intra_op_num_threads = accelerator_options.num_threads
+            _ALIASES = {
+                "onnxruntime": EngineType.ONNXRUNTIME,
+                "openvino": EngineType.OPENVINO,
+                "paddle": EngineType.PADDLE,
+                "torch": EngineType.TORCH,
+            }
+            backend_enum = _ALIASES.get(self.options.backend, EngineType.ONNXRUNTIME)
             self.reader = RapidOCR(
-                text_score=self.options.text_score,
-                cls_use_cuda=use_cuda,
-                rec_use_cuda=use_cuda,
-                det_use_cuda=use_cuda,
-                det_use_dml=use_dml,
-                cls_use_dml=use_dml,
-                rec_use_dml=use_dml,
-                intra_op_num_threads=intra_op_num_threads,
-                print_verbose=self.options.print_verbose,
-                det_model_path=self.options.det_model_path,
-                cls_model_path=self.options.cls_model_path,
-                rec_model_path=self.options.rec_model_path,
-                rec_keys_path=self.options.rec_keys_path,
+                params={
+                    # Global settings (these are still correct)
+                    "Global.text_score": self.options.text_score,
+                    # "Global.verbose": self.options.print_verbose,
+                    # Detection model settings
+                    "Det.model_path": self.options.det_model_path,
+                    "Det.use_cuda": use_cuda,
+                    "Det.use_dml": use_dml,
+                    "Det.intra_op_num_threads": intra_op_num_threads,
+                    # Classification model settings
+                    "Cls.model_path": self.options.cls_model_path,
+                    "Cls.use_cuda": use_cuda,
+                    "Cls.use_dml": use_dml,
+                    "Cls.intra_op_num_threads": intra_op_num_threads,
+                    # Recognition model settings
+                    "Rec.model_path": self.options.rec_model_path,
+                    "Rec.keys_path": self.options.rec_keys_path,
+                    "Rec.use_cuda": use_cuda,
+                    "Rec.use_dml": use_dml,
+                    "Rec.intra_op_num_threads": intra_op_num_threads,
+                    "Det.engine_type": backend_enum,
+                    "Cls.engine_type": backend_enum,
+                    "Rec.engine_type": backend_enum,
+                }
             )
     def __call__(
@@ -95,12 +113,15 @@ class RapidOcrModel(BaseOcrModel):
                             scale=self.scale, cropbox=ocr_rect
                         )
                         im = numpy.array(high_res_image)
-                        result, _ = self.reader(
+                        result = self.reader(
                             im,
                             use_det=self.options.use_det,
                             use_cls=self.options.use_cls,
                             use_rec=self.options.use_rec,
                         )
+                        result = list(
+                            zip(result.boxes.tolist(), result.txts, result.scores)
+                        )
                         del high_res_image
                         del im

{docling-2.47.0 → docling-2.48.0}/docling/pipeline/base_pipeline.py RENAMED Viewed

@@ -146,6 +146,7 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
                     conv_res.pages.append(Page(page_no=i))
             try:
+                total_pages_processed = 0
                 # Iterate batches of pages (page_batch_size) in the doc
                 for page_batch in chunkify(
                     conv_res.pages, settings.perf.page_batch_size
@@ -186,9 +187,9 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
                         )
                         conv_res.status = ConversionStatus.PARTIAL_SUCCESS
                         break
+                    total_pages_processed += len(page_batch)
                     _log.debug(
-                        f"Finished converting page batch time={end_batch_time:.3f}"
+                        f"Finished converting pages {total_pages_processed}/{len(conv_res.pages)} time={end_batch_time:.3f}"
                     )
             except Exception as e:

{docling-2.47.0 → docling-2.48.0}/docling.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docling
-Version: 2.47.0
+Version: 2.48.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
 License-Expression: MIT
@@ -59,10 +59,11 @@ Provides-Extra: vlm
 Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
 Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
 Requires-Dist: mlx-vlm<1.0.0,>=0.3.0; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
-Requires-Dist: vllm<1.0.0,>=0.10.0; (python_version >= "3.10" and sys_platform == "linux") and extra == "vlm"
+Requires-Dist: vllm<1.0.0,>=0.10.0; (python_version >= "3.10" and sys_platform == "linux" and platform_machine == "x86_64") and extra == "vlm"
 Provides-Extra: rapidocr
-Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
+Requires-Dist: rapidocr<4.0.0,>=3.3; python_version < "3.14" and extra == "rapidocr"
 Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
+Requires-Dist: modelscope>=1.29.0; extra == "rapidocr"
 Provides-Extra: asr
 Requires-Dist: openai-whisper>=20250625; extra == "asr"
 Dynamic: license-file

{docling-2.47.0 → docling-2.48.0}/docling.egg-info/requires.txt RENAMED Viewed

@@ -35,9 +35,10 @@ ocrmac<2.0.0,>=1.0.0
 [rapidocr]
 onnxruntime<2.0.0,>=1.7.0
+modelscope>=1.29.0
-[rapidocr:python_version < "3.13"]
-rapidocr-onnxruntime<2.0.0,>=1.4.0
+[rapidocr:python_version < "3.14"]
+rapidocr<4.0.0,>=3.3
 [tesserocr]
 tesserocr<3.0.0,>=2.7.1
@@ -49,5 +50,5 @@ accelerate<2.0.0,>=1.2.1
 [vlm:python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"]
 mlx-vlm<1.0.0,>=0.3.0
-[vlm:python_version >= "3.10" and sys_platform == "linux"]
+[vlm:python_version >= "3.10" and sys_platform == "linux" and platform_machine == "x86_64"]
 vllm<1.0.0,>=0.10.0

{docling-2.47.0 → docling-2.48.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "docling"
-version = "2.47.0"  # DO NOT EDIT, updated automatically
+version = "2.48.0"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 license = "MIT"
 keywords = [
@@ -93,11 +93,12 @@ vlm = [
   'transformers (>=4.46.0,<5.0.0)',
   'accelerate (>=1.2.1,<2.0.0)',
   'mlx-vlm (>=0.3.0,<1.0.0) ; python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"',
-  'vllm (>=0.10.0,<1.0.0) ; python_version >= "3.10" and sys_platform == "linux"',
+  'vllm (>=0.10.0,<1.0.0) ; python_version >= "3.10" and sys_platform == "linux" and platform_machine == "x86_64"',
 ]
 rapidocr = [
-  'rapidocr-onnxruntime (>=1.4.0,<2.0.0) ; python_version < "3.13"',
+  'rapidocr (>=3.3,<4.0.0) ; python_version < "3.14"',
   'onnxruntime (>=1.7.0,<2.0.0)',
+    "modelscope>=1.29.0",
   # 'onnxruntime (>=1.7.0,<2.0.0) ; python_version >= "3.10"',
   # 'onnxruntime (>=1.7.0,<1.20.0) ; python_version < "3.10"',
 ]

{docling-2.47.0 → docling-2.48.0}/tests/test_backend_webp.py RENAMED Viewed

@@ -55,8 +55,8 @@ def test_e2e_webp_conversions():
         TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]),
     ]
-    # rapidocr is only available for Python >=3.6,<3.13
-    if sys.version_info < (3, 13):
+    # rapidocr is only available for Python >=3.6,<3.14
+    if sys.version_info < (3, 14):
         engines.append(RapidOcrOptions())
         engines.append(RapidOcrOptions(force_full_page_ocr=True))

{docling-2.47.0 → docling-2.48.0}/LICENSE RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/README.md RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/__init__.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/backend/__init__.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/backend/abstract_backend.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/backend/asciidoc_backend.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/backend/csv_backend.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/backend/docling_parse_backend.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/backend/docling_parse_v2_backend.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/backend/docling_parse_v4_backend.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/backend/docx/__init__.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/backend/docx/latex/__init__.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/backend/docx/latex/latex_dict.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/backend/docx/latex/omml.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/backend/json/__init__.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/backend/json/docling_json_backend.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/backend/md_backend.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/backend/mets_gbs_backend.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/backend/msexcel_backend.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/backend/mspowerpoint_backend.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/backend/msword_backend.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/backend/noop_backend.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/backend/pdf_backend.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/backend/pypdfium2_backend.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/backend/xml/__init__.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/backend/xml/jats_backend.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/backend/xml/uspto_backend.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/chunking/__init__.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/cli/__init__.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/cli/main.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/cli/models.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/cli/tools.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/datamodel/__init__.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/datamodel/accelerator_options.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/datamodel/asr_model_specs.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/datamodel/base_models.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/datamodel/document.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/datamodel/layout_model_specs.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/datamodel/pipeline_options_asr_model.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/datamodel/pipeline_options_vlm_model.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/datamodel/settings.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/datamodel/vlm_model_specs.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/document_converter.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/exceptions.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/models/__init__.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/models/api_vlm_model.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/models/base_model.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/models/base_ocr_model.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/models/code_formula_model.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/models/document_picture_classifier.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/models/easyocr_model.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/models/factories/__init__.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/models/factories/base_factory.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/models/factories/ocr_factory.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/models/factories/picture_description_factory.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/models/layout_model.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/models/ocr_mac_model.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/models/page_assemble_model.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/models/page_preprocessing_model.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/models/picture_description_api_model.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/models/picture_description_base_model.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/models/picture_description_vlm_model.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/models/plugins/__init__.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/models/plugins/defaults.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/models/readingorder_model.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/models/table_structure_model.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/models/tesseract_ocr_cli_model.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/models/tesseract_ocr_model.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/models/utils/__init__.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/models/utils/hf_model_download.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/models/vlm_models_inline/__init__.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/models/vlm_models_inline/hf_transformers_model.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/models/vlm_models_inline/mlx_model.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/models/vlm_models_inline/vllm_model.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/pipeline/__init__.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/pipeline/asr_pipeline.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/pipeline/simple_pipeline.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/pipeline/standard_pdf_pipeline.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/pipeline/threaded_standard_pdf_pipeline.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/pipeline/vlm_pipeline.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/py.typed RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/utils/__init__.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/utils/accelerator_utils.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/utils/api_image_request.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/utils/export.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/utils/glm_utils.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/utils/layout_postprocessor.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/utils/locks.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/utils/model_downloader.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/utils/ocr_utils.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/utils/orientation.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/utils/profiling.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/utils/utils.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling/utils/visualization.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling.egg-info/SOURCES.txt RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling.egg-info/entry_points.txt RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/docling.egg-info/top_level.txt RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/setup.cfg RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/tests/test_asr_pipeline.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/tests/test_backend_asciidoc.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/tests/test_backend_csv.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/tests/test_backend_docling_json.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/tests/test_backend_docling_parse.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/tests/test_backend_docling_parse_v2.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/tests/test_backend_docling_parse_v4.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/tests/test_backend_html.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/tests/test_backend_jats.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/tests/test_backend_markdown.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/tests/test_backend_mets_gbs.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/tests/test_backend_msexcel.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/tests/test_backend_msword.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/tests/test_backend_patent_uspto.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/tests/test_backend_pdfium.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/tests/test_backend_pptx.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/tests/test_cli.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/tests/test_code_formula.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/tests/test_data_gen_flag.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/tests/test_document_picture_classifier.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/tests/test_e2e_conversion.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/tests/test_e2e_ocr_conversion.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/tests/test_input_doc.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/tests/test_interfaces.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/tests/test_invalid_input.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/tests/test_legacy_format_transform.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/tests/test_ocr_utils.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/tests/test_options.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/tests/test_settings_load.py RENAMED Viewed

File without changes

{docling-2.47.0 → docling-2.48.0}/tests/test_threaded_pipeline.py RENAMED Viewed

File without changes

docling 2.47.0__tar.gz → 2.48.0__tar.gz

docling 2.47.0tar.gz → 2.48.0tar.gz