PyPI - docling - Versions diffs - 2.31.2__tar.gz → 2.33.0__tar.gz - Mend

docling 2.31.2tar.gz → 2.33.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

{docling-2.31.2 → docling-2.33.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 2.31.2
+Version: 2.33.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Home-page: https://github.com/docling-project/docling
 License: MIT
@@ -29,7 +29,7 @@ Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platfo
 Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
 Requires-Dist: certifi (>=2024.7.4)
 Requires-Dist: click (<8.2.0)
-Requires-Dist: docling-core[chunking] (>=2.26.0,<3.0.0)
+Requires-Dist: docling-core[chunking] (>=2.29.0,<3.0.0)
 Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
 Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
 Requires-Dist: easyocr (>=1.7,<2.0)

{docling-2.31.2 → docling-2.33.0}/docling/backend/msword_backend.py RENAMED Viewed

@@ -2,7 +2,7 @@ import logging
 import re
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Optional, Union
+from typing import Any, List, Optional, Union
 from docling_core.types.doc import (
     DocItemLabel,
@@ -24,7 +24,6 @@ from docx.text.hyperlink import Hyperlink
 from docx.text.paragraph import Paragraph
 from docx.text.run import Run
 from lxml import etree
-from lxml.etree import XPath
 from PIL import Image, UnidentifiedImageError
 from pydantic import AnyUrl
 from typing_extensions import override
@@ -59,6 +58,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         self.parents: dict[int, Optional[NodeItem]] = {}
         self.numbered_headers: dict[int, int] = {}
         self.equation_bookends: str = "<eq>{EQ}</eq>"
+        # Track processed textbox elements to avoid duplication
+        self.processed_textbox_elements: List[int] = []
+        # Track content hash of processed paragraphs to avoid duplicate content
+        self.processed_paragraph_content: List[str] = []
         for i in range(-1, self.max_levels):
             self.parents[i] = None
@@ -175,10 +179,74 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
                 "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
                 "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
+                "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
+                "mc": "http://schemas.openxmlformats.org/markup-compatibility/2006",
+                "v": "urn:schemas-microsoft-com:vml",
+                "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape",
+                "w10": "urn:schemas-microsoft-com:office:word",
+                "a14": "http://schemas.microsoft.com/office/drawing/2010/main",
             }
-            xpath_expr = XPath(".//a:blip", namespaces=namespaces)
+            xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces)
             drawing_blip = xpath_expr(element)
+            # Check for textbox content - check multiple textbox formats
+            # Only process if the element hasn't been processed before
+            element_id = id(element)
+            if element_id not in self.processed_textbox_elements:
+                # Modern Word textboxes
+                txbx_xpath = etree.XPath(
+                    ".//w:txbxContent|.//v:textbox//w:p", namespaces=namespaces
+                )
+                textbox_elements = txbx_xpath(element)
+                # No modern textboxes found, check for alternate/legacy textbox formats
+                if not textbox_elements and tag_name in ["drawing", "pict"]:
+                    # Additional checks for textboxes in DrawingML and VML formats
+                    alt_txbx_xpath = etree.XPath(
+                        ".//wps:txbx//w:p|.//w10:wrap//w:p|.//a:p//a:t",
+                        namespaces=namespaces,
+                    )
+                    textbox_elements = alt_txbx_xpath(element)
+                    # Check for shape text that's not in a standard textbox
+                    if not textbox_elements:
+                        shape_text_xpath = etree.XPath(
+                            ".//a:bodyPr/ancestor::*//a:t|.//a:txBody//a:t",
+                            namespaces=namespaces,
+                        )
+                        shape_text_elements = shape_text_xpath(element)
+                        if shape_text_elements:
+                            # Create custom text elements from shape text
+                            text_content = " ".join(
+                                [t.text for t in shape_text_elements if t.text]
+                            )
+                            if text_content.strip():
+                                _log.debug(f"Found shape text: {text_content[:50]}...")
+                                # Create a paragraph-like element to process with standard handler
+                                level = self._get_level()
+                                shape_group = doc.add_group(
+                                    label=GroupLabel.SECTION,
+                                    parent=self.parents[level - 1],
+                                    name="shape-text",
+                                )
+                                doc.add_text(
+                                    label=DocItemLabel.PARAGRAPH,
+                                    parent=shape_group,
+                                    text=text_content,
+                                )
+                if textbox_elements:
+                    # Mark the parent element as processed
+                    self.processed_textbox_elements.append(element_id)
+                    # Also mark all found textbox elements as processed
+                    for tb_element in textbox_elements:
+                        self.processed_textbox_elements.append(id(tb_element))
+                    _log.debug(
+                        f"Found textbox content with {len(textbox_elements)} elements"
+                    )
+                    self._handle_textbox_content(textbox_elements, docx_obj, doc)
             # Check for Tables
             if element.tag.endswith("tbl"):
                 try:
@@ -291,15 +359,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
     @classmethod
     def _get_format_from_run(cls, run: Run) -> Optional[Formatting]:
-        has_any_formatting = run.bold or run.italic or run.underline
-        return (
-            Formatting(
-                bold=run.bold or False,
-                italic=run.italic or False,
-                underline=run.underline or False,
-            )
-            if has_any_formatting
-            else None
+        # The .bold and .italic properties are booleans, but .underline can be an enum
+        # like WD_UNDERLINE.THICK (value 6), so we need to convert it to a boolean
+        has_bold = run.bold or False
+        has_italic = run.italic or False
+        # Convert any non-None underline value to True
+        has_underline = bool(run.underline is not None and run.underline)
+        return Formatting(
+            bold=has_bold,
+            italic=has_italic,
+            underline=has_underline,
         )
     def _get_paragraph_elements(self, paragraph: Paragraph):
@@ -355,6 +425,182 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         return paragraph_elements
+    def _get_paragraph_position(self, paragraph_element):
+        """Extract vertical position information from paragraph element."""
+        # First try to directly get the index from w:p element that has an order-related attribute
+        if (
+            hasattr(paragraph_element, "getparent")
+            and paragraph_element.getparent() is not None
+        ):
+            parent = paragraph_element.getparent()
+            # Get all paragraph siblings
+            paragraphs = [
+                p for p in parent.getchildren() if etree.QName(p).localname == "p"
+            ]
+            # Find index of current paragraph within its siblings
+            try:
+                paragraph_index = paragraphs.index(paragraph_element)
+                return paragraph_index  # Use index as position for consistent ordering
+            except ValueError:
+                pass
+        # Look for position hints in element attributes and ancestor elements
+        for elem in (*[paragraph_element], *paragraph_element.iterancestors()):
+            # Check for direct position attributes
+            for attr_name in ["y", "top", "positionY", "y-position", "position"]:
+                value = elem.get(attr_name)
+                if value:
+                    try:
+                        # Remove any non-numeric characters (like 'pt', 'px', etc.)
+                        clean_value = re.sub(r"[^0-9.]", "", value)
+                        if clean_value:
+                            return float(clean_value)
+                    except (ValueError, TypeError):
+                        pass
+            # Check for position in transform attribute
+            transform = elem.get("transform")
+            if transform:
+                # Extract translation component from transform matrix
+                match = re.search(r"translate\([^,]+,\s*([0-9.]+)", transform)
+                if match:
+                    try:
+                        return float(match.group(1))
+                    except ValueError:
+                        pass
+            # Check for anchors or relative position indicators in Word format
+            # 'dist' attributes can indicate relative positioning
+            for attr_name in ["distT", "distB", "anchor", "relativeFrom"]:
+                if elem.get(attr_name) is not None:
+                    return elem.sourceline  # Use the XML source line number as fallback
+        # For VML shapes, look for specific attributes
+        for ns_uri in paragraph_element.nsmap.values():
+            if "vml" in ns_uri:
+                # Try to extract position from style attribute
+                style = paragraph_element.get("style")
+                if style:
+                    match = re.search(r"top:([0-9.]+)pt", style)
+                    if match:
+                        try:
+                            return float(match.group(1))
+                        except ValueError:
+                            pass
+        # If no better position indicator found, use XML source line number as proxy for order
+        return (
+            paragraph_element.sourceline
+            if hasattr(paragraph_element, "sourceline")
+            else None
+        )
+    def _collect_textbox_paragraphs(self, textbox_elements):
+        """Collect and organize paragraphs from textbox elements."""
+        processed_paragraphs = []
+        container_paragraphs = {}
+        for element in textbox_elements:
+            element_id = id(element)
+            # Skip if we've already processed this exact element
+            if element_id in processed_paragraphs:
+                continue
+            tag_name = etree.QName(element).localname
+            processed_paragraphs.append(element_id)
+            # Handle paragraphs directly found (VML textboxes)
+            if tag_name == "p":
+                # Find the containing textbox or shape element
+                container_id = None
+                for ancestor in element.iterancestors():
+                    if any(ns in ancestor.tag for ns in ["textbox", "shape", "txbx"]):
+                        container_id = id(ancestor)
+                        break
+                if container_id not in container_paragraphs:
+                    container_paragraphs[container_id] = []
+                container_paragraphs[container_id].append(
+                    (element, self._get_paragraph_position(element))
+                )
+            # Handle txbxContent elements (Word DrawingML textboxes)
+            elif tag_name == "txbxContent":
+                paragraphs = element.findall(".//w:p", namespaces=element.nsmap)
+                container_id = id(element)
+                if container_id not in container_paragraphs:
+                    container_paragraphs[container_id] = []
+                for p in paragraphs:
+                    p_id = id(p)
+                    if p_id not in processed_paragraphs:
+                        processed_paragraphs.append(p_id)
+                        container_paragraphs[container_id].append(
+                            (p, self._get_paragraph_position(p))
+                        )
+            else:
+                # Try to extract any paragraphs from unknown elements
+                paragraphs = element.findall(".//w:p", namespaces=element.nsmap)
+                container_id = id(element)
+                if container_id not in container_paragraphs:
+                    container_paragraphs[container_id] = []
+                for p in paragraphs:
+                    p_id = id(p)
+                    if p_id not in processed_paragraphs:
+                        processed_paragraphs.append(p_id)
+                        container_paragraphs[container_id].append(
+                            (p, self._get_paragraph_position(p))
+                        )
+        return container_paragraphs
+    def _handle_textbox_content(
+        self,
+        textbox_elements: list,
+        docx_obj: DocxDocument,
+        doc: DoclingDocument,
+    ) -> None:
+        """Process textbox content and add it to the document structure."""
+        level = self._get_level()
+        # Create a textbox group to contain all text from the textbox
+        textbox_group = doc.add_group(
+            label=GroupLabel.SECTION, parent=self.parents[level - 1], name="textbox"
+        )
+        # Set this as the current parent to ensure textbox content
+        # is properly nested in document structure
+        original_parent = self.parents[level]
+        self.parents[level] = textbox_group
+        # Collect and organize paragraphs
+        container_paragraphs = self._collect_textbox_paragraphs(textbox_elements)
+        # Process all paragraphs
+        all_paragraphs = []
+        # Sort paragraphs within each container, then process containers
+        for container_id, paragraphs in container_paragraphs.items():
+            # Sort by vertical position within each container
+            sorted_container_paragraphs = sorted(
+                paragraphs,
+                key=lambda x: (
+                    x[1] is None,
+                    x[1] if x[1] is not None else float("inf"),
+                ),
+            )
+            # Add the sorted paragraphs to our processing list
+            all_paragraphs.extend(sorted_container_paragraphs)
+        # Process all the paragraphs
+        for p, _ in all_paragraphs:
+            self._handle_text_elements(p, docx_obj, doc, is_from_textbox=True)
+        # Restore original parent
+        self.parents[level] = original_parent
+        return
     def _handle_equations_in_text(self, element, text):
         only_texts = []
         only_equations = []
@@ -423,10 +669,21 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         element: BaseOxmlElement,
         docx_obj: DocxDocument,
         doc: DoclingDocument,
+        is_from_textbox: bool = False,
     ) -> None:
         paragraph = Paragraph(element, docx_obj)
+        # Skip if from a textbox and this exact paragraph content was already processed
+        # Skip if from a textbox and this exact paragraph content was already processed
         raw_text = paragraph.text
+        if is_from_textbox and raw_text:
+            # Create a simple hash of content to detect duplicates
+            content_hash = f"{len(raw_text)}:{raw_text[:50]}"
+            if content_hash in self.processed_paragraph_content:
+                _log.debug(f"Skipping duplicate paragraph content: {content_hash}")
+                return
+            self.processed_paragraph_content.append(content_hash)
         text, equations = self._handle_equations_in_text(element=element, text=raw_text)
         if text is None:

{docling-2.31.2 → docling-2.33.0}/docling/backend/pypdfium2_backend.py RENAMED Viewed

@@ -175,13 +175,18 @@ class PyPdfiumPageBackend(PdfPageBackend):
                 if len(group) == 1:
                     return group[0]
-                merged_text = "".join(cell.text for cell in group)
                 merged_bbox = BoundingBox(
                     l=min(cell.rect.to_bounding_box().l for cell in group),
                     t=min(cell.rect.to_bounding_box().t for cell in group),
                     r=max(cell.rect.to_bounding_box().r for cell in group),
                     b=max(cell.rect.to_bounding_box().b for cell in group),
                 )
+                assert self._ppage is not None
+                self.text_page = self._ppage.get_textpage()
+                bbox = merged_bbox.to_bottom_left_origin(page_size.height)
+                merged_text = self.text_page.get_text_bounded(*bbox.as_tuple())
                 return TextCell(
                     index=group[0].index,
                     text=merged_text,

{docling-2.31.2 → docling-2.33.0}/docling/datamodel/base_models.py RENAMED Viewed

@@ -90,6 +90,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
         "image/tiff",
         "image/gif",
         "image/bmp",
+        "image/webp",
     ],
     InputFormat.PDF: ["application/pdf"],
     InputFormat.ASCIIDOC: ["text/asciidoc"],

{docling-2.31.2 → docling-2.33.0}/docling/datamodel/document.py RENAMED Viewed

@@ -302,7 +302,7 @@ class _DocumentConversionInput(BaseModel):
                     if ("." in obj.name and not obj.name.startswith("."))
                     else ""
                 )
-                mime = _DocumentConversionInput._mime_from_extension(ext)
+                mime = _DocumentConversionInput._mime_from_extension(ext.lower())
             if mime is not None and mime.lower() == "application/zip":
                 objname = obj.name.lower()
                 if objname.endswith(".xlsx"):
@@ -376,6 +376,13 @@ class _DocumentConversionInput(BaseModel):
             mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
         elif ext in FormatToExtensions[InputFormat.PDF]:
             mime = FormatToMimeType[InputFormat.PDF][0]
+        elif ext in FormatToExtensions[InputFormat.DOCX]:
+            mime = FormatToMimeType[InputFormat.DOCX][0]
+        elif ext in FormatToExtensions[InputFormat.PPTX]:
+            mime = FormatToMimeType[InputFormat.PPTX][0]
+        elif ext in FormatToExtensions[InputFormat.XLSX]:
+            mime = FormatToMimeType[InputFormat.XLSX][0]
         return mime
     @staticmethod

{docling-2.31.2 → docling-2.33.0}/docling/datamodel/pipeline_options.py RENAMED Viewed

@@ -225,6 +225,7 @@ class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
     headers: Dict[str, str] = {}
     params: Dict[str, Any] = {}
     timeout: float = 20
+    concurrency: int = 1
     prompt: str = "Describe this image in a few sentences."
     provenance: str = ""
@@ -295,6 +296,7 @@ class ApiVlmOptions(BaseVlmOptions):
     params: Dict[str, Any] = {}
     scale: float = 2.0
     timeout: float = 60
+    concurrency: int = 1
     response_format: ResponseFormat

{docling-2.31.2 → docling-2.33.0}/docling/datamodel/settings.py RENAMED Viewed

@@ -56,13 +56,15 @@ class DebugSettings(BaseModel):
 class AppSettings(BaseSettings):
-    model_config = SettingsConfigDict(env_prefix="DOCLING_", env_nested_delimiter="_")
+    model_config = SettingsConfigDict(
+        env_prefix="DOCLING_", env_nested_delimiter="_", env_nested_max_split=1
+    )
-    perf: BatchConcurrencySettings
-    debug: DebugSettings
+    perf: BatchConcurrencySettings = BatchConcurrencySettings()
+    debug: DebugSettings = DebugSettings()
     cache_dir: Path = Path.home() / ".cache" / "docling"
     artifacts_path: Optional[Path] = None
-settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())
+settings = AppSettings()

{docling-2.31.2 → docling-2.33.0}/docling/models/api_vlm_model.py RENAMED Viewed

@@ -1,4 +1,5 @@
 from collections.abc import Iterable
+from concurrent.futures import ThreadPoolExecutor
 from docling.datamodel.base_models import Page, VlmPrediction
 from docling.datamodel.document import ConversionResult
@@ -27,6 +28,7 @@ class ApiVlmModel(BasePageModel):
                 )
             self.timeout = self.vlm_options.timeout
+            self.concurrency = self.vlm_options.concurrency
             self.prompt_content = (
                 f"This is a page from a document.\n{self.vlm_options.prompt}"
             )
@@ -38,10 +40,10 @@ class ApiVlmModel(BasePageModel):
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
-        for page in page_batch:
+        def _vlm_request(page):
             assert page._backend is not None
             if not page._backend.is_valid():
-                yield page
+                return page
             else:
                 with TimeRecorder(conv_res, "vlm"):
                     assert page.size is not None
@@ -63,4 +65,7 @@ class ApiVlmModel(BasePageModel):
                     page.predictions.vlm_response = VlmPrediction(text=page_tags)
-                yield page
+                return page
+        with ThreadPoolExecutor(max_workers=self.concurrency) as executor:
+            yield from executor.map(_vlm_request, page_batch)

{docling-2.31.2 → docling-2.33.0}/docling/models/picture_description_api_model.py RENAMED Viewed

@@ -1,4 +1,5 @@
 from collections.abc import Iterable
+from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
 from typing import Optional, Type, Union
@@ -37,6 +38,7 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
             accelerator_options=accelerator_options,
         )
         self.options: PictureDescriptionApiOptions
+        self.concurrency = self.options.concurrency
         if self.enabled:
             if not enable_remote_services:
@@ -48,8 +50,8 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
     def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
         # Note: technically we could make a batch request here,
         # but not all APIs will allow for it. For example, vllm won't allow more than 1.
-        for image in images:
-            yield api_image_request(
+        def _api_request(image):
+            return api_image_request(
                 image=image,
                 prompt=self.options.prompt,
                 url=self.options.url,
@@ -57,3 +59,6 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
                 headers=self.options.headers,
                 **self.options.params,
             )
+        with ThreadPoolExecutor(max_workers=self.concurrency) as executor:
+            yield from executor.map(_api_request, images)

{docling-2.31.2 → docling-2.33.0}/docling/models/tesseract_ocr_cli_model.py RENAMED Viewed

@@ -249,7 +249,7 @@ class TesseractOcrCliModel(BaseOcrModel):
                             cell = TextCell(
                                 index=ix,
                                 text=str(text),
-                                orig=text,
+                                orig=str(text),
                                 from_ocr=True,
                                 confidence=conf / 100.0,
                                 rect=BoundingRectangle.from_bounding_box(

{docling-2.31.2 → docling-2.33.0}/docling/pipeline/vlm_pipeline.py RENAMED Viewed

@@ -3,7 +3,7 @@ from io import BytesIO
 from pathlib import Path
 from typing import List, Optional, Union, cast
-# from docling_core.types import DoclingDocument
+from docling_core.types import DoclingDocument
 from docling_core.types.doc import BoundingBox, DocItem, ImageRef, PictureItem, TextItem
 from docling_core.types.doc.document import DocTagsDocument
 from PIL import Image as PILImage
@@ -133,28 +133,26 @@ class VlmPipeline(PaginatedPipeline):
                 doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
                     doctags_list_c, image_list_c
                 )
-                conv_res.document.load_from_doctags(doctags_doc)
+                conv_res.document = DoclingDocument.load_from_doctags(doctags_doc)
                 # If forced backend text, replace model predicted text with backend one
-                if page.size:
-                    if self.force_backend_text:
-                        scale = self.pipeline_options.images_scale
-                        for element, _level in conv_res.document.iterate_items():
-                            if (
-                                not isinstance(element, TextItem)
-                                or len(element.prov) == 0
-                            ):
-                                continue
-                            crop_bbox = (
-                                element.prov[0]
-                                .bbox.scaled(scale=scale)
-                                .to_top_left_origin(
-                                    page_height=page.size.height * scale
-                                )
-                            )
-                            txt = self.extract_text_from_backend(page, crop_bbox)
-                            element.text = txt
-                            element.orig = txt
+                if self.force_backend_text:
+                    scale = self.pipeline_options.images_scale
+                    for element, _level in conv_res.document.iterate_items():
+                        if not isinstance(element, TextItem) or len(element.prov) == 0:
+                            continue
+                        page_ix = element.prov[0].page_no - 1
+                        page = conv_res.pages[page_ix]
+                        if not page.size:
+                            continue
+                        crop_bbox = (
+                            element.prov[0]
+                            .bbox.scaled(scale=scale)
+                            .to_top_left_origin(page_height=page.size.height * scale)
+                        )
+                        txt = self.extract_text_from_backend(page, crop_bbox)
+                        element.text = txt
+                        element.orig = txt
             elif (
                 self.pipeline_options.vlm_options.response_format
                 == ResponseFormat.MARKDOWN

{docling-2.31.2 → docling-2.33.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "2.31.2"  # DO NOT EDIT, updated automatically
+version = "2.33.0"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 authors = [
   "Christoph Auer <cau@zurich.ibm.com>",
@@ -46,7 +46,7 @@ packages = [{ include = "docling" }]
 ######################
 python = "^3.9"
 pydantic = "^2.0.0"
-docling-core = {version = "^2.26.0", extras = ["chunking"]}
+docling-core = {version = "^2.29.0", extras = ["chunking"]}
 docling-ibm-models = "^3.4.0"
 docling-parse = "^4.0.0"
 filetype = "^1.2.0"

{docling-2.31.2 → docling-2.33.0}/LICENSE RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/README.md RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/__init__.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/backend/__init__.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/backend/abstract_backend.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/backend/asciidoc_backend.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/backend/csv_backend.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/backend/docling_parse_backend.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/backend/docling_parse_v2_backend.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/backend/docling_parse_v4_backend.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/backend/docx/__init__.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/backend/docx/latex/__init__.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/backend/docx/latex/latex_dict.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/backend/docx/latex/omml.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/backend/html_backend.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/backend/json/__init__.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/backend/json/docling_json_backend.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/backend/md_backend.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/backend/msexcel_backend.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/backend/mspowerpoint_backend.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/backend/pdf_backend.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/backend/xml/__init__.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/backend/xml/jats_backend.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/backend/xml/uspto_backend.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/chunking/__init__.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/cli/__init__.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/cli/main.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/cli/models.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/cli/tools.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/datamodel/__init__.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/document_converter.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/exceptions.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/models/__init__.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/models/base_model.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/models/base_ocr_model.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/models/code_formula_model.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/models/document_picture_classifier.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/models/easyocr_model.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/models/factories/__init__.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/models/factories/base_factory.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/models/factories/ocr_factory.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/models/factories/picture_description_factory.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/models/hf_mlx_model.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/models/hf_vlm_model.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/models/layout_model.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/models/ocr_mac_model.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/models/page_assemble_model.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/models/page_preprocessing_model.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/models/picture_description_base_model.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/models/picture_description_vlm_model.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/models/plugins/__init__.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/models/plugins/defaults.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/models/rapid_ocr_model.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/models/readingorder_model.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/models/table_structure_model.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/models/tesseract_ocr_model.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/pipeline/__init__.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/pipeline/base_pipeline.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/pipeline/simple_pipeline.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/pipeline/standard_pdf_pipeline.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/py.typed RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/utils/__init__.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/utils/accelerator_utils.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/utils/api_image_request.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/utils/export.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/utils/glm_utils.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/utils/layout_postprocessor.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/utils/locks.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/utils/model_downloader.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/utils/ocr_utils.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/utils/profiling.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/utils/utils.py RENAMED Viewed

File without changes

{docling-2.31.2 → docling-2.33.0}/docling/utils/visualization.py RENAMED Viewed

File without changes

docling 2.31.2__tar.gz → 2.33.0__tar.gz

docling 2.31.2tar.gz → 2.33.0tar.gz